promptfoo 0.121.4 → 0.121.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/{ListApp-DQkFNqE9.js → ListApp-DLmM02JS.js} +1 -1
- package/dist/src/{accounts-DhMYUUbu.js → accounts-Ca7WIoPY.js} +12 -7
- package/dist/src/{accounts-F9d_5sMC.js → accounts-CjFnOPmb.js} +14 -9
- package/dist/src/{accounts-Dy17bs4D.cjs → accounts-CmWzeD2d.cjs} +16 -10
- package/dist/src/{accounts-DdJ2pHMI.js → accounts-DanM1wq_.js} +13 -8
- package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-CJ0j3fBi.js} +3 -3
- package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-DDEGRV9v.js} +3 -3
- package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-DvPWSUpb.cjs} +8 -7
- package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-TxUEMPYS.js} +2 -2
- package/dist/src/{agents-BahDpe5G.cjs → agents-B4sRuXg3.cjs} +7 -6
- package/dist/src/{agents-pQeBEXMm.js → agents-B8q7h_ek.js} +5 -5
- package/dist/src/{agents-CgaMXvLM.js → agents-CBgJvRkB.js} +21 -10
- package/dist/src/{agents-C-R_jfzI.js → agents-CYn2n3QP.js} +4 -4
- package/dist/src/{agents-8FDnTriG.js → agents-D-vDNFx4.js} +21 -10
- package/dist/src/{agents-aYPQLf8W.js → agents-LrHuQqr1.js} +20 -9
- package/dist/src/{agents-DJ35I3Nt.js → agents-QGg76OF-.js} +5 -5
- package/dist/src/{agents-D7-HGxUj.cjs → agents-eHZ9nlgA.cjs} +21 -10
- package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-CJEbQ0o6.js} +7 -7
- package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-D5HXzZ0s.js} +6 -6
- package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-T6HGNxNe.cjs} +7 -7
- package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-eYv3a_DK.js} +7 -7
- package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
- package/dist/src/app/assets/Report-BNHJKN35.js +1 -0
- package/dist/src/app/assets/index-BnT6P6sF.js +388 -0
- package/dist/src/app/assets/index-yhM8y1PP.css +1 -0
- package/dist/src/app/assets/{scroll-timeline-D9IT_e8Z.js → scroll-timeline-RpeTwOvs.js} +1 -1
- package/dist/src/app/assets/sync-5gq6fmG4.js +4 -0
- package/dist/src/app/assets/vendor-charts-BL9OMNU7.js +36 -0
- package/dist/src/app/assets/{vendor-markdown-Ch00wnNI.js → vendor-markdown-BYsQqn7Z.js} +10 -10
- package/dist/src/app/assets/{vendor-react-CVvmk1UB.js → vendor-react-CqWgVW6T.js} +2 -2
- package/dist/src/app/assets/{vendor-utils-BnEYbx2Q.js → vendor-utils-BHPO71pu.js} +1 -1
- package/dist/src/app/index.html +31 -6
- package/dist/src/{audio-COrn8rM6.js → audio-BqnRvcWG.js} +3 -3
- package/dist/src/{audio-DcVKoInv.js → audio-CPMtV1yR.js} +4 -4
- package/dist/src/{audio-B7izf48x.js → audio-DyiebVB3.js} +4 -4
- package/dist/src/{audio-BQtNuYBj.cjs → audio-FnxbEnSE.cjs} +4 -4
- package/dist/src/authoritativeMarkupInjection-BZIywVjG.js +74 -0
- package/dist/src/authoritativeMarkupInjection-DyAXAsSr.js +75 -0
- package/dist/src/authoritativeMarkupInjection-F2gBw0lN.cjs +74 -0
- package/dist/src/authoritativeMarkupInjection-QEQmFS83.js +74 -0
- package/dist/src/{base-PYJvBE1i.js → base-CKLo890h.js} +4 -3
- package/dist/src/{base-fZ9wgg50.js → base-Co80MMCi.js} +5 -4
- package/dist/src/{base-D-670DX8.cjs → base-DGJW48uz.cjs} +5 -4
- package/dist/src/{base-yrI1Yal4.js → base-E9I8zXjz.js} +5 -4
- package/dist/src/bestOfN-B3wNzjSB.js +137 -0
- package/dist/src/bestOfN-BBsO41z4.js +136 -0
- package/dist/src/bestOfN-CAwmg5UL.cjs +140 -0
- package/dist/src/bestOfN-_kTi8Bxe.js +136 -0
- package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B0977K1O.cjs} +7 -6
- package/dist/src/{blobs-BCZavS8s.js → blobs-CeFdPn_T.js} +3 -3
- package/dist/src/{blobs-BQWqnnvL.js → blobs-DODuTK-a.js} +3 -3
- package/dist/src/{blobs-C-F78Kfn.js → blobs-Dwef1Ao1.js} +2 -2
- package/dist/src/{cache-BIyPcp5v.cjs → cache-CPGUA4Yl.cjs} +135 -25
- package/dist/src/cache-Cf7b4pWE.js +3 -0
- package/dist/src/{cache-D5NZmMiT.js → cache-DIXbtkNO.js} +125 -10
- package/dist/src/{cache-mb7c8hbp.js → cache-DpPWrkTE.js} +128 -12
- package/dist/src/{cache-C4Xb-hNb.js → cache-roFAE0cI.js} +126 -11
- package/dist/src/{chat-I9izLm49.js → chat-CUCorGiL.js} +12 -12
- package/dist/src/{chat-BPXSW8Bv.cjs → chat-DG1wG4w0.cjs} +6 -6
- package/dist/src/{chat-BfPaS15_.js → chat-Dabu84Br.js} +12 -12
- package/dist/src/{chat-Dr3DUQ0D.js → chat-DqUFcWI0.js} +12 -12
- package/dist/src/{chat-CclRbxGf.cjs → chat-DxTDQ83C.cjs} +14 -13
- package/dist/src/{chat-MKxMnZJZ.js → chat-GmlolEwo.js} +4 -4
- package/dist/src/{chat-0bwXjVP0.js → chat-TP8Qifkh.js} +6 -6
- package/dist/src/{chat-mW0ORo8G.js → chat-iwaM5UTQ.js} +6 -6
- package/dist/src/{chatkit-zUIVoDos.js → chatkit-B6DWi70Q.js} +4 -4
- package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-BYveR48_.cjs} +6 -5
- package/dist/src/{chatkit-Cv6AhukM.js → chatkit-fARZwEfV.js} +3 -3
- package/dist/src/{chatkit-CJnHRRMM.js → chatkit-lb6FK02w.js} +4 -4
- package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BQNp_y-F.js} +212 -67
- package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-D5Jl0SDh.js} +212 -67
- package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-DH416NBD.cjs} +218 -72
- package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-x1XJ1-pU.js} +212 -67
- package/dist/src/{cloud-DQZ5sVjW.js → cloud-D3DiFqH6.js} +3 -3
- package/dist/src/cloud-p96PA4MH.js +3 -0
- package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-B6NVI3ax.js} +4 -4
- package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-CEAW-xQa.cjs} +6 -6
- package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-RFSojyXG.js} +6 -6
- package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-r4tbYmWU.js} +6 -6
- package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-BCkLouto.js} +5 -5
- package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-BaZ4insB.js} +3 -3
- package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-CF-Vb-2Z.js} +5 -5
- package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-TJMLBj6I.cjs} +5 -5
- package/dist/src/codex-app-server-B8KHEiF4.js +1915 -0
- package/dist/src/codex-app-server-CnrLBCeA.cjs +1921 -0
- package/dist/src/codex-app-server-DIXZ230V.js +1915 -0
- package/dist/src/codex-app-server-Dd22dC_N.js +1916 -0
- package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-B6Wah8Pa.js} +6 -6
- package/dist/src/codex-sdk-BGjVAk23.js +3 -0
- package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-CFF6gUyi.cjs} +18 -10
- package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-CmQABzV3.js} +6 -6
- package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-D2d54RL8.js} +5 -5
- package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-Bu9B8NUY.js} +8 -8
- package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CtzNCHKu.js} +7 -7
- package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-DHCDlQUI.cjs} +8 -8
- package/dist/src/{cometapi-BUlt_ELa.js → cometapi-OBILPLlu.js} +8 -8
- package/dist/src/{completion-HUe8wDhZ.js → completion-CO2e1_62.js} +6 -6
- package/dist/src/{completion-BozdoXba.cjs → completion-CSYfl2cd.cjs} +6 -6
- package/dist/src/{completion-x0a_c2y1.js → completion-DZNxcyfG.js} +6 -6
- package/dist/src/{completion-Dnxn7E-j.js → completion-sNvCLTAP.js} +5 -5
- package/dist/src/constants-BjJV0cRr.js +6 -0
- package/dist/src/constants-DH5XYLKZ.js +7 -0
- package/dist/src/constants-DZGEFLsu.js +6 -0
- package/dist/src/constants-a2kYssQk.cjs +11 -0
- package/dist/src/{createHash-4gFQpDDv.js → createHash-BtbSX3mj.js} +1 -1
- package/dist/src/{createHash-CwDVU5xr.js → createHash-CGVzWdjj.js} +1 -1
- package/dist/src/{createHash-B7KvgoOD.cjs → createHash-CSiqnK5P.cjs} +2 -2
- package/dist/src/{createHash-ChI45QR1.js → createHash-CgRvs4Fn.js} +1 -1
- package/dist/src/crescendo-BXEJK_bi.cjs +704 -0
- package/dist/src/crescendo-CU_Y2i-m.js +702 -0
- package/dist/src/crescendo-J1Xx4_zb.js +703 -0
- package/dist/src/crescendo-QiaSLW0d.js +701 -0
- package/dist/src/custom-BJfP00Bh.js +619 -0
- package/dist/src/custom-CZVn-1-r.js +620 -0
- package/dist/src/custom-Cqia7M0D.cjs +621 -0
- package/dist/src/custom-notggYVl.js +618 -0
- package/dist/src/{docker-DCgsveLD.js → docker-4D1eL6Gq.js} +6 -6
- package/dist/src/{docker-ClnmCf1Z.js → docker-BBv1WUDu.js} +5 -5
- package/dist/src/{docker-DS4_Osau.cjs → docker-D06JUoe2.cjs} +6 -6
- package/dist/src/{docker-CQmlA2NU.js → docker-DdJQBxK9.js} +6 -6
- package/dist/src/{embedding-D3xTseo7.js → embedding--UZVe4_7.js} +6 -6
- package/dist/src/{embedding-I45KG3o7.cjs → embedding-BbrwopfX.cjs} +6 -6
- package/dist/src/{embedding-nFbumxcv.js → embedding-Bi3rxrZF.js} +5 -5
- package/dist/src/{embedding-DD9wa3ae.js → embedding-C251p1-8.js} +6 -6
- package/dist/src/{errors-Cw810C93.js → errors-9PcUL8BC.js} +1 -1
- package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
- package/dist/src/{esm-CtEPLdAj.cjs → esm-BIKakvNa.cjs} +8 -7
- package/dist/src/{esm-C7PnfdF8.js → esm-BTK1W7lG.js} +1 -1
- package/dist/src/{esm-tVgYPY-f.js → esm-Bexx2PFc.js} +2 -2
- package/dist/src/{eval-u4UVafl6.js → eval-0VRANImH.js} +21 -21
- package/dist/src/{eval-CzJFfFO9.js → eval-DscR5iOM.js} +1 -1
- package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-2RRJvFyB.js} +41 -16
- package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-CvtS8h8u.cjs} +51 -15
- package/dist/src/evalResult-DqzsS6_W.js +3 -0
- package/dist/src/{evalResult-D3hVYFis.js → evalResult-eUkJv9Ko.js} +40 -15
- package/dist/src/evaluator-DNdJF1Gv.js +3 -0
- package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-DRoiYB2q.js} +1060 -187
- package/dist/src/evaluatorHelpers-BsYP_muT.js +511 -0
- package/dist/src/evaluatorHelpers-CRqTvSux.cjs +537 -0
- package/dist/src/evaluatorHelpers-DuqFFfq7.js +510 -0
- package/dist/src/{extractor-CAfTSraf.js → extractor-BR7XAzAL.js} +6 -6
- package/dist/src/{extractor-WVPOrH43.cjs → extractor-BdxEtt3J.cjs} +6 -6
- package/dist/src/{extractor-DNSeBVOJ.js → extractor-CIW3iN-b.js} +6 -6
- package/dist/src/{extractor-Dk6bRWkv.js → extractor-CxRtnaHl.js} +5 -5
- package/dist/src/{fetch-B0Z3Oe4k.js → fetch-BufrQtvR.js} +93 -40
- package/dist/src/{fetch-BEWnXrrG.js → fetch-DXUnXkVU.js} +89 -40
- package/dist/src/{fetch-CJU5ELPa.cjs → fetch-Dw4XZHjj.cjs} +330 -270
- package/dist/src/{fetch-Di00EQrc.js → fetch-It34O8Ur.js} +305 -252
- package/dist/src/fetch-_YgGd2qv.js +3 -0
- package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-BhdwzYaD.cjs} +24 -1
- package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-CXRfY3Ss.js} +12 -2
- package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-D4GCJ67J.js} +12 -2
- package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-CMVNrYvE.js} +1 -1
- package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-14nugQQx.cjs} +14 -2
- package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-BPVvltoW.js} +2 -2
- package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-D18lYzhB.js} +2 -2
- package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-jJKYsnjc.js} +2 -2
- package/dist/src/goat-Ckd3q3AY.js +467 -0
- package/dist/src/goat-Qgurm-NP.js +466 -0
- package/dist/src/goat-ghadEDdy.js +465 -0
- package/dist/src/goat-una6pZGP.cjs +469 -0
- package/dist/src/graders-BDT7dif6.js +3 -0
- package/dist/src/{graders-eIHhRqoC.js → graders-BGP99PdK.js} +2416 -2224
- package/dist/src/{graders-Zy3x0zqX.js → graders-BX0f2tvS.js} +2423 -2226
- package/dist/src/{graders-pvbReLLn.js → graders-C0nXU_ZP.js} +1806 -1609
- package/dist/src/{graders--zknU_uk.cjs → graders-ClrU2fnd.cjs} +2219 -1949
- package/dist/src/hydra-BSNZZm2M.js +543 -0
- package/dist/src/hydra-BxdG4nkg.js +541 -0
- package/dist/src/hydra-DE4xWwyc.js +542 -0
- package/dist/src/hydra-DrJttnvw.cjs +542 -0
- package/dist/src/image-B4oBtu6J.js +443 -0
- package/dist/src/{image-dnoUgPrC.js → image-BN-hjLL9.js} +4 -4
- package/dist/src/{image-9302QVqR.js → image-B_fPIwdg.js} +3 -3
- package/dist/src/image-BvUAW344.js +442 -0
- package/dist/src/image-Cvjwx1uY.js +442 -0
- package/dist/src/{image-De2FBmYV.cjs → image-DfVCGPbI.cjs} +4 -4
- package/dist/src/{image-u7-rKnYU.js → image-QzmydkiG.js} +4 -4
- package/dist/src/image-X0oY4350.cjs +465 -0
- package/dist/src/index.cjs +1689 -558
- package/dist/src/index.d.cts +3270 -1624
- package/dist/src/index.d.ts +3270 -1624
- package/dist/src/index.js +1553 -438
- package/dist/src/indirectWebPwn-02ZIghCS.js +259 -0
- package/dist/src/indirectWebPwn-BJ22AbQa.cjs +397 -0
- package/dist/src/indirectWebPwn-CbjUG0rh.js +385 -0
- package/dist/src/indirectWebPwn-CfQJt3gk.cjs +260 -0
- package/dist/src/indirectWebPwn-DBQhOjoD.js +260 -0
- package/dist/src/indirectWebPwn-OsXnKejv.js +259 -0
- package/dist/src/indirectWebPwn-tNx9OZ35.js +385 -0
- package/dist/src/indirectWebPwn-uyWdHx04.js +386 -0
- package/dist/src/inputVariables-B0qUChbV.js +467 -0
- package/dist/src/inputVariables-DUGMb9Ka.js +464 -0
- package/dist/src/inputVariables-DXFdi7AI.js +468 -0
- package/dist/src/inputVariables-Dq9W-Z3a.cjs +475 -0
- package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-C4QlIuoR.js} +2 -2
- package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
- package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
- package/dist/src/{invariant-kfQ8Bu82.cjs → invariant-QtnLD03y.cjs} +1 -1
- package/dist/src/iterative-CpU6i2As.js +490 -0
- package/dist/src/iterative-DJQEQpG3.js +491 -0
- package/dist/src/iterative-DQBuWM-j.cjs +493 -0
- package/dist/src/iterative-FTS4Bz67.js +492 -0
- package/dist/src/iterativeImage-BUABMVOA.js +413 -0
- package/dist/src/iterativeImage-ByFWkxax.cjs +415 -0
- package/dist/src/iterativeImage-BzUapOUi.js +414 -0
- package/dist/src/iterativeImage-Doz8mgxF.js +413 -0
- package/dist/src/iterativeMeta-B3YiAOc8.js +386 -0
- package/dist/src/iterativeMeta-C7APE_P1.js +385 -0
- package/dist/src/iterativeMeta-CSS8M6Ds.cjs +385 -0
- package/dist/src/iterativeMeta-DgoQ7bLh.js +384 -0
- package/dist/src/iterativeTree-B5zxBBSW.js +769 -0
- package/dist/src/iterativeTree-CNyIk0Yn.js +768 -0
- package/dist/src/iterativeTree-CPMF10ve.cjs +771 -0
- package/dist/src/iterativeTree-DvZ7GBwt.js +770 -0
- package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-BadkINlJ.js} +24 -10
- package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-Bi_8sV-H.js} +25 -11
- package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CkMljjdg.js} +25 -11
- package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-DUh34xba.cjs} +25 -11
- package/dist/src/{litellm-DRjpcSa7.js → litellm-BKBo0jpC.js} +5 -5
- package/dist/src/{litellm-C2kqjxqp.js → litellm-BXyn5kZK.js} +5 -5
- package/dist/src/{litellm-p37R1dzQ.js → litellm-CNcfbCfa.js} +4 -4
- package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CtAr7bKG.cjs} +5 -5
- package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
- package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
- package/dist/src/{logger-COuQb2xB.cjs → logger-cfNpzI4o.cjs} +13 -55
- package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-BMX1iEB6.js} +5 -5
- package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CR5TSpp4.js} +5 -5
- package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-D3FUc2K3.cjs} +9 -8
- package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-OEMmS1RB.js} +6 -6
- package/dist/src/main.js +909 -369
- package/dist/src/memoryPoisoning-CM83NWYl.js +107 -0
- package/dist/src/memoryPoisoning-D8h9gXJF.js +106 -0
- package/dist/src/memoryPoisoning-Dp-btinn.cjs +106 -0
- package/dist/src/memoryPoisoning-cLuCoTuJ.js +106 -0
- package/dist/src/{messages-BTQz42fn.js → messages-BabO-cX8.js} +273 -17
- package/dist/src/{messages-811uVVW5.cjs → messages-DBPir0TQ.cjs} +278 -18
- package/dist/src/{messages-zWbkLLHz.js → messages-DGUlSNU7.js} +273 -17
- package/dist/src/{messages-MYTQ2TWp.js → messages-vsE_-Lv0.js} +273 -17
- package/dist/src/{meteor-DHdzY1Ss.js → meteor--TZYICTI.js} +2 -2
- package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-CR226f7Z.cjs} +2 -2
- package/dist/src/{meteor-CU5UAE-H.js → meteor-Cl_yd7rJ.js} +2 -2
- package/dist/src/{meteor-DuAFv6gF.js → meteor-Dce-_zGQ.js} +1 -1
- package/dist/src/mischievousUser-0l8GD7Dp.js +46 -0
- package/dist/src/mischievousUser-BUOP9W5r.js +46 -0
- package/dist/src/mischievousUser-frFYKxu6.js +47 -0
- package/dist/src/mischievousUser-olGgHIVR.cjs +46 -0
- package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-CNV5bMSk.cjs} +7 -7
- package/dist/src/{modelslab-D0erNWKe.js → modelslab-Cogmu4mG.js} +6 -6
- package/dist/src/{modelslab-DIq-6y7x.js → modelslab-Dzst7VTU.js} +6 -6
- package/dist/src/{modelslab-wu9yi5GE.js → modelslab-EyDczZ5A.js} +7 -7
- package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-BGPNBOMS.js} +6 -6
- package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-B_5NKFu1.js} +5 -5
- package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-C4eUJGse.js} +5 -5
- package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-CjJRxI1X.cjs} +9 -8
- package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BNGmgfFz.js} +4 -4
- package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-ChPlh5na.js} +4 -4
- package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CrV0iaY_.js} +3 -3
- package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-DuOG9Aun.cjs} +5 -4
- package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
- package/dist/src/{openai-URNyItar.cjs → openai-C3uXv8wS.cjs} +2 -2
- package/dist/src/{openai-Qsvz25mV.js → openai-CJrsh9n4.js} +2 -2
- package/dist/src/{openai-iYtrXzOX.js → openai-zgwBb4Ff.js} +1 -1
- package/dist/src/{openclaw-CnQ363Wi.js → openclaw-BIHlu_36.js} +10 -8
- package/dist/src/{openclaw-CwzlQSQX.js → openclaw-CF7fMido.js} +9 -7
- package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-Dphc01BY.cjs} +18 -15
- package/dist/src/{openclaw-CLWrW03k.js → openclaw-zIJAsz3P.js} +10 -8
- package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-B3vlPLsp.js} +40 -5
- package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-D05JSgMQ.js} +40 -5
- package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-DoY6GbWw.cjs} +46 -10
- package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-sRKYHGoI.js} +39 -4
- package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver--gTpSagc.js} +120 -4
- package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-B2eaKC8C.js} +120 -4
- package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-BXjcRqAM.js} +119 -3
- package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-CvJdBGSc.cjs} +125 -7
- package/dist/src/packageParser--MWTSrPW.js +36 -0
- package/dist/src/packageParser-CgE-ziRo.js +35 -0
- package/dist/src/packageParser-QoCS1FMl.cjs +54 -0
- package/dist/src/packageParser-hwwSGnAZ.js +35 -0
- package/dist/src/processShim-BBxt7LKO.js +95 -0
- package/dist/src/processShim-BcGzU8fY.js +94 -0
- package/dist/src/processShim-C_z3aRvF.js +94 -0
- package/dist/src/processShim-DSY9BV2T.cjs +98 -0
- package/dist/src/promptLength-0qIHyhA5.js +71 -0
- package/dist/src/promptLength-4X-Wd8PG.js +72 -0
- package/dist/src/promptLength-B9nZEfO6.js +71 -0
- package/dist/src/promptLength-BbBbDHNj.cjs +94 -0
- package/dist/src/promptfoo-BDrfT30-.js +180 -0
- package/dist/src/promptfoo-Cm4hiy1Y.js +180 -0
- package/dist/src/promptfoo-Rjp-MeBb.js +181 -0
- package/dist/src/promptfoo-b-baRMj-.cjs +205 -0
- package/dist/src/prompts-BYMtqPCw.js +259 -0
- package/dist/src/prompts-C-bqE1Yp.js +260 -0
- package/dist/src/prompts-Cp_Qx5Ml.js +270 -0
- package/dist/src/prompts-DHhQsANy.js +259 -0
- package/dist/src/prompts-D_QpZ2Dm.js +271 -0
- package/dist/src/prompts-hNvWBD3z.cjs +284 -0
- package/dist/src/prompts-huDVH2CI.js +270 -0
- package/dist/src/prompts-p78Hul5i.cjs +289 -0
- package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
- package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-CZO_w7ue.cjs} +2 -2
- package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DHcFiVWX.js} +1 -1
- package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-ReCd0sFa.js} +2 -2
- package/dist/src/{providers-BV_KMZje.js → providers-B9KzWxAX.js} +10558 -21587
- package/dist/src/{providers-DruaQfwu.js → providers-BCCz6_IX.js} +1228 -12196
- package/dist/src/{providers-1eKkXBKp.cjs → providers-BDVVIQM6.cjs} +10649 -21843
- package/dist/src/{providers-iUt5fbAN.js → providers-BYAn82cf.js} +1 -1
- package/dist/src/{providers-Domz_llv.js → providers-DVYRZP4E.js} +10589 -21570
- package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-CLCgQ9tt.js} +3 -3
- package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-CgYxeSmO.js} +3 -3
- package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-Cokhluq3.cjs} +8 -7
- package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-D0BYebvX.js} +3 -3
- package/dist/src/{quiverai-DFotyafY.cjs → quiverai-BAp6iTZD.cjs} +4 -4
- package/dist/src/{quiverai-aPPvXOgn.js → quiverai-BvIhI_0l.js} +4 -4
- package/dist/src/{quiverai-DR0SnIQV.js → quiverai-CdTWPe-A.js} +3 -3
- package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Cv7rJKDz.js} +4 -4
- package/dist/src/registry-BUJrgjwv.js +124 -0
- package/dist/src/registry-DXm1t_x0.js +125 -0
- package/dist/src/registry-Dp5EqoXc.js +124 -0
- package/dist/src/registry-KCVF1CFC.cjs +124 -0
- package/dist/src/{server-D6Il2Sob.js → remoteGeneration-B1_XsKXU.js} +16 -108
- package/dist/src/{server-BSB45Nt9.js → remoteGeneration-COpWcmWd.js} +15 -146
- package/dist/src/{server-Dx2TyCH2.cjs → remoteGeneration-DS9N3pgB.cjs} +30 -119
- package/dist/src/remoteGeneration-DsaSwmG2.js +217 -0
- package/dist/src/render-BNTrbmBw.cjs +384 -0
- package/dist/src/render-CSP99NLm.js +348 -0
- package/dist/src/render-DFfDeYUK.js +347 -0
- package/dist/src/{render-CgVDrJmM.js → render-DznWrxGO.js} +2 -2
- package/dist/src/render-_6ur1fhE.js +347 -0
- package/dist/src/resourceAttributes-D1jP3kL5.js +17 -0
- package/dist/src/resourceAttributes-DQbBB--2.js +16 -0
- package/dist/src/resourceAttributes-ephgOvdR.cjs +27 -0
- package/dist/src/resourceAttributes-v6-I67fn.js +16 -0
- package/dist/src/{responses-Bi9vBuW_.cjs → responses-1UFFF9N_.cjs} +51 -16
- package/dist/src/{responses-DL9m8CyY.js → responses-B3W2JvOQ.js} +49 -15
- package/dist/src/{responses--OsX2aYW.js → responses-B6ktc3Ra.js} +49 -15
- package/dist/src/{responses-C-flexAY.js → responses-URRzV8qE.js} +49 -15
- package/dist/src/rolldown-runtime-D_mwlA32.cjs +43 -0
- package/dist/src/rubyUtils-BYVlQ94c.js +3 -0
- package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CXlFM2rR.js} +3 -3
- package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-CnlW8AYb.js} +3 -3
- package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-CqUWBZAt.cjs} +18 -27
- package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DdGojpfv.js} +3 -3
- package/dist/src/runtimeTransform-BJOpL9Yc.js +142 -0
- package/dist/src/runtimeTransform-Dgh_D7DU.js +143 -0
- package/dist/src/runtimeTransform-DigbjU1r.js +142 -0
- package/dist/src/runtimeTransform-ON3YYILw.cjs +147 -0
- package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-CujrzP1a.js} +62 -51
- package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-DzffAqo_.cjs} +65 -53
- package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-vhtSV7JI.js} +62 -51
- package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-yr1QKeBs.js} +61 -50
- package/dist/src/{scanner-1DqWi1Ej.js → scanner-DS0109SS.js} +7 -7
- package/dist/src/server/index.js +5105 -605
- package/dist/src/server-B8rqV126.cjs +126 -0
- package/dist/src/server-BaLytskk.js +3 -0
- package/dist/src/server-CMJD10J4.js +107 -0
- package/dist/src/server-Ddp8GNMp.js +146 -0
- package/dist/src/server-DhMHosWj.js +182 -0
- package/dist/src/shared-7pmVZLNO.js +1334 -0
- package/dist/src/shared-9WHQ1oNE.js +1335 -0
- package/dist/src/{fileExtensions-BArZuxsI.js → shared-BoG7qLMv.js} +12 -2
- package/dist/src/shared-D6IjElRI.js +1334 -0
- package/dist/src/shared-WkgnDkcg.cjs +1436 -0
- package/dist/src/{signal-CE5G3a7x.js → signal-CSurUUyV.js} +3 -3
- package/dist/src/simulatedUser-C9aQObBI.js +222 -0
- package/dist/src/simulatedUser-Cu601Dd4.cjs +227 -0
- package/dist/src/simulatedUser-U_qAHnuB.js +222 -0
- package/dist/src/simulatedUser-p3tACcmw.js +223 -0
- package/dist/src/{slack-DDUe-5MC.js → slack-Bapo-7_8.js} +2 -2
- package/dist/src/{slack-1Rhq0EoV.cjs → slack-DMC1QVEg.cjs} +3 -2
- package/dist/src/{slack-D5Wpy8LM.js → slack-DTEFhrMn.js} +2 -2
- package/dist/src/{slack-acRb0IqQ.js → slack-k-_CP84Q.js} +1 -1
- package/dist/src/storage-BU4qcnOb.js +875 -0
- package/dist/src/storage-CA-v9V2v.cjs +911 -0
- package/dist/src/storage-CD-GWAdx.js +822 -0
- package/dist/src/storage-QdU-SmvD.js +834 -0
- package/dist/src/{store-DAAyxcy6.cjs → store-B2NDDooM.cjs} +60 -24
- package/dist/src/{store-CYEy5J2D.js → store-DKd5592Q.js} +51 -20
- package/dist/src/{store-M0b1WfYb.js → store-HpopRVzl.js} +50 -19
- package/dist/src/store-IbiRIF3k.js +3 -0
- package/dist/src/strategies-7CS3Alao.cjs +2360 -0
- package/dist/src/strategies-CiSeroPH.js +2331 -0
- package/dist/src/strategies-DRJjGTIY.js +2333 -0
- package/dist/src/{tables-DQ4WU5tX.js → tables-CRSXQ2Ke.js} +2 -2
- package/dist/src/{tables-CsWou1Bx.js → tables-CxjU7bBd.js} +3 -3
- package/dist/src/{tables-DUfh1F7Z.cjs → tables-DBIJU0WE.cjs} +6 -5
- package/dist/src/{tables-C4CH3zRr.js → tables-DafUHOeh.js} +3 -3
- package/dist/src/{telemetry-CQPez_Jp.js → telemetry-00ezXr_t.js} +5 -4
- package/dist/src/telemetry-ByPqDcKC.js +3 -0
- package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CJ7FnCsc.cjs} +18 -11
- package/dist/src/{telemetry-dbaJ0E98.js → telemetry-DmXYcJNV.js} +5 -4
- package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-DwX9XUN5.js} +4 -3
- package/dist/src/{text-KvuD2Iko.js → text-Db-Wt2u2.js} +1 -1
- package/dist/src/{text-DHxdyQqT.js → text-DwYK5EBn.js} +1 -1
- package/dist/src/{text-BVi-cLPJ.cjs → text-nywWsRBM.cjs} +1 -1
- package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-BjVkdk18.js} +1 -1
- package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-CDet74yk.js} +1 -1
- package/dist/src/tokenUsageUtils-CmnQ0G2m.js +142 -0
- package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-_B-P8IAi.cjs} +1 -1
- package/dist/src/toolAttributes-BAjwcBf0.cjs +103 -0
- package/dist/src/toolAttributes-COVgDrBG.js +87 -0
- package/dist/src/toolAttributes-DJ9ZEKXD.js +86 -0
- package/dist/src/tracingOptions-BnwKCkSB.js +221 -0
- package/dist/src/tracingOptions-Chi74lOD.js +219 -0
- package/dist/src/tracingOptions-DrbSFaKy.cjs +249 -0
- package/dist/src/tracingOptions-ji2OuXbT.js +220 -0
- package/dist/src/{transcription-DuWDupG7.js → transcription-B8uIgCYX.js} +5 -5
- package/dist/src/{transcription-CJspiD2c.js → transcription-CfU5loSq.js} +6 -6
- package/dist/src/{transcription-V2HaAmy2.js → transcription-Dkd22_4K.js} +6 -6
- package/dist/src/{transcription-BvjmiYB1.cjs → transcription-mzuf18Mq.cjs} +9 -8
- package/dist/src/{transform-lQrDE1BQ.js → transform-BIMynQsA.js} +9 -9
- package/dist/src/transform-BnSTnFlp.js +187 -0
- package/dist/src/transform-BnSXWmU_2.cjs +221 -0
- package/dist/src/transform-CGt7Kt3y2.js +186 -0
- package/dist/src/transform-CrPGTsij.js +186 -0
- package/dist/src/{transform-CTeuTR3S.cjs → transform-DhNkAUs8.cjs} +13 -12
- package/dist/src/{transform-CG0ehZNG.js → transform-DmvYBRll.js} +9 -9
- package/dist/src/{transform-zDhMmzwX.js → transform-EtD4jAWi.js} +9 -9
- package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-0ThtPved.js} +1 -1
- package/dist/src/transformersAvailability-BYydDE5U.js +35 -0
- package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-BvyU9vDD.js} +1 -1
- package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-BytPvKUW.cjs} +1 -1
- package/dist/src/{types-Dm9JM6Vb.js → types-BFevViUY.js} +115 -19
- package/dist/src/{types-Bgh5SOn6.js → types-BJQBBPTP.js} +115 -19
- package/dist/src/{types-CeaeaZdP.cjs → types-CxJvaY2S.cjs} +357 -172
- package/dist/src/{types-BGQDAP8i.js → types-D6glLbdF.js} +271 -170
- package/dist/src/{util-BYvQUPp7.js → util--WMgw7wM.js} +28 -8
- package/dist/src/{util-C9J8ahRn.js → util-5WnCSb0h.js} +72 -48
- package/dist/src/{util-CN3SrLT4.cjs → util-BSIuSLVK.cjs} +74 -49
- package/dist/src/{util-C8e5uydV.js → util-Bx677_k2.js} +154 -147
- package/dist/src/util-CN8om2rz.cjs +386 -0
- package/dist/src/{util-DDs-7g6-.js → util-CoQWM76y.js} +28 -8
- package/dist/src/util-DNl96nNs.js +327 -0
- package/dist/src/{util-DxWpWjhc.js → util-DURocbYR.js} +667 -507
- package/dist/src/util-Df8YMvS1.js +327 -0
- package/dist/src/{util-DvU2Pw8c.js → util-DiQ3QvBB.js} +28 -8
- package/dist/src/{util-oGMLA7vc.js → util-I-Rf-KaD.js} +862 -577
- package/dist/src/{util-olYL5C6N.cjs → util-IYzs5Y04.cjs} +33 -7
- package/dist/src/{util-D9TisOyk.js → util-LKTmNsMQ.js} +71 -47
- package/dist/src/{util-Bxn8emtE.cjs → util-SPsvFONY.cjs} +738 -582
- package/dist/src/{util-D3q0WQ-0.js → util-efByNxcr.js} +72 -48
- package/dist/src/util-kDURhgJW.js +328 -0
- package/dist/src/{utils-DJfvjyMj.js → utils-B0lzitHZ.js} +3 -3
- package/dist/src/{utils-BLJKfv0y.js → utils-BFOh20Gb.js} +3 -3
- package/dist/src/{utils-hXtCYanr.js → utils-BGY69tk_.js} +2 -2
- package/dist/src/{utils-B05gLxER.cjs → utils-Ve6kuJsa.cjs} +3 -3
- package/dist/src/version-BK20a4sw.js +16 -0
- package/dist/src/version-BWCSaByA.cjs +27 -0
- package/dist/src/version-eRkNuGv8.js +17 -0
- package/dist/src/version-lpHV_53E.js +16 -0
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +56 -28
- package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
- package/dist/src/app/assets/index-BXGkeMwh.css +0 -1
- package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
- package/dist/src/app/assets/sync-IjzpWrOE.js +0 -4
- package/dist/src/app/assets/vendor-charts-BNdH8TCw.js +0 -36
- package/dist/src/cache-Cr9oLMUa.js +0 -3
- package/dist/src/cache-DbLsVWB2.cjs +0 -3
- package/dist/src/cloud-Hphvo8kr.js +0 -3
- package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
- package/dist/src/codex-sdk-CWEnH70W.cjs +0 -2
- package/dist/src/evalResult-D8MT9p0s.js +0 -3
- package/dist/src/evalResult-DElBuddX.js +0 -2
- package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
- package/dist/src/evaluator-CVessDWe.js +0 -3
- package/dist/src/fetch-C7bGKDlQ.js +0 -3
- package/dist/src/graders-BOAzQEUe.cjs +0 -2
- package/dist/src/graders-D4BTsZdG2.js +0 -3
- package/dist/src/graders-DOJK1XpV.js +0 -2
- package/dist/src/graders-NAv9LcBn.js +0 -2
- package/dist/src/image-B5Mv-Z3h.js +0 -257
- package/dist/src/image-DVz2RiMF.js +0 -258
- package/dist/src/image-qUpPvmNZ.js +0 -257
- package/dist/src/image-x6KqLQl4.cjs +0 -280
- package/dist/src/providers-Bp4S-FvO.js +0 -2
- package/dist/src/providers-DV3ax9e_.cjs +0 -3
- package/dist/src/providers-u9Enmfok.js +0 -2
- package/dist/src/render-CH-62LbA.js +0 -135
- package/dist/src/render-CMEpfLaO.js +0 -136
- package/dist/src/render-DHIZ6_k8.js +0 -135
- package/dist/src/render-DfQSFxGE.cjs +0 -165
- package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
- package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
- package/dist/src/server-BNYztJkh.js +0 -385
- package/dist/src/server-DCtHUqlp.js +0 -3
- package/dist/src/server-DaA2eR26.cjs +0 -2
- package/dist/src/store-CWOSz6D_.cjs +0 -2
- package/dist/src/store-DCDBhv7B.js +0 -3
- package/dist/src/store-Dn9HUkdW.js +0 -240
- package/dist/src/telemetry-C1IqxcdW.js +0 -3
- package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
- package/dist/src/transform-Bbg6A8Jk.js +0 -216
- package/dist/src/transform-CUnzlsbn.cjs +0 -228
- package/dist/src/transform-DYX1_Xnh.js +0 -216
- package/dist/src/transform-DgKlRr73.cjs +0 -2
- package/dist/src/transform-M6ITAESf.js +0 -3
- package/dist/src/transform-UN5UGu8U.js +0 -213
package/dist/src/index.cjs
CHANGED
|
@@ -2,77 +2,93 @@ Object.defineProperties(exports, {
|
|
|
2
2
|
__esModule: { value: true },
|
|
3
3
|
[Symbol.toStringTag]: { value: "Module" }
|
|
4
4
|
});
|
|
5
|
-
const
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
const
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const require_render = require("./render-
|
|
15
|
-
const
|
|
16
|
-
const
|
|
17
|
-
const
|
|
18
|
-
const
|
|
19
|
-
const
|
|
20
|
-
const
|
|
21
|
-
const
|
|
22
|
-
const
|
|
23
|
-
const
|
|
24
|
-
const
|
|
25
|
-
const
|
|
26
|
-
const
|
|
27
|
-
const
|
|
28
|
-
const
|
|
29
|
-
const
|
|
30
|
-
const
|
|
31
|
-
const
|
|
32
|
-
const
|
|
33
|
-
const
|
|
34
|
-
const
|
|
5
|
+
const require_rolldown_runtime = require("./rolldown-runtime-D_mwlA32.cjs");
|
|
6
|
+
const require_logger = require("./logger-cfNpzI4o.cjs");
|
|
7
|
+
const require_invariant = require("./invariant-QtnLD03y.cjs");
|
|
8
|
+
const require_types = require("./types-CxJvaY2S.cjs");
|
|
9
|
+
const require_fetch = require("./fetch-Dw4XZHjj.cjs");
|
|
10
|
+
const require_version = require("./version-BWCSaByA.cjs");
|
|
11
|
+
const require_fileExtensions = require("./fileExtensions-BhdwzYaD.cjs");
|
|
12
|
+
const require_accounts = require("./accounts-CmWzeD2d.cjs");
|
|
13
|
+
const require_esm = require("./esm-BIKakvNa.cjs");
|
|
14
|
+
const require_render = require("./render-BNTrbmBw.cjs");
|
|
15
|
+
const require_providerRegistry = require("./providerRegistry-CZO_w7ue.cjs");
|
|
16
|
+
const require_remoteGeneration = require("./remoteGeneration-DS9N3pgB.cjs");
|
|
17
|
+
const require_server = require("./server-B8rqV126.cjs");
|
|
18
|
+
const require_storage = require("./storage-CA-v9V2v.cjs");
|
|
19
|
+
const require_pythonUtils = require("./pythonUtils-Cokhluq3.cjs");
|
|
20
|
+
const require_util = require("./util-SPsvFONY.cjs");
|
|
21
|
+
const require_createHash = require("./createHash-CSiqnK5P.cjs");
|
|
22
|
+
const require_cache = require("./cache-CPGUA4Yl.cjs");
|
|
23
|
+
const require_chat = require("./chat-DxTDQ83C.cjs");
|
|
24
|
+
const require_transform = require("./transform-DhNkAUs8.cjs");
|
|
25
|
+
const require_util$1 = require("./util-BSIuSLVK.cjs");
|
|
26
|
+
const require_providers = require("./providers-BDVVIQM6.cjs");
|
|
27
|
+
const require_tokenUsageUtils = require("./tokenUsageUtils-_B-P8IAi.cjs");
|
|
28
|
+
const require_text = require("./text-nywWsRBM.cjs");
|
|
29
|
+
const require_telemetry = require("./telemetry-CJ7FnCsc.cjs");
|
|
30
|
+
const require_tables = require("./tables-DBIJU0WE.cjs");
|
|
31
|
+
const require_blobs = require("./blobs-B0977K1O.cjs");
|
|
32
|
+
const require_processShim = require("./processShim-DSY9BV2T.cjs");
|
|
33
|
+
const require_packageParser = require("./packageParser-QoCS1FMl.cjs");
|
|
34
|
+
const require_rubyUtils = require("./rubyUtils-CqUWBZAt.cjs");
|
|
35
|
+
const require_inputVariables = require("./inputVariables-Dq9W-Z3a.cjs");
|
|
36
|
+
const require_util$2 = require("./util-CN8om2rz.cjs");
|
|
37
|
+
const require_promptfoo = require("./promptfoo-b-baRMj-.cjs");
|
|
38
|
+
const require_graders = require("./graders-ClrU2fnd.cjs");
|
|
39
|
+
const require_shared = require("./shared-WkgnDkcg.cjs");
|
|
40
|
+
const require_utils = require("./utils-Ve6kuJsa.cjs");
|
|
41
|
+
const require_transform$1 = require("./transform-BnSXWmU_2.cjs");
|
|
42
|
+
const require_store = require("./store-B2NDDooM.cjs");
|
|
43
|
+
const require_extractor = require("./extractor-BdxEtt3J.cjs");
|
|
44
|
+
const require_promptLength = require("./promptLength-BbBbDHNj.cjs");
|
|
45
|
+
const require_indirectWebPwn = require("./indirectWebPwn-BJ22AbQa.cjs");
|
|
46
|
+
const require_toolAttributes = require("./toolAttributes-BAjwcBf0.cjs");
|
|
47
|
+
const require_evaluatorHelpers = require("./evaluatorHelpers-CRqTvSux.cjs");
|
|
48
|
+
const require_evalResult = require("./evalResult-CvtS8h8u.cjs");
|
|
49
|
+
const require_strategies = require("./strategies-7CS3Alao.cjs");
|
|
35
50
|
let fs = require("fs");
|
|
36
|
-
fs =
|
|
51
|
+
fs = require_rolldown_runtime.__toESM(fs, 1);
|
|
37
52
|
let path = require("path");
|
|
38
|
-
path =
|
|
53
|
+
path = require_rolldown_runtime.__toESM(path, 1);
|
|
39
54
|
let async = require("async");
|
|
40
|
-
async =
|
|
55
|
+
async = require_rolldown_runtime.__toESM(async, 1);
|
|
41
56
|
let js_yaml = require("js-yaml");
|
|
42
|
-
js_yaml =
|
|
57
|
+
js_yaml = require_rolldown_runtime.__toESM(js_yaml, 1);
|
|
43
58
|
let node_async_hooks = require("node:async_hooks");
|
|
44
|
-
require("node:path");
|
|
45
|
-
require("node:url");
|
|
46
59
|
let chalk = require("chalk");
|
|
47
|
-
chalk =
|
|
60
|
+
chalk = require_rolldown_runtime.__toESM(chalk, 1);
|
|
48
61
|
let os = require("os");
|
|
49
|
-
os =
|
|
50
|
-
let util = require("util");
|
|
51
|
-
util = require_logger.__toESM(util);
|
|
62
|
+
os = require_rolldown_runtime.__toESM(os, 1);
|
|
52
63
|
let dedent = require("dedent");
|
|
53
|
-
dedent =
|
|
54
|
-
let fs_promises = require("fs/promises");
|
|
55
|
-
fs_promises = require_logger.__toESM(fs_promises);
|
|
56
|
-
let glob = require("glob");
|
|
64
|
+
dedent = require_rolldown_runtime.__toESM(dedent, 1);
|
|
57
65
|
let zod = require("zod");
|
|
58
|
-
let
|
|
59
|
-
|
|
66
|
+
let fs_promises = require("fs/promises");
|
|
67
|
+
fs_promises = require_rolldown_runtime.__toESM(fs_promises, 1);
|
|
68
|
+
let util = require("util");
|
|
69
|
+
util = require_rolldown_runtime.__toESM(util, 1);
|
|
70
|
+
let _inquirer_input = require("@inquirer/input");
|
|
71
|
+
_inquirer_input = require_rolldown_runtime.__toESM(_inquirer_input, 1);
|
|
72
|
+
require("node:path");
|
|
73
|
+
require("node:url");
|
|
60
74
|
let crypto$1 = require("crypto");
|
|
61
|
-
crypto$1 =
|
|
75
|
+
crypto$1 = require_rolldown_runtime.__toESM(crypto$1, 1);
|
|
62
76
|
let _opentelemetry_api = require("@opentelemetry/api");
|
|
63
|
-
let _inquirer_input = require("@inquirer/input");
|
|
64
|
-
_inquirer_input = require_logger.__toESM(_inquirer_input);
|
|
65
77
|
let readline = require("readline");
|
|
66
|
-
readline =
|
|
78
|
+
readline = require_rolldown_runtime.__toESM(readline, 1);
|
|
79
|
+
let csv_parse_sync = require("csv-parse/sync");
|
|
80
|
+
let glob = require("glob");
|
|
81
|
+
let fast_xml_parser = require("fast-xml-parser");
|
|
67
82
|
let drizzle_orm = require("drizzle-orm");
|
|
68
|
-
let cli_progress = require("cli-progress");
|
|
69
|
-
cli_progress = require_logger.__toESM(cli_progress);
|
|
70
83
|
let url = require("url");
|
|
71
|
-
let
|
|
84
|
+
let parse5 = require("parse5");
|
|
72
85
|
let fastest_levenshtein = require("fastest-levenshtein");
|
|
86
|
+
let cli_progress = require("cli-progress");
|
|
87
|
+
cli_progress = require_rolldown_runtime.__toESM(cli_progress, 1);
|
|
73
88
|
let js_rouge = require("js-rouge");
|
|
74
|
-
js_rouge =
|
|
89
|
+
js_rouge = require_rolldown_runtime.__toESM(js_rouge, 1);
|
|
75
90
|
let node_util = require("node:util");
|
|
91
|
+
let lru_cache = require("lru-cache");
|
|
76
92
|
require("debounce");
|
|
77
93
|
let _opentelemetry_core = require("@opentelemetry/core");
|
|
78
94
|
let _opentelemetry_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otlp-http");
|
|
@@ -81,15 +97,15 @@ let _opentelemetry_sdk_trace_node = require("@opentelemetry/sdk-trace-node");
|
|
|
81
97
|
let _opentelemetry_semantic_conventions = require("@opentelemetry/semantic-conventions");
|
|
82
98
|
let drizzle_orm_better_sqlite3_migrator = require("drizzle-orm/better-sqlite3/migrator");
|
|
83
99
|
let process$1 = require("process");
|
|
84
|
-
process$1 =
|
|
100
|
+
process$1 = require_rolldown_runtime.__toESM(process$1, 1);
|
|
85
101
|
let _apidevtools_json_schema_ref_parser = require("@apidevtools/json-schema-ref-parser");
|
|
86
|
-
_apidevtools_json_schema_ref_parser =
|
|
102
|
+
_apidevtools_json_schema_ref_parser = require_rolldown_runtime.__toESM(_apidevtools_json_schema_ref_parser, 1);
|
|
87
103
|
let cli_table3 = require("cli-table3");
|
|
88
|
-
cli_table3 =
|
|
104
|
+
cli_table3 = require_rolldown_runtime.__toESM(cli_table3, 1);
|
|
89
105
|
let chokidar = require("chokidar");
|
|
90
|
-
chokidar =
|
|
106
|
+
chokidar = require_rolldown_runtime.__toESM(chokidar, 1);
|
|
91
107
|
let ora = require("ora");
|
|
92
|
-
ora =
|
|
108
|
+
ora = require_rolldown_runtime.__toESM(ora, 1);
|
|
93
109
|
require("@inquirer/confirm");
|
|
94
110
|
//#region src/external/matchers/conversationRelevancyTemplate.ts
|
|
95
111
|
var ConversationRelevancyTemplate = class {
|
|
@@ -262,6 +278,505 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
262
278
|
};
|
|
263
279
|
};
|
|
264
280
|
//#endregion
|
|
281
|
+
//#region src/matchers/classification.ts
|
|
282
|
+
/**
|
|
283
|
+
*
|
|
284
|
+
* @param expected Expected classification. If undefined, matches any classification.
|
|
285
|
+
* @param output Text to classify.
|
|
286
|
+
* @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
|
|
287
|
+
* @param grading
|
|
288
|
+
* @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
|
|
289
|
+
*/
|
|
290
|
+
async function matchesClassification(expected, output, threshold, grading) {
|
|
291
|
+
const resp = await (await require_graders.getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
|
|
292
|
+
if (!resp.classification) return require_graders.fail(resp.error || "Unknown error fetching classification");
|
|
293
|
+
let score;
|
|
294
|
+
if (expected === void 0) {
|
|
295
|
+
const scores = Object.values(resp.classification);
|
|
296
|
+
if (scores.length === 0) return {
|
|
297
|
+
pass: false,
|
|
298
|
+
score: 0,
|
|
299
|
+
reason: "No classification scores returned"
|
|
300
|
+
};
|
|
301
|
+
score = Math.max(...scores);
|
|
302
|
+
} else score = resp.classification[expected] || 0;
|
|
303
|
+
if (score >= threshold - Number.EPSILON) {
|
|
304
|
+
const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
|
|
305
|
+
return {
|
|
306
|
+
pass: true,
|
|
307
|
+
score,
|
|
308
|
+
reason
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
return {
|
|
312
|
+
pass: false,
|
|
313
|
+
score,
|
|
314
|
+
reason: expected === void 0 ? `Maximum classification score ${score.toFixed(2)} < ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
//#endregion
|
|
318
|
+
//#region src/matchers/comparison.ts
|
|
319
|
+
async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
|
|
320
|
+
require_invariant.invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
|
|
321
|
+
const resp = await require_graders.callProviderWithContext(await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "select-best check"), await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.SELECT_BEST_PROMPT), {
|
|
322
|
+
criteria,
|
|
323
|
+
outputs: outputs.map((o) => require_graders.tryParse(o)),
|
|
324
|
+
...vars || {}
|
|
325
|
+
}), "select-best", {
|
|
326
|
+
criteria,
|
|
327
|
+
outputs: outputs.map((o) => require_graders.tryParse(o)),
|
|
328
|
+
...vars || {}
|
|
329
|
+
}, providerCallContext);
|
|
330
|
+
if (resp.error || !resp.output) return Array.from({ length: outputs.length }, () => require_graders.fail(resp.error || "No output", resp.tokenUsage));
|
|
331
|
+
require_invariant.invariant(typeof resp.output === "string", "select-best produced malformed response");
|
|
332
|
+
const firstIntegerMatch = resp.output.trim().match(/\d+/);
|
|
333
|
+
const verdict = firstIntegerMatch ? Number.parseInt(firstIntegerMatch[0], 10) : NaN;
|
|
334
|
+
if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return Array.from({ length: outputs.length }, () => require_graders.fail(`Invalid select-best verdict: ${verdict}`, resp.tokenUsage));
|
|
335
|
+
const tokensUsed = require_graders.normalizeMatcherTokenUsage(resp.tokenUsage);
|
|
336
|
+
return outputs.map((_output, index) => {
|
|
337
|
+
if (index === verdict) return {
|
|
338
|
+
pass: true,
|
|
339
|
+
score: 1,
|
|
340
|
+
reason: `Output selected as the best: ${criteria}`,
|
|
341
|
+
tokensUsed
|
|
342
|
+
};
|
|
343
|
+
else return {
|
|
344
|
+
pass: false,
|
|
345
|
+
score: 0,
|
|
346
|
+
reason: `Output not selected: ${criteria}`,
|
|
347
|
+
tokensUsed
|
|
348
|
+
};
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
|
|
352
|
+
require_invariant.invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
|
|
353
|
+
const value = assertion.value || {};
|
|
354
|
+
const options = {
|
|
355
|
+
method: typeof value === "object" && "method" in value ? value.method : "average",
|
|
356
|
+
weights: typeof value === "object" && "weights" in value ? value.weights : {},
|
|
357
|
+
threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
|
|
358
|
+
};
|
|
359
|
+
const scores = resultsWithGradingResults.map((result, index) => {
|
|
360
|
+
const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
|
|
361
|
+
if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
|
|
362
|
+
let totalWeightedScore = 0;
|
|
363
|
+
let totalWeight = 0;
|
|
364
|
+
relevantResults.forEach((componentResult) => {
|
|
365
|
+
const assertionType = componentResult.assertion?.type || "unknown";
|
|
366
|
+
const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
|
|
367
|
+
const score = componentResult.score || 0;
|
|
368
|
+
totalWeightedScore += score * weight;
|
|
369
|
+
totalWeight += weight;
|
|
370
|
+
});
|
|
371
|
+
let aggregateScore;
|
|
372
|
+
if (options.method === "sum") aggregateScore = totalWeightedScore;
|
|
373
|
+
else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
|
|
374
|
+
return {
|
|
375
|
+
index,
|
|
376
|
+
score: aggregateScore,
|
|
377
|
+
componentCount: relevantResults.length,
|
|
378
|
+
totalWeight
|
|
379
|
+
};
|
|
380
|
+
});
|
|
381
|
+
let maxScore = -Infinity;
|
|
382
|
+
let winnerIndex = 0;
|
|
383
|
+
for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
|
|
384
|
+
maxScore = scores[i].score;
|
|
385
|
+
winnerIndex = i;
|
|
386
|
+
}
|
|
387
|
+
const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
|
|
388
|
+
return scores.map(({ index, score, componentCount, totalWeight }) => {
|
|
389
|
+
const isWinner = index === winnerIndex && meetsThreshold;
|
|
390
|
+
return {
|
|
391
|
+
pass: isWinner,
|
|
392
|
+
score: isWinner ? 1 : 0,
|
|
393
|
+
reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
|
|
394
|
+
namedScores: {
|
|
395
|
+
maxScore: score,
|
|
396
|
+
assertionCount: componentCount,
|
|
397
|
+
totalWeight
|
|
398
|
+
}
|
|
399
|
+
};
|
|
400
|
+
});
|
|
401
|
+
}
|
|
402
|
+
//#endregion
|
|
403
|
+
//#region src/matchers/moderation.ts
|
|
404
|
+
async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
|
|
405
|
+
if (!assistantResponse) return {
|
|
406
|
+
pass: true,
|
|
407
|
+
score: 1,
|
|
408
|
+
reason: "No output to moderate"
|
|
409
|
+
};
|
|
410
|
+
const defaultProviders = await require_graders.getDefaultProviders();
|
|
411
|
+
const defaultModerationProvider = !require_logger.getEnvString("OPENAI_API_KEY") && (require_logger.getEnvString("REPLICATE_API_KEY") || require_logger.getEnvString("REPLICATE_API_TOKEN")) ? await require_providers.loadApiProvider(require_types.LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
|
|
412
|
+
const moderationProvider = await require_graders.getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
|
|
413
|
+
require_invariant.invariant(moderationProvider, "Moderation provider must be defined");
|
|
414
|
+
const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
|
|
415
|
+
if (resp.error) return {
|
|
416
|
+
pass: false,
|
|
417
|
+
score: 0,
|
|
418
|
+
reason: `Moderation API error: ${resp.error}`
|
|
419
|
+
};
|
|
420
|
+
const { flags } = resp;
|
|
421
|
+
if (!flags || flags.length === 0) return {
|
|
422
|
+
pass: true,
|
|
423
|
+
score: 1,
|
|
424
|
+
reason: "No moderation flags detected"
|
|
425
|
+
};
|
|
426
|
+
const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
|
|
427
|
+
if (filteredFlags.length > 0) return {
|
|
428
|
+
pass: false,
|
|
429
|
+
score: 0,
|
|
430
|
+
reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
|
|
431
|
+
};
|
|
432
|
+
return {
|
|
433
|
+
pass: true,
|
|
434
|
+
score: 1,
|
|
435
|
+
reason: "No relevant moderation flags detected"
|
|
436
|
+
};
|
|
437
|
+
}
|
|
438
|
+
//#endregion
|
|
439
|
+
//#region src/assertions/contextUtils.ts
|
|
440
|
+
/**
|
|
441
|
+
* Resolves the context value for context-based assertions.
|
|
442
|
+
* Supports extracting context from test variables or transforming from output.
|
|
443
|
+
* Can return either a single context string or an array of context chunks.
|
|
444
|
+
*
|
|
445
|
+
* @param assertion - The assertion configuration
|
|
446
|
+
* @param test - The test case
|
|
447
|
+
* @param output - The provider output (after provider transform, before test transform)
|
|
448
|
+
* @param prompt - The prompt text
|
|
449
|
+
* @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
|
|
450
|
+
* @param providerResponse - Optional full provider response for contextTransform
|
|
451
|
+
* @returns The resolved context string or array of strings
|
|
452
|
+
* @throws Error if context cannot be resolved or transform fails
|
|
453
|
+
*/
|
|
454
|
+
async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
|
|
455
|
+
let contextValue;
|
|
456
|
+
if (test.vars?.context) {
|
|
457
|
+
if (typeof test.vars.context === "string") contextValue = test.vars.context;
|
|
458
|
+
else if (Array.isArray(test.vars.context)) {
|
|
459
|
+
const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
|
|
460
|
+
if (invalidEntry) {
|
|
461
|
+
const [idx, val] = invalidEntry;
|
|
462
|
+
require_invariant.invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
|
|
463
|
+
}
|
|
464
|
+
contextValue = test.vars.context;
|
|
465
|
+
}
|
|
466
|
+
} else if (fallbackContext) contextValue = fallbackContext;
|
|
467
|
+
if (assertion.contextTransform) {
|
|
468
|
+
const getLabel = () => require_transform$1.getTransformLabel(assertion.contextTransform);
|
|
469
|
+
try {
|
|
470
|
+
const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
|
|
471
|
+
const transformed = await require_transform$1.transform(assertion.contextTransform, outputForTransform, {
|
|
472
|
+
vars: test.vars,
|
|
473
|
+
prompt: { label: prompt },
|
|
474
|
+
...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
|
|
475
|
+
});
|
|
476
|
+
require_invariant.invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), () => `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${getLabel()}`);
|
|
477
|
+
contextValue = transformed;
|
|
478
|
+
} catch (error) {
|
|
479
|
+
throw new Error(`Failed to transform context using expression '${getLabel()}': ${require_transform$1.getTransformErrorMessage(error)}`);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
require_invariant.invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
|
|
483
|
+
return contextValue;
|
|
484
|
+
}
|
|
485
|
+
/**
|
|
486
|
+
* Serializes context (string or string[]) to a single string for prompts.
|
|
487
|
+
* Joins chunks with double newlines to preserve separation.
|
|
488
|
+
*/
|
|
489
|
+
function serializeContext(context) {
|
|
490
|
+
return Array.isArray(context) ? context.join("\n\n") : context;
|
|
491
|
+
}
|
|
492
|
+
//#endregion
|
|
493
|
+
//#region src/matchers/rag.ts
|
|
494
|
+
async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
|
|
495
|
+
const defaults = await require_graders.getDefaultProviders();
|
|
496
|
+
const embeddingProvider = await require_graders.getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "answer relevancy check");
|
|
497
|
+
const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, defaults.gradingProvider, "answer relevancy check");
|
|
498
|
+
const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
|
|
499
|
+
const rubricPrompt = await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.ANSWER_RELEVANCY_GENERATE);
|
|
500
|
+
const parsedOutput = require_graders.tryParse(output);
|
|
501
|
+
const promptText = await require_graders.renderLlmRubricPrompt(rubricPrompt, { answer: parsedOutput });
|
|
502
|
+
const candidateQuestions = [];
|
|
503
|
+
for (let i = 0; i < 3; i++) {
|
|
504
|
+
const resp = await require_graders.callProviderWithContext(textProvider, promptText, "answer-relevance", { answer: parsedOutput }, providerCallContext);
|
|
505
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
506
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
|
|
507
|
+
require_invariant.invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
|
|
508
|
+
candidateQuestions.push(resp.output);
|
|
509
|
+
}
|
|
510
|
+
require_invariant.invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id()} must implement callEmbeddingApi for similarity check`);
|
|
511
|
+
const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
|
|
512
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, inputEmbeddingResp.tokenUsage);
|
|
513
|
+
if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return require_graders.fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
|
|
514
|
+
const inputEmbedding = inputEmbeddingResp.embedding;
|
|
515
|
+
const similarities = [];
|
|
516
|
+
const questionsWithScores = [];
|
|
517
|
+
for (const question of candidateQuestions) {
|
|
518
|
+
const resp = await embeddingProvider.callEmbeddingApi(question);
|
|
519
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
520
|
+
if (resp.error || !resp.embedding) return require_graders.fail(resp.error || "No embedding", tokensUsed);
|
|
521
|
+
const questionSimilarity = require_graders.cosineSimilarity(inputEmbedding, resp.embedding);
|
|
522
|
+
similarities.push(questionSimilarity);
|
|
523
|
+
questionsWithScores.push({
|
|
524
|
+
question,
|
|
525
|
+
similarity: questionSimilarity
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
|
|
529
|
+
const pass = similarity >= threshold - Number.EPSILON;
|
|
530
|
+
const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
|
|
531
|
+
const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
532
|
+
const metadata = {
|
|
533
|
+
generatedQuestions: questionsWithScores,
|
|
534
|
+
averageSimilarity: similarity,
|
|
535
|
+
threshold
|
|
536
|
+
};
|
|
537
|
+
if (pass) return {
|
|
538
|
+
pass: true,
|
|
539
|
+
score: similarity,
|
|
540
|
+
reason: greaterThanReason,
|
|
541
|
+
tokensUsed,
|
|
542
|
+
metadata
|
|
543
|
+
};
|
|
544
|
+
return {
|
|
545
|
+
pass: false,
|
|
546
|
+
score: similarity,
|
|
547
|
+
reason: lessThanReason,
|
|
548
|
+
tokensUsed,
|
|
549
|
+
metadata
|
|
550
|
+
};
|
|
551
|
+
}
|
|
552
|
+
async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
|
|
553
|
+
const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "context recall check");
|
|
554
|
+
const contextString = serializeContext(context);
|
|
555
|
+
const resp = await require_graders.callProviderWithContext(textProvider, await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.CONTEXT_RECALL), {
|
|
556
|
+
context: contextString,
|
|
557
|
+
groundTruth,
|
|
558
|
+
...vars || {}
|
|
559
|
+
}), "context-recall", {
|
|
560
|
+
context: contextString,
|
|
561
|
+
groundTruth,
|
|
562
|
+
...vars || {}
|
|
563
|
+
}, providerCallContext);
|
|
564
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", resp.tokenUsage);
|
|
565
|
+
require_invariant.invariant(typeof resp.output === "string", "context-recall produced malformed response");
|
|
566
|
+
const attributedTokenLower = require_graders.CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
|
|
567
|
+
const notAttributedTokenLower = require_graders.CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
|
|
568
|
+
const sentences = require_graders.splitIntoSentences(resp.output).filter((line) => {
|
|
569
|
+
const lowerLine = line.toLowerCase();
|
|
570
|
+
return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
|
|
571
|
+
});
|
|
572
|
+
const sentenceAttributions = [];
|
|
573
|
+
let numerator = 0;
|
|
574
|
+
for (const sentence of sentences) {
|
|
575
|
+
const lowerSentence = sentence.toLowerCase();
|
|
576
|
+
const isAttributed = !lowerSentence.includes(notAttributedTokenLower) && lowerSentence.includes(attributedTokenLower);
|
|
577
|
+
if (isAttributed) numerator++;
|
|
578
|
+
const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
|
|
579
|
+
const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
|
|
580
|
+
sentenceAttributions.push({
|
|
581
|
+
sentence: cleanSentence,
|
|
582
|
+
attributed: isAttributed
|
|
583
|
+
});
|
|
584
|
+
}
|
|
585
|
+
const score = sentences.length > 0 ? numerator / sentences.length : 0;
|
|
586
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
587
|
+
const metadata = {
|
|
588
|
+
sentenceAttributions,
|
|
589
|
+
totalSentences: sentences.length,
|
|
590
|
+
attributedSentences: numerator,
|
|
591
|
+
score
|
|
592
|
+
};
|
|
593
|
+
return {
|
|
594
|
+
pass,
|
|
595
|
+
score,
|
|
596
|
+
reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
|
|
597
|
+
tokensUsed: require_graders.normalizeMatcherTokenUsage(resp.tokenUsage),
|
|
598
|
+
metadata
|
|
599
|
+
};
|
|
600
|
+
}
|
|
601
|
+
async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
|
|
602
|
+
const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "context relevance check");
|
|
603
|
+
const contextString = serializeContext(context);
|
|
604
|
+
const resp = await require_graders.callProviderWithContext(textProvider, await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.CONTEXT_RELEVANCE), {
|
|
605
|
+
context: contextString,
|
|
606
|
+
query: question
|
|
607
|
+
}), "context-relevance", {
|
|
608
|
+
context: contextString,
|
|
609
|
+
query: question
|
|
610
|
+
}, providerCallContext);
|
|
611
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", resp.tokenUsage);
|
|
612
|
+
require_invariant.invariant(typeof resp.output === "string", "context-relevance produced malformed response");
|
|
613
|
+
const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : require_graders.splitIntoSentences(context);
|
|
614
|
+
const totalContextUnits = contextUnits.length;
|
|
615
|
+
const extractedSentences = require_graders.splitIntoSentences(resp.output);
|
|
616
|
+
const relevantSentences = [];
|
|
617
|
+
const insufficientInformation = resp.output.includes(require_graders.CONTEXT_RELEVANCE_BAD);
|
|
618
|
+
let numerator = 0;
|
|
619
|
+
if (insufficientInformation) numerator = 0;
|
|
620
|
+
else {
|
|
621
|
+
const uniqueRelevantSentences = [...new Set(extractedSentences)];
|
|
622
|
+
numerator = Math.min(uniqueRelevantSentences.length, totalContextUnits);
|
|
623
|
+
relevantSentences.push(...uniqueRelevantSentences);
|
|
624
|
+
}
|
|
625
|
+
const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
|
|
626
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
627
|
+
const metadata = {
|
|
628
|
+
extractedSentences: relevantSentences,
|
|
629
|
+
totalContextUnits,
|
|
630
|
+
totalContextSentences: totalContextUnits,
|
|
631
|
+
contextUnits,
|
|
632
|
+
relevantSentenceCount: numerator,
|
|
633
|
+
insufficientInformation,
|
|
634
|
+
score
|
|
635
|
+
};
|
|
636
|
+
return {
|
|
637
|
+
pass,
|
|
638
|
+
score,
|
|
639
|
+
reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
|
|
640
|
+
tokensUsed: require_graders.normalizeMatcherTokenUsage(resp.tokenUsage),
|
|
641
|
+
metadata
|
|
642
|
+
};
|
|
643
|
+
}
|
|
644
|
+
async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
|
|
645
|
+
const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "faithfulness check");
|
|
646
|
+
const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
|
|
647
|
+
if (grading?.rubricPrompt) require_invariant.invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
|
|
648
|
+
const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
|
|
649
|
+
const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
|
|
650
|
+
const longformPrompt = await require_graders.loadRubricPrompt(rawLongformPrompt, require_graders.CONTEXT_FAITHFULNESS_LONGFORM);
|
|
651
|
+
const nliPrompt = await require_graders.loadRubricPrompt(rawNliPrompt, require_graders.CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
|
|
652
|
+
let promptText = await require_graders.renderLlmRubricPrompt(longformPrompt, {
|
|
653
|
+
question: query,
|
|
654
|
+
answer: require_graders.tryParse(output),
|
|
655
|
+
...vars || {}
|
|
656
|
+
});
|
|
657
|
+
let resp = await require_graders.callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
|
|
658
|
+
question: query,
|
|
659
|
+
answer: require_graders.tryParse(output),
|
|
660
|
+
...vars || {}
|
|
661
|
+
}, providerCallContext);
|
|
662
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
663
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
|
|
664
|
+
require_invariant.invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
665
|
+
const contextString = serializeContext(context);
|
|
666
|
+
const statements = require_graders.splitIntoSentences(resp.output);
|
|
667
|
+
promptText = await require_graders.renderLlmRubricPrompt(nliPrompt, {
|
|
668
|
+
context: contextString,
|
|
669
|
+
statements,
|
|
670
|
+
...vars || {}
|
|
671
|
+
});
|
|
672
|
+
resp = await require_graders.callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
|
|
673
|
+
context: contextString,
|
|
674
|
+
statements,
|
|
675
|
+
...vars || {}
|
|
676
|
+
}, providerCallContext);
|
|
677
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
678
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
|
|
679
|
+
require_invariant.invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
680
|
+
let finalAnswer = "Final verdict for each statement in order:";
|
|
681
|
+
finalAnswer = finalAnswer.toLowerCase();
|
|
682
|
+
let verdicts = resp.output.toLowerCase().trim();
|
|
683
|
+
let score = 0;
|
|
684
|
+
if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
|
|
685
|
+
verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
|
|
686
|
+
const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
|
|
687
|
+
if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
|
|
688
|
+
} else {
|
|
689
|
+
const noVerdictCount = verdicts.split("verdict: no").length - 1;
|
|
690
|
+
if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
|
|
691
|
+
}
|
|
692
|
+
score = Math.min(1, Math.max(0, score));
|
|
693
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
694
|
+
return {
|
|
695
|
+
pass,
|
|
696
|
+
score,
|
|
697
|
+
reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
|
|
698
|
+
tokensUsed
|
|
699
|
+
};
|
|
700
|
+
}
|
|
701
|
+
//#endregion
|
|
702
|
+
//#region src/matchers/similarity.ts
|
|
703
|
+
function calculateSimilarityScore(expectedEmbedding, outputEmbedding, metric, tokensUsed) {
|
|
704
|
+
switch (metric) {
|
|
705
|
+
case "cosine": return require_graders.cosineSimilarity(expectedEmbedding, outputEmbedding);
|
|
706
|
+
case "dot_product": return require_graders.dotProduct(expectedEmbedding, outputEmbedding);
|
|
707
|
+
case "euclidean": return require_graders.euclideanDistance(expectedEmbedding, outputEmbedding);
|
|
708
|
+
default: return require_graders.fail(`Unsupported metric: ${metric}`, tokensUsed);
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
|
|
712
|
+
if (metric === "euclidean") {
|
|
713
|
+
const distance = similarity;
|
|
714
|
+
const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
|
|
715
|
+
const normalizedScore = 1 / (1 + distance);
|
|
716
|
+
const score = inverse ? 1 - normalizedScore : normalizedScore;
|
|
717
|
+
const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
|
|
718
|
+
const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
|
|
719
|
+
return {
|
|
720
|
+
pass,
|
|
721
|
+
score,
|
|
722
|
+
reason: pass ? inverse ? aboveThresholdReason : belowThresholdReason : inverse ? belowThresholdReason : aboveThresholdReason,
|
|
723
|
+
tokensUsed
|
|
724
|
+
};
|
|
725
|
+
}
|
|
726
|
+
const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
|
|
727
|
+
const score = inverse ? 1 - similarity : similarity;
|
|
728
|
+
const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
|
|
729
|
+
const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
730
|
+
return {
|
|
731
|
+
pass,
|
|
732
|
+
score,
|
|
733
|
+
reason: pass ? inverse ? lessThanReason : greaterThanReason : inverse ? greaterThanReason : lessThanReason,
|
|
734
|
+
tokensUsed
|
|
735
|
+
};
|
|
736
|
+
}
|
|
737
|
+
async function calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed) {
|
|
738
|
+
if (metric === "cosine" && "callSimilarityApi" in finalProvider) {
|
|
739
|
+
const similarityResp = await finalProvider.callSimilarityApi(expected, output);
|
|
740
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, similarityResp.tokenUsage);
|
|
741
|
+
if (similarityResp.error) return require_graders.fail(similarityResp.error, tokensUsed);
|
|
742
|
+
if (similarityResp.similarity == null) return require_graders.fail("Unknown error fetching similarity", tokensUsed);
|
|
743
|
+
if (!Number.isFinite(similarityResp.similarity)) return require_graders.fail(`Invalid similarity score: ${similarityResp.similarity}`, tokensUsed);
|
|
744
|
+
return similarityResp.similarity;
|
|
745
|
+
}
|
|
746
|
+
const callEmbeddingApi = "callEmbeddingApi" in finalProvider ? finalProvider.callEmbeddingApi : void 0;
|
|
747
|
+
if (typeof callEmbeddingApi !== "function") {
|
|
748
|
+
if ("callSimilarityApi" in finalProvider) return require_graders.fail(`Provider ${finalProvider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed);
|
|
749
|
+
throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
|
|
750
|
+
}
|
|
751
|
+
const [expectedEmbedding, outputEmbedding] = await Promise.all([callEmbeddingApi.call(finalProvider, expected), callEmbeddingApi.call(finalProvider, output)]);
|
|
752
|
+
const mergedUsage = require_graders.normalizeMatcherTokenUsage(void 0);
|
|
753
|
+
require_tokenUsageUtils.accumulateTokenUsage(mergedUsage, expectedEmbedding.tokenUsage);
|
|
754
|
+
require_tokenUsageUtils.accumulateTokenUsage(mergedUsage, outputEmbedding.tokenUsage);
|
|
755
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, mergedUsage);
|
|
756
|
+
if (expectedEmbedding.error || outputEmbedding.error) return require_graders.fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed);
|
|
757
|
+
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return require_graders.fail("Embedding not found", tokensUsed);
|
|
758
|
+
return calculateSimilarityScore(expectedEmbedding.embedding, outputEmbedding.embedding, metric, tokensUsed);
|
|
759
|
+
}
|
|
760
|
+
async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
|
|
761
|
+
if (metric === "cosine" && require_logger.state.config?.redteam && require_remoteGeneration.shouldGenerateRemote({ requireEmbeddingProvider: true })) try {
|
|
762
|
+
return await require_graders.doRemoteGrading({
|
|
763
|
+
task: "similar",
|
|
764
|
+
expected,
|
|
765
|
+
output,
|
|
766
|
+
threshold,
|
|
767
|
+
inverse
|
|
768
|
+
});
|
|
769
|
+
} catch (error) {
|
|
770
|
+
return require_graders.fail(`Could not perform remote grading: ${error}`);
|
|
771
|
+
}
|
|
772
|
+
const defaults = await require_graders.getDefaultProviders();
|
|
773
|
+
const finalProvider = await require_graders.getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check");
|
|
774
|
+
const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
|
|
775
|
+
const similarity = await calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed);
|
|
776
|
+
if (typeof similarity !== "number") return similarity;
|
|
777
|
+
return buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed);
|
|
778
|
+
}
|
|
779
|
+
//#endregion
|
|
265
780
|
//#region src/tracing/evaluatorTracing.ts
|
|
266
781
|
let otlpReceiverStarted = false;
|
|
267
782
|
const DEFAULT_OTLP_ACCEPT_FORMATS = ["json", "protobuf"];
|
|
@@ -305,7 +820,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
305
820
|
require_telemetry.telemetry.record("feature_used", { feature: "tracing" });
|
|
306
821
|
try {
|
|
307
822
|
require_logger.logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
308
|
-
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
823
|
+
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-CvJdBGSc.cjs"));
|
|
309
824
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
310
825
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
311
826
|
const acceptFormats = normalizeOtlpAcceptFormats(testSuite.tracing.otlp.http.acceptFormats);
|
|
@@ -329,7 +844,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
329
844
|
async function stopOtlpReceiverIfNeeded() {
|
|
330
845
|
if (otlpReceiverStarted) try {
|
|
331
846
|
require_logger.logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
332
|
-
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
847
|
+
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-CvJdBGSc.cjs"));
|
|
333
848
|
await stopOTLPReceiver();
|
|
334
849
|
otlpReceiverStarted = false;
|
|
335
850
|
require_logger.logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
@@ -364,7 +879,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
364
879
|
}
|
|
365
880
|
if (!tracingEnabled) return null;
|
|
366
881
|
require_logger.logger.debug("[EvaluatorTracing] Importing trace store");
|
|
367
|
-
const { getTraceStore } = await Promise.resolve().then(() => require("./store-
|
|
882
|
+
const { getTraceStore } = await Promise.resolve().then(() => require("./store-B2NDDooM.cjs")).then((n) => n.store_exports);
|
|
368
883
|
const traceStore = getTraceStore();
|
|
369
884
|
const traceId = generateTraceId();
|
|
370
885
|
const spanId = generateSpanId();
|
|
@@ -406,7 +921,7 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
|
|
|
406
921
|
require_invariant.invariant(prompt, "answer-relevance assertion type must have a prompt");
|
|
407
922
|
return {
|
|
408
923
|
assertion,
|
|
409
|
-
...await
|
|
924
|
+
...await matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
|
|
410
925
|
};
|
|
411
926
|
};
|
|
412
927
|
//#endregion
|
|
@@ -662,7 +1177,7 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
662
1177
|
//#region src/assertions/classifier.ts
|
|
663
1178
|
async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
|
|
664
1179
|
require_invariant.invariant(typeof renderedValue === "string" || typeof renderedValue === "undefined", "\"classifier\" assertion type must have a string value or be undefined");
|
|
665
|
-
const classificationResult = await
|
|
1180
|
+
const classificationResult = await matchesClassification(renderedValue, outputString, assertion.threshold ?? 1, test.options);
|
|
666
1181
|
if (inverse) {
|
|
667
1182
|
classificationResult.pass = !classificationResult.pass;
|
|
668
1183
|
classificationResult.score = 1 - classificationResult.score;
|
|
@@ -674,38 +1189,84 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
674
1189
|
}
|
|
675
1190
|
//#endregion
|
|
676
1191
|
//#region src/assertions/contains.ts
|
|
1192
|
+
/**
|
|
1193
|
+
* Advance over separators between parsed fields.
|
|
1194
|
+
*
|
|
1195
|
+
* Contains-any values allow whitespace around comma delimiters, and historical
|
|
1196
|
+
* parsing ignored repeated commas rather than producing empty fields.
|
|
1197
|
+
*/
|
|
1198
|
+
function skipWhitespaceAndCommas(value, startIndex) {
|
|
1199
|
+
let i = startIndex;
|
|
1200
|
+
while (i < value.length) {
|
|
1201
|
+
i = skipWhitespace(value, i);
|
|
1202
|
+
if (value[i] !== ",") break;
|
|
1203
|
+
i++;
|
|
1204
|
+
}
|
|
1205
|
+
return i;
|
|
1206
|
+
}
|
|
1207
|
+
/**
|
|
1208
|
+
* Advance over whitespace while preserving comma delimiter handling for callers.
|
|
1209
|
+
*/
|
|
1210
|
+
function skipWhitespace(value, startIndex) {
|
|
1211
|
+
let i = startIndex;
|
|
1212
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
1213
|
+
return i;
|
|
1214
|
+
}
|
|
1215
|
+
/**
|
|
1216
|
+
* Parse a quoted field using the assertion parser's CSV-like escape rules.
|
|
1217
|
+
*
|
|
1218
|
+
* Supports backslash-escaped quotes/backslashes and doubled quotes, and rejects
|
|
1219
|
+
* unterminated fields so malformed assertion values do not silently pass.
|
|
1220
|
+
*/
|
|
1221
|
+
function parseQuotedField(value, startIndex) {
|
|
1222
|
+
let i = startIndex + 1;
|
|
1223
|
+
let field = "";
|
|
1224
|
+
let terminated = false;
|
|
1225
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && ["\"", "\\"].includes(value[i + 1])) {
|
|
1226
|
+
field += value[i + 1];
|
|
1227
|
+
i += 2;
|
|
1228
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
1229
|
+
field += "\"";
|
|
1230
|
+
i += 2;
|
|
1231
|
+
} else if (value[i] === "\"") {
|
|
1232
|
+
i++;
|
|
1233
|
+
terminated = true;
|
|
1234
|
+
break;
|
|
1235
|
+
} else {
|
|
1236
|
+
field += value[i];
|
|
1237
|
+
i++;
|
|
1238
|
+
}
|
|
1239
|
+
require_invariant.invariant(terminated, "Unterminated quoted field in contains assertion value");
|
|
1240
|
+
return {
|
|
1241
|
+
field,
|
|
1242
|
+
nextIndex: i
|
|
1243
|
+
};
|
|
1244
|
+
}
|
|
1245
|
+
/**
|
|
1246
|
+
* Parse an unquoted field up to the next comma, trimming surrounding whitespace.
|
|
1247
|
+
*/
|
|
1248
|
+
function parseUnquotedField(value, startIndex) {
|
|
1249
|
+
let i = startIndex;
|
|
1250
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
1251
|
+
return {
|
|
1252
|
+
field: value.substring(startIndex, i).trim(),
|
|
1253
|
+
nextIndex: i
|
|
1254
|
+
};
|
|
1255
|
+
}
|
|
1256
|
+
/**
|
|
1257
|
+
* Split a contains-any string into fields while preserving quoted commas.
|
|
1258
|
+
*/
|
|
677
1259
|
function parseCommaSeparatedValues(value) {
|
|
678
1260
|
const results = [];
|
|
679
1261
|
let i = 0;
|
|
680
1262
|
while (i < value.length) {
|
|
681
|
-
|
|
1263
|
+
i = skipWhitespaceAndCommas(value, i);
|
|
682
1264
|
if (i >= value.length) break;
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
i++;
|
|
689
|
-
let field = "";
|
|
690
|
-
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
691
|
-
field += value[i + 1];
|
|
692
|
-
i += 2;
|
|
693
|
-
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
694
|
-
field += "\"";
|
|
695
|
-
i += 2;
|
|
696
|
-
} else if (value[i] === "\"") {
|
|
697
|
-
i++;
|
|
698
|
-
break;
|
|
699
|
-
} else {
|
|
700
|
-
field += value[i];
|
|
701
|
-
i++;
|
|
702
|
-
}
|
|
703
|
-
results.push(field);
|
|
704
|
-
} else {
|
|
705
|
-
const start = i;
|
|
706
|
-
while (i < value.length && value[i] !== ",") i++;
|
|
707
|
-
results.push(value.substring(start, i).trim());
|
|
708
|
-
}
|
|
1265
|
+
const isQuotedField = value[i] === "\"";
|
|
1266
|
+
const parsed = isQuotedField ? parseQuotedField(value, i) : parseUnquotedField(value, i);
|
|
1267
|
+
results.push(parsed.field);
|
|
1268
|
+
i = isQuotedField ? skipWhitespace(value, parsed.nextIndex) : parsed.nextIndex;
|
|
1269
|
+
require_invariant.invariant(!isQuotedField || i >= value.length || value[i] === ",", "Expected comma after quoted field in contains assertion value");
|
|
709
1270
|
}
|
|
710
1271
|
return results;
|
|
711
1272
|
}
|
|
@@ -803,10 +1364,10 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
803
1364
|
require_invariant.invariant(test.vars, "context-faithfulness assertion requires a test with variables");
|
|
804
1365
|
require_invariant.invariant(typeof test.vars.query === "string", "context-faithfulness assertion requires a \"query\" variable with the user question");
|
|
805
1366
|
require_invariant.invariant(typeof output === "string", "context-faithfulness assertion requires string output from the provider");
|
|
806
|
-
const context = await
|
|
1367
|
+
const context = await resolveContext(assertion, test, output, prompt, void 0, providerResponse);
|
|
807
1368
|
return {
|
|
808
1369
|
assertion,
|
|
809
|
-
...await
|
|
1370
|
+
...await matchesContextFaithfulness(test.vars.query, output, context, assertion.threshold ?? 0, test.options, test.vars, providerCallContext),
|
|
810
1371
|
metadata: { context }
|
|
811
1372
|
};
|
|
812
1373
|
}
|
|
@@ -825,8 +1386,8 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
825
1386
|
const handleContextRecall = async ({ assertion, renderedValue, prompt, test, output, providerResponse, providerCallContext }) => {
|
|
826
1387
|
require_invariant.invariant(typeof renderedValue === "string", "context-recall assertion requires a string value (expected answer or fact to verify)");
|
|
827
1388
|
require_invariant.invariant(prompt, "context-recall assertion requires a prompt");
|
|
828
|
-
const context = await
|
|
829
|
-
const result = await
|
|
1389
|
+
const context = await resolveContext(assertion, test, output, prompt, prompt, providerResponse);
|
|
1390
|
+
const result = await matchesContextRecall(context, renderedValue, assertion.threshold ?? 0, test.options, test.vars, providerCallContext);
|
|
830
1391
|
return {
|
|
831
1392
|
assertion,
|
|
832
1393
|
...result,
|
|
@@ -851,8 +1412,8 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
|
|
|
851
1412
|
const handleContextRelevance = async ({ assertion, test, output, prompt, providerResponse, providerCallContext }) => {
|
|
852
1413
|
require_invariant.invariant(test.vars, "context-relevance assertion requires a test with variables");
|
|
853
1414
|
require_invariant.invariant(typeof test.vars.query === "string", "context-relevance assertion requires a \"query\" variable with the user question");
|
|
854
|
-
const context = await
|
|
855
|
-
const result = await
|
|
1415
|
+
const context = await resolveContext(assertion, test, output, prompt, void 0, providerResponse);
|
|
1416
|
+
const result = await matchesContextRelevance(test.vars.query, context, assertion.threshold ?? 0, test.options, providerCallContext);
|
|
856
1417
|
return {
|
|
857
1418
|
assertion,
|
|
858
1419
|
...result,
|
|
@@ -930,7 +1491,7 @@ function handleFinishReason({ assertion, inverse = false, renderedValue, provide
|
|
|
930
1491
|
//#region src/assertions/functionToolCall.ts
|
|
931
1492
|
const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
932
1493
|
try {
|
|
933
|
-
if (provider instanceof require_providers.AIStudioChatProvider || provider instanceof require_providers.GoogleLiveProvider || provider instanceof require_providers.VertexChatProvider) require_transform
|
|
1494
|
+
if (provider instanceof require_providers.AIStudioChatProvider || provider instanceof require_providers.GoogleLiveProvider || provider instanceof require_providers.VertexChatProvider) require_transform.validateFunctionCall(output, provider.config?.tools, test.vars);
|
|
934
1495
|
else if (provider instanceof require_chat.OpenAiChatCompletionProvider) require_util$1.validateFunctionCall(output, provider.config.functions, test.vars);
|
|
935
1496
|
else throw new Error(`Provider does not have functionality for checking function call.`);
|
|
936
1497
|
return {
|
|
@@ -950,27 +1511,67 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
950
1511
|
};
|
|
951
1512
|
//#endregion
|
|
952
1513
|
//#region src/assertions/geval.ts
|
|
953
|
-
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
954
|
-
require_invariant.invariant(typeof renderedValue === "string" || Array.isArray(renderedValue), "G-Eval assertion type must have a string or array of strings value");
|
|
1514
|
+
const handleGEval = async ({ assertion, inverse, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
1515
|
+
require_invariant.invariant(typeof renderedValue === "string" || Array.isArray(renderedValue) && renderedValue.every((value) => typeof value === "string"), "G-Eval assertion type must have a string or array of strings value");
|
|
955
1516
|
const threshold = assertion.threshold ?? .7;
|
|
956
1517
|
if (Array.isArray(renderedValue)) {
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
1518
|
+
if (renderedValue.length === 0) return {
|
|
1519
|
+
assertion,
|
|
1520
|
+
pass: false,
|
|
1521
|
+
score: 0,
|
|
1522
|
+
reason: "G-Eval assertion requires at least one criterion string in the value array."
|
|
1523
|
+
};
|
|
1524
|
+
const responses = [];
|
|
1525
|
+
let failure;
|
|
1526
|
+
for (const [index, value] of renderedValue.entries()) {
|
|
960
1527
|
const resp = await require_graders.matchesGEval(value, prompt || "", outputString, threshold, test.options, providerCallContext);
|
|
961
|
-
|
|
962
|
-
|
|
1528
|
+
responses.push(resp);
|
|
1529
|
+
if (require_graders.isGraderFailure(resp)) {
|
|
1530
|
+
failure = {
|
|
1531
|
+
index,
|
|
1532
|
+
resp
|
|
1533
|
+
};
|
|
1534
|
+
break;
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
const tokensUsed = require_tokenUsageUtils.createEmptyTokenUsage();
|
|
1538
|
+
for (const r of responses) require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, r.tokensUsed);
|
|
1539
|
+
if (failure) {
|
|
1540
|
+
const criterion = renderedValue[failure.index];
|
|
1541
|
+
return {
|
|
1542
|
+
assertion,
|
|
1543
|
+
pass: false,
|
|
1544
|
+
score: 0,
|
|
1545
|
+
reason: `G-Eval criterion ${failure.index + 1}/${renderedValue.length} (${JSON.stringify(criterion)}) failed: ${failure.resp.reason}`,
|
|
1546
|
+
tokensUsed,
|
|
1547
|
+
metadata: failure.resp.metadata
|
|
1548
|
+
};
|
|
963
1549
|
}
|
|
964
|
-
const
|
|
1550
|
+
const averageScore = responses.reduce((acc, r) => acc + r.score, 0) / responses.length;
|
|
1551
|
+
const combinedReason = responses.map((r) => r.reason).join("\n\n");
|
|
965
1552
|
return {
|
|
966
1553
|
assertion,
|
|
967
|
-
pass:
|
|
968
|
-
score:
|
|
969
|
-
reason:
|
|
1554
|
+
pass: averageScore >= threshold !== inverse,
|
|
1555
|
+
score: inverse ? 1 - averageScore : averageScore,
|
|
1556
|
+
reason: combinedReason,
|
|
1557
|
+
tokensUsed
|
|
970
1558
|
};
|
|
971
|
-
}
|
|
1559
|
+
}
|
|
1560
|
+
const resp = await require_graders.matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext);
|
|
1561
|
+
if (require_graders.isGraderFailure(resp)) return {
|
|
1562
|
+
assertion,
|
|
1563
|
+
pass: false,
|
|
1564
|
+
score: 0,
|
|
1565
|
+
reason: resp.reason,
|
|
1566
|
+
tokensUsed: resp.tokensUsed,
|
|
1567
|
+
metadata: resp.metadata
|
|
1568
|
+
};
|
|
1569
|
+
const passed = resp.score >= threshold !== inverse;
|
|
1570
|
+
return {
|
|
972
1571
|
assertion,
|
|
973
|
-
...
|
|
1572
|
+
...resp,
|
|
1573
|
+
pass: passed,
|
|
1574
|
+
score: inverse ? 1 - resp.score : resp.score
|
|
974
1575
|
};
|
|
975
1576
|
};
|
|
976
1577
|
//#endregion
|
|
@@ -1110,6 +1711,43 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1110
1711
|
};
|
|
1111
1712
|
//#endregion
|
|
1112
1713
|
//#region src/assertions/html.ts
|
|
1714
|
+
const LITERAL_WRAPPER_PATTERNS = {
|
|
1715
|
+
html: /<html(?=[\s>/])/,
|
|
1716
|
+
head: /<head(?=[\s>/])/,
|
|
1717
|
+
body: /<body(?=[\s>/])/
|
|
1718
|
+
};
|
|
1719
|
+
function isWrapperTagName(tagName) {
|
|
1720
|
+
return tagName === "html" || tagName === "head" || tagName === "body";
|
|
1721
|
+
}
|
|
1722
|
+
function isTextNode(node) {
|
|
1723
|
+
return node.nodeName === "#text";
|
|
1724
|
+
}
|
|
1725
|
+
function isElementNode(node) {
|
|
1726
|
+
return "tagName" in node;
|
|
1727
|
+
}
|
|
1728
|
+
function hasSourceCodeLocation(element) {
|
|
1729
|
+
return "sourceCodeLocation" in element && element.sourceCodeLocation !== null && element.sourceCodeLocation !== void 0;
|
|
1730
|
+
}
|
|
1731
|
+
function getChildNodes(node) {
|
|
1732
|
+
return "childNodes" in node ? node.childNodes : [];
|
|
1733
|
+
}
|
|
1734
|
+
function findFirstElement(root, predicate) {
|
|
1735
|
+
const stack = [root];
|
|
1736
|
+
while (stack.length > 0) {
|
|
1737
|
+
const current = stack.pop();
|
|
1738
|
+
if (isElementNode(current) && predicate(current)) return current;
|
|
1739
|
+
const children = getChildNodes(current);
|
|
1740
|
+
for (let i = children.length - 1; i >= 0; i--) stack.push(children[i]);
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
function hasTopLevelText(parentNode) {
|
|
1744
|
+
return parentNode.childNodes.some((node) => isTextNode(node) && Boolean(node.value.trim()));
|
|
1745
|
+
}
|
|
1746
|
+
function isUserProvidedElement(element, inputLowercase) {
|
|
1747
|
+
const tagName = element.tagName.toLowerCase();
|
|
1748
|
+
if (isWrapperTagName(tagName)) return LITERAL_WRAPPER_PATTERNS[tagName].test(inputLowercase) && hasSourceCodeLocation(element);
|
|
1749
|
+
return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
|
|
1750
|
+
}
|
|
1113
1751
|
const HTML_PATTERNS = {
|
|
1114
1752
|
openingTag: /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/,
|
|
1115
1753
|
closingTag: /<\/[a-zA-Z][a-zA-Z0-9-]*\s*>/,
|
|
@@ -1265,37 +1903,21 @@ function validateHtml(htmlString) {
|
|
|
1265
1903
|
isValid: false,
|
|
1266
1904
|
reason: "Output appears to be XML, not HTML"
|
|
1267
1905
|
};
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
].includes(tagName) && !trimmed.toLowerCase().includes(`<${tagName}`)) return false;
|
|
1284
|
-
return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
|
|
1285
|
-
})) return {
|
|
1286
|
-
isValid: false,
|
|
1287
|
-
reason: "Output does not contain recognized HTML elements"
|
|
1288
|
-
};
|
|
1289
|
-
return {
|
|
1290
|
-
isValid: true,
|
|
1291
|
-
reason: "Output is valid HTML"
|
|
1292
|
-
};
|
|
1293
|
-
} catch (error) {
|
|
1294
|
-
return {
|
|
1295
|
-
isValid: false,
|
|
1296
|
-
reason: `HTML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
1297
|
-
};
|
|
1298
|
-
}
|
|
1906
|
+
const document = (0, parse5.parse)(trimmed, { sourceCodeLocationInfo: true });
|
|
1907
|
+
const inputLowercase = trimmed.toLowerCase();
|
|
1908
|
+
const body = findFirstElement(document, (element) => element.tagName === "body");
|
|
1909
|
+
if (!(body !== void 0 && LITERAL_WRAPPER_PATTERNS.body.test(inputLowercase) && hasSourceCodeLocation(body)) && body && hasTopLevelText(body)) return {
|
|
1910
|
+
isValid: false,
|
|
1911
|
+
reason: "Output must be wrapped in HTML tags"
|
|
1912
|
+
};
|
|
1913
|
+
if (!findFirstElement(document, (element) => isUserProvidedElement(element, inputLowercase))) return {
|
|
1914
|
+
isValid: false,
|
|
1915
|
+
reason: "Output does not contain recognized HTML elements"
|
|
1916
|
+
};
|
|
1917
|
+
return {
|
|
1918
|
+
isValid: true,
|
|
1919
|
+
reason: "Output is valid HTML"
|
|
1920
|
+
};
|
|
1299
1921
|
}
|
|
1300
1922
|
const handleContainsHtml = ({ assertion, outputString, inverse }) => {
|
|
1301
1923
|
const pass = containsHtml(outputString) !== inverse;
|
|
@@ -1460,7 +2082,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
|
|
|
1460
2082
|
let result;
|
|
1461
2083
|
if (typeof valueFromScript === "undefined") {
|
|
1462
2084
|
const functionBody = renderedValue.includes("\n") ? renderedValue : buildFunctionBody(renderedValue);
|
|
1463
|
-
result = await validateResult(new Function("output", "context", "process", functionBody)(output, assertionValueContext,
|
|
2085
|
+
result = await validateResult(new Function("output", "context", "process", functionBody)(output, assertionValueContext, require_processShim.getProcessShim()));
|
|
1464
2086
|
} else {
|
|
1465
2087
|
require_invariant.invariant(typeof valueFromScript === "boolean" || typeof valueFromScript === "number" || typeof valueFromScript === "object", `Javascript assertion script must return a boolean, number, or object (${assertion.value})`);
|
|
1466
2088
|
result = await validateResult(valueFromScript);
|
|
@@ -1667,7 +2289,7 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
|
|
|
1667
2289
|
const parsedPrompt = require_fetch.parseChatPrompt(promptToModerate, null);
|
|
1668
2290
|
if (parsedPrompt && parsedPrompt.length > 0) promptToModerate = getLastModerationPrompt(parsedPrompt) ?? promptToModerate;
|
|
1669
2291
|
} catch {}
|
|
1670
|
-
const moderationResult = await
|
|
2292
|
+
const moderationResult = await matchesModeration({
|
|
1671
2293
|
userPrompt: promptToModerate,
|
|
1672
2294
|
assistantResponse: outputString,
|
|
1673
2295
|
categories: Array.isArray(assertion.value) ? assertion.value : []
|
|
@@ -1952,45 +2574,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
1952
2574
|
}
|
|
1953
2575
|
//#endregion
|
|
1954
2576
|
//#region src/assertions/trajectoryUtils.ts
|
|
1955
|
-
const TOOL_ATTRIBUTE_KEYS = [
|
|
1956
|
-
"tool.name",
|
|
1957
|
-
"tool_name",
|
|
1958
|
-
"tool",
|
|
1959
|
-
"function.name",
|
|
1960
|
-
"function_name",
|
|
1961
|
-
"gen_ai.tool.name",
|
|
1962
|
-
"codex.mcp.tool",
|
|
1963
|
-
"agent.tool",
|
|
1964
|
-
"agent.tool_name",
|
|
1965
|
-
"agent.toolName"
|
|
1966
|
-
];
|
|
1967
|
-
const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
|
|
1968
|
-
"tool.arguments",
|
|
1969
|
-
"tool.args",
|
|
1970
|
-
"tool.input",
|
|
1971
|
-
"tool_arguments",
|
|
1972
|
-
"tool_args",
|
|
1973
|
-
"tool_input",
|
|
1974
|
-
"function.arguments",
|
|
1975
|
-
"function.args",
|
|
1976
|
-
"function.input",
|
|
1977
|
-
"function_arguments",
|
|
1978
|
-
"function_args",
|
|
1979
|
-
"gen_ai.tool.arguments",
|
|
1980
|
-
"gen_ai.tool.args",
|
|
1981
|
-
"gen_ai.tool.input",
|
|
1982
|
-
"gen_ai.tool.call.arguments",
|
|
1983
|
-
"gen_ai.tool.call.args",
|
|
1984
|
-
"agent.tool.arguments",
|
|
1985
|
-
"agent.tool.args",
|
|
1986
|
-
"agent.tool.input",
|
|
1987
|
-
"codex.mcp.arguments",
|
|
1988
|
-
"codex.mcp.args",
|
|
1989
|
-
"codex.mcp.input",
|
|
1990
|
-
"arguments",
|
|
1991
|
-
"args",
|
|
1992
|
-
"input"
|
|
1993
|
-
];
|
|
1994
2577
|
const COMMAND_ATTRIBUTE_KEYS = [
|
|
1995
2578
|
"codex.command",
|
|
1996
2579
|
"command",
|
|
@@ -2003,16 +2586,15 @@ const SEARCH_ATTRIBUTE_KEYS = [
|
|
|
2003
2586
|
"search_query"
|
|
2004
2587
|
];
|
|
2005
2588
|
const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
|
|
2589
|
+
const COMMAND_TOOL_NAMES = new Set([
|
|
2590
|
+
"exec_command",
|
|
2591
|
+
"local_shell",
|
|
2592
|
+
"shell"
|
|
2593
|
+
]);
|
|
2006
2594
|
const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
|
|
2007
2595
|
const MAX_JUDGE_SUMMARY_STEPS = 24;
|
|
2008
2596
|
const JUDGE_SUMMARY_HEAD_STEPS = 12;
|
|
2009
2597
|
const JUDGE_SUMMARY_TAIL_STEPS = 12;
|
|
2010
|
-
function getStringAttribute(attributes, keys) {
|
|
2011
|
-
for (const key of keys) {
|
|
2012
|
-
const value = attributes[key];
|
|
2013
|
-
if (typeof value === "string" && value.trim()) return value.trim();
|
|
2014
|
-
}
|
|
2015
|
-
}
|
|
2016
2598
|
function normalizeStructuredAttribute(value) {
|
|
2017
2599
|
if (value === void 0 || value === null) return;
|
|
2018
2600
|
if (typeof value === "string") {
|
|
@@ -2044,9 +2626,12 @@ function getTrajectoryStepStatus(step) {
|
|
|
2044
2626
|
function getCommandExecutable(command) {
|
|
2045
2627
|
return command.trim().split(/\s+/)[0] || void 0;
|
|
2046
2628
|
}
|
|
2629
|
+
function isCommandToolName(toolName) {
|
|
2630
|
+
return !!toolName && COMMAND_TOOL_NAMES.has(toolName.trim().toLowerCase());
|
|
2631
|
+
}
|
|
2047
2632
|
function extractToolName(span) {
|
|
2048
2633
|
const attributes = span.attributes || {};
|
|
2049
|
-
const directMatch =
|
|
2634
|
+
const directMatch = require_toolAttributes.getToolNameFromAttributes(attributes);
|
|
2050
2635
|
if (directMatch) return directMatch;
|
|
2051
2636
|
for (const [key, value] of Object.entries(attributes)) {
|
|
2052
2637
|
if (typeof value !== "string" || !value.trim()) continue;
|
|
@@ -2060,7 +2645,7 @@ function extractToolName(span) {
|
|
|
2060
2645
|
}
|
|
2061
2646
|
function extractToolArgs(span) {
|
|
2062
2647
|
const attributes = span.attributes || {};
|
|
2063
|
-
for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
|
|
2648
|
+
for (const key of require_toolAttributes.TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
|
|
2064
2649
|
const value = normalizeStructuredAttribute(attributes[key]);
|
|
2065
2650
|
if (value !== void 0) return value;
|
|
2066
2651
|
}
|
|
@@ -2071,21 +2656,31 @@ function extractToolArgs(span) {
|
|
|
2071
2656
|
if (value !== void 0) return value;
|
|
2072
2657
|
}
|
|
2073
2658
|
}
|
|
2074
|
-
function extractCommand(span) {
|
|
2659
|
+
function extractCommand(span, toolName = extractToolName(span), getToolArgs = () => extractToolArgs(span)) {
|
|
2075
2660
|
const attributes = span.attributes || {};
|
|
2076
|
-
const directMatch =
|
|
2661
|
+
const directMatch = require_toolAttributes.getFirstStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
|
|
2077
2662
|
if (directMatch) return directMatch;
|
|
2078
2663
|
for (const [key, value] of Object.entries(attributes)) {
|
|
2079
2664
|
if (typeof value !== "string" || !value.trim()) continue;
|
|
2080
2665
|
if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
|
|
2081
2666
|
}
|
|
2667
|
+
const toolArgs = getToolArgs();
|
|
2668
|
+
if (isCommandToolName(toolName) && toolArgs && typeof toolArgs === "object") {
|
|
2669
|
+
const args = toolArgs;
|
|
2670
|
+
const command = args.cmd ?? args.command;
|
|
2671
|
+
if (typeof command === "string" && command.trim()) return command.trim();
|
|
2672
|
+
if (Array.isArray(command)) {
|
|
2673
|
+
const joined = command.map((part) => String(part).trim()).filter(Boolean).join(" ");
|
|
2674
|
+
if (joined) return joined;
|
|
2675
|
+
}
|
|
2676
|
+
}
|
|
2082
2677
|
if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
|
|
2083
2678
|
}
|
|
2084
2679
|
function extractSearchQuery(span) {
|
|
2085
2680
|
const attributes = span.attributes || {};
|
|
2086
|
-
const directMatch =
|
|
2681
|
+
const directMatch = require_toolAttributes.getFirstStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
|
|
2087
2682
|
if (directMatch) return directMatch;
|
|
2088
|
-
const genericQuery =
|
|
2683
|
+
const genericQuery = require_toolAttributes.getFirstStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
|
|
2089
2684
|
if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
|
|
2090
2685
|
if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
|
|
2091
2686
|
}
|
|
@@ -2109,17 +2704,34 @@ function extractTrajectorySteps(trace) {
|
|
|
2109
2704
|
return left.index - right.index;
|
|
2110
2705
|
}).map(({ span }) => {
|
|
2111
2706
|
const toolName = extractToolName(span);
|
|
2112
|
-
|
|
2707
|
+
let toolArgs;
|
|
2708
|
+
let hasExtractedToolArgs = false;
|
|
2709
|
+
const getToolArgs = () => {
|
|
2710
|
+
if (!hasExtractedToolArgs) {
|
|
2711
|
+
toolArgs = extractToolArgs(span);
|
|
2712
|
+
hasExtractedToolArgs = true;
|
|
2713
|
+
}
|
|
2714
|
+
return toolArgs;
|
|
2715
|
+
};
|
|
2716
|
+
const command = extractCommand(span, toolName, getToolArgs);
|
|
2113
2717
|
const searchQuery = extractSearchQuery(span);
|
|
2114
2718
|
let type = "span";
|
|
2115
2719
|
let name = span.name;
|
|
2116
2720
|
const aliases = new Set([span.name]);
|
|
2117
2721
|
let args;
|
|
2118
|
-
if (toolName) {
|
|
2722
|
+
if (command && isCommandToolName(toolName)) {
|
|
2723
|
+
type = "command";
|
|
2724
|
+
name = command;
|
|
2725
|
+
aliases.add(command);
|
|
2726
|
+
args = getToolArgs();
|
|
2727
|
+
if (toolName) aliases.add(toolName);
|
|
2728
|
+
const executable = getCommandExecutable(command);
|
|
2729
|
+
if (executable) aliases.add(executable);
|
|
2730
|
+
} else if (toolName) {
|
|
2119
2731
|
type = "tool";
|
|
2120
2732
|
name = toolName;
|
|
2121
2733
|
aliases.add(toolName);
|
|
2122
|
-
args =
|
|
2734
|
+
args = getToolArgs();
|
|
2123
2735
|
} else if (command) {
|
|
2124
2736
|
type = "command";
|
|
2125
2737
|
name = command;
|
|
@@ -2295,7 +2907,7 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
2295
2907
|
if (match) evalId = match[1];
|
|
2296
2908
|
}
|
|
2297
2909
|
}
|
|
2298
|
-
const tracking = await
|
|
2910
|
+
const tracking = await require_indirectWebPwn.checkExfilTracking(webPageUuid, evalId);
|
|
2299
2911
|
if (tracking) gradingContext = {
|
|
2300
2912
|
...gradingContext,
|
|
2301
2913
|
wasExfiltrated: tracking.wasExfiltrated,
|
|
@@ -2361,7 +2973,7 @@ function handleIsRefusal(params) {
|
|
|
2361
2973
|
assertion
|
|
2362
2974
|
};
|
|
2363
2975
|
}
|
|
2364
|
-
const pass =
|
|
2976
|
+
const pass = require_util$2.isBasicRefusal(output) !== inverse;
|
|
2365
2977
|
return {
|
|
2366
2978
|
pass,
|
|
2367
2979
|
score: pass ? 1 : 0,
|
|
@@ -2400,11 +3012,10 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
2400
3012
|
const rougeMethod = js_rouge[baseType[baseType.length - 1]];
|
|
2401
3013
|
const score = rougeMethod(outputString, renderedValue, {});
|
|
2402
3014
|
const threshold = assertion.threshold ?? .75;
|
|
2403
|
-
const pass = score >= threshold != inverse;
|
|
2404
3015
|
return {
|
|
2405
|
-
pass,
|
|
3016
|
+
pass: score >= threshold !== inverse,
|
|
2406
3017
|
score: inverse ? 1 - score : score,
|
|
2407
|
-
reason:
|
|
3018
|
+
reason: `${baseType.toUpperCase()} score ${score.toFixed(2)} is ${score >= threshold ? "greater than or equal to" : "less than"} threshold ${threshold}`,
|
|
2408
3019
|
assertion
|
|
2409
3020
|
};
|
|
2410
3021
|
}
|
|
@@ -2466,10 +3077,196 @@ const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertion
|
|
|
2466
3077
|
}
|
|
2467
3078
|
};
|
|
2468
3079
|
//#endregion
|
|
3080
|
+
//#region src/providers/webSearchUtils.ts
|
|
3081
|
+
function hasTool(provider, predicate) {
|
|
3082
|
+
return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
|
|
3083
|
+
}
|
|
3084
|
+
function getProviderId(provider) {
|
|
3085
|
+
if (typeof provider.id !== "function") return null;
|
|
3086
|
+
try {
|
|
3087
|
+
return provider.id();
|
|
3088
|
+
} catch (err) {
|
|
3089
|
+
require_logger.logger.debug(`Failed to read provider id: ${err}`);
|
|
3090
|
+
return null;
|
|
3091
|
+
}
|
|
3092
|
+
}
|
|
3093
|
+
function isOpenAiResponsesProvider(provider, id) {
|
|
3094
|
+
return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
|
|
3095
|
+
}
|
|
3096
|
+
/**
|
|
3097
|
+
* Check if a provider has web search capabilities
|
|
3098
|
+
* @param provider The provider to check
|
|
3099
|
+
* @returns true if the provider supports web search
|
|
3100
|
+
*/
|
|
3101
|
+
function hasWebSearchCapability(provider) {
|
|
3102
|
+
if (!provider) return false;
|
|
3103
|
+
const id = getProviderId(provider);
|
|
3104
|
+
if (!id) return false;
|
|
3105
|
+
if (id.includes("perplexity")) return true;
|
|
3106
|
+
if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
|
|
3107
|
+
if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
|
|
3108
|
+
if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
|
|
3109
|
+
if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
|
|
3110
|
+
if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
|
|
3111
|
+
return false;
|
|
3112
|
+
}
|
|
3113
|
+
/**
|
|
3114
|
+
* Load a provider with web search capabilities.
|
|
3115
|
+
* Tries multiple providers in order of preference until one succeeds.
|
|
3116
|
+
* Uses the latest and most capable models from each provider with specific checkpoint IDs.
|
|
3117
|
+
*
|
|
3118
|
+
* @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
|
|
3119
|
+
* @returns A provider with web search capabilities or null
|
|
3120
|
+
*/
|
|
3121
|
+
async function loadWebSearchProvider(preferAnthropic = false) {
|
|
3122
|
+
const loadAnthropicWebSearch = async () => {
|
|
3123
|
+
try {
|
|
3124
|
+
return await require_providers.loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
|
|
3125
|
+
type: "web_search_20250305",
|
|
3126
|
+
name: "web_search",
|
|
3127
|
+
max_uses: 5
|
|
3128
|
+
}] } } });
|
|
3129
|
+
} catch (err) {
|
|
3130
|
+
require_logger.logger.debug(`Failed to load Anthropic web search provider: ${err}`);
|
|
3131
|
+
return null;
|
|
3132
|
+
}
|
|
3133
|
+
};
|
|
3134
|
+
const loadOpenAIWebSearch = async () => {
|
|
3135
|
+
try {
|
|
3136
|
+
return await require_providers.loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
|
|
3137
|
+
} catch (err) {
|
|
3138
|
+
require_logger.logger.debug(`Failed to load OpenAI web search provider: ${err}`);
|
|
3139
|
+
return null;
|
|
3140
|
+
}
|
|
3141
|
+
};
|
|
3142
|
+
const loadPerplexity = async () => {
|
|
3143
|
+
try {
|
|
3144
|
+
return await require_providers.loadApiProvider("perplexity:sonar-pro");
|
|
3145
|
+
} catch (err) {
|
|
3146
|
+
require_logger.logger.debug(`Failed to load Perplexity provider: ${err}`);
|
|
3147
|
+
return null;
|
|
3148
|
+
}
|
|
3149
|
+
};
|
|
3150
|
+
const loadGoogleWebSearch = async () => {
|
|
3151
|
+
try {
|
|
3152
|
+
return await require_providers.loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
3153
|
+
} catch (err) {
|
|
3154
|
+
require_logger.logger.debug(`Failed to load Google web search provider: ${err}`);
|
|
3155
|
+
return null;
|
|
3156
|
+
}
|
|
3157
|
+
};
|
|
3158
|
+
const loadVertexWebSearch = async () => {
|
|
3159
|
+
try {
|
|
3160
|
+
return await require_providers.loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
3161
|
+
} catch (err) {
|
|
3162
|
+
require_logger.logger.debug(`Failed to load Vertex web search provider: ${err}`);
|
|
3163
|
+
return null;
|
|
3164
|
+
}
|
|
3165
|
+
};
|
|
3166
|
+
const loadXaiWebSearch = async () => {
|
|
3167
|
+
try {
|
|
3168
|
+
return await require_providers.loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
|
|
3169
|
+
} catch (err) {
|
|
3170
|
+
require_logger.logger.debug(`Failed to load xAI web search provider: ${err}`);
|
|
3171
|
+
return null;
|
|
3172
|
+
}
|
|
3173
|
+
};
|
|
3174
|
+
const providers = preferAnthropic ? [
|
|
3175
|
+
loadAnthropicWebSearch,
|
|
3176
|
+
loadOpenAIWebSearch,
|
|
3177
|
+
loadPerplexity,
|
|
3178
|
+
loadGoogleWebSearch,
|
|
3179
|
+
loadVertexWebSearch,
|
|
3180
|
+
loadXaiWebSearch
|
|
3181
|
+
] : [
|
|
3182
|
+
loadOpenAIWebSearch,
|
|
3183
|
+
loadAnthropicWebSearch,
|
|
3184
|
+
loadPerplexity,
|
|
3185
|
+
loadGoogleWebSearch,
|
|
3186
|
+
loadVertexWebSearch,
|
|
3187
|
+
loadXaiWebSearch
|
|
3188
|
+
];
|
|
3189
|
+
for (const getProvider of providers) {
|
|
3190
|
+
const provider = await getProvider();
|
|
3191
|
+
if (provider && hasWebSearchCapability(provider)) {
|
|
3192
|
+
require_logger.logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
|
|
3193
|
+
return provider;
|
|
3194
|
+
}
|
|
3195
|
+
if (provider) require_logger.logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
|
|
3196
|
+
}
|
|
3197
|
+
return null;
|
|
3198
|
+
}
|
|
3199
|
+
//#endregion
|
|
3200
|
+
//#region src/matchers/search.ts
|
|
3201
|
+
async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
|
|
3202
|
+
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
3203
|
+
const defaultProviders = await require_graders.getDefaultProviders();
|
|
3204
|
+
const defaultSearchProviders = [
|
|
3205
|
+
defaultProviders.webSearchProvider,
|
|
3206
|
+
defaultProviders.llmRubricProvider,
|
|
3207
|
+
defaultProviders.gradingProvider
|
|
3208
|
+
];
|
|
3209
|
+
let searchProvider = (grading.provider ? await require_graders.getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
|
|
3210
|
+
if (!hasWebSearchCapability(searchProvider)) {
|
|
3211
|
+
const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
|
|
3212
|
+
if (webSearchDefault) searchProvider = webSearchDefault;
|
|
3213
|
+
}
|
|
3214
|
+
if (!hasWebSearchCapability(searchProvider)) {
|
|
3215
|
+
const webSearchProvider = await loadWebSearchProvider(true);
|
|
3216
|
+
if (webSearchProvider) searchProvider = webSearchProvider;
|
|
3217
|
+
}
|
|
3218
|
+
if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error(`search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:${require_graders.DEFAULT_ANTHROPIC_MODEL}, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider`);
|
|
3219
|
+
const prompt = await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.DEFAULT_WEB_SEARCH_PROMPT), {
|
|
3220
|
+
output: require_graders.tryParse(llmOutput),
|
|
3221
|
+
rubric,
|
|
3222
|
+
...vars || {}
|
|
3223
|
+
});
|
|
3224
|
+
const resp = await require_graders.callProviderWithContext(searchProvider, prompt, "search-rubric", {
|
|
3225
|
+
output: require_graders.tryParse(llmOutput),
|
|
3226
|
+
rubric,
|
|
3227
|
+
...vars || {}
|
|
3228
|
+
}, providerCallContext);
|
|
3229
|
+
if (resp.error || !resp.output) return {
|
|
3230
|
+
pass: false,
|
|
3231
|
+
score: 0,
|
|
3232
|
+
reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
|
|
3233
|
+
tokensUsed: resp.tokenUsage,
|
|
3234
|
+
assertion
|
|
3235
|
+
};
|
|
3236
|
+
try {
|
|
3237
|
+
const result = require_logger.extractFirstJsonObject(String(resp.output));
|
|
3238
|
+
let pass = result.pass ?? false;
|
|
3239
|
+
const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
|
|
3240
|
+
if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
|
|
3241
|
+
return {
|
|
3242
|
+
pass,
|
|
3243
|
+
score,
|
|
3244
|
+
reason: result.reason || "No reason provided",
|
|
3245
|
+
tokensUsed: resp.tokenUsage,
|
|
3246
|
+
assertion,
|
|
3247
|
+
metadata: {
|
|
3248
|
+
searchResults: result.searchResults || [],
|
|
3249
|
+
searchProvider: searchProvider.id()
|
|
3250
|
+
}
|
|
3251
|
+
};
|
|
3252
|
+
} catch (err) {
|
|
3253
|
+
require_logger.logger.warn(`[search-rubric] Could not parse structured JSON from provider response, falling back to substring matching: ${err.message}`);
|
|
3254
|
+
const outputLower = String(resp.output).toLowerCase();
|
|
3255
|
+
const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
|
|
3256
|
+
return {
|
|
3257
|
+
pass,
|
|
3258
|
+
score: pass ? 1 : 0,
|
|
3259
|
+
reason: resp.output,
|
|
3260
|
+
tokensUsed: resp.tokenUsage,
|
|
3261
|
+
assertion
|
|
3262
|
+
};
|
|
3263
|
+
}
|
|
3264
|
+
}
|
|
3265
|
+
//#endregion
|
|
2469
3266
|
//#region src/assertions/searchRubric.ts
|
|
2470
3267
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
2471
3268
|
if (renderedValue == null) throw new Error("search-rubric assertion type must have a string value");
|
|
2472
|
-
const result = await
|
|
3269
|
+
const result = await matchesSearchRubric(String(renderedValue), providerResponse.output, test.options, test.vars, assertion, provider, providerCallContext);
|
|
2473
3270
|
if (inverse) {
|
|
2474
3271
|
result.pass = !result.pass;
|
|
2475
3272
|
result.reason = result.pass ? `Output does not require web search verification: ${result.reason}` : `Output requires web search verification: ${result.reason}`;
|
|
@@ -2500,7 +3297,7 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2500
3297
|
if (Array.isArray(renderedValue)) {
|
|
2501
3298
|
let minScore = Number.POSITIVE_INFINITY;
|
|
2502
3299
|
for (const value of renderedValue) {
|
|
2503
|
-
const result = await
|
|
3300
|
+
const result = await matchesSimilarity(value, outputString, threshold, inverse, test.options, metric);
|
|
2504
3301
|
if (result.pass) return {
|
|
2505
3302
|
assertion,
|
|
2506
3303
|
...result
|
|
@@ -2515,7 +3312,7 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2515
3312
|
};
|
|
2516
3313
|
} else return {
|
|
2517
3314
|
assertion,
|
|
2518
|
-
...await
|
|
3315
|
+
...await matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
|
|
2519
3316
|
};
|
|
2520
3317
|
};
|
|
2521
3318
|
//#endregion
|
|
@@ -3121,13 +3918,13 @@ function resolveSequenceValue(value) {
|
|
|
3121
3918
|
}
|
|
3122
3919
|
throw new Error("trajectory:tool-sequence assertion must have an array or object value");
|
|
3123
3920
|
}
|
|
3124
|
-
function isRecord(value) {
|
|
3921
|
+
function isRecord$1(value) {
|
|
3125
3922
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3126
3923
|
}
|
|
3127
3924
|
function matchesExpectedArgsPartial(actual, expected) {
|
|
3128
3925
|
if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
|
|
3129
|
-
if (isRecord(expected)) {
|
|
3130
|
-
if (!isRecord(actual)) return false;
|
|
3926
|
+
if (isRecord$1(expected)) {
|
|
3927
|
+
if (!isRecord$1(actual)) return false;
|
|
3131
3928
|
return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
|
|
3132
3929
|
}
|
|
3133
3930
|
return (0, node_util.isDeepStrictEqual)(actual, expected);
|
|
@@ -3484,7 +4281,7 @@ function assertionMayNeedTraceContext(assertion) {
|
|
|
3484
4281
|
if (assertionUsesTrace(assertion)) return true;
|
|
3485
4282
|
if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
|
|
3486
4283
|
if (assertion.type.startsWith("promptfoo:redteam:coding-agent:")) return true;
|
|
3487
|
-
return typeof assertion.value === "string" ? assertion.value.startsWith("file://") ||
|
|
4284
|
+
return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || require_packageParser.isPackagePath(assertion.value) : false;
|
|
3488
4285
|
}
|
|
3489
4286
|
function hasTraceAwareAssertions(assertions) {
|
|
3490
4287
|
return Boolean(assertions?.some(assertionMayNeedTraceContext));
|
|
@@ -3498,7 +4295,7 @@ async function loadTraceData(traceId) {
|
|
|
3498
4295
|
let stableObservations = 0;
|
|
3499
4296
|
let latestTrace = null;
|
|
3500
4297
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
3501
|
-
latestTrace = await traceStore.getTrace(traceId);
|
|
4298
|
+
latestTrace = await traceStore.getTrace(traceId, { sanitizeAttributes: false });
|
|
3502
4299
|
const spanCount = latestTrace?.spans?.length ?? 0;
|
|
3503
4300
|
if (spanCount > 0) {
|
|
3504
4301
|
stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
|
|
@@ -3551,7 +4348,7 @@ const ASSERTION_HANDLERS = {
|
|
|
3551
4348
|
"llm-rubric": handleLlmRubric,
|
|
3552
4349
|
meteor: async (params) => {
|
|
3553
4350
|
try {
|
|
3554
|
-
const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-
|
|
4351
|
+
const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-CR226f7Z.cjs"));
|
|
3555
4352
|
return handleMeteorAssertion(params);
|
|
3556
4353
|
} catch (error) {
|
|
3557
4354
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -3633,7 +4430,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
3633
4430
|
const { cost, logProbs, output: originalOutput } = providerResponse;
|
|
3634
4431
|
let output = originalOutput;
|
|
3635
4432
|
require_invariant.invariant(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
|
|
3636
|
-
if (assertion.transform) output = await require_transform.transform(assertion.transform, output, {
|
|
4433
|
+
if (assertion.transform) output = await require_transform$1.transform(assertion.transform, output, {
|
|
3637
4434
|
vars: resolvedVars,
|
|
3638
4435
|
prompt: { label: prompt },
|
|
3639
4436
|
...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
|
|
@@ -3687,7 +4484,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
3687
4484
|
};
|
|
3688
4485
|
}
|
|
3689
4486
|
else if (filePath.endsWith(".rb")) try {
|
|
3690
|
-
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-
|
|
4487
|
+
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-CqUWBZAt.cjs")).then((n) => n.rubyUtils_exports);
|
|
3691
4488
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
3692
4489
|
require_logger.logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
3693
4490
|
} catch (error) {
|
|
@@ -3699,9 +4496,9 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
3699
4496
|
};
|
|
3700
4497
|
}
|
|
3701
4498
|
else renderedValue = require_graders.processFileReference(renderedValue);
|
|
3702
|
-
} else if (
|
|
4499
|
+
} else if (require_packageParser.isPackagePath(renderedValue)) {
|
|
3703
4500
|
const basePath = require_logger.state.basePath || "";
|
|
3704
|
-
const requiredModule = await
|
|
4501
|
+
const requiredModule = await require_packageParser.loadFromPackage(renderedValue, basePath);
|
|
3705
4502
|
if (typeof requiredModule !== "function") throw new Error(`Assertion malformed: ${renderedValue} must be a function. Received: ${typeof requiredModule}`);
|
|
3706
4503
|
valueFromScript = await Promise.resolve(requiredModule(output, context));
|
|
3707
4504
|
} else renderedValue = nunjucks.renderString(renderedValue, resolvedVars);
|
|
@@ -3804,7 +4601,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
3804
4601
|
require_logger.logger.debug(`Failed to preload trace data for assertions: ${error}`);
|
|
3805
4602
|
preloadedTraceData = null;
|
|
3806
4603
|
}
|
|
3807
|
-
|
|
4604
|
+
const concurrency = require_graders.getProviderCallExecutionContext()?.providerCallQueue ? 1 : ASSERTIONS_MAX_CONCURRENCY;
|
|
4605
|
+
await async.default.forEachOfLimit(asserts, concurrency, async ({ assertion, assertResult, index }) => {
|
|
3808
4606
|
if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
|
|
3809
4607
|
const result = await runAssertion({
|
|
3810
4608
|
prompt,
|
|
@@ -3840,7 +4638,7 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
3840
4638
|
async function runCompareAssertion(test, assertion, outputs, context) {
|
|
3841
4639
|
require_invariant.invariant(typeof assertion.value === "string", "select-best must have a string value");
|
|
3842
4640
|
test = require_graders.getFinalTest(test, assertion);
|
|
3843
|
-
return (await
|
|
4641
|
+
return (await matchesSelectBest(assertion.value, outputs, test.options, test.vars, context)).map((result) => ({
|
|
3844
4642
|
...result,
|
|
3845
4643
|
assertion
|
|
3846
4644
|
}));
|
|
@@ -3857,17 +4655,17 @@ async function readAssertions(filePath) {
|
|
|
3857
4655
|
var assertions_default = {
|
|
3858
4656
|
runAssertion,
|
|
3859
4657
|
runAssertions,
|
|
3860
|
-
matchesSimilarity
|
|
3861
|
-
matchesClassification
|
|
4658
|
+
matchesSimilarity,
|
|
4659
|
+
matchesClassification,
|
|
3862
4660
|
matchesLlmRubric: require_graders.matchesLlmRubric,
|
|
3863
4661
|
matchesFactuality: require_graders.matchesFactuality,
|
|
3864
4662
|
matchesClosedQa: require_graders.matchesClosedQa,
|
|
3865
|
-
matchesAnswerRelevance
|
|
3866
|
-
matchesContextRecall
|
|
3867
|
-
matchesContextRelevance
|
|
3868
|
-
matchesContextFaithfulness
|
|
3869
|
-
matchesComparisonBoolean:
|
|
3870
|
-
matchesModeration
|
|
4663
|
+
matchesAnswerRelevance,
|
|
4664
|
+
matchesContextRecall,
|
|
4665
|
+
matchesContextRelevance,
|
|
4666
|
+
matchesContextFaithfulness,
|
|
4667
|
+
matchesComparisonBoolean: matchesSelectBest,
|
|
4668
|
+
matchesModeration,
|
|
3871
4669
|
matchesConversationRelevance
|
|
3872
4670
|
};
|
|
3873
4671
|
//#endregion
|
|
@@ -3931,7 +4729,8 @@ var CIProgressReporter = class {
|
|
|
3931
4729
|
}
|
|
3932
4730
|
updateTotalTests(newTotal) {
|
|
3933
4731
|
this.totalTests = Math.max(newTotal, 1);
|
|
3934
|
-
|
|
4732
|
+
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
4733
|
+
this.highestPercentageSeen = percentage;
|
|
3935
4734
|
}
|
|
3936
4735
|
finish() {
|
|
3937
4736
|
if (this.intervalId) {
|
|
@@ -4104,6 +4903,10 @@ function getDefaultOtelConfig() {
|
|
|
4104
4903
|
}
|
|
4105
4904
|
//#endregion
|
|
4106
4905
|
//#region src/tracing/localSpanExporter.ts
|
|
4906
|
+
const MISSING_TRACE_RETRY_DELAY_MS = 50;
|
|
4907
|
+
function delay(ms) {
|
|
4908
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
4909
|
+
}
|
|
4107
4910
|
/**
|
|
4108
4911
|
* A span exporter that writes spans to the local TraceStore (SQLite).
|
|
4109
4912
|
* This allows OTEL spans to be stored locally for analysis in the promptfoo UI.
|
|
@@ -4145,7 +4948,7 @@ var LocalSpanExporter = class {
|
|
|
4145
4948
|
}
|
|
4146
4949
|
let firstError;
|
|
4147
4950
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
4148
|
-
const result = await
|
|
4951
|
+
const result = await this.addSpansWithTraceRetry(traceStore, traceId, spanDataList);
|
|
4149
4952
|
if (result.stored) require_logger.logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
4150
4953
|
else require_logger.logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
4151
4954
|
} catch (error) {
|
|
@@ -4157,6 +4960,16 @@ var LocalSpanExporter = class {
|
|
|
4157
4960
|
}
|
|
4158
4961
|
return firstError;
|
|
4159
4962
|
}
|
|
4963
|
+
async addSpansWithTraceRetry(traceStore, traceId, spans) {
|
|
4964
|
+
const options = {
|
|
4965
|
+
skipTraceCheck: false,
|
|
4966
|
+
warnIfMissingTrace: false
|
|
4967
|
+
};
|
|
4968
|
+
const result = await traceStore.addSpans(traceId, spans, options);
|
|
4969
|
+
if (result.stored) return result;
|
|
4970
|
+
await delay(MISSING_TRACE_RETRY_DELAY_MS);
|
|
4971
|
+
return traceStore.addSpans(traceId, spans, options);
|
|
4972
|
+
}
|
|
4160
4973
|
/**
|
|
4161
4974
|
* Convert an OTEL ReadableSpan to our SpanData format.
|
|
4162
4975
|
*/
|
|
@@ -4242,7 +5055,7 @@ function initializeOtel(config) {
|
|
|
4242
5055
|
require_logger.logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
|
|
4243
5056
|
const resource = (0, _opentelemetry_resources.resourceFromAttributes)({
|
|
4244
5057
|
[_opentelemetry_semantic_conventions.ATTR_SERVICE_NAME]: config.serviceName,
|
|
4245
|
-
[_opentelemetry_semantic_conventions.ATTR_SERVICE_VERSION]:
|
|
5058
|
+
[_opentelemetry_semantic_conventions.ATTR_SERVICE_VERSION]: require_version.VERSION
|
|
4246
5059
|
});
|
|
4247
5060
|
const spanProcessors = [];
|
|
4248
5061
|
if (config.localExport) {
|
|
@@ -4439,6 +5252,15 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
4439
5252
|
}
|
|
4440
5253
|
//#endregion
|
|
4441
5254
|
//#region src/evaluator.ts
|
|
5255
|
+
const CONVERSATION_VAR_NAME = "_conversation";
|
|
5256
|
+
const promptUsesConversationVariableCache = new lru_cache.LRUCache({ max: 1024 });
|
|
5257
|
+
function promptUsesConversationVariable(prompt) {
|
|
5258
|
+
const cached = promptUsesConversationVariableCache.get(prompt.raw);
|
|
5259
|
+
if (cached !== void 0) return cached;
|
|
5260
|
+
const { referenced, parsed } = require_render.analyzeTemplateReference(prompt.raw, CONVERSATION_VAR_NAME);
|
|
5261
|
+
if (parsed) promptUsesConversationVariableCache.set(prompt.raw, referenced);
|
|
5262
|
+
return referenced;
|
|
5263
|
+
}
|
|
4442
5264
|
/**
|
|
4443
5265
|
* Manages a single progress bar for the evaluation
|
|
4444
5266
|
*/
|
|
@@ -4638,6 +5460,18 @@ function hasProviderGroupedAssertion(assertion) {
|
|
|
4638
5460
|
function shouldDeferGradingForTest(test) {
|
|
4639
5461
|
return Boolean(test.assert?.some(hasProviderGroupedAssertion));
|
|
4640
5462
|
}
|
|
5463
|
+
function logGroupedGradingStatus({ concurrency, hasEvalStepTimeout, runEvalOptions, shouldGroupGradingByProvider, usesConversationVar }) {
|
|
5464
|
+
if (!runEvalOptions.some(({ test }) => shouldDeferGradingForTest(test))) return;
|
|
5465
|
+
if (shouldGroupGradingByProvider) {
|
|
5466
|
+
require_logger.logger.info("Grouping model-graded assertions by provider to minimize local-model reload overhead.");
|
|
5467
|
+
return;
|
|
5468
|
+
}
|
|
5469
|
+
if (concurrency !== 1) return;
|
|
5470
|
+
const reasons = [];
|
|
5471
|
+
if (hasEvalStepTimeout) reasons.push("per-eval-step timeout is configured");
|
|
5472
|
+
if (usesConversationVar) reasons.push("conversation variables require per-row ordering");
|
|
5473
|
+
if (reasons.length > 0) require_logger.logger.info(`Serial grading grouping disabled because ${reasons.join(" and ")}; model-graded judges may reload between rows.`);
|
|
5474
|
+
}
|
|
4641
5475
|
function applyGradingResult(row, checkResult) {
|
|
4642
5476
|
if (!checkResult.pass) {
|
|
4643
5477
|
row.error = checkResult.reason;
|
|
@@ -4652,14 +5486,29 @@ function applyGradingResult(row, checkResult) {
|
|
|
4652
5486
|
if (checkResult.tokensUsed) require_tokenUsageUtils.accumulateAssertionTokenUsage(row.tokenUsage.assertions, checkResult.tokensUsed);
|
|
4653
5487
|
row.gradingResult = checkResult;
|
|
4654
5488
|
}
|
|
4655
|
-
|
|
4656
|
-
|
|
4657
|
-
|
|
4658
|
-
|
|
4659
|
-
|
|
4660
|
-
|
|
4661
|
-
|
|
4662
|
-
|
|
5489
|
+
const ABORTED_GRADING_PREFIX = "Aborted: ";
|
|
5490
|
+
function isAbortShapedError(error) {
|
|
5491
|
+
return error instanceof Error && (error.name === "AbortError" || error.name === "AbortException");
|
|
5492
|
+
}
|
|
5493
|
+
function applyGradingError(row, error, abortSignal) {
|
|
5494
|
+
const errorAsError = error instanceof Error ? error : void 0;
|
|
5495
|
+
if (Boolean(abortSignal?.aborted) && isAbortShapedError(error)) {
|
|
5496
|
+
const shortMessage = errorAsError?.message ?? String(error);
|
|
5497
|
+
require_logger.logger.debug("Assertion grading aborted", {
|
|
5498
|
+
error: shortMessage,
|
|
5499
|
+
promptIdx: row.promptIdx,
|
|
5500
|
+
testIdx: row.testIdx
|
|
5501
|
+
});
|
|
5502
|
+
row.error = `${ABORTED_GRADING_PREFIX}${shortMessage}`;
|
|
5503
|
+
} else {
|
|
5504
|
+
const fullMessage = errorAsError ? errorAsError.stack ?? errorAsError.message : String(error);
|
|
5505
|
+
require_logger.logger.error("Assertion grading failed during eval", {
|
|
5506
|
+
error: fullMessage,
|
|
5507
|
+
promptIdx: row.promptIdx,
|
|
5508
|
+
testIdx: row.testIdx
|
|
5509
|
+
});
|
|
5510
|
+
row.error = fullMessage;
|
|
5511
|
+
}
|
|
4663
5512
|
row.failureReason = require_types.ResultFailureReason.ERROR;
|
|
4664
5513
|
row.success = false;
|
|
4665
5514
|
row.score = 0;
|
|
@@ -4671,7 +5520,7 @@ function getNonTransientTargetStatus(row) {
|
|
|
4671
5520
|
}
|
|
4672
5521
|
function createRunEvalState({ provider, prompt, test }) {
|
|
4673
5522
|
const vars = structuredClone(test.vars || {});
|
|
4674
|
-
const fileMetadata =
|
|
5523
|
+
const fileMetadata = require_evaluatorHelpers.collectFileMetadata(vars);
|
|
4675
5524
|
const conversationKey = `${provider.label || provider.id()}:${prompt.id}${test.metadata?.conversationId ? `:${test.metadata.conversationId}` : ""}`;
|
|
4676
5525
|
const setup = createRunEvalSetup({
|
|
4677
5526
|
provider,
|
|
@@ -4691,7 +5540,7 @@ function createRunEvalState({ provider, prompt, test }) {
|
|
|
4691
5540
|
};
|
|
4692
5541
|
}
|
|
4693
5542
|
function attachConversationVar({ conversations, conversationKey, prompt, test, vars }) {
|
|
4694
|
-
const usesConversation = prompt
|
|
5543
|
+
const usesConversation = promptUsesConversationVariable(prompt);
|
|
4695
5544
|
if (!require_logger.getEnvBool("PROMPTFOO_DISABLE_CONVERSATION_VAR") && !test.options?.disableConversationVar && usesConversation) vars._conversation = conversations?.[conversationKey] || [];
|
|
4696
5545
|
}
|
|
4697
5546
|
function createRunEvalSetup({ provider, prompt, promptConfig, vars }) {
|
|
@@ -4710,8 +5559,8 @@ function createRunEvalSetup({ provider, prompt, promptConfig, vars }) {
|
|
|
4710
5559
|
};
|
|
4711
5560
|
}
|
|
4712
5561
|
async function renderRunEvalPrompt({ filters, isRedteam, provider, promptForRender, test, testSuite, vars }) {
|
|
4713
|
-
const renderedPrompt = await
|
|
4714
|
-
if (isRedteam)
|
|
5562
|
+
const renderedPrompt = await require_evaluatorHelpers.renderPrompt(promptForRender, vars, filters, provider, shouldSkipRedteamInjectVar(test, testSuite, isRedteam) ? [getRedteamInjectVar(test, promptForRender, testSuite)] : void 0);
|
|
5563
|
+
if (isRedteam) require_promptLength.throwIfTargetPromptExceedsMaxChars(renderedPrompt, testSuite?.redteam?.maxCharsPerMessage);
|
|
4715
5564
|
const setup = createRunEvalSetup({
|
|
4716
5565
|
provider,
|
|
4717
5566
|
prompt: promptForRender,
|
|
@@ -4778,7 +5627,7 @@ async function callActiveProvider({ abortSignal, evalId, filters, promptForRende
|
|
|
4778
5627
|
});
|
|
4779
5628
|
const callApiOptions = abortSignal ? { abortSignal } : void 0;
|
|
4780
5629
|
const callApi = () => activeProvider.callApi(renderedPrompt, callApiContext, callApiOptions);
|
|
4781
|
-
const response = rateLimitRegistry ? await rateLimitRegistry.execute(activeProvider, callApi,
|
|
5630
|
+
const response = rateLimitRegistry ? await rateLimitRegistry.execute(activeProvider, callApi, require_shared.createProviderRateLimitOptions()) : await callApi();
|
|
4782
5631
|
require_logger.logger.debug(`Provider response properties: ${Object.keys(response).join(", ")}`);
|
|
4783
5632
|
require_logger.logger.debug(`Provider response cached property explicitly: ${response.cached}`);
|
|
4784
5633
|
return response;
|
|
@@ -4856,7 +5705,7 @@ function createEvaluateResult({ fileMetadata, latencyMs, prompt, promptIdx, rend
|
|
|
4856
5705
|
};
|
|
4857
5706
|
if (!ret.metadata?.sessionIds && !ret.metadata?.sessionId) {
|
|
4858
5707
|
ret.metadata ??= {};
|
|
4859
|
-
ret.metadata.sessionId =
|
|
5708
|
+
ret.metadata.sessionId = require_util$2.getSessionId(response, { vars });
|
|
4860
5709
|
}
|
|
4861
5710
|
return ret;
|
|
4862
5711
|
}
|
|
@@ -4864,7 +5713,7 @@ function trackProviderUsage(provider, response) {
|
|
|
4864
5713
|
if (!response.tokenUsage) return;
|
|
4865
5714
|
const providerId = provider.id();
|
|
4866
5715
|
const trackingId = provider.constructor?.name ? `${providerId} (${provider.constructor.name})` : providerId;
|
|
4867
|
-
|
|
5716
|
+
require_shared.TokenUsageTracker.getInstance().trackUsage(trackingId, response.tokenUsage);
|
|
4868
5717
|
}
|
|
4869
5718
|
async function applyRunEvalResponseOutcome({ abortSignal, deferGrading, evalId, isRedteam, latencyMs, prompt, promptIdx, provider, providerCallQueue, rateLimitRegistry, renderedPrompt, response, ret, test, testIdx, traceContext, vars }) {
|
|
4870
5719
|
if (response.error) {
|
|
@@ -4938,7 +5787,7 @@ async function gradeRunEvalResponse({ abortSignal, deferGrading, evalId, latency
|
|
|
4938
5787
|
assertScoringFunction: test.assertScoringFunction,
|
|
4939
5788
|
traceId
|
|
4940
5789
|
}).then((checkResult) => applyGradingResult(ret, checkResult))).catch((error) => {
|
|
4941
|
-
applyGradingError(ret, error);
|
|
5790
|
+
applyGradingError(ret, error, abortSignal);
|
|
4942
5791
|
});
|
|
4943
5792
|
deferredGradingPromises.set(ret, gradingPromise);
|
|
4944
5793
|
return;
|
|
@@ -4960,13 +5809,13 @@ async function gradeRunEvalResponse({ abortSignal, deferGrading, evalId, latency
|
|
|
4960
5809
|
}
|
|
4961
5810
|
async function transformRunEvalResponse({ evalId, prompt, promptIdx, provider, response, test, testIdx, vars }) {
|
|
4962
5811
|
const processedResponse = { ...response };
|
|
4963
|
-
if (provider.transform) processedResponse.output = await require_transform.transform(provider.transform, processedResponse.output, {
|
|
5812
|
+
if (provider.transform) processedResponse.output = await require_transform$1.transform(provider.transform, processedResponse.output, {
|
|
4964
5813
|
vars,
|
|
4965
5814
|
prompt
|
|
4966
5815
|
});
|
|
4967
5816
|
const providerTransformedOutput = processedResponse.output;
|
|
4968
5817
|
const testTransform = test.options?.transform || test.options?.postprocess;
|
|
4969
|
-
if (testTransform) processedResponse.output = await require_transform.transform(testTransform, processedResponse.output, {
|
|
5818
|
+
if (testTransform) processedResponse.output = await require_transform$1.transform(testTransform, processedResponse.output, {
|
|
4970
5819
|
vars,
|
|
4971
5820
|
prompt,
|
|
4972
5821
|
...response && response.metadata && { metadata: response.metadata }
|
|
@@ -5418,10 +6267,10 @@ async function prepareTestVariables(tests, testSuite) {
|
|
|
5418
6267
|
async function applyInputTransform(testCase, inputTransformDefault) {
|
|
5419
6268
|
const inputTransform = testCase.options?.transformVars || inputTransformDefault;
|
|
5420
6269
|
if (!inputTransform) return;
|
|
5421
|
-
const transformedVars = await require_transform.transform(inputTransform, testCase.vars, {
|
|
6270
|
+
const transformedVars = await require_transform$1.transform(inputTransform, testCase.vars, {
|
|
5422
6271
|
prompt: {},
|
|
5423
6272
|
uuid: crypto.randomUUID()
|
|
5424
|
-
}, true, require_transform.TransformInputType.VARS);
|
|
6273
|
+
}, true, require_transform$1.TransformInputType.VARS);
|
|
5425
6274
|
require_invariant.invariant(typeof transformedVars === "object", "Transform function did not return a valid object");
|
|
5426
6275
|
testCase.vars = {
|
|
5427
6276
|
...testCase.vars,
|
|
@@ -5485,7 +6334,7 @@ async function resolveDefaultTestProvider(defaultTest, testCase) {
|
|
|
5485
6334
|
const defaultProvider = defaultTest.provider;
|
|
5486
6335
|
if (require_types.isApiProvider(defaultProvider)) return defaultProvider;
|
|
5487
6336
|
if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
5488
|
-
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-
|
|
6337
|
+
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-BDVVIQM6.cjs")).then((n) => n.providers_exports);
|
|
5489
6338
|
return loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
5490
6339
|
}
|
|
5491
6340
|
return defaultProvider;
|
|
@@ -5645,7 +6494,7 @@ function buildRepeatCacheContextByTestIdx(runEvalOptions) {
|
|
|
5645
6494
|
async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
|
|
5646
6495
|
if (!require_logger.state.resume || !evalRecord.persisted) return;
|
|
5647
6496
|
try {
|
|
5648
|
-
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-
|
|
6497
|
+
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-CvtS8h8u.cjs")).then((n) => n.evalResult_exports);
|
|
5649
6498
|
const completedPairs = await EvalResult.getCompletedIndexPairs(evalRecord.id, { excludeErrors: require_logger.state.retryMode });
|
|
5650
6499
|
const originalCount = runEvalOptions.length;
|
|
5651
6500
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
@@ -5659,14 +6508,14 @@ async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
|
|
|
5659
6508
|
}
|
|
5660
6509
|
}
|
|
5661
6510
|
function adjustConcurrencyForSerialFeatures({ concurrency, prompts, tests }) {
|
|
5662
|
-
const usesConversationVar = prompts.some(
|
|
6511
|
+
const usesConversationVar = prompts.some(promptUsesConversationVariable);
|
|
5663
6512
|
if (concurrency <= 1) return {
|
|
5664
6513
|
concurrency,
|
|
5665
6514
|
usesConversationVar
|
|
5666
6515
|
};
|
|
5667
6516
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
5668
6517
|
if (usesConversationVar) {
|
|
5669
|
-
require_logger.logger.info(`Setting concurrency to 1 because the ${chalk.default.cyan(
|
|
6518
|
+
require_logger.logger.info(`Setting concurrency to 1 because the ${chalk.default.cyan(CONVERSATION_VAR_NAME)} variable is used.`);
|
|
5670
6519
|
return {
|
|
5671
6520
|
concurrency: 1,
|
|
5672
6521
|
usesConversationVar
|
|
@@ -5896,8 +6745,9 @@ var Evaluator = class {
|
|
|
5896
6745
|
};
|
|
5897
6746
|
this.conversations = {};
|
|
5898
6747
|
this.registers = {};
|
|
5899
|
-
|
|
5900
|
-
this.
|
|
6748
|
+
const jsonlFiles = Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : [];
|
|
6749
|
+
this.fileWriters = jsonlFiles.map((p) => new JsonlFileWriter(p));
|
|
6750
|
+
this.rateLimitRegistry = require_shared.createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
5901
6751
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
5902
6752
|
require_logger.logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
5903
6753
|
retryAfterMs: data.retryAfterMs,
|
|
@@ -5923,7 +6773,7 @@ var Evaluator = class {
|
|
|
5923
6773
|
current: data.current
|
|
5924
6774
|
});
|
|
5925
6775
|
});
|
|
5926
|
-
|
|
6776
|
+
require_shared.redteamProviderManager.setRateLimitRegistry(this.rateLimitRegistry);
|
|
5927
6777
|
}
|
|
5928
6778
|
/**
|
|
5929
6779
|
* Updates metrics and stats after a comparison assertion (select-best or max-score).
|
|
@@ -6001,7 +6851,7 @@ var Evaluator = class {
|
|
|
6001
6851
|
});
|
|
6002
6852
|
}
|
|
6003
6853
|
async runEvalStepAfterBeforeEach(evalStep, { deferGrading, onRowsReady, providerCallQueue, testSuite }) {
|
|
6004
|
-
evalStep.test = (await
|
|
6854
|
+
evalStep.test = (await require_evaluatorHelpers.runExtensionHook(testSuite.extensions, "beforeEach", { test: evalStep.test })).test;
|
|
6005
6855
|
const rows = await runEvalInternal({
|
|
6006
6856
|
...evalStep,
|
|
6007
6857
|
deferGrading,
|
|
@@ -6016,6 +6866,25 @@ var Evaluator = class {
|
|
|
6016
6866
|
this.trackCompletedRow(evalStep, row, context);
|
|
6017
6867
|
context.numComplete++;
|
|
6018
6868
|
const promptEvalCount = reservePromptEvalCount(context, row.promptIdx);
|
|
6869
|
+
if (context.testSuite.extensions?.length) try {
|
|
6870
|
+
const afterEachOut = await require_evaluatorHelpers.runExtensionHook(context.testSuite.extensions, "afterEach", {
|
|
6871
|
+
test: evalStep.test,
|
|
6872
|
+
result: {
|
|
6873
|
+
...row,
|
|
6874
|
+
namedScores: { ...row.namedScores },
|
|
6875
|
+
metadata: { ...row.metadata },
|
|
6876
|
+
response: row.response ? {
|
|
6877
|
+
...row.response,
|
|
6878
|
+
metadata: { ...row.response.metadata }
|
|
6879
|
+
} : row.response
|
|
6880
|
+
}
|
|
6881
|
+
});
|
|
6882
|
+
row.namedScores = require_evaluatorHelpers.filterFiniteScores(afterEachOut.result.namedScores);
|
|
6883
|
+
row.metadata = afterEachOut.result.metadata;
|
|
6884
|
+
if (row.response && afterEachOut.result.response) row.response.metadata = afterEachOut.result.response.metadata;
|
|
6885
|
+
} catch (error) {
|
|
6886
|
+
require_logger.logger.error(`afterEach extension hook failed, persisting row without hook modifications`, { error });
|
|
6887
|
+
}
|
|
6019
6888
|
await this.persistEvalRow(row);
|
|
6020
6889
|
if (this.abortIfTargetUnavailable(row, context)) break;
|
|
6021
6890
|
const metrics = context.prompts[row.promptIdx].metrics;
|
|
@@ -6027,10 +6896,6 @@ var Evaluator = class {
|
|
|
6027
6896
|
promptEvalCount,
|
|
6028
6897
|
row
|
|
6029
6898
|
});
|
|
6030
|
-
await require_providers.runExtensionHook(context.testSuite.extensions, "afterEach", {
|
|
6031
|
-
test: evalStep.test,
|
|
6032
|
-
result: row
|
|
6033
|
-
});
|
|
6034
6899
|
context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics);
|
|
6035
6900
|
}
|
|
6036
6901
|
}
|
|
@@ -6104,9 +6969,8 @@ var Evaluator = class {
|
|
|
6104
6969
|
context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics || createTimeoutMetrics(timeoutMs));
|
|
6105
6970
|
}
|
|
6106
6971
|
async executeEvalSteps({ checkAbort, ciProgressReporter, combinedAbortSignal, concurrentRunEvalOptions, evalStepIndexMap, globalTimeout, groupedRunEvalOptions, isEvalTimedOut, isWebUI, maxEvalTimeMs, processingContext, processedIndices, progressBarManager, prompts, serialRunEvalOptions, shouldGroupGradingByProvider }) {
|
|
6107
|
-
let flushGroupedRows;
|
|
6108
6972
|
try {
|
|
6109
|
-
if (shouldGroupGradingByProvider)
|
|
6973
|
+
if (shouldGroupGradingByProvider) await this.runGroupedEvalSteps({
|
|
6110
6974
|
checkAbort,
|
|
6111
6975
|
evalStepIndexMap,
|
|
6112
6976
|
groupedRunEvalOptions,
|
|
@@ -6138,7 +7002,6 @@ var Evaluator = class {
|
|
|
6138
7002
|
cleanupProgressAfterError(progressBarManager, ciProgressReporter, err);
|
|
6139
7003
|
throw err;
|
|
6140
7004
|
}
|
|
6141
|
-
await flushGroupedRows?.();
|
|
6142
7005
|
if (isEvalTimedOut()) require_logger.logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
6143
7006
|
else if (!processingContext.targetUnavailable) return this.saveInterruptedEval({
|
|
6144
7007
|
ciProgressReporter,
|
|
@@ -6187,7 +7050,15 @@ var Evaluator = class {
|
|
|
6187
7050
|
})) break;
|
|
6188
7051
|
}
|
|
6189
7052
|
} catch (error) {
|
|
6190
|
-
|
|
7053
|
+
const pendingRowCount = groupedRows.reduce((sum, entry) => sum + entry.rows.length, 0);
|
|
7054
|
+
try {
|
|
7055
|
+
await flushGroupedRows();
|
|
7056
|
+
} catch (flushError) {
|
|
7057
|
+
require_logger.logger.warn("Failed to flush grouped rows after error; target outputs may be lost", {
|
|
7058
|
+
error: flushError instanceof Error ? flushError.message : String(flushError),
|
|
7059
|
+
pendingRowCount
|
|
7060
|
+
});
|
|
7061
|
+
}
|
|
6191
7062
|
throw error;
|
|
6192
7063
|
}
|
|
6193
7064
|
await flushGroupedRows();
|
|
@@ -6363,7 +7234,7 @@ var Evaluator = class {
|
|
|
6363
7234
|
}
|
|
6364
7235
|
const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
|
|
6365
7236
|
if (!maxScoreAssertion) return;
|
|
6366
|
-
const maxScoreGradingResults = await
|
|
7237
|
+
const maxScoreGradingResults = await selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
|
|
6367
7238
|
updateComparisonReporterProgress({
|
|
6368
7239
|
ciProgressReporter,
|
|
6369
7240
|
compareCount,
|
|
@@ -6457,7 +7328,7 @@ var Evaluator = class {
|
|
|
6457
7328
|
async runAfterAllExtensions(testSuite) {
|
|
6458
7329
|
if (!testSuite.extensions?.length) return;
|
|
6459
7330
|
const resultsForExtension = (await this.evalRecord.getResults()).map((result) => "toEvaluateResult" in result ? result.toEvaluateResult() : result);
|
|
6460
|
-
await
|
|
7331
|
+
await require_evaluatorHelpers.runExtensionHook(testSuite.extensions, "afterAll", {
|
|
6461
7332
|
prompts: this.evalRecord.prompts,
|
|
6462
7333
|
results: resultsForExtension,
|
|
6463
7334
|
suite: testSuite,
|
|
@@ -6543,7 +7414,7 @@ var Evaluator = class {
|
|
|
6543
7414
|
const rowsWithSelectBestAssertion = /* @__PURE__ */ new Set();
|
|
6544
7415
|
const rowsWithMaxScoreAssertion = /* @__PURE__ */ new Set();
|
|
6545
7416
|
ensureDefaultTestForExtensions(testSuite);
|
|
6546
|
-
testSuite = (await
|
|
7417
|
+
testSuite = (await require_evaluatorHelpers.runExtensionHook(testSuite.extensions, "beforeAll", { suite: testSuite })).suite;
|
|
6547
7418
|
if (!await maybeAddGeneratedPrompts(testSuite, options)) return this.evalRecord;
|
|
6548
7419
|
prompts.push(...buildCompletedPrompts(testSuite, this.evalRecord));
|
|
6549
7420
|
const promptIndexMap = buildPromptIndexMap(prompts);
|
|
@@ -6623,6 +7494,13 @@ var Evaluator = class {
|
|
|
6623
7494
|
if (!this.options.silent) {
|
|
6624
7495
|
if (serialRunEvalOptions.length > 0) require_logger.logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
6625
7496
|
if (concurrentRunEvalOptions.length > 0) require_logger.logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
7497
|
+
logGroupedGradingStatus({
|
|
7498
|
+
concurrency,
|
|
7499
|
+
hasEvalStepTimeout,
|
|
7500
|
+
runEvalOptions,
|
|
7501
|
+
shouldGroupGradingByProvider,
|
|
7502
|
+
usesConversationVar
|
|
7503
|
+
});
|
|
6626
7504
|
}
|
|
6627
7505
|
if (this.options.showProgressBar && progressBarManager) {
|
|
6628
7506
|
await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
@@ -6714,7 +7592,7 @@ var Evaluator = class {
|
|
|
6714
7592
|
});
|
|
6715
7593
|
}
|
|
6716
7594
|
this.rateLimitRegistry?.dispose();
|
|
6717
|
-
|
|
7595
|
+
require_shared.redteamProviderManager.setRateLimitRegistry(void 0);
|
|
6718
7596
|
require_logger.state.maxConcurrency = void 0;
|
|
6719
7597
|
}
|
|
6720
7598
|
}
|
|
@@ -7622,7 +8500,7 @@ var Eval = class Eval {
|
|
|
7622
8500
|
const evalInstance = new Eval(eval_.config, {
|
|
7623
8501
|
id: eval_.id,
|
|
7624
8502
|
createdAt: new Date(eval_.createdAt),
|
|
7625
|
-
author: eval_.author
|
|
8503
|
+
author: eval_.author,
|
|
7626
8504
|
description: eval_.description || void 0,
|
|
7627
8505
|
prompts: eval_.prompts || [],
|
|
7628
8506
|
datasetId,
|
|
@@ -7645,7 +8523,7 @@ var Eval = class Eval {
|
|
|
7645
8523
|
return (await require_tables.getDb().select().from(require_tables.evalsTable).limit(limit).orderBy((0, drizzle_orm.desc)(require_tables.evalsTable.createdAt)).all()).map((e) => new Eval(e.config, {
|
|
7646
8524
|
id: e.id,
|
|
7647
8525
|
createdAt: new Date(e.createdAt),
|
|
7648
|
-
author: e.author
|
|
8526
|
+
author: e.author,
|
|
7649
8527
|
description: e.description || void 0,
|
|
7650
8528
|
prompts: e.prompts || [],
|
|
7651
8529
|
persisted: true
|
|
@@ -7660,7 +8538,7 @@ var Eval = class Eval {
|
|
|
7660
8538
|
return (await require_tables.getDb().select().from(require_tables.evalsTable).orderBy((0, drizzle_orm.desc)(require_tables.evalsTable.createdAt)).limit(limit).offset(offset).all()).map((e) => new Eval(e.config, {
|
|
7661
8539
|
id: e.id,
|
|
7662
8540
|
createdAt: new Date(e.createdAt),
|
|
7663
|
-
author: e.author
|
|
8541
|
+
author: e.author,
|
|
7664
8542
|
description: e.description || void 0,
|
|
7665
8543
|
prompts: e.prompts || [],
|
|
7666
8544
|
persisted: true
|
|
@@ -7675,7 +8553,7 @@ var Eval = class Eval {
|
|
|
7675
8553
|
static async create(config, renderedPrompts, opts) {
|
|
7676
8554
|
const createdAt = opts?.createdAt || /* @__PURE__ */ new Date();
|
|
7677
8555
|
const evalId = opts?.id || createEvalId(createdAt);
|
|
7678
|
-
const author = opts
|
|
8556
|
+
const author = opts && "author" in opts ? opts.author ?? null : require_accounts.getAuthor();
|
|
7679
8557
|
const db = require_tables.getDb();
|
|
7680
8558
|
const datasetId = require_createHash.sha256(JSON.stringify(config.tests || []));
|
|
7681
8559
|
db.transaction(() => {
|
|
@@ -7737,7 +8615,7 @@ var Eval = class Eval {
|
|
|
7737
8615
|
});
|
|
7738
8616
|
return new Eval(config, {
|
|
7739
8617
|
id: evalId,
|
|
7740
|
-
author
|
|
8618
|
+
author,
|
|
7741
8619
|
createdAt,
|
|
7742
8620
|
persisted: true,
|
|
7743
8621
|
runtimeOptions: sanitizeRuntimeOptions(opts?.runtimeOptions)
|
|
@@ -7747,7 +8625,7 @@ var Eval = class Eval {
|
|
|
7747
8625
|
const createdAt = opts?.createdAt || /* @__PURE__ */ new Date();
|
|
7748
8626
|
this.createdAt = createdAt.getTime();
|
|
7749
8627
|
this.id = opts?.id || createEvalId(createdAt);
|
|
7750
|
-
this.author = opts?.author;
|
|
8628
|
+
this.author = opts?.author ?? null;
|
|
7751
8629
|
this.config = config;
|
|
7752
8630
|
this.results = [];
|
|
7753
8631
|
this.prompts = opts?.prompts || [];
|
|
@@ -8277,7 +9155,7 @@ var Eval = class Eval {
|
|
|
8277
9155
|
newConfig.description = copyDescription;
|
|
8278
9156
|
const newPrompts = structuredClone(this.prompts);
|
|
8279
9157
|
const newVars = this.vars ? structuredClone(this.vars) : [];
|
|
8280
|
-
const author = require_accounts.
|
|
9158
|
+
const author = require_accounts.getAuthor();
|
|
8281
9159
|
const db = require_tables.getDb();
|
|
8282
9160
|
let copiedCount = 0;
|
|
8283
9161
|
db.transaction(() => {
|
|
@@ -8434,47 +9312,11 @@ function filterPrompts(prompts, filterPromptsOption) {
|
|
|
8434
9312
|
//#endregion
|
|
8435
9313
|
//#region src/commands/eval/filterProviders.ts
|
|
8436
9314
|
/**
|
|
8437
|
-
* Checks if a value is a valid provider ID (non-empty string).
|
|
8438
|
-
*/
|
|
8439
|
-
function isValidProviderId(id) {
|
|
8440
|
-
return id !== null && id !== void 0 && typeof id === "string" && id !== "";
|
|
8441
|
-
}
|
|
8442
|
-
/**
|
|
8443
9315
|
* Extracts the id and label from a raw provider config without instantiating it.
|
|
8444
9316
|
* Handles all provider config formats: string, function, ProviderOptions, ProviderOptionsMap.
|
|
8445
9317
|
*/
|
|
8446
9318
|
function getProviderIdAndLabel(provider, index) {
|
|
8447
|
-
|
|
8448
|
-
if (typeof provider === "function") {
|
|
8449
|
-
const label = provider.label;
|
|
8450
|
-
return {
|
|
8451
|
-
id: label ?? `custom-function-${index}`,
|
|
8452
|
-
label
|
|
8453
|
-
};
|
|
8454
|
-
}
|
|
8455
|
-
const providerId = provider.id;
|
|
8456
|
-
if ("id" in provider && isValidProviderId(providerId)) return {
|
|
8457
|
-
id: providerId,
|
|
8458
|
-
label: provider.label
|
|
8459
|
-
};
|
|
8460
|
-
const keys = Object.keys(provider);
|
|
8461
|
-
if (keys.length > 0) {
|
|
8462
|
-
const id = keys[0];
|
|
8463
|
-
const value = provider[id];
|
|
8464
|
-
if (typeof value === "object" && value !== null) return {
|
|
8465
|
-
id: value.id || id,
|
|
8466
|
-
label: value.label
|
|
8467
|
-
};
|
|
8468
|
-
}
|
|
8469
|
-
const label = provider.label;
|
|
8470
|
-
if (isValidProviderId(label)) return {
|
|
8471
|
-
id: label,
|
|
8472
|
-
label
|
|
8473
|
-
};
|
|
8474
|
-
return {
|
|
8475
|
-
id: `unknown-${index}`,
|
|
8476
|
-
label
|
|
8477
|
-
};
|
|
9319
|
+
return require_util.normalizeProviderRef(provider, { index });
|
|
8478
9320
|
}
|
|
8479
9321
|
/**
|
|
8480
9322
|
* Filters raw provider configs BEFORE instantiation.
|
|
@@ -10227,10 +11069,10 @@ async function fetchRemoteGeneration(task, prompts) {
|
|
|
10227
11069
|
const body = {
|
|
10228
11070
|
task,
|
|
10229
11071
|
prompts,
|
|
10230
|
-
version:
|
|
11072
|
+
version: require_version.VERSION,
|
|
10231
11073
|
email: require_accounts.getUserEmail()
|
|
10232
11074
|
};
|
|
10233
|
-
const response = await require_cache.fetchWithCache(
|
|
11075
|
+
const response = await require_cache.fetchWithCache(require_remoteGeneration.getRemoteGenerationUrl(), {
|
|
10234
11076
|
method: "POST",
|
|
10235
11077
|
headers: { "Content-Type": "application/json" },
|
|
10236
11078
|
body: JSON.stringify(body)
|
|
@@ -10265,7 +11107,7 @@ function formatPrompts(prompts) {
|
|
|
10265
11107
|
//#endregion
|
|
10266
11108
|
//#region src/redteam/extraction/entities.ts
|
|
10267
11109
|
async function extractEntities(provider, prompts) {
|
|
10268
|
-
if (
|
|
11110
|
+
if (require_remoteGeneration.shouldGenerateRemote()) try {
|
|
10269
11111
|
return await fetchRemoteGeneration("entities", prompts);
|
|
10270
11112
|
} catch (error) {
|
|
10271
11113
|
require_logger.logger.warn(`[Entity Extraction] Failed, returning 0 entities. Error using remote generation: ${error}`);
|
|
@@ -10312,7 +11154,7 @@ async function extractSystemPurpose(provider, prompts) {
|
|
|
10312
11154
|
require_logger.logger.debug("[purpose] No meaningful prompts provided, returning default purpose");
|
|
10313
11155
|
return DEFAULT_PURPOSE;
|
|
10314
11156
|
}
|
|
10315
|
-
if (!
|
|
11157
|
+
if (!require_remoteGeneration.neverGenerateRemote()) try {
|
|
10316
11158
|
return await fetchRemoteGeneration("purpose", prompts);
|
|
10317
11159
|
} catch (error) {
|
|
10318
11160
|
require_logger.logger.warn(`[purpose] Error using remote generation, returning empty string: ${error}`);
|
|
@@ -10584,7 +11426,7 @@ function createTestCase(injectVar, output, harmCategory) {
|
|
|
10584
11426
|
vars: { [injectVar]: output.trim() },
|
|
10585
11427
|
metadata: {
|
|
10586
11428
|
harmCategory: harmCategoryLabel,
|
|
10587
|
-
pluginId:
|
|
11429
|
+
pluginId: require_util$2.getShortPluginId(harmCategory)
|
|
10588
11430
|
},
|
|
10589
11431
|
assert: getHarmfulAssertions(harmCategory)
|
|
10590
11432
|
};
|
|
@@ -10707,25 +11549,45 @@ var AlignedHarmfulPlugin = class extends require_graders.RedteamPluginBase {
|
|
|
10707
11549
|
getAssertions(_prompt) {
|
|
10708
11550
|
return getHarmfulAssertions(this.harmCategory);
|
|
10709
11551
|
}
|
|
10710
|
-
promptsToTestCases(prompts) {
|
|
11552
|
+
async promptsToTestCases(prompts) {
|
|
10711
11553
|
const hasMultipleInputs = this.config.inputs && Object.keys(this.config.inputs).length > 0;
|
|
10712
11554
|
const harmCategoryLabel = require_types.HARM_PLUGINS[this.harmCategory] || this.harmCategory;
|
|
10713
|
-
|
|
11555
|
+
const pluginId = require_util$2.getShortPluginId(this.harmCategory);
|
|
11556
|
+
return Promise.all([...prompts].sort((a, b) => a.__prompt.localeCompare(b.__prompt)).map(async ({ __prompt }, materializationIndex) => {
|
|
10714
11557
|
const vars = { [this.injectVar]: __prompt };
|
|
10715
|
-
|
|
10716
|
-
|
|
10717
|
-
|
|
10718
|
-
|
|
11558
|
+
let inputMaterialization;
|
|
11559
|
+
if (hasMultipleInputs) {
|
|
11560
|
+
let parsed;
|
|
11561
|
+
try {
|
|
11562
|
+
parsed = JSON.parse(__prompt);
|
|
11563
|
+
} catch (error) {
|
|
11564
|
+
require_logger.logger.debug("[AlignedHarmful] Could not parse prompt as JSON for multi-input mode", { error });
|
|
11565
|
+
}
|
|
11566
|
+
if (parsed) try {
|
|
11567
|
+
const materializedVars = await require_util$2.extractMaterializedVariablesFromJsonWithMetadata(parsed, this.config.inputs, {
|
|
11568
|
+
materializationIndex,
|
|
11569
|
+
pluginId,
|
|
11570
|
+
provider: this.provider,
|
|
11571
|
+
purpose: this.purpose
|
|
11572
|
+
});
|
|
11573
|
+
Object.assign(vars, materializedVars.vars);
|
|
11574
|
+
inputMaterialization = materializedVars.metadata;
|
|
11575
|
+
} catch (error) {
|
|
11576
|
+
require_logger.logger.debug("[AlignedHarmful] Failed to materialize prompt inputs", { error });
|
|
11577
|
+
throw error;
|
|
11578
|
+
}
|
|
11579
|
+
}
|
|
10719
11580
|
return {
|
|
10720
11581
|
vars,
|
|
10721
11582
|
metadata: {
|
|
10722
11583
|
harmCategory: harmCategoryLabel,
|
|
10723
|
-
pluginId
|
|
10724
|
-
pluginConfig: this.config
|
|
11584
|
+
pluginId,
|
|
11585
|
+
pluginConfig: this.config,
|
|
11586
|
+
...inputMaterialization ? { inputMaterialization } : {}
|
|
10725
11587
|
},
|
|
10726
11588
|
assert: getHarmfulAssertions(this.harmCategory)
|
|
10727
11589
|
};
|
|
10728
|
-
});
|
|
11590
|
+
}));
|
|
10729
11591
|
}
|
|
10730
11592
|
};
|
|
10731
11593
|
//#endregion
|
|
@@ -10734,25 +11596,42 @@ var AlignedHarmfulPlugin = class extends require_graders.RedteamPluginBase {
|
|
|
10734
11596
|
* Extract content from <Prompt> tags and parse JSON if inputs are defined.
|
|
10735
11597
|
* Returns the processed prompt and any additional vars extracted from JSON.
|
|
10736
11598
|
*/
|
|
10737
|
-
function processPromptForInputs(prompt,
|
|
11599
|
+
async function processPromptForInputs(prompt, inputs, plugin, provider, purpose, materializationIndex) {
|
|
10738
11600
|
let processedPrompt = prompt.trim();
|
|
10739
11601
|
const additionalVars = {};
|
|
10740
|
-
|
|
11602
|
+
let additionalMetadata;
|
|
11603
|
+
const extractedPrompt = require_util$2.extractPromptFromTags(processedPrompt);
|
|
10741
11604
|
if (extractedPrompt) processedPrompt = extractedPrompt;
|
|
10742
|
-
if (inputs && Object.keys(inputs).length > 0)
|
|
10743
|
-
|
|
10744
|
-
|
|
10745
|
-
|
|
10746
|
-
|
|
11605
|
+
if (inputs && Object.keys(inputs).length > 0) {
|
|
11606
|
+
let parsed;
|
|
11607
|
+
try {
|
|
11608
|
+
parsed = JSON.parse(processedPrompt);
|
|
11609
|
+
} catch (error) {
|
|
11610
|
+
require_logger.logger.debug("[Harmful] Could not parse prompt as JSON for multi-input mode", { error });
|
|
11611
|
+
}
|
|
11612
|
+
if (parsed) try {
|
|
11613
|
+
const materializedVars = await require_util$2.extractMaterializedVariablesFromJsonWithMetadata(parsed, inputs, {
|
|
11614
|
+
materializationIndex,
|
|
11615
|
+
pluginId: plugin,
|
|
11616
|
+
provider,
|
|
11617
|
+
purpose
|
|
11618
|
+
});
|
|
11619
|
+
Object.assign(additionalVars, materializedVars.vars);
|
|
11620
|
+
additionalMetadata = materializedVars.metadata;
|
|
11621
|
+
} catch (error) {
|
|
11622
|
+
require_logger.logger.debug("[Harmful] Failed to materialize prompt inputs", { error });
|
|
11623
|
+
throw error;
|
|
11624
|
+
}
|
|
10747
11625
|
}
|
|
10748
11626
|
return {
|
|
10749
11627
|
processedPrompt,
|
|
10750
|
-
additionalVars
|
|
11628
|
+
additionalVars,
|
|
11629
|
+
additionalMetadata
|
|
10751
11630
|
};
|
|
10752
11631
|
}
|
|
10753
11632
|
async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, plugin) {
|
|
10754
11633
|
const maxHarmfulTests = require_logger.getEnvInt("PROMPTFOO_MAX_HARMFUL_TESTS_PER_REQUEST", 5);
|
|
10755
|
-
const unalignedProvider = new
|
|
11634
|
+
const unalignedProvider = new require_promptfoo.PromptfooHarmfulCompletionProvider({
|
|
10756
11635
|
purpose,
|
|
10757
11636
|
n: Math.min(n, maxHarmfulTests),
|
|
10758
11637
|
harmCategory: plugin,
|
|
@@ -10768,15 +11647,19 @@ async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, p
|
|
|
10768
11647
|
};
|
|
10769
11648
|
const allPrompts = await require_graders.retryWithDeduplication(generatePrompts, n);
|
|
10770
11649
|
const inputs = config?.inputs;
|
|
10771
|
-
return require_graders.sampleArray(allPrompts, n).map((prompt) => {
|
|
10772
|
-
const { processedPrompt, additionalVars } = processPromptForInputs(prompt,
|
|
11650
|
+
return Promise.all(require_graders.sampleArray(allPrompts, n).map(async (prompt, materializationIndex) => {
|
|
11651
|
+
const { processedPrompt, additionalVars, additionalMetadata } = await processPromptForInputs(prompt, inputs, plugin, unalignedProvider, purpose, materializationIndex);
|
|
10773
11652
|
const testCase = createTestCase(injectVar, processedPrompt, plugin);
|
|
10774
11653
|
if (Object.keys(additionalVars).length > 0) testCase.vars = {
|
|
10775
11654
|
...testCase.vars,
|
|
10776
11655
|
...additionalVars
|
|
10777
11656
|
};
|
|
11657
|
+
if (additionalMetadata) testCase.metadata = {
|
|
11658
|
+
...testCase.metadata,
|
|
11659
|
+
inputMaterialization: additionalMetadata
|
|
11660
|
+
};
|
|
10778
11661
|
return testCase;
|
|
10779
|
-
});
|
|
11662
|
+
}));
|
|
10780
11663
|
}
|
|
10781
11664
|
//#endregion
|
|
10782
11665
|
//#region src/redteam/plugins/teenSafety/graderExamples.ts
|
|
@@ -11094,9 +11977,9 @@ const MAX_CHARS_RETRY_MODIFIER_KEY = "__maxCharsPerMessageRetry";
|
|
|
11094
11977
|
function computeModifiersFromConfig(config) {
|
|
11095
11978
|
const modifiers = { ...config?.modifiers };
|
|
11096
11979
|
if (config?.language && typeof config.language === "string") modifiers.language = config.language;
|
|
11097
|
-
if (config?.inputs && Object.keys(config.inputs).length > 0) modifiers.__outputFormat = `Output each test case as JSON wrapped in <Prompt> tags: <Prompt>{${Object.entries(config.inputs).map(([k, description]) => `"${k}": "${description}"`).join(", ")}}</Prompt>`;
|
|
11098
|
-
const maxCharsModifier =
|
|
11099
|
-
if (maxCharsModifier) modifiers[
|
|
11980
|
+
if (config?.inputs && Object.keys(config.inputs).length > 0) modifiers.__outputFormat = `Output each test case as JSON wrapped in <Prompt> tags: <Prompt>{${Object.entries(require_inputVariables.buildPromptInputDescriptions(config.inputs) ?? {}).map(([k, description]) => `"${k}": "${description}"`).join(", ")}}</Prompt>`;
|
|
11981
|
+
const maxCharsModifier = require_promptLength.getMaxCharsPerMessageModifierValue(config?.maxCharsPerMessage);
|
|
11982
|
+
if (maxCharsModifier) modifiers[require_promptLength.MAX_CHARS_PER_MESSAGE_MODIFIER_KEY] = maxCharsModifier;
|
|
11100
11983
|
return modifiers;
|
|
11101
11984
|
}
|
|
11102
11985
|
function applyDefaultGraderExamples(key, config) {
|
|
@@ -11120,7 +12003,7 @@ function isValidMaxCharsPerMessage(limit) {
|
|
|
11120
12003
|
}
|
|
11121
12004
|
function getMaxCharsPerMessageFromConfig(config) {
|
|
11122
12005
|
if (isValidMaxCharsPerMessage(config?.maxCharsPerMessage)) return config.maxCharsPerMessage;
|
|
11123
|
-
const maxCharsModifier = (config?.modifiers)?.[
|
|
12006
|
+
const maxCharsModifier = (config?.modifiers)?.[require_promptLength.MAX_CHARS_PER_MESSAGE_MODIFIER_KEY];
|
|
11124
12007
|
if (typeof maxCharsModifier !== "string") return;
|
|
11125
12008
|
const match = /must be (\d+) characters or fewer\./.exec(maxCharsModifier);
|
|
11126
12009
|
if (!match) return;
|
|
@@ -11179,9 +12062,10 @@ function dedupeTestCases(testCases) {
|
|
|
11179
12062
|
return deduped;
|
|
11180
12063
|
}
|
|
11181
12064
|
function buildMaxCharsRetryInstructions(rejectedPromptLengths, limit) {
|
|
12065
|
+
const longestRejectedPromptText = rejectedPromptLengths.length > 0 ? `${Math.max(...rejectedPromptLengths)} characters` : "unknown length";
|
|
11182
12066
|
return dedent.default`
|
|
11183
12067
|
Your previous response included ${rejectedPromptLengths.length} generated prompt${rejectedPromptLengths.length === 1 ? "" : "s"} that exceeded the ${limit ?? "configured"}-character limit.
|
|
11184
|
-
The longest rejected prompt was ${
|
|
12068
|
+
The longest rejected prompt was ${longestRejectedPromptText}.
|
|
11185
12069
|
Generate replacement prompts only, and keep every user message within the character limit.
|
|
11186
12070
|
`.trim();
|
|
11187
12071
|
}
|
|
@@ -11203,7 +12087,7 @@ function withMaxCharsRetries(pluginFactory) {
|
|
|
11203
12087
|
const rejectedPromptLengths = [];
|
|
11204
12088
|
let rejectedPromptLimit;
|
|
11205
12089
|
for (const testCase of generatedTestCases) {
|
|
11206
|
-
const violation =
|
|
12090
|
+
const violation = require_promptLength.getGeneratedPromptOverLimit(String(testCase.vars?.[params.injectVar] ?? ""), maxCharsPerMessage);
|
|
11207
12091
|
if (violation) {
|
|
11208
12092
|
rejectedPromptLengths.push(violation.length);
|
|
11209
12093
|
rejectedPromptLimit = violation.limit;
|
|
@@ -11220,16 +12104,16 @@ function withMaxCharsRetries(pluginFactory) {
|
|
|
11220
12104
|
}
|
|
11221
12105
|
async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
11222
12106
|
require_invariant.invariant(!require_logger.getEnvBool("PROMPTFOO_DISABLE_REDTEAM_REMOTE_GENERATION"), "fetchRemoteTestCases should never be called when remote generation is disabled");
|
|
11223
|
-
const remoteHealth = await checkRemoteHealth(
|
|
12107
|
+
const remoteHealth = await checkRemoteHealth(require_remoteGeneration.getRemoteHealthUrl());
|
|
11224
12108
|
if (remoteHealth.status !== "OK") {
|
|
11225
12109
|
require_logger.logger.error(`Error generating test cases for ${key}: ${remoteHealth.message}`);
|
|
11226
12110
|
return [];
|
|
11227
12111
|
}
|
|
11228
12112
|
const { graderExamples, ...configForRemote } = config ?? {};
|
|
11229
|
-
const maxCharsModifier =
|
|
12113
|
+
const maxCharsModifier = require_promptLength.getMaxCharsPerMessageModifierValue(config?.maxCharsPerMessage);
|
|
11230
12114
|
if (maxCharsModifier) configForRemote.modifiers = {
|
|
11231
12115
|
...configForRemote.modifiers ?? {},
|
|
11232
|
-
[
|
|
12116
|
+
[require_promptLength.MAX_CHARS_PER_MESSAGE_MODIFIER_KEY]: maxCharsModifier
|
|
11233
12117
|
};
|
|
11234
12118
|
const body = JSON.stringify({
|
|
11235
12119
|
config: configForRemote,
|
|
@@ -11238,11 +12122,11 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
11238
12122
|
n,
|
|
11239
12123
|
purpose,
|
|
11240
12124
|
task: key,
|
|
11241
|
-
version:
|
|
12125
|
+
version: require_version.VERSION,
|
|
11242
12126
|
email: require_accounts.getUserEmail()
|
|
11243
12127
|
});
|
|
11244
12128
|
try {
|
|
11245
|
-
const { data, status, statusText } = await require_cache.fetchWithCache(
|
|
12129
|
+
const { data, status, statusText } = await require_cache.fetchWithCache(require_remoteGeneration.getRemoteGenerationUrl(), {
|
|
11246
12130
|
method: "POST",
|
|
11247
12131
|
headers: { "Content-Type": "application/json" },
|
|
11248
12132
|
body
|
|
@@ -11259,23 +12143,56 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
11259
12143
|
return [];
|
|
11260
12144
|
}
|
|
11261
12145
|
}
|
|
12146
|
+
async function materializeRemoteTestCaseInputs({ config, injectVar, pluginId, provider, purpose, testCases }) {
|
|
12147
|
+
const inputs = config.inputs;
|
|
12148
|
+
if (!inputs || Object.keys(inputs).length === 0) return testCases;
|
|
12149
|
+
return Promise.all(testCases.map(async (testCase, materializationIndex) => {
|
|
12150
|
+
const inputVars = require_util$2.extractInputVarsFromPrompt(String(testCase.vars?.[injectVar] ?? ""), inputs);
|
|
12151
|
+
if (!inputVars) return testCase;
|
|
12152
|
+
const materializedVars = await require_inputVariables.materializeInputVariablesWithMetadata(inputVars, inputs, {
|
|
12153
|
+
materializationIndex,
|
|
12154
|
+
pluginId,
|
|
12155
|
+
provider,
|
|
12156
|
+
purpose
|
|
12157
|
+
});
|
|
12158
|
+
return {
|
|
12159
|
+
...testCase,
|
|
12160
|
+
vars: {
|
|
12161
|
+
...testCase.vars || {},
|
|
12162
|
+
...materializedVars.vars
|
|
12163
|
+
},
|
|
12164
|
+
metadata: {
|
|
12165
|
+
...testCase.metadata || {},
|
|
12166
|
+
...materializedVars.metadata ? { inputMaterialization: materializedVars.metadata } : {}
|
|
12167
|
+
}
|
|
12168
|
+
};
|
|
12169
|
+
}));
|
|
12170
|
+
}
|
|
11262
12171
|
function createPluginFactory(PluginClass, key, validate) {
|
|
11263
12172
|
return {
|
|
11264
12173
|
key,
|
|
11265
12174
|
validate,
|
|
11266
12175
|
action: async ({ provider, purpose, injectVar, n, delayMs, config }) => {
|
|
11267
12176
|
const configWithDefaults = applyDefaultGraderExamples(key, config);
|
|
11268
|
-
if (PluginClass.canGenerateRemote === false || !
|
|
12177
|
+
if (PluginClass.canGenerateRemote === false || !require_remoteGeneration.shouldGenerateRemote()) {
|
|
11269
12178
|
require_logger.logger.debug(`Using local redteam generation for ${key}`);
|
|
11270
12179
|
return new PluginClass(provider, purpose, injectVar, configWithDefaults).generateTests(n, delayMs);
|
|
11271
12180
|
}
|
|
11272
|
-
const
|
|
12181
|
+
const pluginId = require_util$2.getShortPluginId(key);
|
|
12182
|
+
const testCases = await materializeRemoteTestCaseInputs({
|
|
12183
|
+
config: configWithDefaults ?? {},
|
|
12184
|
+
injectVar,
|
|
12185
|
+
pluginId,
|
|
12186
|
+
provider,
|
|
12187
|
+
purpose,
|
|
12188
|
+
testCases: await fetchRemoteTestCases(key, purpose, injectVar, n, configWithDefaults ?? {})
|
|
12189
|
+
});
|
|
11273
12190
|
const computedModifiers = computeModifiersFromConfig(configWithDefaults);
|
|
11274
12191
|
return testCases.map((testCase) => ({
|
|
11275
12192
|
...testCase,
|
|
11276
12193
|
metadata: {
|
|
11277
12194
|
...testCase.metadata,
|
|
11278
|
-
pluginId
|
|
12195
|
+
pluginId,
|
|
11279
12196
|
pluginConfig: {
|
|
11280
12197
|
...configWithDefaults,
|
|
11281
12198
|
modifiers: computedModifiers
|
|
@@ -11331,8 +12248,8 @@ const pluginFactories = [
|
|
|
11331
12248
|
...unalignedHarmCategories.map((category) => ({
|
|
11332
12249
|
key: category,
|
|
11333
12250
|
action: async (params) => {
|
|
11334
|
-
if (
|
|
11335
|
-
require_logger.logger.error(`${category} plugin
|
|
12251
|
+
if (require_remoteGeneration.neverGenerateRemote()) {
|
|
12252
|
+
require_logger.logger.error(require_remoteGeneration.getRemoteGenerationExplicitlyDisabledError(`${category} plugin`));
|
|
11336
12253
|
return [];
|
|
11337
12254
|
}
|
|
11338
12255
|
const testCases = await getHarmfulTests(params, category);
|
|
@@ -11341,7 +12258,7 @@ const pluginFactories = [
|
|
|
11341
12258
|
...testCase,
|
|
11342
12259
|
metadata: {
|
|
11343
12260
|
...testCase.metadata,
|
|
11344
|
-
pluginId:
|
|
12261
|
+
pluginId: require_util$2.getShortPluginId(category),
|
|
11345
12262
|
pluginConfig: {
|
|
11346
12263
|
...params.config,
|
|
11347
12264
|
modifiers: computedModifiers
|
|
@@ -11354,14 +12271,22 @@ const pluginFactories = [
|
|
|
11354
12271
|
const piiPlugins = require_types.PII_PLUGINS.map((category) => ({
|
|
11355
12272
|
key: category,
|
|
11356
12273
|
action: async (params) => {
|
|
11357
|
-
if (
|
|
11358
|
-
const
|
|
12274
|
+
if (require_remoteGeneration.shouldGenerateRemote()) {
|
|
12275
|
+
const pluginId = require_util$2.getShortPluginId(category);
|
|
12276
|
+
const testCases = await materializeRemoteTestCaseInputs({
|
|
12277
|
+
config: params.config ?? {},
|
|
12278
|
+
injectVar: params.injectVar,
|
|
12279
|
+
pluginId,
|
|
12280
|
+
provider: params.provider,
|
|
12281
|
+
purpose: params.purpose,
|
|
12282
|
+
testCases: await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {})
|
|
12283
|
+
});
|
|
11359
12284
|
const computedModifiers = computeModifiersFromConfig(params.config);
|
|
11360
12285
|
return testCases.map((testCase) => ({
|
|
11361
12286
|
...testCase,
|
|
11362
12287
|
metadata: {
|
|
11363
12288
|
...testCase.metadata,
|
|
11364
|
-
pluginId
|
|
12289
|
+
pluginId,
|
|
11365
12290
|
pluginConfig: {
|
|
11366
12291
|
...params.config,
|
|
11367
12292
|
modifiers: computedModifiers
|
|
@@ -11374,7 +12299,7 @@ const piiPlugins = require_types.PII_PLUGINS.map((category) => ({
|
|
|
11374
12299
|
...testCase,
|
|
11375
12300
|
metadata: {
|
|
11376
12301
|
...testCase.metadata,
|
|
11377
|
-
pluginId:
|
|
12302
|
+
pluginId: require_util$2.getShortPluginId(category)
|
|
11378
12303
|
}
|
|
11379
12304
|
}));
|
|
11380
12305
|
}
|
|
@@ -11382,17 +12307,25 @@ const piiPlugins = require_types.PII_PLUGINS.map((category) => ({
|
|
|
11382
12307
|
const biasPlugins = require_types.BIAS_PLUGINS.map((category) => ({
|
|
11383
12308
|
key: category,
|
|
11384
12309
|
action: async (params) => {
|
|
11385
|
-
if (
|
|
11386
|
-
require_logger.logger.error(`${category} plugin
|
|
12310
|
+
if (require_remoteGeneration.neverGenerateRemote()) {
|
|
12311
|
+
require_logger.logger.error(require_remoteGeneration.getRemoteGenerationExplicitlyDisabledError(`${category} plugin`));
|
|
11387
12312
|
return [];
|
|
11388
12313
|
}
|
|
11389
|
-
const
|
|
12314
|
+
const pluginId = require_util$2.getShortPluginId(category);
|
|
12315
|
+
const testCases = await materializeRemoteTestCaseInputs({
|
|
12316
|
+
config: params.config ?? {},
|
|
12317
|
+
injectVar: params.injectVar,
|
|
12318
|
+
pluginId,
|
|
12319
|
+
provider: params.provider,
|
|
12320
|
+
purpose: params.purpose,
|
|
12321
|
+
testCases: await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {})
|
|
12322
|
+
});
|
|
11390
12323
|
const computedModifiers = computeModifiersFromConfig(params.config);
|
|
11391
12324
|
return testCases.map((testCase) => ({
|
|
11392
12325
|
...testCase,
|
|
11393
12326
|
metadata: {
|
|
11394
12327
|
...testCase.metadata,
|
|
11395
|
-
pluginId
|
|
12328
|
+
pluginId,
|
|
11396
12329
|
pluginConfig: {
|
|
11397
12330
|
...params.config,
|
|
11398
12331
|
modifiers: computedModifiers
|
|
@@ -11405,19 +12338,27 @@ function createRemotePlugin(key, validate) {
|
|
|
11405
12338
|
return {
|
|
11406
12339
|
key,
|
|
11407
12340
|
validate,
|
|
11408
|
-
action: async ({ purpose, injectVar, n, config }) => {
|
|
12341
|
+
action: async ({ provider, purpose, injectVar, n, config }) => {
|
|
11409
12342
|
const configWithDefaults = applyDefaultRemotePluginConfig(key, config);
|
|
11410
|
-
if (
|
|
11411
|
-
require_logger.logger.error(`${key} plugin
|
|
12343
|
+
if (require_remoteGeneration.neverGenerateRemote()) {
|
|
12344
|
+
require_logger.logger.error(require_remoteGeneration.getRemoteGenerationExplicitlyDisabledError(`${key} plugin`));
|
|
11412
12345
|
return [];
|
|
11413
12346
|
}
|
|
11414
|
-
const
|
|
12347
|
+
const pluginId = require_util$2.getShortPluginId(key);
|
|
12348
|
+
const testCases = await materializeRemoteTestCaseInputs({
|
|
12349
|
+
config: configWithDefaults ?? {},
|
|
12350
|
+
injectVar,
|
|
12351
|
+
pluginId,
|
|
12352
|
+
provider,
|
|
12353
|
+
purpose,
|
|
12354
|
+
testCases: await fetchRemoteTestCases(key, purpose, injectVar, n, configWithDefaults ?? {})
|
|
12355
|
+
});
|
|
11415
12356
|
const computedModifiers = computeModifiersFromConfig(configWithDefaults);
|
|
11416
12357
|
const testsWithMetadata = testCases.map((testCase) => ({
|
|
11417
12358
|
...testCase,
|
|
11418
12359
|
metadata: {
|
|
11419
12360
|
...testCase.metadata,
|
|
11420
|
-
pluginId
|
|
12361
|
+
pluginId,
|
|
11421
12362
|
pluginConfig: {
|
|
11422
12363
|
...configWithDefaults,
|
|
11423
12364
|
modifiers: computedModifiers
|
|
@@ -11487,6 +12428,37 @@ function getPolicyText(metadata) {
|
|
|
11487
12428
|
return typeof policyObject.text === "string" && policyObject.text.length > 0 ? policyObject.text : void 0;
|
|
11488
12429
|
}
|
|
11489
12430
|
}
|
|
12431
|
+
async function rematerializeStrategyInputVars(testCase, injectVar, provider, purpose, materializationIndex) {
|
|
12432
|
+
const inputs = testCase.metadata?.pluginConfig?.inputs;
|
|
12433
|
+
const inputMaterialization = testCase.metadata?.inputMaterialization;
|
|
12434
|
+
if (!inputs || Object.keys(inputs).length === 0 || !testCase.vars?.[injectVar]) return {
|
|
12435
|
+
inputMaterialization,
|
|
12436
|
+
vars: testCase.vars
|
|
12437
|
+
};
|
|
12438
|
+
try {
|
|
12439
|
+
const materializedVars = await require_util$2.extractMaterializedVariablesFromJsonWithMetadata(JSON.parse(String(testCase.vars[injectVar])), inputs, {
|
|
12440
|
+
materializationIndex,
|
|
12441
|
+
pluginId: String(testCase.metadata?.pluginId || "unknown-plugin"),
|
|
12442
|
+
provider,
|
|
12443
|
+
purpose
|
|
12444
|
+
});
|
|
12445
|
+
return {
|
|
12446
|
+
inputMaterialization: materializedVars.metadata ? {
|
|
12447
|
+
...inputMaterialization,
|
|
12448
|
+
...materializedVars.metadata
|
|
12449
|
+
} : inputMaterialization,
|
|
12450
|
+
vars: {
|
|
12451
|
+
...testCase.vars,
|
|
12452
|
+
...materializedVars.vars
|
|
12453
|
+
}
|
|
12454
|
+
};
|
|
12455
|
+
} catch {
|
|
12456
|
+
return {
|
|
12457
|
+
inputMaterialization,
|
|
12458
|
+
vars: testCase.vars
|
|
12459
|
+
};
|
|
12460
|
+
}
|
|
12461
|
+
}
|
|
11490
12462
|
/**
|
|
11491
12463
|
* Gets the severity level for a plugin based on its ID and configuration.
|
|
11492
12464
|
* @param pluginId - The ID of the plugin.
|
|
@@ -11495,7 +12467,7 @@ function getPolicyText(metadata) {
|
|
|
11495
12467
|
*/
|
|
11496
12468
|
function getPluginSeverity(pluginId, pluginConfig) {
|
|
11497
12469
|
if (pluginConfig?.severity) return pluginConfig.severity;
|
|
11498
|
-
const shortId =
|
|
12470
|
+
const shortId = require_util$2.getShortPluginId(pluginId);
|
|
11499
12471
|
return shortId in require_types.riskCategorySeverityMap ? require_types.riskCategorySeverityMap[shortId] : require_types.Severity.Low;
|
|
11500
12472
|
}
|
|
11501
12473
|
const POLICY_PREVIEW_MAX_LENGTH = 20;
|
|
@@ -11620,14 +12592,15 @@ function buildRedteamModifiers({ maxCharsPerMessage, pluginConfig, testGeneratio
|
|
|
11620
12592
|
...testGenerationInstructions ? { testGenerationInstructions } : {},
|
|
11621
12593
|
...pluginConfig?.modifiers ?? {}
|
|
11622
12594
|
};
|
|
11623
|
-
const maxCharsPerMessageModifier =
|
|
11624
|
-
if (maxCharsPerMessageModifier) modifiers[
|
|
12595
|
+
const maxCharsPerMessageModifier = require_promptLength.getMaxCharsPerMessageModifierValue(maxCharsPerMessage ?? pluginConfig?.maxCharsPerMessage);
|
|
12596
|
+
if (maxCharsPerMessageModifier) modifiers[require_promptLength.MAX_CHARS_PER_MESSAGE_MODIFIER_KEY] = maxCharsPerMessageModifier;
|
|
11625
12597
|
return modifiers;
|
|
11626
12598
|
}
|
|
11627
12599
|
const categories = {
|
|
11628
12600
|
foundation: require_types.FOUNDATION_PLUGINS,
|
|
11629
12601
|
harmful: Object.keys(require_types.HARM_PLUGINS),
|
|
11630
12602
|
"coding-agent:core": require_types.CODING_AGENT_CORE_PLUGINS,
|
|
12603
|
+
"coding-agent:all": require_types.CODING_AGENT_PLUGINS,
|
|
11631
12604
|
bias: require_types.BIAS_PLUGINS,
|
|
11632
12605
|
pii: require_types.PII_PLUGINS,
|
|
11633
12606
|
medical: require_types.MEDICAL_PLUGINS,
|
|
@@ -11657,7 +12630,7 @@ function getLanguageForTestCase(test) {
|
|
|
11657
12630
|
function filterOversizedTestCases(testCases, injectVar, sourceLabel, maxCharsPerMessage) {
|
|
11658
12631
|
return testCases.filter((testCase) => {
|
|
11659
12632
|
const testCaseMaxCharsPerMessage = maxCharsPerMessage ?? (testCase.metadata?.strategyConfig)?.maxCharsPerMessage ?? (testCase.metadata?.pluginConfig)?.maxCharsPerMessage;
|
|
11660
|
-
const violation =
|
|
12633
|
+
const violation = require_promptLength.getGeneratedPromptOverLimit(String(testCase.vars?.[injectVar] ?? ""), testCaseMaxCharsPerMessage);
|
|
11661
12634
|
if (!violation) return true;
|
|
11662
12635
|
require_logger.logger.warn(`[${sourceLabel}] Dropping generated test case that exceeds maxCharsPerMessage=${violation.limit} (${violation.length} chars)`);
|
|
11663
12636
|
return false;
|
|
@@ -11722,18 +12695,18 @@ function addLanguageToPluginMetadata(test, lang, plugin, maxCharsPerMessage, tes
|
|
|
11722
12695
|
* @param injectVar - The variable to inject.
|
|
11723
12696
|
* @returns An array of new test cases generated by strategies.
|
|
11724
12697
|
*/
|
|
11725
|
-
async function applyStrategies(testCases, strategies, injectVar, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage) {
|
|
12698
|
+
async function applyStrategies(testCases, strategies, injectVar, provider, purpose, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage) {
|
|
11726
12699
|
const newTestCases = [];
|
|
11727
12700
|
const strategyResults = {};
|
|
11728
12701
|
for (const strategy of strategies) {
|
|
11729
12702
|
require_logger.logger.debug(`Generating ${strategy.id} tests`);
|
|
11730
12703
|
let strategyAction;
|
|
11731
|
-
if (strategy.id.startsWith("file://")) strategyAction = (await
|
|
12704
|
+
if (strategy.id.startsWith("file://")) strategyAction = (await require_strategies.loadStrategy(strategy.id)).action;
|
|
11732
12705
|
else {
|
|
11733
|
-
let builtinStrategy =
|
|
12706
|
+
let builtinStrategy = require_strategies.Strategies.find((s) => s.id === strategy.id);
|
|
11734
12707
|
if (!builtinStrategy && strategy.id.includes(":")) {
|
|
11735
12708
|
const baseStrategyId = strategy.id.split(":")[0];
|
|
11736
|
-
builtinStrategy =
|
|
12709
|
+
builtinStrategy = require_strategies.Strategies.find((s) => s.id === baseStrategyId);
|
|
11737
12710
|
}
|
|
11738
12711
|
if (!builtinStrategy) {
|
|
11739
12712
|
require_logger.logger.warn(`Strategy ${strategy.id} not registered, skipping`);
|
|
@@ -11743,7 +12716,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
11743
12716
|
}
|
|
11744
12717
|
const targetPlugins = strategy.config?.plugins;
|
|
11745
12718
|
const applicableTestCases = testCases.filter((t) => {
|
|
11746
|
-
if (!
|
|
12719
|
+
if (!require_strategies.pluginMatchesStrategyTargets(t, strategy.id, targetPlugins)) return false;
|
|
11747
12720
|
if (t.metadata?.retry === true) {
|
|
11748
12721
|
require_logger.logger.debug(`Skipping ${strategy.id} for retry test (plugin: ${t.metadata?.pluginId}) - retry tests are not transformed`);
|
|
11749
12722
|
return false;
|
|
@@ -11778,14 +12751,8 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
11778
12751
|
}
|
|
11779
12752
|
}
|
|
11780
12753
|
resultTestCases = filterOversizedTestCases(resultTestCases, injectVar, `Strategy ${strategy.id}`, maxCharsPerMessage);
|
|
11781
|
-
newTestCases.push(...resultTestCases.map((t) => {
|
|
11782
|
-
const
|
|
11783
|
-
let updatedVars = t.vars;
|
|
11784
|
-
if (inputs && Object.keys(inputs).length > 0 && t.vars?.[injectVar]) try {
|
|
11785
|
-
const parsed = JSON.parse(String(t.vars[injectVar]));
|
|
11786
|
-
updatedVars = { ...t.vars };
|
|
11787
|
-
Object.assign(updatedVars, require_providers.extractVariablesFromJson(parsed, inputs));
|
|
11788
|
-
} catch {}
|
|
12754
|
+
newTestCases.push(...await Promise.all(resultTestCases.map(async (t, materializationIndex) => {
|
|
12755
|
+
const { inputMaterialization, vars } = await rematerializeStrategyInputVars(t, injectVar, provider, purpose, materializationIndex);
|
|
11789
12756
|
const strategyConfig = {
|
|
11790
12757
|
...strategy.config || {},
|
|
11791
12758
|
...maxCharsPerMessage ? { maxCharsPerMessage } : {},
|
|
@@ -11793,16 +12760,17 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
11793
12760
|
};
|
|
11794
12761
|
return {
|
|
11795
12762
|
...t,
|
|
11796
|
-
vars
|
|
12763
|
+
vars,
|
|
11797
12764
|
metadata: {
|
|
11798
12765
|
...t?.metadata || {},
|
|
11799
12766
|
...strategy.id !== "retry" && { strategyId: t?.metadata?.strategyId || strategy.id },
|
|
11800
12767
|
...t?.metadata?.pluginId && { pluginId: t.metadata.pluginId },
|
|
11801
12768
|
...t?.metadata?.pluginConfig && { pluginConfig: t.metadata.pluginConfig },
|
|
12769
|
+
...inputMaterialization && { inputMaterialization },
|
|
11802
12770
|
...Object.keys(strategyConfig).length > 0 && { strategyConfig }
|
|
11803
12771
|
}
|
|
11804
12772
|
};
|
|
11805
|
-
}));
|
|
12773
|
+
})));
|
|
11806
12774
|
const displayId = strategy.id === "layer" && Array.isArray(strategy.config?.steps) ? `layer(${strategy.config.steps.map((st) => typeof st === "string" ? st : st.id).join("→")})` : strategy.id;
|
|
11807
12775
|
const languagesInResults = new Set(strategyTestCases.map((t) => getLanguageForTestCase(t)).filter((lang) => lang !== void 0));
|
|
11808
12776
|
const applyNumTestsCap = (calculatedRequested) => {
|
|
@@ -11960,10 +12928,10 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
11960
12928
|
seen.add(key);
|
|
11961
12929
|
return true;
|
|
11962
12930
|
});
|
|
11963
|
-
const needsGoalExtraction = strategies.some((s) =>
|
|
11964
|
-
await
|
|
12931
|
+
const needsGoalExtraction = strategies.some((s) => require_strategies.Strategies.find((def) => def.id === s.id)?.requiresGoalExtraction);
|
|
12932
|
+
await require_strategies.validateStrategies(strategies);
|
|
11965
12933
|
await validateSharpDependency(strategies, plugins);
|
|
11966
|
-
const redteamProvider = await
|
|
12934
|
+
const redteamProvider = await require_shared.redteamProviderManager.getProvider({ provider });
|
|
11967
12935
|
const { effectiveStrategyCount, includeBasicTests, totalPluginTests, totalTests } = calculateTotalTests(plugins, strategies, language);
|
|
11968
12936
|
require_logger.logger.info(`Synthesizing test cases for ${prompts.length} ${prompts.length === 1 ? "prompt" : "prompts"}...\nUsing plugins:\n\n${chalk.default.yellow(plugins.map((p) => {
|
|
11969
12937
|
const pluginLanguageConfig = p.config?.language ?? language;
|
|
@@ -12067,8 +13035,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
12067
13035
|
};
|
|
12068
13036
|
require_logger.logger.debug("Validating plugins...");
|
|
12069
13037
|
plugins = [...new Set(expandedPlugins)].filter(validatePlugin).sort();
|
|
12070
|
-
if (
|
|
12071
|
-
const healthUrl =
|
|
13038
|
+
if (require_remoteGeneration.shouldGenerateRemote()) {
|
|
13039
|
+
const healthUrl = require_remoteGeneration.getRemoteHealthUrl();
|
|
12072
13040
|
if (healthUrl) {
|
|
12073
13041
|
require_logger.logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
12074
13042
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
@@ -12163,7 +13131,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
12163
13131
|
const promptVar = testCase.vars?.[injectVar];
|
|
12164
13132
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
12165
13133
|
const policy = getPolicyText(testCase.metadata);
|
|
12166
|
-
const extractedGoal = await
|
|
13134
|
+
const extractedGoal = await require_util$2.extractGoalFromPrompt(prompt, purpose, plugin.id, policy);
|
|
12167
13135
|
testCase.metadata.goal = extractedGoal;
|
|
12168
13136
|
}
|
|
12169
13137
|
}
|
|
@@ -12218,7 +13186,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
12218
13186
|
const promptVar = testCase.vars?.[injectVar];
|
|
12219
13187
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
12220
13188
|
const policy = getPolicyText(testCase.metadata);
|
|
12221
|
-
const extractedGoal = await
|
|
13189
|
+
const extractedGoal = await require_util$2.extractGoalFromPrompt(prompt, purpose, plugin.id, policy);
|
|
12222
13190
|
testCase.metadata.goal = extractedGoal;
|
|
12223
13191
|
}
|
|
12224
13192
|
}
|
|
@@ -12257,7 +13225,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
12257
13225
|
targetIds,
|
|
12258
13226
|
...retryStrategy.config
|
|
12259
13227
|
};
|
|
12260
|
-
const { testCases: retryTestCases, strategyResults: retryResults } = await applyStrategies(pluginTestCases, [retryStrategy], injectVar, void 0, maxCharsPerMessage);
|
|
13228
|
+
const { testCases: retryTestCases, strategyResults: retryResults } = await applyStrategies(pluginTestCases, [retryStrategy], injectVar, redteamProvider, purpose, void 0, maxCharsPerMessage);
|
|
12261
13229
|
pluginTestCases.push(...retryTestCases);
|
|
12262
13230
|
Object.assign(strategyResults, retryResults);
|
|
12263
13231
|
if (showProgressBar) progressBar?.increment(retryTestCases.length);
|
|
@@ -12265,7 +13233,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
12265
13233
|
checkAbort();
|
|
12266
13234
|
const nonBasicStrategies = strategies.filter((s) => !["basic", "retry"].includes(s.id));
|
|
12267
13235
|
if (showProgressBar && nonBasicStrategies.length > 0) progressBar?.update({ task: "Applying strategies" });
|
|
12268
|
-
const { testCases: strategyTestCases, strategyResults: otherStrategyResults } = await applyStrategies(pluginTestCases, nonBasicStrategies, injectVar, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage);
|
|
13236
|
+
const { testCases: strategyTestCases, strategyResults: otherStrategyResults } = await applyStrategies(pluginTestCases, nonBasicStrategies, injectVar, redteamProvider, purpose, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage);
|
|
12269
13237
|
Object.assign(strategyResults, otherStrategyResults);
|
|
12270
13238
|
if (showProgressBar && strategyTestCases.length > 0) progressBar?.increment(strategyTestCases.length);
|
|
12271
13239
|
const finalTestCases = [...includeBasicTests ? pluginTestCases : [], ...strategyTestCases];
|
|
@@ -12317,7 +13285,7 @@ function handleFailedPlugins(failedPlugins, strict) {
|
|
|
12317
13285
|
}
|
|
12318
13286
|
function getConfigHash(configPath) {
|
|
12319
13287
|
const content = fs.readFileSync(configPath, "utf8");
|
|
12320
|
-
return (0, crypto$1.createHash)("md5").update(`${
|
|
13288
|
+
return (0, crypto$1.createHash)("md5").update(`${require_version.VERSION}:${content}`).digest("hex");
|
|
12321
13289
|
}
|
|
12322
13290
|
function createHeaderComments({ title, timestampLabel, author, cloudHost, testCasesCount, plugins, strategies, isUpdate = false }) {
|
|
12323
13291
|
const sectionLabel = isUpdate ? "Changes:" : "Test Configuration:";
|
|
@@ -12392,7 +13360,7 @@ async function doGenerateRedteam(options) {
|
|
|
12392
13360
|
redteamConfig = resolved.config.redteam;
|
|
12393
13361
|
commandLineOptions = resolved.commandLineOptions;
|
|
12394
13362
|
resolvedConfig = resolved.config;
|
|
12395
|
-
await
|
|
13363
|
+
await require_storage.checkCloudPermissions(resolved.config);
|
|
12396
13364
|
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0) require_logger.logger.warn(chalk.default.yellow(dedent.default`
|
|
12397
13365
|
⚠️ Warning: Found both 'tests' section and 'redteam' configuration in your config file.
|
|
12398
13366
|
|
|
@@ -12407,8 +13375,8 @@ async function doGenerateRedteam(options) {
|
|
|
12407
13375
|
`));
|
|
12408
13376
|
try {
|
|
12409
13377
|
const providerId = require_providers.getProviderIds(resolved.config.providers)[0];
|
|
12410
|
-
if (
|
|
12411
|
-
const overrides = await
|
|
13378
|
+
if (require_storage.isCloudProvider(providerId)) {
|
|
13379
|
+
const overrides = await require_storage.getPluginSeverityOverridesFromCloud(require_storage.getCloudDatabaseId(providerId));
|
|
12412
13380
|
if (overrides) {
|
|
12413
13381
|
pluginSeverityOverrides = new Map(Object.entries(overrides.severities));
|
|
12414
13382
|
pluginSeverityOverridesId = overrides.id;
|
|
@@ -12426,7 +13394,7 @@ async function doGenerateRedteam(options) {
|
|
|
12426
13394
|
require_logger.logger.info(chalk.default.red(`\nCan't generate without configuration - run ${chalk.default.yellow.bold(promptfooCommand("redteam init"))} first`));
|
|
12427
13395
|
return null;
|
|
12428
13396
|
}
|
|
12429
|
-
if (!
|
|
13397
|
+
if (!require_remoteGeneration.neverGenerateRemote()) {
|
|
12430
13398
|
let hasValidEmail = false;
|
|
12431
13399
|
while (!hasValidEmail) {
|
|
12432
13400
|
const { emailNeedsValidation } = await require_accounts.promptForEmailUnverified();
|
|
@@ -12490,7 +13458,7 @@ async function doGenerateRedteam(options) {
|
|
|
12490
13458
|
}
|
|
12491
13459
|
const policyPluginsWithRefs = plugins.filter((plugin) => plugin.config?.policy && require_graders.isValidPolicyObject(plugin.config?.policy) && require_graders.determinePolicyTypeFromId(plugin.config.policy.id) === "reusable");
|
|
12492
13460
|
if (policyPluginsWithRefs.length > 0) {
|
|
12493
|
-
const teamId = (await
|
|
13461
|
+
const teamId = (await require_storage.resolveTeamId()).id;
|
|
12494
13462
|
const policiesById = await require_graders.getCustomPolicies(policyPluginsWithRefs, teamId);
|
|
12495
13463
|
for (const policyPlugin of policyPluginsWithRefs) {
|
|
12496
13464
|
const policyId = policyPlugin.config.policy.id;
|
|
@@ -13062,7 +14030,7 @@ async function sendChunkedResults(evalRecord, url$5, options = {}) {
|
|
|
13062
14030
|
const isVerbose = require_logger.isDebugEnabled();
|
|
13063
14031
|
const { silent = false } = options;
|
|
13064
14032
|
require_logger.logger.debug(`Starting chunked results upload to ${url$5}`);
|
|
13065
|
-
await
|
|
14033
|
+
await require_storage.checkCloudPermissions(evalRecord.config);
|
|
13066
14034
|
const inlineBlobs = require_extractor.isBlobStorageEnabled() && require_logger.getEnvBool("PROMPTFOO_SHARE_INLINE_BLOBS", !require_fetch.cloudConfig.isEnabled());
|
|
13067
14035
|
const inlineCache = inlineBlobs ? createBlobInlineCache() : null;
|
|
13068
14036
|
let sampleResults = (await evalRecord.fetchResultsBatched(100).next()).value ?? [];
|
|
@@ -13158,6 +14126,10 @@ function stripAuthFromUrl(urlString) {
|
|
|
13158
14126
|
}
|
|
13159
14127
|
}
|
|
13160
14128
|
async function handleEmailCollection(evalRecord) {
|
|
14129
|
+
if (evalRecord.author) {
|
|
14130
|
+
require_logger.logger.debug(`[Share] Skipping email collection because author is already set`, { evalId: evalRecord.id });
|
|
14131
|
+
return;
|
|
14132
|
+
}
|
|
13161
14133
|
if (!process.stdout.isTTY || require_logger.isCI() || require_logger.getEnvBool("PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST")) return;
|
|
13162
14134
|
let email = require_accounts.getUserEmail();
|
|
13163
14135
|
if (!email) {
|
|
@@ -13200,7 +14172,7 @@ async function createShareableUrl(evalRecord, options = {}) {
|
|
|
13200
14172
|
return null;
|
|
13201
14173
|
}
|
|
13202
14174
|
if (!silent) {
|
|
13203
|
-
const orgContext = await
|
|
14175
|
+
const orgContext = await require_storage.getOrgContext();
|
|
13204
14176
|
if (orgContext) {
|
|
13205
14177
|
const teamSuffix = orgContext.teamName ? ` > ${orgContext.teamName}` : "";
|
|
13206
14178
|
require_logger.logger.info(`${chalk.default.dim("Sharing to:")} ${chalk.default.cyan(orgContext.organizationName)}${teamSuffix}`);
|
|
@@ -13230,7 +14202,7 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
|
13230
14202
|
for (const row of evaluateTable.body.slice(0, maxRows)) table.push([...row.vars.map((v) => require_text.ellipsize(v, tableCellMaxLength)), ...row.outputs.map(({ pass, text, failureReason: failureType }) => {
|
|
13231
14203
|
text = require_text.ellipsize(text, tableCellMaxLength);
|
|
13232
14204
|
if (pass) return chalk.default.green("[PASS] ") + text;
|
|
13233
|
-
|
|
14205
|
+
return chalk.default.red(failureType === require_types.ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.default.red.bold(c) : c).join("---");
|
|
13234
14206
|
})]);
|
|
13235
14207
|
return table.toString();
|
|
13236
14208
|
}
|
|
@@ -13301,6 +14273,14 @@ function shouldShareResults(opts) {
|
|
|
13301
14273
|
return require_fetch.cloudConfig.isEnabled() && sharing !== false;
|
|
13302
14274
|
}
|
|
13303
14275
|
//#endregion
|
|
14276
|
+
//#region src/commands/eval/redteamWarning.ts
|
|
14277
|
+
function warnIfRedteamConfigHasNoTests(config, testSuite) {
|
|
14278
|
+
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) require_logger.logger.warn(chalk.default.yellow(dedent.default`
|
|
14279
|
+
Warning: Config file has a redteam section but no test cases.
|
|
14280
|
+
Did you mean to run ${chalk.default.bold("promptfoo redteam generate")} instead?
|
|
14281
|
+
`));
|
|
14282
|
+
}
|
|
14283
|
+
//#endregion
|
|
13304
14284
|
//#region src/util/formatDuration.ts
|
|
13305
14285
|
/**
|
|
13306
14286
|
* Formats a duration in seconds into a human-readable string
|
|
@@ -13321,6 +14301,115 @@ function formatDuration(seconds) {
|
|
|
13321
14301
|
}
|
|
13322
14302
|
//#endregion
|
|
13323
14303
|
//#region src/commands/eval/summary.ts
|
|
14304
|
+
function getCompletionMessage({ completionType, evalId, shareableUrl, wasAborted, writeToDatabase, activelySharing }) {
|
|
14305
|
+
if (wasAborted) {
|
|
14306
|
+
const idSuffix = writeToDatabase ? ` (ID: ${chalk.default.cyan(evalId)})` : "";
|
|
14307
|
+
return `${chalk.default.red("✗")} ${completionType} aborted${idSuffix}`;
|
|
14308
|
+
}
|
|
14309
|
+
if (writeToDatabase && shareableUrl) return `${chalk.default.green("✓")} ${completionType} complete: ${shareableUrl}`;
|
|
14310
|
+
if (writeToDatabase && activelySharing) return `${chalk.default.green("✓")} ${completionType} complete`;
|
|
14311
|
+
if (writeToDatabase) return `${chalk.default.green("✓")} ${completionType} complete (ID: ${chalk.default.cyan(evalId)})`;
|
|
14312
|
+
return `${chalk.default.green("✓")} ${completionType} complete`;
|
|
14313
|
+
}
|
|
14314
|
+
function getAbortSummaryLines(targetErrorStatus) {
|
|
14315
|
+
if (targetErrorStatus == null) return [];
|
|
14316
|
+
return [
|
|
14317
|
+
"",
|
|
14318
|
+
chalk.default.red.bold("Scan stopped: Target is unavailable and will not recover on retry."),
|
|
14319
|
+
chalk.default.red(` Target returned HTTP ${targetErrorStatus}`),
|
|
14320
|
+
"",
|
|
14321
|
+
chalk.default.yellow("Possible causes:"),
|
|
14322
|
+
chalk.default.yellow(" • Invalid API key or authentication (401/403)"),
|
|
14323
|
+
chalk.default.yellow(" • Target endpoint does not exist (404)"),
|
|
14324
|
+
chalk.default.yellow(" • Server does not support the request (501)"),
|
|
14325
|
+
"",
|
|
14326
|
+
chalk.default.cyan("To fix: Check your target configuration and credentials.")
|
|
14327
|
+
];
|
|
14328
|
+
}
|
|
14329
|
+
function getGuidanceLines({ writeToDatabase, shareableUrl, wantsToShare, activelySharing, hasExplicitDisable, cloudEnabled }) {
|
|
14330
|
+
if (!writeToDatabase || shareableUrl || wantsToShare || activelySharing) return [];
|
|
14331
|
+
const lines = ["", `» View results: ${chalk.default.green.bold("promptfoo view")}`];
|
|
14332
|
+
if (!hasExplicitDisable) lines.push(cloudEnabled ? `» Create shareable URL: ${chalk.default.green.bold("promptfoo share")}` : `» Share with your team: ${chalk.default.green.bold("https://promptfoo.app")}`);
|
|
14333
|
+
lines.push(`» Feedback: ${chalk.default.green.bold("https://promptfoo.dev/feedback")}`);
|
|
14334
|
+
return lines;
|
|
14335
|
+
}
|
|
14336
|
+
function buildUsageDetails(usage, total) {
|
|
14337
|
+
const parts = [];
|
|
14338
|
+
if (usage.prompt && usage.prompt > 0) parts.push(`${usage.prompt.toLocaleString()} prompt`);
|
|
14339
|
+
if (usage.completion && usage.completion > 0) parts.push(`${usage.completion.toLocaleString()} completion`);
|
|
14340
|
+
if (usage.cached && usage.cached > 0) parts.push(usage.cached === total && parts.length === 0 ? "cached" : `${usage.cached.toLocaleString()} cached`);
|
|
14341
|
+
if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) parts.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
14342
|
+
return parts;
|
|
14343
|
+
}
|
|
14344
|
+
function getTokenUsageLines(tokenUsage, isRedteam, tracker) {
|
|
14345
|
+
const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
|
|
14346
|
+
const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
|
|
14347
|
+
if (!hasEvalTokens && !hasGradingTokens) return [];
|
|
14348
|
+
const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
|
|
14349
|
+
const evalTokens = {
|
|
14350
|
+
prompt: tokenUsage.prompt || 0,
|
|
14351
|
+
completion: tokenUsage.completion || 0,
|
|
14352
|
+
total: tokenUsage.total || combinedTotal,
|
|
14353
|
+
cached: tokenUsage.cached || 0,
|
|
14354
|
+
numRequests: tokenUsage.numRequests || 0,
|
|
14355
|
+
completionDetails: tokenUsage.completionDetails || {
|
|
14356
|
+
reasoning: 0,
|
|
14357
|
+
acceptedPrediction: 0,
|
|
14358
|
+
rejectedPrediction: 0
|
|
14359
|
+
}
|
|
14360
|
+
};
|
|
14361
|
+
const lines = [`${chalk.default.bold("Total Tokens:")} ${chalk.default.white.bold((evalTokens.total + (tokenUsage.assertions?.total || 0)).toLocaleString())}`];
|
|
14362
|
+
if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.default.gray("Probes:")} ${chalk.default.white(tokenUsage.numRequests.toLocaleString())}`);
|
|
14363
|
+
if (evalTokens.total > 0) {
|
|
14364
|
+
const evalParts = buildUsageDetails(evalTokens, evalTokens.total);
|
|
14365
|
+
lines.push(` ${chalk.default.gray("Eval:")} ${chalk.default.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
|
|
14366
|
+
}
|
|
14367
|
+
if (tokenUsage.assertions?.total && tokenUsage.assertions.total > 0) {
|
|
14368
|
+
const gradingParts = buildUsageDetails(tokenUsage.assertions, tokenUsage.assertions.total);
|
|
14369
|
+
lines.push(` ${chalk.default.gray("Grading:")} ${chalk.default.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
|
|
14370
|
+
}
|
|
14371
|
+
lines.push(...getProviderUsageLines(tracker));
|
|
14372
|
+
return lines;
|
|
14373
|
+
}
|
|
14374
|
+
function getProviderUsageLines(tracker) {
|
|
14375
|
+
const providerIds = tracker.getProviderIds();
|
|
14376
|
+
if (providerIds.length <= 1) return [];
|
|
14377
|
+
const sortedProviders = providerIds.map((id) => ({
|
|
14378
|
+
id,
|
|
14379
|
+
usage: tracker.getProviderUsage(id)
|
|
14380
|
+
})).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
|
|
14381
|
+
const lines = ["", chalk.default.bold("Providers:")];
|
|
14382
|
+
for (const { id, usage } of sortedProviders) {
|
|
14383
|
+
if ((usage.total || 0) === 0 && (usage.prompt || 0) + (usage.completion || 0) === 0) continue;
|
|
14384
|
+
const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
|
|
14385
|
+
const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
|
|
14386
|
+
const details = buildUsageDetails(usage, displayTotal);
|
|
14387
|
+
const requestInfo = `${usage.numRequests || 0} requests`;
|
|
14388
|
+
const separator = details.length > 0 ? "; " : "";
|
|
14389
|
+
lines.push(` ${chalk.default.gray(`${displayId}:`)} ${chalk.default.white(displayTotal.toLocaleString())} (${requestInfo}${separator}${details.join(", ")})`);
|
|
14390
|
+
}
|
|
14391
|
+
return lines;
|
|
14392
|
+
}
|
|
14393
|
+
function formatResultPercentage(count, totalTests) {
|
|
14394
|
+
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
14395
|
+
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
14396
|
+
}
|
|
14397
|
+
function formatResultLine(count, label, icon, iconColor, totalTests) {
|
|
14398
|
+
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count, totalTests)})`)}`;
|
|
14399
|
+
}
|
|
14400
|
+
function getResultsLines({ successes, failures, errors, duration, maxConcurrency }) {
|
|
14401
|
+
const totalTests = successes + failures + errors;
|
|
14402
|
+
const errorLabel = errors === 1 ? "error" : "errors";
|
|
14403
|
+
return [
|
|
14404
|
+
"",
|
|
14405
|
+
chalk.default.bold("Results:"),
|
|
14406
|
+
formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green, totalTests),
|
|
14407
|
+
formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red, totalTests),
|
|
14408
|
+
formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red, totalTests),
|
|
14409
|
+
chalk.default.gray(`Duration: ${formatDuration(duration)} (concurrency: ${maxConcurrency})`),
|
|
14410
|
+
""
|
|
14411
|
+
];
|
|
14412
|
+
}
|
|
13324
14413
|
/**
|
|
13325
14414
|
* Generate formatted evaluation summary output for CLI display.
|
|
13326
14415
|
*
|
|
@@ -13359,115 +14448,28 @@ function formatDuration(seconds) {
|
|
|
13359
14448
|
* ```
|
|
13360
14449
|
*/
|
|
13361
14450
|
function generateEvalSummary(params) {
|
|
13362
|
-
|
|
13363
|
-
|
|
13364
|
-
|
|
13365
|
-
|
|
13366
|
-
|
|
13367
|
-
|
|
13368
|
-
|
|
13369
|
-
|
|
13370
|
-
|
|
13371
|
-
|
|
13372
|
-
|
|
13373
|
-
|
|
13374
|
-
|
|
13375
|
-
|
|
13376
|
-
|
|
13377
|
-
|
|
13378
|
-
|
|
13379
|
-
|
|
13380
|
-
|
|
13381
|
-
|
|
13382
|
-
|
|
13383
|
-
|
|
13384
|
-
lines.push("");
|
|
13385
|
-
lines.push(chalk.default.cyan("To fix: Check your target configuration and credentials."));
|
|
13386
|
-
}
|
|
13387
|
-
if (writeToDatabase && !shareableUrl && !wantsToShare && !activelySharing) {
|
|
13388
|
-
lines.push("");
|
|
13389
|
-
lines.push(`» View results: ${chalk.default.green.bold("promptfoo view")}`);
|
|
13390
|
-
if (!hasExplicitDisable) if (cloudEnabled) lines.push(`» Create shareable URL: ${chalk.default.green.bold("promptfoo share")}`);
|
|
13391
|
-
else lines.push(`» Share with your team: ${chalk.default.green.bold("https://promptfoo.app")}`);
|
|
13392
|
-
lines.push(`» Feedback: ${chalk.default.green.bold("https://promptfoo.dev/feedback")}`);
|
|
13393
|
-
}
|
|
13394
|
-
lines.push("");
|
|
13395
|
-
const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
|
|
13396
|
-
const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
|
|
13397
|
-
if (hasEvalTokens || hasGradingTokens) {
|
|
13398
|
-
const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
|
|
13399
|
-
const evalTokens = {
|
|
13400
|
-
prompt: tokenUsage.prompt || 0,
|
|
13401
|
-
completion: tokenUsage.completion || 0,
|
|
13402
|
-
total: tokenUsage.total || combinedTotal,
|
|
13403
|
-
cached: tokenUsage.cached || 0,
|
|
13404
|
-
completionDetails: tokenUsage.completionDetails || {
|
|
13405
|
-
reasoning: 0,
|
|
13406
|
-
acceptedPrediction: 0,
|
|
13407
|
-
rejectedPrediction: 0
|
|
13408
|
-
}
|
|
13409
|
-
};
|
|
13410
|
-
const grandTotal = evalTokens.total + (tokenUsage.assertions?.total || 0);
|
|
13411
|
-
lines.push(`${chalk.default.bold("Total Tokens:")} ${chalk.default.white.bold(grandTotal.toLocaleString())}`);
|
|
13412
|
-
if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.default.gray("Probes:")} ${chalk.default.white(tokenUsage.numRequests.toLocaleString())}`);
|
|
13413
|
-
if (evalTokens.total > 0) {
|
|
13414
|
-
const evalParts = [];
|
|
13415
|
-
if (evalTokens.prompt > 0) evalParts.push(`${evalTokens.prompt.toLocaleString()} prompt`);
|
|
13416
|
-
if (evalTokens.completion > 0) evalParts.push(`${evalTokens.completion.toLocaleString()} completion`);
|
|
13417
|
-
if (evalTokens.cached > 0) if (evalTokens.cached === evalTokens.total && evalParts.length === 0) evalParts.push("cached");
|
|
13418
|
-
else evalParts.push(`${evalTokens.cached.toLocaleString()} cached`);
|
|
13419
|
-
if (evalTokens.completionDetails?.reasoning && evalTokens.completionDetails.reasoning > 0) evalParts.push(`${evalTokens.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13420
|
-
lines.push(` ${chalk.default.gray("Eval:")} ${chalk.default.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
|
|
13421
|
-
}
|
|
13422
|
-
if (tokenUsage.assertions && tokenUsage.assertions.total && tokenUsage.assertions.total > 0) {
|
|
13423
|
-
const gradingParts = [];
|
|
13424
|
-
if (tokenUsage.assertions.prompt && tokenUsage.assertions.prompt > 0) gradingParts.push(`${tokenUsage.assertions.prompt.toLocaleString()} prompt`);
|
|
13425
|
-
if (tokenUsage.assertions.completion && tokenUsage.assertions.completion > 0) gradingParts.push(`${tokenUsage.assertions.completion.toLocaleString()} completion`);
|
|
13426
|
-
if (tokenUsage.assertions.cached && tokenUsage.assertions.cached > 0) if (tokenUsage.assertions.cached === tokenUsage.assertions.total && gradingParts.length === 0) gradingParts.push("cached");
|
|
13427
|
-
else gradingParts.push(`${tokenUsage.assertions.cached.toLocaleString()} cached`);
|
|
13428
|
-
if (tokenUsage.assertions.completionDetails?.reasoning && tokenUsage.assertions.completionDetails.reasoning > 0) gradingParts.push(`${tokenUsage.assertions.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13429
|
-
lines.push(` ${chalk.default.gray("Grading:")} ${chalk.default.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
|
|
13430
|
-
}
|
|
13431
|
-
const providerIds = tracker.getProviderIds();
|
|
13432
|
-
if (providerIds.length > 1) {
|
|
13433
|
-
lines.push("");
|
|
13434
|
-
lines.push(chalk.default.bold("Providers:"));
|
|
13435
|
-
const sortedProviders = providerIds.map((id) => ({
|
|
13436
|
-
id,
|
|
13437
|
-
usage: tracker.getProviderUsage(id)
|
|
13438
|
-
})).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
|
|
13439
|
-
for (const { id, usage } of sortedProviders) if ((usage.total || 0) > 0 || (usage.prompt || 0) + (usage.completion || 0) > 0) {
|
|
13440
|
-
const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
|
|
13441
|
-
const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
|
|
13442
|
-
const details = [];
|
|
13443
|
-
if (usage.prompt && usage.prompt > 0) details.push(`${usage.prompt.toLocaleString()} prompt`);
|
|
13444
|
-
if (usage.completion && usage.completion > 0) details.push(`${usage.completion.toLocaleString()} completion`);
|
|
13445
|
-
if (usage.cached && usage.cached > 0) if (usage.cached === displayTotal && details.length === 0) details.push("cached");
|
|
13446
|
-
else details.push(`${usage.cached.toLocaleString()} cached`);
|
|
13447
|
-
if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) details.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13448
|
-
const breakdown = ` (${`${usage.numRequests || 0} requests`}${details.length > 0 ? "; " : ""}${details.join(", ")})`;
|
|
13449
|
-
lines.push(` ${chalk.default.gray(displayId + ":")} ${chalk.default.white(displayTotal.toLocaleString())}${breakdown}`);
|
|
13450
|
-
}
|
|
13451
|
-
}
|
|
13452
|
-
}
|
|
13453
|
-
lines.push("");
|
|
13454
|
-
const totalTests = successes + failures + errors;
|
|
13455
|
-
const formatResultPercentage = (count) => {
|
|
13456
|
-
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
13457
|
-
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
13458
|
-
};
|
|
13459
|
-
const formatResultLine = (count, label, icon, iconColor) => {
|
|
13460
|
-
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count)})`)}`;
|
|
13461
|
-
};
|
|
13462
|
-
const errorLabel = errors === 1 ? "error" : "errors";
|
|
13463
|
-
lines.push(chalk.default.bold("Results:"));
|
|
13464
|
-
lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green));
|
|
13465
|
-
lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red));
|
|
13466
|
-
lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red));
|
|
13467
|
-
const durationDisplay = formatDuration(duration);
|
|
13468
|
-
lines.push(chalk.default.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
|
|
13469
|
-
lines.push("");
|
|
13470
|
-
return lines;
|
|
14451
|
+
return [
|
|
14452
|
+
getCompletionMessage({
|
|
14453
|
+
completionType: params.isRedteam ? "Red team" : "Eval",
|
|
14454
|
+
evalId: params.evalId,
|
|
14455
|
+
shareableUrl: params.shareableUrl,
|
|
14456
|
+
wasAborted: params.targetErrorStatus != null,
|
|
14457
|
+
writeToDatabase: params.writeToDatabase,
|
|
14458
|
+
activelySharing: params.activelySharing ?? false
|
|
14459
|
+
}),
|
|
14460
|
+
...getAbortSummaryLines(params.targetErrorStatus),
|
|
14461
|
+
...getGuidanceLines({
|
|
14462
|
+
writeToDatabase: params.writeToDatabase,
|
|
14463
|
+
shareableUrl: params.shareableUrl,
|
|
14464
|
+
wantsToShare: params.wantsToShare,
|
|
14465
|
+
activelySharing: params.activelySharing ?? false,
|
|
14466
|
+
hasExplicitDisable: params.hasExplicitDisable,
|
|
14467
|
+
cloudEnabled: params.cloudEnabled
|
|
14468
|
+
}),
|
|
14469
|
+
"",
|
|
14470
|
+
...getTokenUsageLines(params.tokenUsage, params.isRedteam, params.tracker),
|
|
14471
|
+
...getResultsLines(params)
|
|
14472
|
+
];
|
|
13471
14473
|
}
|
|
13472
14474
|
//#endregion
|
|
13473
14475
|
//#region src/commands/retry.ts
|
|
@@ -13625,7 +14627,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13625
14627
|
const cloudConfigId = uuidConfigArgs[0];
|
|
13626
14628
|
if (cmdObj.watch) throw new Error("--watch is not supported when using a cloud config UUID with -c. Use a local config file path for watch mode.");
|
|
13627
14629
|
try {
|
|
13628
|
-
defaultConfig = await
|
|
14630
|
+
defaultConfig = await require_storage.getEvalConfigFromCloud(cloudConfigId);
|
|
13629
14631
|
} catch (error) {
|
|
13630
14632
|
const reason = error instanceof Error ? error.message : String(error);
|
|
13631
14633
|
throw new Error(`Failed to load cloud eval config "${cloudConfigId}". ${reason}. Cloud UUID inputs do not fall back to local file paths. Check authentication and that the UUID exists.`);
|
|
@@ -13719,14 +14721,11 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13719
14721
|
require_logger.state.resume = true;
|
|
13720
14722
|
require_logger.state.retryMode = true;
|
|
13721
14723
|
} else ({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs(cmdObj, defaultConfig));
|
|
13722
|
-
if (!cmdObj.envPath && commandLineOptions?.envPath) {
|
|
14724
|
+
if ((!cmdObj.envPath || cmdObj.envPath.length === 0) && commandLineOptions?.envPath) {
|
|
13723
14725
|
require_logger.logger.debug(`Loading additional environment from config: ${commandLineOptions.envPath}`);
|
|
13724
14726
|
require_util.setupEnv(commandLineOptions.envPath);
|
|
13725
14727
|
}
|
|
13726
|
-
|
|
13727
|
-
Warning: Config file has a redteam section but no test cases.
|
|
13728
|
-
Did you mean to run ${chalk.default.bold("promptfoo redteam generate")} instead?
|
|
13729
|
-
`));
|
|
14728
|
+
warnIfRedteamConfigHasNoTests(config, testSuite);
|
|
13730
14729
|
if (config.redteam && Array.isArray(config.providers) && config.providers.length > 0 && typeof config.providers[0] === "object" && config.providers[0].id === "http") {
|
|
13731
14730
|
const maybeUrl = config.providers[0]?.config?.url;
|
|
13732
14731
|
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) require_telemetry.telemetry.record("feature_used", { feature: "redteam_run_with_example" });
|
|
@@ -13774,7 +14773,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13774
14773
|
};
|
|
13775
14774
|
testSuite.tests = await filterTests(testSuite, filterOptions);
|
|
13776
14775
|
}
|
|
13777
|
-
if (!
|
|
14776
|
+
if (!require_remoteGeneration.neverGenerateRemote() && config.redteam && config.redteam.plugins && config.redteam.plugins.length > 0 && testSuite.tests && testSuite.tests.length > 0) {
|
|
13778
14777
|
let hasValidEmail = false;
|
|
13779
14778
|
while (!hasValidEmail) {
|
|
13780
14779
|
const { emailNeedsValidation } = await require_accounts.promptForEmailUnverified();
|
|
@@ -13792,7 +14791,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13792
14791
|
process.exitCode = 1;
|
|
13793
14792
|
return new Eval({}, { persisted: false });
|
|
13794
14793
|
}
|
|
13795
|
-
await
|
|
14794
|
+
await require_storage.checkCloudPermissions(config);
|
|
13796
14795
|
const options = {
|
|
13797
14796
|
...evaluateOptions,
|
|
13798
14797
|
showProgressBar: require_logger.getLogLevel() === "debug" ? false : cmdObj.progressBar === void 0 ? evaluateOptions.showProgressBar === void 0 ? true : evaluateOptions.showProgressBar : cmdObj.progressBar !== false,
|
|
@@ -13834,7 +14833,14 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13834
14833
|
${zod.z.prettifyError(testSuiteSchema.error)}
|
|
13835
14834
|
|
|
13836
14835
|
Please review your promptfooconfig.yaml configuration.`));
|
|
13837
|
-
const
|
|
14836
|
+
const author = require_accounts.getAuthor();
|
|
14837
|
+
const evalRecord = resumeEval ? resumeEval : cmdObj.write ? await Eval.create(config, testSuite.prompts, {
|
|
14838
|
+
author,
|
|
14839
|
+
runtimeOptions: options
|
|
14840
|
+
}) : new Eval(config, {
|
|
14841
|
+
author,
|
|
14842
|
+
runtimeOptions: options
|
|
14843
|
+
});
|
|
13838
14844
|
const abortController = new AbortController();
|
|
13839
14845
|
const previousAbortSignal = evaluateOptions.abortSignal;
|
|
13840
14846
|
evaluateOptions.abortSignal = previousAbortSignal ? AbortSignal.any([previousAbortSignal, abortController.signal]) : abortController.signal;
|
|
@@ -13945,7 +14951,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13945
14951
|
const paths = (Array.isArray(outputPath) ? outputPath : [outputPath]).filter((p) => typeof p === "string" && p.length > 0 && !p.endsWith(".jsonl"));
|
|
13946
14952
|
const isRedteam = Boolean(config.redteam);
|
|
13947
14953
|
const duration = Math.round((Date.now() - startTime) / 1e3);
|
|
13948
|
-
const tracker =
|
|
14954
|
+
const tracker = require_shared.TokenUsageTracker.getInstance();
|
|
13949
14955
|
const targetErrorStatus = await evalRecord.findTargetErrorStatus();
|
|
13950
14956
|
const summaryLines = generateEvalSummary({
|
|
13951
14957
|
evalId: evalRecord.id,
|
|
@@ -13975,7 +14981,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13975
14981
|
} else for (const line of summaryLines) require_logger.logger.info(line);
|
|
13976
14982
|
let shareableUrl = null;
|
|
13977
14983
|
if (sharePromise != null) {
|
|
13978
|
-
const orgContext = await
|
|
14984
|
+
const orgContext = await require_storage.getOrgContext();
|
|
13979
14985
|
const orgSuffix = orgContext ? ` to ${orgContext.organizationName}${orgContext.teamName ? ` > ${orgContext.teamName}` : ""}` : "";
|
|
13980
14986
|
if (process.stdout.isTTY && !require_logger.isCI()) {
|
|
13981
14987
|
const spinner = (0, ora.default)({
|
|
@@ -14150,7 +15156,7 @@ async function doRedteamRun(options) {
|
|
|
14150
15156
|
redteamPath = path.join(configDir, "redteam.yaml");
|
|
14151
15157
|
}
|
|
14152
15158
|
try {
|
|
14153
|
-
const healthUrl =
|
|
15159
|
+
const healthUrl = require_remoteGeneration.getRemoteHealthUrl();
|
|
14154
15160
|
if (healthUrl) {
|
|
14155
15161
|
require_logger.logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
14156
15162
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
@@ -14236,65 +15242,175 @@ async function doRedteamRun(options) {
|
|
|
14236
15242
|
return evalResult;
|
|
14237
15243
|
}
|
|
14238
15244
|
//#endregion
|
|
15245
|
+
//#region src/types/transform.ts
|
|
15246
|
+
/** Runtime type guard for `TransformFunction` values. */
|
|
15247
|
+
function isTransformFunction(value) {
|
|
15248
|
+
return typeof value === "function";
|
|
15249
|
+
}
|
|
15250
|
+
//#endregion
|
|
14239
15251
|
//#region src/index.ts
|
|
15252
|
+
/**
|
|
15253
|
+
* Shallow-clone a test case so the caller can swap in resolved ApiProvider
|
|
15254
|
+
* instances on `options.provider` / `assert[].provider` without leaking those
|
|
15255
|
+
* mutations back to the input. The input may alias the unified config written
|
|
15256
|
+
* to the Eval record, and a live SDK client (e.g. Bedrock's BedrockRuntime,
|
|
15257
|
+
* Anthropic's client) holds circular references that break drizzle's JSON
|
|
15258
|
+
* serialization on `evalRecord.save()`. Fixes #8687.
|
|
15259
|
+
*
|
|
15260
|
+
* Detaches only `options` and `assert[]`. Other reference fields (`provider`,
|
|
15261
|
+
* `vars`, `metadata`, `providerOutput`) remain aliased — callers must reassign
|
|
15262
|
+
* those by reference rather than mutating in place. `assert-set` children are
|
|
15263
|
+
* not deep-cloned because the resolve loop skips `assert-set`; if that ever
|
|
15264
|
+
* changes, extend this helper.
|
|
15265
|
+
*/
|
|
15266
|
+
function cloneTestForResolve(test) {
|
|
15267
|
+
const cloned = { ...test };
|
|
15268
|
+
if (test.options) cloned.options = { ...test.options };
|
|
15269
|
+
if (test.assert) cloned.assert = test.assert.map((assertion) => ({ ...assertion }));
|
|
15270
|
+
return cloned;
|
|
15271
|
+
}
|
|
15272
|
+
function toSerializableProviderRef(provider) {
|
|
15273
|
+
if (require_types.isApiProvider(provider)) return require_evalResult.sanitizeProvider(provider);
|
|
15274
|
+
if (Array.isArray(provider)) return provider.map(toSerializableProviderRef);
|
|
15275
|
+
return provider;
|
|
15276
|
+
}
|
|
15277
|
+
function isRecord(value) {
|
|
15278
|
+
return Boolean(value && typeof value === "object" && !Array.isArray(value));
|
|
15279
|
+
}
|
|
15280
|
+
function withSerializableProvider(record) {
|
|
15281
|
+
if (!require_types.isApiProvider(record.provider)) return record;
|
|
15282
|
+
return {
|
|
15283
|
+
...record,
|
|
15284
|
+
provider: require_evalResult.sanitizeProvider(record.provider)
|
|
15285
|
+
};
|
|
15286
|
+
}
|
|
15287
|
+
/**
|
|
15288
|
+
* Function-valued transforms are first-class at runtime but are silently dropped
|
|
15289
|
+
* by `JSON.stringify`. Persisted eval configs (drizzle-stored) must never retain
|
|
15290
|
+
* a function reference, so replace every `transform`-like field with a
|
|
15291
|
+
* `[inline function]: name` marker. Non-function values pass through unchanged.
|
|
15292
|
+
*
|
|
15293
|
+
* `droppedRef.value` is flipped to `true` the first time a function is replaced
|
|
15294
|
+
* so the caller can emit a single warning instead of logging per field.
|
|
15295
|
+
*/
|
|
15296
|
+
function replaceFunctionTransforms(record, droppedRef) {
|
|
15297
|
+
let result;
|
|
15298
|
+
for (const key of require_transform$1.TRANSFORM_KEYS) {
|
|
15299
|
+
const value = record[key];
|
|
15300
|
+
if (!isTransformFunction(value)) continue;
|
|
15301
|
+
if (!result) result = { ...record };
|
|
15302
|
+
result[key] = value.name ? `${require_transform$1.INLINE_FUNCTION_LABEL}: ${value.name}` : require_transform$1.INLINE_FUNCTION_LABEL;
|
|
15303
|
+
droppedRef.value = true;
|
|
15304
|
+
}
|
|
15305
|
+
return result ?? record;
|
|
15306
|
+
}
|
|
15307
|
+
function toSerializableAssertion(assertion, droppedRef) {
|
|
15308
|
+
if (!isRecord(assertion)) return assertion;
|
|
15309
|
+
let sanitizedAssertion = withSerializableProvider(assertion);
|
|
15310
|
+
sanitizedAssertion = replaceFunctionTransforms(sanitizedAssertion, droppedRef);
|
|
15311
|
+
if (Array.isArray(assertion.assert)) sanitizedAssertion = {
|
|
15312
|
+
...sanitizedAssertion,
|
|
15313
|
+
assert: assertion.assert.map((a) => toSerializableAssertion(a, droppedRef))
|
|
15314
|
+
};
|
|
15315
|
+
return sanitizedAssertion;
|
|
15316
|
+
}
|
|
15317
|
+
function toSerializableTestCase(test, droppedRef) {
|
|
15318
|
+
if (!isRecord(test)) return test;
|
|
15319
|
+
let sanitizedTest = withSerializableProvider(test);
|
|
15320
|
+
if (isRecord(test.options)) {
|
|
15321
|
+
let options = withSerializableProvider(test.options);
|
|
15322
|
+
options = replaceFunctionTransforms(options, droppedRef);
|
|
15323
|
+
if (options !== test.options) sanitizedTest = {
|
|
15324
|
+
...sanitizedTest,
|
|
15325
|
+
options
|
|
15326
|
+
};
|
|
15327
|
+
}
|
|
15328
|
+
if (Array.isArray(test.assert)) sanitizedTest = {
|
|
15329
|
+
...sanitizedTest,
|
|
15330
|
+
assert: test.assert.map((a) => toSerializableAssertion(a, droppedRef))
|
|
15331
|
+
};
|
|
15332
|
+
return sanitizedTest;
|
|
15333
|
+
}
|
|
15334
|
+
function toSerializableScenario(scenario, droppedRef) {
|
|
15335
|
+
if (!isRecord(scenario)) return scenario;
|
|
15336
|
+
if (!Array.isArray(scenario.tests)) return scenario;
|
|
15337
|
+
return {
|
|
15338
|
+
...scenario,
|
|
15339
|
+
tests: scenario.tests.map((t) => toSerializableTestCase(t, droppedRef))
|
|
15340
|
+
};
|
|
15341
|
+
}
|
|
15342
|
+
function createSerializableUnifiedConfig(testSuite, prompts) {
|
|
15343
|
+
const droppedRef = { value: false };
|
|
15344
|
+
const config = {
|
|
15345
|
+
...testSuite,
|
|
15346
|
+
providers: toSerializableProviderRef(testSuite.providers),
|
|
15347
|
+
defaultTest: toSerializableTestCase(testSuite.defaultTest, droppedRef),
|
|
15348
|
+
tests: Array.isArray(testSuite.tests) ? testSuite.tests.map((t) => toSerializableTestCase(t, droppedRef)) : testSuite.tests,
|
|
15349
|
+
scenarios: Array.isArray(testSuite.scenarios) ? testSuite.scenarios.map((s) => toSerializableScenario(s, droppedRef)) : testSuite.scenarios,
|
|
15350
|
+
prompts
|
|
15351
|
+
};
|
|
15352
|
+
if (droppedRef.value && testSuite.writeLatestResults) require_logger.logger.warn("Function-valued transform(s) in testSuite were replaced with \"[inline function]\" markers in the persisted config. Re-running the saved eval will not invoke them; use string expressions or file:// references if you need the config to round-trip.");
|
|
15353
|
+
return config;
|
|
15354
|
+
}
|
|
14240
15355
|
async function evaluate(testSuite, options = {}) {
|
|
14241
|
-
|
|
14242
|
-
|
|
15356
|
+
const { author: suiteAuthor, ...testSuiteConfig } = testSuite;
|
|
15357
|
+
if (testSuiteConfig.writeLatestResults) await runDbMigrations();
|
|
15358
|
+
const loadedProviders = await require_providers.loadApiProviders(testSuiteConfig.providers, { env: testSuiteConfig.env });
|
|
14243
15359
|
const providerMap = {};
|
|
14244
15360
|
for (const p of loadedProviders) {
|
|
14245
15361
|
providerMap[p.id()] = p;
|
|
14246
15362
|
if (p.label) providerMap[p.label] = p;
|
|
14247
15363
|
}
|
|
14248
|
-
let resolvedDefaultTest =
|
|
14249
|
-
if (typeof
|
|
15364
|
+
let resolvedDefaultTest = testSuiteConfig.defaultTest;
|
|
15365
|
+
if (typeof testSuiteConfig.defaultTest === "string" && testSuiteConfig.defaultTest.startsWith("file://")) resolvedDefaultTest = await require_util.maybeLoadFromExternalFile(testSuiteConfig.defaultTest);
|
|
14250
15366
|
const constructedTestSuite = {
|
|
14251
|
-
...
|
|
15367
|
+
...testSuiteConfig,
|
|
14252
15368
|
defaultTest: resolvedDefaultTest,
|
|
14253
|
-
scenarios:
|
|
15369
|
+
scenarios: testSuiteConfig.scenarios,
|
|
14254
15370
|
providers: loadedProviders,
|
|
14255
|
-
tests: await readTests(
|
|
14256
|
-
nunjucksFilters: await require_util.readFilters(
|
|
14257
|
-
prompts: await require_graders.processPrompts(
|
|
14258
|
-
};
|
|
14259
|
-
if (typeof constructedTestSuite.defaultTest === "object") {
|
|
14260
|
-
|
|
14261
|
-
|
|
15371
|
+
tests: await readTests(testSuiteConfig.tests),
|
|
15372
|
+
nunjucksFilters: await require_util.readFilters(testSuiteConfig.nunjucksFilters || {}),
|
|
15373
|
+
prompts: await require_graders.processPrompts(testSuiteConfig.prompts)
|
|
15374
|
+
};
|
|
15375
|
+
if (typeof constructedTestSuite.defaultTest === "object" && constructedTestSuite.defaultTest) {
|
|
15376
|
+
constructedTestSuite.defaultTest = cloneTestForResolve(constructedTestSuite.defaultTest);
|
|
15377
|
+
if (constructedTestSuite.defaultTest.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
|
|
15378
|
+
env: testSuiteConfig.env,
|
|
14262
15379
|
basePath: require_logger.state.basePath
|
|
14263
15380
|
});
|
|
14264
|
-
if (constructedTestSuite.defaultTest
|
|
14265
|
-
env:
|
|
15381
|
+
if (constructedTestSuite.defaultTest.options?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
|
|
15382
|
+
env: testSuiteConfig.env,
|
|
14266
15383
|
basePath: require_logger.state.basePath
|
|
14267
15384
|
});
|
|
14268
15385
|
}
|
|
14269
|
-
|
|
15386
|
+
constructedTestSuite.tests = (constructedTestSuite.tests || []).map(cloneTestForResolve);
|
|
15387
|
+
for (const test of constructedTestSuite.tests) {
|
|
14270
15388
|
if (test.options?.provider && !require_types.isApiProvider(test.options.provider)) test.options.provider = await require_providers.resolveProvider(test.options.provider, providerMap, {
|
|
14271
|
-
env:
|
|
15389
|
+
env: testSuiteConfig.env,
|
|
14272
15390
|
basePath: require_logger.state.basePath
|
|
14273
15391
|
});
|
|
14274
|
-
|
|
15392
|
+
for (const assertion of test.assert || []) {
|
|
14275
15393
|
if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
|
|
14276
15394
|
if (assertion.provider && !require_types.isApiProvider(assertion.provider)) assertion.provider = await require_providers.resolveProvider(assertion.provider, providerMap, {
|
|
14277
|
-
env:
|
|
15395
|
+
env: testSuiteConfig.env,
|
|
14278
15396
|
basePath: require_logger.state.basePath
|
|
14279
15397
|
});
|
|
14280
15398
|
}
|
|
14281
15399
|
}
|
|
14282
15400
|
if (options.cache === false) require_cache.disableCache();
|
|
14283
|
-
const parsedProviderPromptMap = require_graders.readProviderPromptMap(
|
|
14284
|
-
const unifiedConfig =
|
|
14285
|
-
|
|
14286
|
-
|
|
14287
|
-
};
|
|
14288
|
-
const evalRecord = testSuite.writeLatestResults ? await Eval.create(unifiedConfig, constructedTestSuite.prompts) : new Eval(unifiedConfig);
|
|
15401
|
+
const parsedProviderPromptMap = require_graders.readProviderPromptMap(testSuiteConfig, constructedTestSuite.prompts);
|
|
15402
|
+
const unifiedConfig = createSerializableUnifiedConfig(testSuiteConfig, constructedTestSuite.prompts);
|
|
15403
|
+
const author = require_accounts.getAuthor(suiteAuthor);
|
|
15404
|
+
const evalRecord = testSuiteConfig.writeLatestResults ? await Eval.create(unifiedConfig, constructedTestSuite.prompts, { author }) : new Eval(unifiedConfig, { author });
|
|
14289
15405
|
const ret = await evaluate$1({
|
|
14290
15406
|
...constructedTestSuite,
|
|
14291
15407
|
providerPromptMap: parsedProviderPromptMap
|
|
14292
15408
|
}, evalRecord, {
|
|
14293
15409
|
eventSource: "library",
|
|
14294
|
-
isRedteam: Boolean(
|
|
15410
|
+
isRedteam: Boolean(testSuiteConfig.redteam),
|
|
14295
15411
|
...options
|
|
14296
15412
|
});
|
|
14297
|
-
if (
|
|
15413
|
+
if (testSuiteConfig.writeLatestResults && testSuiteConfig.sharing) if (isSharingEnabled(ret)) try {
|
|
14298
15414
|
const shareableUrl = await createShareableUrl(ret, { silent: true });
|
|
14299
15415
|
if (shareableUrl) {
|
|
14300
15416
|
ret.shareableUrl = shareableUrl;
|
|
@@ -14305,9 +15421,9 @@ async function evaluate(testSuite, options = {}) {
|
|
|
14305
15421
|
require_logger.logger.warn(`Failed to create shareable URL: ${error}`);
|
|
14306
15422
|
}
|
|
14307
15423
|
else require_logger.logger.debug("Sharing requested but not enabled (check cloud config or sharing settings)");
|
|
14308
|
-
if (
|
|
14309
|
-
if (typeof
|
|
14310
|
-
else if (Array.isArray(
|
|
15424
|
+
if (testSuiteConfig.outputPath) {
|
|
15425
|
+
if (typeof testSuiteConfig.outputPath === "string") await require_util.writeOutput(testSuiteConfig.outputPath, evalRecord, null);
|
|
15426
|
+
else if (Array.isArray(testSuiteConfig.outputPath)) await require_util.writeMultipleOutputs(testSuiteConfig.outputPath, evalRecord, null);
|
|
14311
15427
|
}
|
|
14312
15428
|
return ret;
|
|
14313
15429
|
}
|
|
@@ -14319,7 +15435,7 @@ const redteam = {
|
|
|
14319
15435
|
},
|
|
14320
15436
|
Graders: require_graders.GRADERS,
|
|
14321
15437
|
Plugins,
|
|
14322
|
-
Strategies:
|
|
15438
|
+
Strategies: require_strategies.Strategies,
|
|
14323
15439
|
Base: {
|
|
14324
15440
|
Plugin: require_graders.RedteamPluginBase,
|
|
14325
15441
|
Grader: require_graders.RedteamGraderBase
|
|
@@ -14348,9 +15464,18 @@ exports.CompletedPromptSchema = require_types.CompletedPromptSchema;
|
|
|
14348
15464
|
exports.CompletionTokenDetailsSchema = require_types.CompletionTokenDetailsSchema;
|
|
14349
15465
|
exports.ConversationMessageSchema = require_types.ConversationMessageSchema;
|
|
14350
15466
|
exports.DerivedMetricSchema = require_types.DerivedMetricSchema;
|
|
15467
|
+
exports.DocumentMediaInjectionPlacementSchema = require_types.DocumentMediaInjectionPlacementSchema;
|
|
15468
|
+
exports.DocumentMediaInjectionPlacementValues = require_types.DocumentMediaInjectionPlacementValues;
|
|
15469
|
+
exports.DocxInjectionPlacementSchema = require_types.DocxInjectionPlacementSchema;
|
|
15470
|
+
exports.DocxInjectionPlacementValues = require_types.DocxInjectionPlacementValues;
|
|
14351
15471
|
exports.EvalResultsFilterMode = require_types.EvalResultsFilterMode;
|
|
14352
15472
|
exports.EvaluateOptionsSchema = require_types.EvaluateOptionsSchema;
|
|
14353
15473
|
exports.GradingConfigSchema = require_types.GradingConfigSchema;
|
|
15474
|
+
exports.InputConfigSchema = require_types.InputConfigSchema;
|
|
15475
|
+
exports.InputDefinitionObjectSchema = require_types.InputDefinitionObjectSchema;
|
|
15476
|
+
exports.InputDefinitionSchema = require_types.InputDefinitionSchema;
|
|
15477
|
+
exports.InputTypeSchema = require_types.InputTypeSchema;
|
|
15478
|
+
exports.InputTypeValues = require_types.InputTypeValues;
|
|
14354
15479
|
exports.InputsSchema = require_types.InputsSchema;
|
|
14355
15480
|
exports.NotPrefixedAssertionTypesSchema = require_types.NotPrefixedAssertionTypesSchema;
|
|
14356
15481
|
exports.OutputConfigSchema = require_types.OutputConfigSchema;
|
|
@@ -14373,6 +15498,7 @@ exports.TestSuiteSchema = require_types.TestSuiteSchema;
|
|
|
14373
15498
|
exports.UnifiedConfigSchema = require_types.UnifiedConfigSchema;
|
|
14374
15499
|
exports.VarsSchema = require_types.VarsSchema;
|
|
14375
15500
|
exports.assertions = assertions_default;
|
|
15501
|
+
exports.buildInputPromptDescription = require_types.buildInputPromptDescription;
|
|
14376
15502
|
Object.defineProperty(exports, "cache", {
|
|
14377
15503
|
enumerable: true,
|
|
14378
15504
|
get: function() {
|
|
@@ -14382,12 +15508,17 @@ Object.defineProperty(exports, "cache", {
|
|
|
14382
15508
|
exports.default = src_default;
|
|
14383
15509
|
exports.evaluate = evaluate;
|
|
14384
15510
|
exports.generateTable = generateTable;
|
|
15511
|
+
exports.getInputDescription = require_types.getInputDescription;
|
|
15512
|
+
exports.getInputType = require_types.getInputType;
|
|
14385
15513
|
exports.guardrails = guardrails;
|
|
14386
15514
|
exports.isApiProvider = require_types.isApiProvider;
|
|
14387
15515
|
exports.isGradingResult = require_types.isGradingResult;
|
|
14388
15516
|
exports.isProviderOptions = require_types.isProviderOptions;
|
|
14389
15517
|
exports.isResultFailureReason = require_types.isResultFailureReason;
|
|
15518
|
+
exports.isTransformFunction = isTransformFunction;
|
|
14390
15519
|
exports.loadApiProvider = require_providers.loadApiProvider;
|
|
15520
|
+
exports.normalizeInputDefinition = require_types.normalizeInputDefinition;
|
|
15521
|
+
exports.normalizeInputs = require_types.normalizeInputs;
|
|
14391
15522
|
exports.redteam = redteam;
|
|
14392
15523
|
|
|
14393
15524
|
//# sourceMappingURL=index.cjs.map
|