promptfoo 0.121.4 → 0.121.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/{ListApp-DQkFNqE9.js → ListApp-BRUsT43Y.js} +1 -1
- package/dist/src/{accounts-Dy17bs4D.cjs → accounts-BIFntVWB.cjs} +4 -4
- package/dist/src/{accounts-F9d_5sMC.js → accounts-CLJHCDDb.js} +6 -6
- package/dist/src/{accounts-DhMYUUbu.js → accounts-CaLNYnf7.js} +4 -4
- package/dist/src/{accounts-DdJ2pHMI.js → accounts-bnyHT7Ju.js} +5 -5
- package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-B5krlibj.js} +3 -3
- package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-Ba67xmgs.js} +2 -2
- package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-BclbiXiq.js} +3 -3
- package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-D2x0wGhB.cjs} +2 -2
- package/dist/src/{agents-CgaMXvLM.js → agents-BGqaTDnr.js} +5 -5
- package/dist/src/{agents-8FDnTriG.js → agents-BV9yFpXX.js} +5 -5
- package/dist/src/{agents-aYPQLf8W.js → agents-BYdMl1UE.js} +4 -4
- package/dist/src/{agents-pQeBEXMm.js → agents-DhxWMCtH.js} +5 -5
- package/dist/src/{agents-D7-HGxUj.cjs → agents-DiWmQYH9.cjs} +4 -4
- package/dist/src/{agents-BahDpe5G.cjs → agents-WULPVjbH.cjs} +4 -4
- package/dist/src/{agents-DJ35I3Nt.js → agents-emVcx3yh.js} +5 -5
- package/dist/src/{agents-C-R_jfzI.js → agents-n6vPqV3i.js} +4 -4
- package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-BxqK9HF_.js} +7 -7
- package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-BzLjZI_m.cjs} +6 -6
- package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-DR4pgeiC.js} +6 -6
- package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-uPGp0Zdo.js} +7 -7
- package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
- package/dist/src/app/assets/Report-vjzrbgce.js +1 -0
- package/dist/src/app/assets/index-B3NQ8HTd.js +385 -0
- package/dist/src/app/assets/{index-BXGkeMwh.css → index-Cli2yAXv.css} +1 -1
- package/dist/src/app/index.html +27 -2
- package/dist/src/{audio-DcVKoInv.js → audio-BvpTOArF.js} +4 -4
- package/dist/src/{audio-BQtNuYBj.cjs → audio-C0vDeS0j.cjs} +3 -3
- package/dist/src/{audio-B7izf48x.js → audio-CScmnmEB.js} +4 -4
- package/dist/src/{audio-COrn8rM6.js → audio-Da8U9IS5.js} +3 -3
- package/dist/src/{base-fZ9wgg50.js → base-BOMaNEes.js} +3 -3
- package/dist/src/{base-PYJvBE1i.js → base-BTux96b1.js} +2 -2
- package/dist/src/{base-D-670DX8.cjs → base-Tw6uhH8K.cjs} +2 -2
- package/dist/src/{base-yrI1Yal4.js → base-dYsl2hmL.js} +3 -3
- package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B95F_7vE.cjs} +2 -2
- package/dist/src/{blobs-C-F78Kfn.js → blobs-BW4U31ue.js} +2 -2
- package/dist/src/{blobs-BCZavS8s.js → blobs-D_gg8nbm.js} +3 -3
- package/dist/src/{blobs-BQWqnnvL.js → blobs-DjLby-uP.js} +3 -3
- package/dist/src/{cache-mb7c8hbp.js → cache-BI5BY7ey.js} +4 -4
- package/dist/src/{cache-DbLsVWB2.cjs → cache-BRkhlH3k.cjs} +1 -1
- package/dist/src/cache-BlC6aeJ0.js +3 -0
- package/dist/src/{cache-D5NZmMiT.js → cache-Bzttsk0X.js} +2 -2
- package/dist/src/{cache-C4Xb-hNb.js → cache-Cr-qWIbP.js} +3 -3
- package/dist/src/{cache-BIyPcp5v.cjs → cache-DGg-yTZG.cjs} +2 -2
- package/dist/src/{chat-Dr3DUQ0D.js → chat-BLOdH60v.js} +12 -12
- package/dist/src/{chat-BfPaS15_.js → chat-Cx_LkwvZ.js} +12 -12
- package/dist/src/{chat-mW0ORo8G.js → chat-D9nudO9b.js} +4 -4
- package/dist/src/{chat-I9izLm49.js → chat-DChSH_Es.js} +12 -12
- package/dist/src/{chat-MKxMnZJZ.js → chat-DG2LkwLq.js} +2 -2
- package/dist/src/{chat-BPXSW8Bv.cjs → chat-DH97tVV9.cjs} +2 -2
- package/dist/src/{chat-0bwXjVP0.js → chat-aMQZw6R7.js} +4 -4
- package/dist/src/{chat-CclRbxGf.cjs → chat-vYqqv1gP.cjs} +11 -11
- package/dist/src/{chatkit-zUIVoDos.js → chatkit-B8X34dQc.js} +4 -4
- package/dist/src/{chatkit-Cv6AhukM.js → chatkit-BXu42Qwt.js} +3 -3
- package/dist/src/{chatkit-CJnHRRMM.js → chatkit-CbMRoeYw.js} +4 -4
- package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-D44VyUyB.cjs} +3 -3
- package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-BRq0bbIK.cjs} +8 -8
- package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-BjriSVRZ.js} +7 -7
- package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BzNZeZ0N.js} +7 -7
- package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-DYv_AJ8u.js} +7 -7
- package/dist/src/cloud-CoD5OacT.js +3 -0
- package/dist/src/{cloud-DQZ5sVjW.js → cloud-Da0bofJd.js} +3 -3
- package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-CXC4b1EU.js} +4 -4
- package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-CyBoIs1Q.js} +6 -6
- package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-DGOwgexC.js} +6 -6
- package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-DJv5qnyb.cjs} +4 -4
- package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-1sAoOyft.js} +5 -5
- package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-D-dnkzCF.js} +5 -5
- package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-DKVjkDav.cjs} +3 -3
- package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-TJkVrZlB.js} +3 -3
- package/dist/src/codex-app-server-CCLjqCh9.js +1915 -0
- package/dist/src/codex-app-server-CCe0TiDc.js +1915 -0
- package/dist/src/codex-app-server-CPW1LFwh.js +1916 -0
- package/dist/src/codex-app-server-VMRnjZ68.cjs +1920 -0
- package/dist/src/codex-sdk-1jm_qPHf.js +3 -0
- package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-Bd8UbO9q.cjs} +5 -5
- package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-BgEFQ70r.js} +6 -6
- package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-Bzb_TqX9.js} +6 -6
- package/dist/src/{codex-sdk-CWEnH70W.cjs → codex-sdk-Danroptg.cjs} +1 -1
- package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-DfvDTN33.js} +5 -5
- package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-B5ImDlSm.js} +8 -8
- package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-BgAkuYCw.cjs} +7 -7
- package/dist/src/{cometapi-BUlt_ELa.js → cometapi-CC7hWxmX.js} +8 -8
- package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CCbpHkuF.js} +7 -7
- package/dist/src/{completion-x0a_c2y1.js → completion-2iuYVxwi.js} +6 -6
- package/dist/src/{completion-Dnxn7E-j.js → completion-CrD6MQ93.js} +5 -5
- package/dist/src/{completion-BozdoXba.cjs → completion-DtQ72Bm3.cjs} +5 -5
- package/dist/src/{completion-HUe8wDhZ.js → completion-Vq_ad618.js} +6 -6
- package/dist/src/{createHash-ChI45QR1.js → createHash-DPpsZgFF.js} +1 -1
- package/dist/src/{createHash-CwDVU5xr.js → createHash-Un4Q_huE.js} +1 -1
- package/dist/src/{createHash-B7KvgoOD.cjs → createHash-VvBIc-AW.cjs} +1 -1
- package/dist/src/{docker-DCgsveLD.js → docker--3qzPa-6.js} +6 -6
- package/dist/src/{docker-DS4_Osau.cjs → docker-D3AY-5F5.cjs} +5 -5
- package/dist/src/{docker-CQmlA2NU.js → docker-DCsCDvwM.js} +6 -6
- package/dist/src/{docker-ClnmCf1Z.js → docker-Dorv4_Dg.js} +5 -5
- package/dist/src/{embedding-I45KG3o7.cjs → embedding-BXhN5lCH.cjs} +5 -5
- package/dist/src/{embedding-nFbumxcv.js → embedding-ChS1ivFS.js} +5 -5
- package/dist/src/{embedding-D3xTseo7.js → embedding-DNRvZwRN.js} +6 -6
- package/dist/src/{embedding-DD9wa3ae.js → embedding-D_bI4NDq.js} +6 -6
- package/dist/src/{errors-Cw810C93.js → errors-DFHe4L-n.js} +1 -1
- package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
- package/dist/src/{esm-C7PnfdF8.js → esm-BRkfNsYs.js} +1 -1
- package/dist/src/{esm-tVgYPY-f.js → esm-BX8fwlAO.js} +2 -2
- package/dist/src/{esm-CtEPLdAj.cjs → esm-B_rGuPTo.cjs} +1 -1
- package/dist/src/{eval-CzJFfFO9.js → eval-BQPLBJbw.js} +1 -1
- package/dist/src/{eval-u4UVafl6.js → eval-DJ_4A-tr.js} +14 -14
- package/dist/src/evalResult-BBJAHAtw.cjs +2 -0
- package/dist/src/evalResult-BBK58h2B.js +3 -0
- package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-Cx-8OWkb.cjs} +28 -10
- package/dist/src/{evalResult-D3hVYFis.js → evalResult-D6P5I5il.js} +29 -11
- package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-pSvGWFMo.js} +29 -11
- package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-D-UIbbYq.js} +845 -98
- package/dist/src/evaluator-DgLKaZk8.js +3 -0
- package/dist/src/{extractor-Dk6bRWkv.js → extractor-BM3jRERL.js} +5 -5
- package/dist/src/{extractor-WVPOrH43.cjs → extractor-Dxr2J_wK.cjs} +5 -5
- package/dist/src/{extractor-DNSeBVOJ.js → extractor-DxyiFhPk.js} +6 -6
- package/dist/src/{extractor-CAfTSraf.js → extractor-YlZbUMsL.js} +6 -6
- package/dist/src/fetch-8viavNv8.js +3 -0
- package/dist/src/{fetch-BEWnXrrG.js → fetch-B6ch2nU2.js} +9 -20
- package/dist/src/{fetch-Di00EQrc.js → fetch-D9xxyC1p.js} +221 -232
- package/dist/src/{fetch-CJU5ELPa.cjs → fetch-NuqXW1Xb.cjs} +221 -244
- package/dist/src/{fetch-B0Z3Oe4k.js → fetch-Y5qX_kST.js} +8 -19
- package/dist/src/{fileExtensions-BArZuxsI.js → fileExtensions-8CjoL7vB.js} +1 -1
- package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-BGh-W-HT.js} +1 -1
- package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-D9h-8Wxg.cjs} +1 -1
- package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-DysCsxNG.js} +1 -1
- package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-Ch4A7G3o.js} +1 -1
- package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-BokHC-MW.cjs} +1 -1
- package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-C3ZPQU60.js} +1 -1
- package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-CFny3gOy.js} +1 -1
- package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-DxODqT9e.js} +1 -1
- package/dist/src/{graders-Zy3x0zqX.js → graders-BoUqsCEm.js} +1303 -2044
- package/dist/src/{graders--zknU_uk.cjs → graders-Bw1wk_21.cjs} +1553 -2240
- package/dist/src/graders-C84JI-m5.js +2 -0
- package/dist/src/graders-CBbd0K0Q.cjs +2 -0
- package/dist/src/graders-CbQqpHSN.js +3 -0
- package/dist/src/{graders-eIHhRqoC.js → graders-CgPn32yp.js} +1300 -2041
- package/dist/src/{graders-pvbReLLn.js → graders-CwrbifOo.js} +747 -1488
- package/dist/src/graders-DS42d3ZG.js +2 -0
- package/dist/src/{image-9302QVqR.js → image-BeWaInPF.js} +3 -3
- package/dist/src/{image-DVz2RiMF.js → image-BmilRNqO.js} +7 -7
- package/dist/src/{image-x6KqLQl4.cjs → image-CxJoa3aW.cjs} +6 -6
- package/dist/src/{image-De2FBmYV.cjs → image-D10dNAav.cjs} +3 -3
- package/dist/src/{image-dnoUgPrC.js → image-Dr_3I3nK.js} +4 -4
- package/dist/src/{image-B5Mv-Z3h.js → image-DsGRlkh7.js} +7 -7
- package/dist/src/{image-qUpPvmNZ.js → image-a_SGUobh.js} +6 -6
- package/dist/src/{image-u7-rKnYU.js → image-qjO6FWPs.js} +4 -4
- package/dist/src/index.cjs +1052 -296
- package/dist/src/index.d.cts +124 -13
- package/dist/src/index.d.ts +125 -14
- package/dist/src/index.js +1018 -262
- package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-CCICw2cy.js} +2 -2
- package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
- package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
- package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-BBETc5-S.js} +6 -6
- package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-C8qOo26M.cjs} +5 -5
- package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CzAi2rUI.js} +6 -6
- package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-Dr3Kib7F.js} +5 -5
- package/dist/src/{litellm-C2kqjxqp.js → litellm-BLSiANhk.js} +5 -5
- package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CaUmV7Mk.cjs} +4 -4
- package/dist/src/{litellm-p37R1dzQ.js → litellm-DQGo_juI.js} +4 -4
- package/dist/src/{litellm-DRjpcSa7.js → litellm-DRc4qWfc.js} +5 -5
- package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
- package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
- package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-B-tNZzqW.js} +6 -6
- package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CtS3OlGq.js} +5 -5
- package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-PJJgUjOc.js} +6 -6
- package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-if-Ml4R9.cjs} +5 -5
- package/dist/src/main.js +242 -198
- package/dist/src/{messages-zWbkLLHz.js → messages-B9dSjrNf.js} +264 -16
- package/dist/src/{messages-811uVVW5.cjs → messages-BnsVHUnm.cjs} +266 -15
- package/dist/src/{messages-MYTQ2TWp.js → messages-CI69Lasb.js} +264 -16
- package/dist/src/{messages-BTQz42fn.js → messages-CewuNcNS.js} +264 -16
- package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-BBGcGeCa.cjs} +1 -1
- package/dist/src/{meteor-DuAFv6gF.js → meteor-BKTM-7KS.js} +1 -1
- package/dist/src/{meteor-DHdzY1Ss.js → meteor-CeGo0Lu2.js} +2 -2
- package/dist/src/{meteor-CU5UAE-H.js → meteor-Wc_aUVvu.js} +2 -2
- package/dist/src/{modelslab-wu9yi5GE.js → modelslab-BCLOtfek.js} +7 -7
- package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-BkapYJhh.cjs} +6 -6
- package/dist/src/{modelslab-DIq-6y7x.js → modelslab-D73OnKSx.js} +6 -6
- package/dist/src/{modelslab-D0erNWKe.js → modelslab-zpz9JcK0.js} +7 -7
- package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-B8F_TK5w.js} +6 -6
- package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-Bx0NFV2f.js} +5 -5
- package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-CNGJTLtG.js} +6 -6
- package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-DkT7tnoB.cjs} +5 -5
- package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BaXRN1cr.js} +4 -4
- package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-BeTRaFOh.js} +4 -4
- package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CL7Zqv0G.js} +3 -3
- package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-YT426juD.cjs} +3 -3
- package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
- package/dist/src/{openai-Qsvz25mV.js → openai-BT-JvDse.js} +2 -2
- package/dist/src/{openai-URNyItar.cjs → openai-Cy1XLs0c.cjs} +1 -1
- package/dist/src/{openai-iYtrXzOX.js → openai-D4fxGvRx.js} +1 -1
- package/dist/src/{openclaw-CwzlQSQX.js → openclaw-Bq7RVR3k.js} +7 -6
- package/dist/src/{openclaw-CLWrW03k.js → openclaw-DA8U4DsD.js} +8 -7
- package/dist/src/{openclaw-CnQ363Wi.js → openclaw-DObVgpjC.js} +8 -7
- package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-DUBZP3GL.cjs} +8 -7
- package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-BB40Wir1.js} +4 -4
- package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-BM1UAIv1.js} +3 -3
- package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-CeqiOcOU.cjs} +4 -4
- package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-ChdK7F7z.js} +4 -4
- package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-C6thJRXi.js} +4 -4
- package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-CcdIikOu.js} +3 -3
- package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-DNSQj6bf.cjs} +3 -3
- package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver-UYMQx3sy.js} +4 -4
- package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
- package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-BESeALrr.cjs} +1 -1
- package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DoACwqhD.js} +1 -1
- package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-PMsleEzs.js} +2 -2
- package/dist/src/{providers-Bp4S-FvO.js → providers-BuyzKt7C.js} +1 -1
- package/dist/src/{providers-DV3ax9e_.cjs → providers-C7lNVBjX.cjs} +1 -1
- package/dist/src/{providers-u9Enmfok.js → providers-CCE2COJi2.js} +1 -1
- package/dist/src/{providers-DruaQfwu.js → providers-CJh7iriU.js} +18103 -17952
- package/dist/src/{providers-iUt5fbAN.js → providers-Ctcc592x.js} +1 -1
- package/dist/src/{providers-Domz_llv.js → providers-DRrerKra.js} +432 -281
- package/dist/src/{providers-BV_KMZje.js → providers-DT-GtF2t.js} +19094 -18943
- package/dist/src/{providers-1eKkXBKp.cjs → providers-eDShy16E.cjs} +17946 -17795
- package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-C4tltmIn.js} +3 -3
- package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-CoLaCwNY.cjs} +3 -3
- package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-DMO68Jg7.js} +3 -3
- package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-DNqbnRdx.js} +3 -3
- package/dist/src/{quiverai-DR0SnIQV.js → quiverai-BSS9a7wV.js} +3 -3
- package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Bk1KrvL6.js} +4 -4
- package/dist/src/{quiverai-DFotyafY.cjs → quiverai-Bpx6MZ7T.cjs} +3 -3
- package/dist/src/{quiverai-aPPvXOgn.js → quiverai-CPKhWgaT.js} +4 -4
- package/dist/src/{render-DHIZ6_k8.js → render-7uNJ2V14.js} +2 -2
- package/dist/src/{render-CH-62LbA.js → render-DlscvAUJ.js} +1 -1
- package/dist/src/{render-CMEpfLaO.js → render-eui5p5mL.js} +2 -2
- package/dist/src/{render-CgVDrJmM.js → render-nj-UaPdn.js} +2 -2
- package/dist/src/{render-DfQSFxGE.cjs → render-tG6ir9_g.cjs} +1 -1
- package/dist/src/{responses--OsX2aYW.js → responses-1ztiVYsx.js} +49 -15
- package/dist/src/{responses-DL9m8CyY.js → responses-B8haB-mD.js} +49 -15
- package/dist/src/{responses-C-flexAY.js → responses-BiaBguAu.js} +49 -15
- package/dist/src/{responses-Bi9vBuW_.cjs → responses-CF-ayauu.cjs} +48 -14
- package/dist/src/rubyUtils-4hjGxvju.js +3 -0
- package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-BI0p46eZ.js} +3 -3
- package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CIQFnVz4.js} +3 -3
- package/dist/src/rubyUtils-CO-tuszQ.cjs +2 -0
- package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DGnoCYL2.js} +3 -3
- package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-DoifqkiA.cjs} +4 -3
- package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-BDLeW29y.js} +12 -12
- package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-C5T60MKf.js} +13 -13
- package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-ClS_NB07.js} +13 -13
- package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-ljtY12VM.cjs} +12 -12
- package/dist/src/{scanner-1DqWi1Ej.js → scanner-nOCWNIXa.js} +7 -7
- package/dist/src/server/index.js +1067 -265
- package/dist/src/{server-Dx2TyCH2.cjs → server-BEECpeGG.cjs} +5 -5
- package/dist/src/{server-BNYztJkh.js → server-ByiF3qlg.js} +9 -8
- package/dist/src/{server-BSB45Nt9.js → server-ByxbqAcQ.js} +8 -7
- package/dist/src/{server-DaA2eR26.cjs → server-C0XKRNB_.cjs} +1 -1
- package/dist/src/server-C_15p79-.js +3 -0
- package/dist/src/{server-D6Il2Sob.js → server-gyd6d4Hc.js} +5 -5
- package/dist/src/{signal-CE5G3a7x.js → signal-DTtUuU3l.js} +3 -3
- package/dist/src/{slack-acRb0IqQ.js → slack-4zZX1OKP.js} +1 -1
- package/dist/src/{slack-1Rhq0EoV.cjs → slack-BLlsDpfG.cjs} +1 -1
- package/dist/src/{slack-D5Wpy8LM.js → slack-BPYLQLgb.js} +2 -2
- package/dist/src/{slack-DDUe-5MC.js → slack-Bamy_7te.js} +2 -2
- package/dist/src/{store-DAAyxcy6.cjs → store-2K0kDi80.cjs} +2 -2
- package/dist/src/{store-Dn9HUkdW.js → store-2OXm_eBY.js} +3 -3
- package/dist/src/store-BELqNwvz.js +3 -0
- package/dist/src/{store-M0b1WfYb.js → store-BPkzEyFM.js} +2 -2
- package/dist/src/{store-CYEy5J2D.js → store-CPh25336.js} +3 -3
- package/dist/src/store-uQZ4AjPe.cjs +2 -0
- package/dist/src/{tables-CsWou1Bx.js → tables-BMSOS2Gg.js} +3 -3
- package/dist/src/{tables-DUfh1F7Z.cjs → tables-CXbaZ9y1.cjs} +2 -2
- package/dist/src/{tables-C4CH3zRr.js → tables-NlvH23ky.js} +3 -3
- package/dist/src/{tables-DQ4WU5tX.js → tables-WgdUZ8Ck.js} +2 -2
- package/dist/src/{telemetry-dbaJ0E98.js → telemetry--iqaGyaS.js} +5 -4
- package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CEQxGnMZ.cjs} +7 -6
- package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-CgdVGV8N.js} +4 -3
- package/dist/src/{telemetry-CQPez_Jp.js → telemetry-DWdGHvEf.js} +5 -4
- package/dist/src/telemetry-DjNoC_n3.cjs +2 -0
- package/dist/src/telemetry-ZdPZc0fm.js +3 -0
- package/dist/src/{text-BVi-cLPJ.cjs → text-BiNME7QG.cjs} +1 -1
- package/dist/src/{text-KvuD2Iko.js → text-D4lz-Jg_.js} +1 -1
- package/dist/src/{text-DHxdyQqT.js → text-DDQP0tuQ.js} +1 -1
- package/dist/src/{text-CZr46tp_.js → text-NWvfMfkF.js} +1 -1
- package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-2wIvAhB3.js} +1 -1
- package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-4c780gFd.js} +1 -1
- package/dist/src/tokenUsageUtils-BjVkdk18.js +142 -0
- package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-C9odhsbW.cjs} +1 -1
- package/dist/src/{transcription-DuWDupG7.js → transcription-84t4ALo2.js} +5 -5
- package/dist/src/{transcription-CJspiD2c.js → transcription-Bm2emLmJ.js} +6 -6
- package/dist/src/{transcription-BvjmiYB1.cjs → transcription-CZ4LG5hQ.cjs} +5 -5
- package/dist/src/{transcription-V2HaAmy2.js → transcription-D7Q0vJsh.js} +6 -6
- package/dist/src/{transform-zDhMmzwX.js → transform-B-b6Cq-q.js} +5 -5
- package/dist/src/transform-BQt0BeAW.js +3 -0
- package/dist/src/{transform-DgKlRr73.cjs → transform-Bq5oqC0s.cjs} +1 -1
- package/dist/src/{transform-CUnzlsbn.cjs → transform-C9izGX54.cjs} +4 -4
- package/dist/src/{transform-DYX1_Xnh.js → transform-CwbAZ84V.js} +5 -5
- package/dist/src/{transform-CTeuTR3S.cjs → transform-Dg4LcO1Y.cjs} +6 -6
- package/dist/src/{transform-CG0ehZNG.js → transform-DtooZqYY.js} +6 -6
- package/dist/src/{transform-UN5UGu8U.js → transform-DzCF-wqV.js} +5 -5
- package/dist/src/{transform-lQrDE1BQ.js → transform-_DpNB4qp.js} +5 -5
- package/dist/src/{transform-Bbg6A8Jk.js → transform-eGiUAv86.js} +4 -4
- package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-B22swDxr.cjs} +1 -1
- package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-lvCCvuPT.js} +1 -1
- package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-rJGPccjr.js} +1 -1
- package/dist/src/{types-Bgh5SOn6.js → types-BDjGOq4E.js} +4 -2
- package/dist/src/{types-Dm9JM6Vb.js → types-BVH9hjgW.js} +4 -2
- package/dist/src/{types-CeaeaZdP.cjs → types-CgG2rKiW.cjs} +151 -149
- package/dist/src/{types-BGQDAP8i.js → types-DNRZVOue.js} +152 -150
- package/dist/src/{util-C8e5uydV.js → util-3pBZZb_H.js} +142 -17
- package/dist/src/{util-CN3SrLT4.cjs → util-A5_ZsQUn.cjs} +65 -43
- package/dist/src/{util-D3q0WQ-0.js → util-B9CNhyac.js} +66 -44
- package/dist/src/{util-DxWpWjhc.js → util-BQOCAHQC.js} +700 -575
- package/dist/src/{util-BYvQUPp7.js → util-BVXcTwXu.js} +3 -3
- package/dist/src/{util-D9TisOyk.js → util-BlFVL0UF.js} +65 -43
- package/dist/src/{util-C9J8ahRn.js → util-C-kmRosx.js} +66 -44
- package/dist/src/{util-DvU2Pw8c.js → util-DFPeFkiV.js} +3 -3
- package/dist/src/{util-DDs-7g6-.js → util-DN0-b81k.js} +3 -3
- package/dist/src/{util-olYL5C6N.cjs → util-Dpmm_dAI.cjs} +3 -3
- package/dist/src/{util-oGMLA7vc.js → util-Dub0f_ej.js} +700 -575
- package/dist/src/{util-Bxn8emtE.cjs → util-DvpHnLt0.cjs} +718 -570
- package/dist/src/{utils-DJfvjyMj.js → utils-BUMN8orw.js} +3 -3
- package/dist/src/{utils-B05gLxER.cjs → utils-DkVeShIB.cjs} +2 -2
- package/dist/src/{utils-BLJKfv0y.js → utils-kt7lv30R.js} +3 -3
- package/dist/src/{utils-hXtCYanr.js → utils-o8S5huU2.js} +2 -2
- package/dist/src/version-0frU0UTr.js +16 -0
- package/dist/src/version-CbpiUINz.js +17 -0
- package/dist/src/version-CbuBKu2U.js +16 -0
- package/dist/src/version-D9zu9FWB.cjs +27 -0
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +22 -20
- package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
- package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
- package/dist/src/cache-Cr9oLMUa.js +0 -3
- package/dist/src/cloud-Hphvo8kr.js +0 -3
- package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
- package/dist/src/evalResult-D8MT9p0s.js +0 -3
- package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
- package/dist/src/evaluator-CVessDWe.js +0 -3
- package/dist/src/fetch-C7bGKDlQ.js +0 -3
- package/dist/src/graders-BOAzQEUe.cjs +0 -2
- package/dist/src/graders-D4BTsZdG2.js +0 -3
- package/dist/src/graders-DOJK1XpV.js +0 -2
- package/dist/src/graders-NAv9LcBn.js +0 -2
- package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
- package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
- package/dist/src/server-DCtHUqlp.js +0 -3
- package/dist/src/store-CWOSz6D_.cjs +0 -2
- package/dist/src/store-DCDBhv7B.js +0 -3
- package/dist/src/telemetry-C1IqxcdW.js +0 -3
- package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
- package/dist/src/transform-M6ITAESf.js +0 -3
- /package/dist/src/{evalResult-DElBuddX.js → evalResult-spPqh1G_.js} +0 -0
package/dist/src/index.cjs
CHANGED
|
@@ -4,34 +4,35 @@ Object.defineProperties(exports, {
|
|
|
4
4
|
});
|
|
5
5
|
const require_logger = require("./logger-COuQb2xB.cjs");
|
|
6
6
|
const require_invariant = require("./invariant-kfQ8Bu82.cjs");
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
const
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const
|
|
15
|
-
const
|
|
16
|
-
const
|
|
17
|
-
const
|
|
18
|
-
const
|
|
19
|
-
const
|
|
20
|
-
const
|
|
21
|
-
const
|
|
22
|
-
const
|
|
23
|
-
const
|
|
24
|
-
const
|
|
25
|
-
const
|
|
26
|
-
const
|
|
27
|
-
const
|
|
28
|
-
const
|
|
29
|
-
const
|
|
30
|
-
const
|
|
31
|
-
const
|
|
32
|
-
const
|
|
33
|
-
const
|
|
34
|
-
const
|
|
7
|
+
const require_fetch = require("./fetch-NuqXW1Xb.cjs");
|
|
8
|
+
const require_version = require("./version-D9zu9FWB.cjs");
|
|
9
|
+
const require_types = require("./types-CgG2rKiW.cjs");
|
|
10
|
+
const require_accounts = require("./accounts-BIFntVWB.cjs");
|
|
11
|
+
const require_esm = require("./esm-B_rGuPTo.cjs");
|
|
12
|
+
const require_render = require("./render-tG6ir9_g.cjs");
|
|
13
|
+
const require_providerRegistry = require("./providerRegistry-BESeALrr.cjs");
|
|
14
|
+
const require_server = require("./server-BEECpeGG.cjs");
|
|
15
|
+
const require_providers = require("./providers-eDShy16E.cjs");
|
|
16
|
+
const require_pythonUtils = require("./pythonUtils-CoLaCwNY.cjs");
|
|
17
|
+
const require_fileExtensions = require("./fileExtensions-D9h-8Wxg.cjs");
|
|
18
|
+
const require_util = require("./util-DvpHnLt0.cjs");
|
|
19
|
+
const require_tokenUsageUtils = require("./tokenUsageUtils-C9odhsbW.cjs");
|
|
20
|
+
const require_blobs = require("./blobs-B95F_7vE.cjs");
|
|
21
|
+
const require_tables = require("./tables-CXbaZ9y1.cjs");
|
|
22
|
+
const require_extractor = require("./extractor-Dxr2J_wK.cjs");
|
|
23
|
+
const require_cache = require("./cache-DGg-yTZG.cjs");
|
|
24
|
+
const require_chat = require("./chat-vYqqv1gP.cjs");
|
|
25
|
+
const require_transform = require("./transform-Dg4LcO1Y.cjs");
|
|
26
|
+
const require_util$1 = require("./util-A5_ZsQUn.cjs");
|
|
27
|
+
const require_transform$1 = require("./transform-C9izGX54.cjs");
|
|
28
|
+
const require_telemetry = require("./telemetry-CEQxGnMZ.cjs");
|
|
29
|
+
const require_text = require("./text-BiNME7QG.cjs");
|
|
30
|
+
const require_store = require("./store-2K0kDi80.cjs");
|
|
31
|
+
const require_createHash = require("./createHash-VvBIc-AW.cjs");
|
|
32
|
+
const require_rubyUtils = require("./rubyUtils-DoifqkiA.cjs");
|
|
33
|
+
const require_graders = require("./graders-Bw1wk_21.cjs");
|
|
34
|
+
const require_utils = require("./utils-DkVeShIB.cjs");
|
|
35
|
+
const require_evalResult = require("./evalResult-Cx-8OWkb.cjs");
|
|
35
36
|
let fs = require("fs");
|
|
36
37
|
fs = require_logger.__toESM(fs);
|
|
37
38
|
let path = require("path");
|
|
@@ -41,34 +42,34 @@ async = require_logger.__toESM(async);
|
|
|
41
42
|
let js_yaml = require("js-yaml");
|
|
42
43
|
js_yaml = require_logger.__toESM(js_yaml);
|
|
43
44
|
let node_async_hooks = require("node:async_hooks");
|
|
44
|
-
require("node:path");
|
|
45
|
-
require("node:url");
|
|
46
45
|
let chalk = require("chalk");
|
|
47
46
|
chalk = require_logger.__toESM(chalk);
|
|
48
47
|
let os = require("os");
|
|
49
48
|
os = require_logger.__toESM(os);
|
|
50
|
-
let util = require("util");
|
|
51
|
-
util = require_logger.__toESM(util);
|
|
52
49
|
let dedent = require("dedent");
|
|
53
50
|
dedent = require_logger.__toESM(dedent);
|
|
51
|
+
let zod = require("zod");
|
|
54
52
|
let fs_promises = require("fs/promises");
|
|
55
53
|
fs_promises = require_logger.__toESM(fs_promises);
|
|
56
|
-
let
|
|
57
|
-
|
|
58
|
-
let
|
|
59
|
-
|
|
54
|
+
let util = require("util");
|
|
55
|
+
util = require_logger.__toESM(util);
|
|
56
|
+
let _inquirer_input = require("@inquirer/input");
|
|
57
|
+
_inquirer_input = require_logger.__toESM(_inquirer_input);
|
|
58
|
+
require("node:path");
|
|
59
|
+
require("node:url");
|
|
60
60
|
let crypto$1 = require("crypto");
|
|
61
61
|
crypto$1 = require_logger.__toESM(crypto$1);
|
|
62
62
|
let _opentelemetry_api = require("@opentelemetry/api");
|
|
63
|
-
let _inquirer_input = require("@inquirer/input");
|
|
64
|
-
_inquirer_input = require_logger.__toESM(_inquirer_input);
|
|
65
63
|
let readline = require("readline");
|
|
66
64
|
readline = require_logger.__toESM(readline);
|
|
65
|
+
let csv_parse_sync = require("csv-parse/sync");
|
|
66
|
+
let glob = require("glob");
|
|
67
67
|
let drizzle_orm = require("drizzle-orm");
|
|
68
|
+
let fast_xml_parser = require("fast-xml-parser");
|
|
68
69
|
let cli_progress = require("cli-progress");
|
|
69
70
|
cli_progress = require_logger.__toESM(cli_progress);
|
|
70
71
|
let url = require("url");
|
|
71
|
-
let
|
|
72
|
+
let parse5 = require("parse5");
|
|
72
73
|
let fastest_levenshtein = require("fastest-levenshtein");
|
|
73
74
|
let js_rouge = require("js-rouge");
|
|
74
75
|
js_rouge = require_logger.__toESM(js_rouge);
|
|
@@ -262,6 +263,502 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
262
263
|
};
|
|
263
264
|
};
|
|
264
265
|
//#endregion
|
|
266
|
+
//#region src/matchers/classification.ts
|
|
267
|
+
/**
|
|
268
|
+
*
|
|
269
|
+
* @param expected Expected classification. If undefined, matches any classification.
|
|
270
|
+
* @param output Text to classify.
|
|
271
|
+
* @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
|
|
272
|
+
* @param grading
|
|
273
|
+
* @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
|
|
274
|
+
*/
|
|
275
|
+
async function matchesClassification(expected, output, threshold, grading) {
|
|
276
|
+
const resp = await (await require_graders.getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
|
|
277
|
+
if (!resp.classification) return require_graders.fail(resp.error || "Unknown error fetching classification");
|
|
278
|
+
let score;
|
|
279
|
+
if (expected === void 0) {
|
|
280
|
+
const scores = Object.values(resp.classification);
|
|
281
|
+
if (scores.length === 0) return {
|
|
282
|
+
pass: false,
|
|
283
|
+
score: 0,
|
|
284
|
+
reason: "No classification scores returned"
|
|
285
|
+
};
|
|
286
|
+
score = Math.max(...scores);
|
|
287
|
+
} else score = resp.classification[expected] || 0;
|
|
288
|
+
if (score >= threshold - Number.EPSILON) {
|
|
289
|
+
const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
|
|
290
|
+
return {
|
|
291
|
+
pass: true,
|
|
292
|
+
score,
|
|
293
|
+
reason
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
return {
|
|
297
|
+
pass: false,
|
|
298
|
+
score,
|
|
299
|
+
reason: expected === void 0 ? `Maximum classification score ${score.toFixed(2)} < ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
//#endregion
|
|
303
|
+
//#region src/matchers/comparison.ts
|
|
304
|
+
async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
|
|
305
|
+
require_invariant.invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
|
|
306
|
+
const resp = await require_graders.callProviderWithContext(await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "select-best check"), await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.SELECT_BEST_PROMPT), {
|
|
307
|
+
criteria,
|
|
308
|
+
outputs: outputs.map((o) => require_graders.tryParse(o)),
|
|
309
|
+
...vars || {}
|
|
310
|
+
}), "select-best", {
|
|
311
|
+
criteria,
|
|
312
|
+
outputs: outputs.map((o) => require_graders.tryParse(o)),
|
|
313
|
+
...vars || {}
|
|
314
|
+
}, providerCallContext);
|
|
315
|
+
if (resp.error || !resp.output) return Array.from({ length: outputs.length }, () => require_graders.fail(resp.error || "No output", resp.tokenUsage));
|
|
316
|
+
require_invariant.invariant(typeof resp.output === "string", "select-best produced malformed response");
|
|
317
|
+
const firstIntegerMatch = resp.output.trim().match(/\d+/);
|
|
318
|
+
const verdict = firstIntegerMatch ? Number.parseInt(firstIntegerMatch[0], 10) : NaN;
|
|
319
|
+
if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return Array.from({ length: outputs.length }, () => require_graders.fail(`Invalid select-best verdict: ${verdict}`, resp.tokenUsage));
|
|
320
|
+
const tokensUsed = require_graders.normalizeMatcherTokenUsage(resp.tokenUsage);
|
|
321
|
+
return outputs.map((_output, index) => {
|
|
322
|
+
if (index === verdict) return {
|
|
323
|
+
pass: true,
|
|
324
|
+
score: 1,
|
|
325
|
+
reason: `Output selected as the best: ${criteria}`,
|
|
326
|
+
tokensUsed
|
|
327
|
+
};
|
|
328
|
+
else return {
|
|
329
|
+
pass: false,
|
|
330
|
+
score: 0,
|
|
331
|
+
reason: `Output not selected: ${criteria}`,
|
|
332
|
+
tokensUsed
|
|
333
|
+
};
|
|
334
|
+
});
|
|
335
|
+
}
|
|
336
|
+
async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
|
|
337
|
+
require_invariant.invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
|
|
338
|
+
const value = assertion.value || {};
|
|
339
|
+
const options = {
|
|
340
|
+
method: typeof value === "object" && "method" in value ? value.method : "average",
|
|
341
|
+
weights: typeof value === "object" && "weights" in value ? value.weights : {},
|
|
342
|
+
threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
|
|
343
|
+
};
|
|
344
|
+
const scores = resultsWithGradingResults.map((result, index) => {
|
|
345
|
+
const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
|
|
346
|
+
if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
|
|
347
|
+
let totalWeightedScore = 0;
|
|
348
|
+
let totalWeight = 0;
|
|
349
|
+
relevantResults.forEach((componentResult) => {
|
|
350
|
+
const assertionType = componentResult.assertion?.type || "unknown";
|
|
351
|
+
const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
|
|
352
|
+
const score = componentResult.score || 0;
|
|
353
|
+
totalWeightedScore += score * weight;
|
|
354
|
+
totalWeight += weight;
|
|
355
|
+
});
|
|
356
|
+
let aggregateScore;
|
|
357
|
+
if (options.method === "sum") aggregateScore = totalWeightedScore;
|
|
358
|
+
else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
|
|
359
|
+
return {
|
|
360
|
+
index,
|
|
361
|
+
score: aggregateScore,
|
|
362
|
+
componentCount: relevantResults.length,
|
|
363
|
+
totalWeight
|
|
364
|
+
};
|
|
365
|
+
});
|
|
366
|
+
let maxScore = -Infinity;
|
|
367
|
+
let winnerIndex = 0;
|
|
368
|
+
for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
|
|
369
|
+
maxScore = scores[i].score;
|
|
370
|
+
winnerIndex = i;
|
|
371
|
+
}
|
|
372
|
+
const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
|
|
373
|
+
return scores.map(({ index, score, componentCount, totalWeight }) => {
|
|
374
|
+
const isWinner = index === winnerIndex && meetsThreshold;
|
|
375
|
+
return {
|
|
376
|
+
pass: isWinner,
|
|
377
|
+
score: isWinner ? 1 : 0,
|
|
378
|
+
reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
|
|
379
|
+
namedScores: {
|
|
380
|
+
maxScore: score,
|
|
381
|
+
assertionCount: componentCount,
|
|
382
|
+
totalWeight
|
|
383
|
+
}
|
|
384
|
+
};
|
|
385
|
+
});
|
|
386
|
+
}
|
|
387
|
+
//#endregion
|
|
388
|
+
//#region src/matchers/moderation.ts
|
|
389
|
+
async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
|
|
390
|
+
if (!assistantResponse) return {
|
|
391
|
+
pass: true,
|
|
392
|
+
score: 1,
|
|
393
|
+
reason: "No output to moderate"
|
|
394
|
+
};
|
|
395
|
+
const defaultProviders = await require_graders.getDefaultProviders();
|
|
396
|
+
const defaultModerationProvider = !require_logger.getEnvString("OPENAI_API_KEY") && (require_logger.getEnvString("REPLICATE_API_KEY") || require_logger.getEnvString("REPLICATE_API_TOKEN")) ? await require_providers.loadApiProvider(require_types.LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
|
|
397
|
+
const moderationProvider = await require_graders.getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
|
|
398
|
+
require_invariant.invariant(moderationProvider, "Moderation provider must be defined");
|
|
399
|
+
const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
|
|
400
|
+
if (resp.error) return {
|
|
401
|
+
pass: false,
|
|
402
|
+
score: 0,
|
|
403
|
+
reason: `Moderation API error: ${resp.error}`
|
|
404
|
+
};
|
|
405
|
+
const { flags } = resp;
|
|
406
|
+
if (!flags || flags.length === 0) return {
|
|
407
|
+
pass: true,
|
|
408
|
+
score: 1,
|
|
409
|
+
reason: "No moderation flags detected"
|
|
410
|
+
};
|
|
411
|
+
const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
|
|
412
|
+
if (filteredFlags.length > 0) return {
|
|
413
|
+
pass: false,
|
|
414
|
+
score: 0,
|
|
415
|
+
reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
|
|
416
|
+
};
|
|
417
|
+
return {
|
|
418
|
+
pass: true,
|
|
419
|
+
score: 1,
|
|
420
|
+
reason: "No relevant moderation flags detected"
|
|
421
|
+
};
|
|
422
|
+
}
|
|
423
|
+
//#endregion
|
|
424
|
+
//#region src/assertions/contextUtils.ts
|
|
425
|
+
/**
|
|
426
|
+
* Resolves the context value for context-based assertions.
|
|
427
|
+
* Supports extracting context from test variables or transforming from output.
|
|
428
|
+
* Can return either a single context string or an array of context chunks.
|
|
429
|
+
*
|
|
430
|
+
* @param assertion - The assertion configuration
|
|
431
|
+
* @param test - The test case
|
|
432
|
+
* @param output - The provider output (after provider transform, before test transform)
|
|
433
|
+
* @param prompt - The prompt text
|
|
434
|
+
* @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
|
|
435
|
+
* @param providerResponse - Optional full provider response for contextTransform
|
|
436
|
+
* @returns The resolved context string or array of strings
|
|
437
|
+
* @throws Error if context cannot be resolved or transform fails
|
|
438
|
+
*/
|
|
439
|
+
async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
|
|
440
|
+
let contextValue;
|
|
441
|
+
if (test.vars?.context) {
|
|
442
|
+
if (typeof test.vars.context === "string") contextValue = test.vars.context;
|
|
443
|
+
else if (Array.isArray(test.vars.context)) {
|
|
444
|
+
const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
|
|
445
|
+
if (invalidEntry) {
|
|
446
|
+
const [idx, val] = invalidEntry;
|
|
447
|
+
require_invariant.invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
|
|
448
|
+
}
|
|
449
|
+
contextValue = test.vars.context;
|
|
450
|
+
}
|
|
451
|
+
} else if (fallbackContext) contextValue = fallbackContext;
|
|
452
|
+
if (assertion.contextTransform) try {
|
|
453
|
+
const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
|
|
454
|
+
const transformed = await require_transform$1.transform(assertion.contextTransform, outputForTransform, {
|
|
455
|
+
vars: test.vars,
|
|
456
|
+
prompt: { label: prompt },
|
|
457
|
+
...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
|
|
458
|
+
});
|
|
459
|
+
require_invariant.invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${assertion.contextTransform}`);
|
|
460
|
+
contextValue = transformed;
|
|
461
|
+
} catch (error) {
|
|
462
|
+
throw new Error(`Failed to transform context using expression '${assertion.contextTransform}': ${error instanceof Error ? error.message : String(error)}`);
|
|
463
|
+
}
|
|
464
|
+
require_invariant.invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
|
|
465
|
+
return contextValue;
|
|
466
|
+
}
|
|
467
|
+
/**
|
|
468
|
+
* Serializes context (string or string[]) to a single string for prompts.
|
|
469
|
+
* Joins chunks with double newlines to preserve separation.
|
|
470
|
+
*/
|
|
471
|
+
function serializeContext(context) {
|
|
472
|
+
return Array.isArray(context) ? context.join("\n\n") : context;
|
|
473
|
+
}
|
|
474
|
+
//#endregion
|
|
475
|
+
//#region src/matchers/rag.ts
|
|
476
|
+
async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
|
|
477
|
+
const defaults = await require_graders.getDefaultProviders();
|
|
478
|
+
const embeddingProvider = await require_graders.getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "answer relevancy check");
|
|
479
|
+
const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, defaults.gradingProvider, "answer relevancy check");
|
|
480
|
+
const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
|
|
481
|
+
const rubricPrompt = await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.ANSWER_RELEVANCY_GENERATE);
|
|
482
|
+
const parsedOutput = require_graders.tryParse(output);
|
|
483
|
+
const promptText = await require_graders.renderLlmRubricPrompt(rubricPrompt, { answer: parsedOutput });
|
|
484
|
+
const candidateQuestions = [];
|
|
485
|
+
for (let i = 0; i < 3; i++) {
|
|
486
|
+
const resp = await require_graders.callProviderWithContext(textProvider, promptText, "answer-relevance", { answer: parsedOutput }, providerCallContext);
|
|
487
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
488
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
|
|
489
|
+
require_invariant.invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
|
|
490
|
+
candidateQuestions.push(resp.output);
|
|
491
|
+
}
|
|
492
|
+
require_invariant.invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id()} must implement callEmbeddingApi for similarity check`);
|
|
493
|
+
const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
|
|
494
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, inputEmbeddingResp.tokenUsage);
|
|
495
|
+
if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return require_graders.fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
|
|
496
|
+
const inputEmbedding = inputEmbeddingResp.embedding;
|
|
497
|
+
const similarities = [];
|
|
498
|
+
const questionsWithScores = [];
|
|
499
|
+
for (const question of candidateQuestions) {
|
|
500
|
+
const resp = await embeddingProvider.callEmbeddingApi(question);
|
|
501
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
502
|
+
if (resp.error || !resp.embedding) return require_graders.fail(resp.error || "No embedding", tokensUsed);
|
|
503
|
+
const questionSimilarity = require_graders.cosineSimilarity(inputEmbedding, resp.embedding);
|
|
504
|
+
similarities.push(questionSimilarity);
|
|
505
|
+
questionsWithScores.push({
|
|
506
|
+
question,
|
|
507
|
+
similarity: questionSimilarity
|
|
508
|
+
});
|
|
509
|
+
}
|
|
510
|
+
const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
|
|
511
|
+
const pass = similarity >= threshold - Number.EPSILON;
|
|
512
|
+
const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
|
|
513
|
+
const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
514
|
+
const metadata = {
|
|
515
|
+
generatedQuestions: questionsWithScores,
|
|
516
|
+
averageSimilarity: similarity,
|
|
517
|
+
threshold
|
|
518
|
+
};
|
|
519
|
+
if (pass) return {
|
|
520
|
+
pass: true,
|
|
521
|
+
score: similarity,
|
|
522
|
+
reason: greaterThanReason,
|
|
523
|
+
tokensUsed,
|
|
524
|
+
metadata
|
|
525
|
+
};
|
|
526
|
+
return {
|
|
527
|
+
pass: false,
|
|
528
|
+
score: similarity,
|
|
529
|
+
reason: lessThanReason,
|
|
530
|
+
tokensUsed,
|
|
531
|
+
metadata
|
|
532
|
+
};
|
|
533
|
+
}
|
|
534
|
+
async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
|
|
535
|
+
const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "context recall check");
|
|
536
|
+
const contextString = serializeContext(context);
|
|
537
|
+
const resp = await require_graders.callProviderWithContext(textProvider, await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.CONTEXT_RECALL), {
|
|
538
|
+
context: contextString,
|
|
539
|
+
groundTruth,
|
|
540
|
+
...vars || {}
|
|
541
|
+
}), "context-recall", {
|
|
542
|
+
context: contextString,
|
|
543
|
+
groundTruth,
|
|
544
|
+
...vars || {}
|
|
545
|
+
}, providerCallContext);
|
|
546
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", resp.tokenUsage);
|
|
547
|
+
require_invariant.invariant(typeof resp.output === "string", "context-recall produced malformed response");
|
|
548
|
+
const attributedTokenLower = require_graders.CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
|
|
549
|
+
const notAttributedTokenLower = require_graders.CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
|
|
550
|
+
const sentences = require_graders.splitIntoSentences(resp.output).filter((line) => {
|
|
551
|
+
const lowerLine = line.toLowerCase();
|
|
552
|
+
return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
|
|
553
|
+
});
|
|
554
|
+
const sentenceAttributions = [];
|
|
555
|
+
let numerator = 0;
|
|
556
|
+
for (const sentence of sentences) {
|
|
557
|
+
const lowerSentence = sentence.toLowerCase();
|
|
558
|
+
const isAttributed = !lowerSentence.includes(notAttributedTokenLower) && lowerSentence.includes(attributedTokenLower);
|
|
559
|
+
if (isAttributed) numerator++;
|
|
560
|
+
const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
|
|
561
|
+
const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
|
|
562
|
+
sentenceAttributions.push({
|
|
563
|
+
sentence: cleanSentence,
|
|
564
|
+
attributed: isAttributed
|
|
565
|
+
});
|
|
566
|
+
}
|
|
567
|
+
const score = sentences.length > 0 ? numerator / sentences.length : 0;
|
|
568
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
569
|
+
const metadata = {
|
|
570
|
+
sentenceAttributions,
|
|
571
|
+
totalSentences: sentences.length,
|
|
572
|
+
attributedSentences: numerator,
|
|
573
|
+
score
|
|
574
|
+
};
|
|
575
|
+
return {
|
|
576
|
+
pass,
|
|
577
|
+
score,
|
|
578
|
+
reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
|
|
579
|
+
tokensUsed: require_graders.normalizeMatcherTokenUsage(resp.tokenUsage),
|
|
580
|
+
metadata
|
|
581
|
+
};
|
|
582
|
+
}
|
|
583
|
+
async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
|
|
584
|
+
const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "context relevance check");
|
|
585
|
+
const contextString = serializeContext(context);
|
|
586
|
+
const resp = await require_graders.callProviderWithContext(textProvider, await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.CONTEXT_RELEVANCE), {
|
|
587
|
+
context: contextString,
|
|
588
|
+
query: question
|
|
589
|
+
}), "context-relevance", {
|
|
590
|
+
context: contextString,
|
|
591
|
+
query: question
|
|
592
|
+
}, providerCallContext);
|
|
593
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", resp.tokenUsage);
|
|
594
|
+
require_invariant.invariant(typeof resp.output === "string", "context-relevance produced malformed response");
|
|
595
|
+
const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : require_graders.splitIntoSentences(context);
|
|
596
|
+
const totalContextUnits = contextUnits.length;
|
|
597
|
+
const extractedSentences = require_graders.splitIntoSentences(resp.output);
|
|
598
|
+
const relevantSentences = [];
|
|
599
|
+
const insufficientInformation = resp.output.includes(require_graders.CONTEXT_RELEVANCE_BAD);
|
|
600
|
+
let numerator = 0;
|
|
601
|
+
if (insufficientInformation) numerator = 0;
|
|
602
|
+
else {
|
|
603
|
+
const uniqueRelevantSentences = [...new Set(extractedSentences)];
|
|
604
|
+
numerator = Math.min(uniqueRelevantSentences.length, totalContextUnits);
|
|
605
|
+
relevantSentences.push(...uniqueRelevantSentences);
|
|
606
|
+
}
|
|
607
|
+
const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
|
|
608
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
609
|
+
const metadata = {
|
|
610
|
+
extractedSentences: relevantSentences,
|
|
611
|
+
totalContextUnits,
|
|
612
|
+
totalContextSentences: totalContextUnits,
|
|
613
|
+
contextUnits,
|
|
614
|
+
relevantSentenceCount: numerator,
|
|
615
|
+
insufficientInformation,
|
|
616
|
+
score
|
|
617
|
+
};
|
|
618
|
+
return {
|
|
619
|
+
pass,
|
|
620
|
+
score,
|
|
621
|
+
reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
|
|
622
|
+
tokensUsed: require_graders.normalizeMatcherTokenUsage(resp.tokenUsage),
|
|
623
|
+
metadata
|
|
624
|
+
};
|
|
625
|
+
}
|
|
626
|
+
async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
|
|
627
|
+
const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "faithfulness check");
|
|
628
|
+
const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
|
|
629
|
+
if (grading?.rubricPrompt) require_invariant.invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
|
|
630
|
+
const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
|
|
631
|
+
const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
|
|
632
|
+
const longformPrompt = await require_graders.loadRubricPrompt(rawLongformPrompt, require_graders.CONTEXT_FAITHFULNESS_LONGFORM);
|
|
633
|
+
const nliPrompt = await require_graders.loadRubricPrompt(rawNliPrompt, require_graders.CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
|
|
634
|
+
let promptText = await require_graders.renderLlmRubricPrompt(longformPrompt, {
|
|
635
|
+
question: query,
|
|
636
|
+
answer: require_graders.tryParse(output),
|
|
637
|
+
...vars || {}
|
|
638
|
+
});
|
|
639
|
+
let resp = await require_graders.callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
|
|
640
|
+
question: query,
|
|
641
|
+
answer: require_graders.tryParse(output),
|
|
642
|
+
...vars || {}
|
|
643
|
+
}, providerCallContext);
|
|
644
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
645
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
|
|
646
|
+
require_invariant.invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
647
|
+
const contextString = serializeContext(context);
|
|
648
|
+
const statements = require_graders.splitIntoSentences(resp.output);
|
|
649
|
+
promptText = await require_graders.renderLlmRubricPrompt(nliPrompt, {
|
|
650
|
+
context: contextString,
|
|
651
|
+
statements,
|
|
652
|
+
...vars || {}
|
|
653
|
+
});
|
|
654
|
+
resp = await require_graders.callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
|
|
655
|
+
context: contextString,
|
|
656
|
+
statements,
|
|
657
|
+
...vars || {}
|
|
658
|
+
}, providerCallContext);
|
|
659
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
660
|
+
if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
|
|
661
|
+
require_invariant.invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
662
|
+
let finalAnswer = "Final verdict for each statement in order:";
|
|
663
|
+
finalAnswer = finalAnswer.toLowerCase();
|
|
664
|
+
let verdicts = resp.output.toLowerCase().trim();
|
|
665
|
+
let score = 0;
|
|
666
|
+
if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
|
|
667
|
+
verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
|
|
668
|
+
const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
|
|
669
|
+
if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
|
|
670
|
+
} else {
|
|
671
|
+
const noVerdictCount = verdicts.split("verdict: no").length - 1;
|
|
672
|
+
if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
|
|
673
|
+
}
|
|
674
|
+
score = Math.min(1, Math.max(0, score));
|
|
675
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
676
|
+
return {
|
|
677
|
+
pass,
|
|
678
|
+
score,
|
|
679
|
+
reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
|
|
680
|
+
tokensUsed
|
|
681
|
+
};
|
|
682
|
+
}
|
|
683
|
+
//#endregion
|
|
684
|
+
//#region src/matchers/similarity.ts
|
|
685
|
+
function calculateSimilarityScore(expectedEmbedding, outputEmbedding, metric, tokensUsed) {
|
|
686
|
+
switch (metric) {
|
|
687
|
+
case "cosine": return require_graders.cosineSimilarity(expectedEmbedding, outputEmbedding);
|
|
688
|
+
case "dot_product": return require_graders.dotProduct(expectedEmbedding, outputEmbedding);
|
|
689
|
+
case "euclidean": return require_graders.euclideanDistance(expectedEmbedding, outputEmbedding);
|
|
690
|
+
default: return require_graders.fail(`Unsupported metric: ${metric}`, tokensUsed);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
|
|
694
|
+
if (metric === "euclidean") {
|
|
695
|
+
const distance = similarity;
|
|
696
|
+
const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
|
|
697
|
+
const normalizedScore = 1 / (1 + distance);
|
|
698
|
+
const score = inverse ? 1 - normalizedScore : normalizedScore;
|
|
699
|
+
const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
|
|
700
|
+
const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
|
|
701
|
+
return {
|
|
702
|
+
pass,
|
|
703
|
+
score,
|
|
704
|
+
reason: pass ? inverse ? aboveThresholdReason : belowThresholdReason : inverse ? belowThresholdReason : aboveThresholdReason,
|
|
705
|
+
tokensUsed
|
|
706
|
+
};
|
|
707
|
+
}
|
|
708
|
+
const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
|
|
709
|
+
const score = inverse ? 1 - similarity : similarity;
|
|
710
|
+
const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
|
|
711
|
+
const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
712
|
+
return {
|
|
713
|
+
pass,
|
|
714
|
+
score,
|
|
715
|
+
reason: pass ? inverse ? lessThanReason : greaterThanReason : inverse ? greaterThanReason : lessThanReason,
|
|
716
|
+
tokensUsed
|
|
717
|
+
};
|
|
718
|
+
}
|
|
719
|
+
async function calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed) {
|
|
720
|
+
if (metric === "cosine" && "callSimilarityApi" in finalProvider) {
|
|
721
|
+
const similarityResp = await finalProvider.callSimilarityApi(expected, output);
|
|
722
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, similarityResp.tokenUsage);
|
|
723
|
+
if (similarityResp.error) return require_graders.fail(similarityResp.error, tokensUsed);
|
|
724
|
+
if (similarityResp.similarity == null) return require_graders.fail("Unknown error fetching similarity", tokensUsed);
|
|
725
|
+
if (!Number.isFinite(similarityResp.similarity)) return require_graders.fail(`Invalid similarity score: ${similarityResp.similarity}`, tokensUsed);
|
|
726
|
+
return similarityResp.similarity;
|
|
727
|
+
}
|
|
728
|
+
const callEmbeddingApi = "callEmbeddingApi" in finalProvider ? finalProvider.callEmbeddingApi : void 0;
|
|
729
|
+
if (typeof callEmbeddingApi !== "function") {
|
|
730
|
+
if ("callSimilarityApi" in finalProvider) return require_graders.fail(`Provider ${finalProvider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed);
|
|
731
|
+
throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
|
|
732
|
+
}
|
|
733
|
+
const [expectedEmbedding, outputEmbedding] = await Promise.all([callEmbeddingApi.call(finalProvider, expected), callEmbeddingApi.call(finalProvider, output)]);
|
|
734
|
+
const mergedUsage = require_graders.normalizeMatcherTokenUsage(void 0);
|
|
735
|
+
require_tokenUsageUtils.accumulateTokenUsage(mergedUsage, expectedEmbedding.tokenUsage);
|
|
736
|
+
require_tokenUsageUtils.accumulateTokenUsage(mergedUsage, outputEmbedding.tokenUsage);
|
|
737
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, mergedUsage);
|
|
738
|
+
if (expectedEmbedding.error || outputEmbedding.error) return require_graders.fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed);
|
|
739
|
+
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return require_graders.fail("Embedding not found", tokensUsed);
|
|
740
|
+
return calculateSimilarityScore(expectedEmbedding.embedding, outputEmbedding.embedding, metric, tokensUsed);
|
|
741
|
+
}
|
|
742
|
+
async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
|
|
743
|
+
if (metric === "cosine" && require_logger.state.config?.redteam && require_server.shouldGenerateRemote({ requireEmbeddingProvider: true })) try {
|
|
744
|
+
return await require_graders.doRemoteGrading({
|
|
745
|
+
task: "similar",
|
|
746
|
+
expected,
|
|
747
|
+
output,
|
|
748
|
+
threshold,
|
|
749
|
+
inverse
|
|
750
|
+
});
|
|
751
|
+
} catch (error) {
|
|
752
|
+
return require_graders.fail(`Could not perform remote grading: ${error}`);
|
|
753
|
+
}
|
|
754
|
+
const defaults = await require_graders.getDefaultProviders();
|
|
755
|
+
const finalProvider = await require_graders.getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check");
|
|
756
|
+
const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
|
|
757
|
+
const similarity = await calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed);
|
|
758
|
+
if (typeof similarity !== "number") return similarity;
|
|
759
|
+
return buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed);
|
|
760
|
+
}
|
|
761
|
+
//#endregion
|
|
265
762
|
//#region src/tracing/evaluatorTracing.ts
|
|
266
763
|
let otlpReceiverStarted = false;
|
|
267
764
|
const DEFAULT_OTLP_ACCEPT_FORMATS = ["json", "protobuf"];
|
|
@@ -305,7 +802,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
305
802
|
require_telemetry.telemetry.record("feature_used", { feature: "tracing" });
|
|
306
803
|
try {
|
|
307
804
|
require_logger.logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
308
|
-
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
805
|
+
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DNSQj6bf.cjs"));
|
|
309
806
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
310
807
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
311
808
|
const acceptFormats = normalizeOtlpAcceptFormats(testSuite.tracing.otlp.http.acceptFormats);
|
|
@@ -329,7 +826,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
329
826
|
async function stopOtlpReceiverIfNeeded() {
|
|
330
827
|
if (otlpReceiverStarted) try {
|
|
331
828
|
require_logger.logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
332
|
-
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
829
|
+
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DNSQj6bf.cjs"));
|
|
333
830
|
await stopOTLPReceiver();
|
|
334
831
|
otlpReceiverStarted = false;
|
|
335
832
|
require_logger.logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
@@ -364,7 +861,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
364
861
|
}
|
|
365
862
|
if (!tracingEnabled) return null;
|
|
366
863
|
require_logger.logger.debug("[EvaluatorTracing] Importing trace store");
|
|
367
|
-
const { getTraceStore } = await Promise.resolve().then(() => require("./store-
|
|
864
|
+
const { getTraceStore } = await Promise.resolve().then(() => require("./store-uQZ4AjPe.cjs"));
|
|
368
865
|
const traceStore = getTraceStore();
|
|
369
866
|
const traceId = generateTraceId();
|
|
370
867
|
const spanId = generateSpanId();
|
|
@@ -406,7 +903,7 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
|
|
|
406
903
|
require_invariant.invariant(prompt, "answer-relevance assertion type must have a prompt");
|
|
407
904
|
return {
|
|
408
905
|
assertion,
|
|
409
|
-
...await
|
|
906
|
+
...await matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
|
|
410
907
|
};
|
|
411
908
|
};
|
|
412
909
|
//#endregion
|
|
@@ -662,7 +1159,7 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
662
1159
|
//#region src/assertions/classifier.ts
|
|
663
1160
|
async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
|
|
664
1161
|
require_invariant.invariant(typeof renderedValue === "string" || typeof renderedValue === "undefined", "\"classifier\" assertion type must have a string value or be undefined");
|
|
665
|
-
const classificationResult = await
|
|
1162
|
+
const classificationResult = await matchesClassification(renderedValue, outputString, assertion.threshold ?? 1, test.options);
|
|
666
1163
|
if (inverse) {
|
|
667
1164
|
classificationResult.pass = !classificationResult.pass;
|
|
668
1165
|
classificationResult.score = 1 - classificationResult.score;
|
|
@@ -674,38 +1171,84 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
674
1171
|
}
|
|
675
1172
|
//#endregion
|
|
676
1173
|
//#region src/assertions/contains.ts
|
|
1174
|
+
/**
|
|
1175
|
+
* Advance over separators between parsed fields.
|
|
1176
|
+
*
|
|
1177
|
+
* Contains-any values allow whitespace around comma delimiters, and historical
|
|
1178
|
+
* parsing ignored repeated commas rather than producing empty fields.
|
|
1179
|
+
*/
|
|
1180
|
+
function skipWhitespaceAndCommas(value, startIndex) {
|
|
1181
|
+
let i = startIndex;
|
|
1182
|
+
while (i < value.length) {
|
|
1183
|
+
i = skipWhitespace(value, i);
|
|
1184
|
+
if (value[i] !== ",") break;
|
|
1185
|
+
i++;
|
|
1186
|
+
}
|
|
1187
|
+
return i;
|
|
1188
|
+
}
|
|
1189
|
+
/**
|
|
1190
|
+
* Advance over whitespace while preserving comma delimiter handling for callers.
|
|
1191
|
+
*/
|
|
1192
|
+
function skipWhitespace(value, startIndex) {
|
|
1193
|
+
let i = startIndex;
|
|
1194
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
1195
|
+
return i;
|
|
1196
|
+
}
|
|
1197
|
+
/**
|
|
1198
|
+
* Parse a quoted field using the assertion parser's CSV-like escape rules.
|
|
1199
|
+
*
|
|
1200
|
+
* Supports backslash-escaped quotes/backslashes and doubled quotes, and rejects
|
|
1201
|
+
* unterminated fields so malformed assertion values do not silently pass.
|
|
1202
|
+
*/
|
|
1203
|
+
function parseQuotedField(value, startIndex) {
|
|
1204
|
+
let i = startIndex + 1;
|
|
1205
|
+
let field = "";
|
|
1206
|
+
let terminated = false;
|
|
1207
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && ["\"", "\\"].includes(value[i + 1])) {
|
|
1208
|
+
field += value[i + 1];
|
|
1209
|
+
i += 2;
|
|
1210
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
1211
|
+
field += "\"";
|
|
1212
|
+
i += 2;
|
|
1213
|
+
} else if (value[i] === "\"") {
|
|
1214
|
+
i++;
|
|
1215
|
+
terminated = true;
|
|
1216
|
+
break;
|
|
1217
|
+
} else {
|
|
1218
|
+
field += value[i];
|
|
1219
|
+
i++;
|
|
1220
|
+
}
|
|
1221
|
+
require_invariant.invariant(terminated, "Unterminated quoted field in contains assertion value");
|
|
1222
|
+
return {
|
|
1223
|
+
field,
|
|
1224
|
+
nextIndex: i
|
|
1225
|
+
};
|
|
1226
|
+
}
|
|
1227
|
+
/**
|
|
1228
|
+
* Parse an unquoted field up to the next comma, trimming surrounding whitespace.
|
|
1229
|
+
*/
|
|
1230
|
+
function parseUnquotedField(value, startIndex) {
|
|
1231
|
+
let i = startIndex;
|
|
1232
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
1233
|
+
return {
|
|
1234
|
+
field: value.substring(startIndex, i).trim(),
|
|
1235
|
+
nextIndex: i
|
|
1236
|
+
};
|
|
1237
|
+
}
|
|
1238
|
+
/**
|
|
1239
|
+
* Split a contains-any string into fields while preserving quoted commas.
|
|
1240
|
+
*/
|
|
677
1241
|
function parseCommaSeparatedValues(value) {
|
|
678
1242
|
const results = [];
|
|
679
1243
|
let i = 0;
|
|
680
1244
|
while (i < value.length) {
|
|
681
|
-
|
|
1245
|
+
i = skipWhitespaceAndCommas(value, i);
|
|
682
1246
|
if (i >= value.length) break;
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
i++;
|
|
689
|
-
let field = "";
|
|
690
|
-
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
691
|
-
field += value[i + 1];
|
|
692
|
-
i += 2;
|
|
693
|
-
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
694
|
-
field += "\"";
|
|
695
|
-
i += 2;
|
|
696
|
-
} else if (value[i] === "\"") {
|
|
697
|
-
i++;
|
|
698
|
-
break;
|
|
699
|
-
} else {
|
|
700
|
-
field += value[i];
|
|
701
|
-
i++;
|
|
702
|
-
}
|
|
703
|
-
results.push(field);
|
|
704
|
-
} else {
|
|
705
|
-
const start = i;
|
|
706
|
-
while (i < value.length && value[i] !== ",") i++;
|
|
707
|
-
results.push(value.substring(start, i).trim());
|
|
708
|
-
}
|
|
1247
|
+
const isQuotedField = value[i] === "\"";
|
|
1248
|
+
const parsed = isQuotedField ? parseQuotedField(value, i) : parseUnquotedField(value, i);
|
|
1249
|
+
results.push(parsed.field);
|
|
1250
|
+
i = isQuotedField ? skipWhitespace(value, parsed.nextIndex) : parsed.nextIndex;
|
|
1251
|
+
require_invariant.invariant(!isQuotedField || i >= value.length || value[i] === ",", "Expected comma after quoted field in contains assertion value");
|
|
709
1252
|
}
|
|
710
1253
|
return results;
|
|
711
1254
|
}
|
|
@@ -803,10 +1346,10 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
803
1346
|
require_invariant.invariant(test.vars, "context-faithfulness assertion requires a test with variables");
|
|
804
1347
|
require_invariant.invariant(typeof test.vars.query === "string", "context-faithfulness assertion requires a \"query\" variable with the user question");
|
|
805
1348
|
require_invariant.invariant(typeof output === "string", "context-faithfulness assertion requires string output from the provider");
|
|
806
|
-
const context = await
|
|
1349
|
+
const context = await resolveContext(assertion, test, output, prompt, void 0, providerResponse);
|
|
807
1350
|
return {
|
|
808
1351
|
assertion,
|
|
809
|
-
...await
|
|
1352
|
+
...await matchesContextFaithfulness(test.vars.query, output, context, assertion.threshold ?? 0, test.options, test.vars, providerCallContext),
|
|
810
1353
|
metadata: { context }
|
|
811
1354
|
};
|
|
812
1355
|
}
|
|
@@ -825,8 +1368,8 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
825
1368
|
const handleContextRecall = async ({ assertion, renderedValue, prompt, test, output, providerResponse, providerCallContext }) => {
|
|
826
1369
|
require_invariant.invariant(typeof renderedValue === "string", "context-recall assertion requires a string value (expected answer or fact to verify)");
|
|
827
1370
|
require_invariant.invariant(prompt, "context-recall assertion requires a prompt");
|
|
828
|
-
const context = await
|
|
829
|
-
const result = await
|
|
1371
|
+
const context = await resolveContext(assertion, test, output, prompt, prompt, providerResponse);
|
|
1372
|
+
const result = await matchesContextRecall(context, renderedValue, assertion.threshold ?? 0, test.options, test.vars, providerCallContext);
|
|
830
1373
|
return {
|
|
831
1374
|
assertion,
|
|
832
1375
|
...result,
|
|
@@ -851,8 +1394,8 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
|
|
|
851
1394
|
const handleContextRelevance = async ({ assertion, test, output, prompt, providerResponse, providerCallContext }) => {
|
|
852
1395
|
require_invariant.invariant(test.vars, "context-relevance assertion requires a test with variables");
|
|
853
1396
|
require_invariant.invariant(typeof test.vars.query === "string", "context-relevance assertion requires a \"query\" variable with the user question");
|
|
854
|
-
const context = await
|
|
855
|
-
const result = await
|
|
1397
|
+
const context = await resolveContext(assertion, test, output, prompt, void 0, providerResponse);
|
|
1398
|
+
const result = await matchesContextRelevance(test.vars.query, context, assertion.threshold ?? 0, test.options, providerCallContext);
|
|
856
1399
|
return {
|
|
857
1400
|
assertion,
|
|
858
1401
|
...result,
|
|
@@ -930,7 +1473,7 @@ function handleFinishReason({ assertion, inverse = false, renderedValue, provide
|
|
|
930
1473
|
//#region src/assertions/functionToolCall.ts
|
|
931
1474
|
const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
932
1475
|
try {
|
|
933
|
-
if (provider instanceof require_providers.AIStudioChatProvider || provider instanceof require_providers.GoogleLiveProvider || provider instanceof require_providers.VertexChatProvider) require_transform
|
|
1476
|
+
if (provider instanceof require_providers.AIStudioChatProvider || provider instanceof require_providers.GoogleLiveProvider || provider instanceof require_providers.VertexChatProvider) require_transform.validateFunctionCall(output, provider.config?.tools, test.vars);
|
|
934
1477
|
else if (provider instanceof require_chat.OpenAiChatCompletionProvider) require_util$1.validateFunctionCall(output, provider.config.functions, test.vars);
|
|
935
1478
|
else throw new Error(`Provider does not have functionality for checking function call.`);
|
|
936
1479
|
return {
|
|
@@ -1110,6 +1653,43 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1110
1653
|
};
|
|
1111
1654
|
//#endregion
|
|
1112
1655
|
//#region src/assertions/html.ts
|
|
1656
|
+
const LITERAL_WRAPPER_PATTERNS = {
|
|
1657
|
+
html: /<html(?=[\s>/])/,
|
|
1658
|
+
head: /<head(?=[\s>/])/,
|
|
1659
|
+
body: /<body(?=[\s>/])/
|
|
1660
|
+
};
|
|
1661
|
+
function isWrapperTagName(tagName) {
|
|
1662
|
+
return tagName === "html" || tagName === "head" || tagName === "body";
|
|
1663
|
+
}
|
|
1664
|
+
function isTextNode(node) {
|
|
1665
|
+
return node.nodeName === "#text";
|
|
1666
|
+
}
|
|
1667
|
+
function isElementNode(node) {
|
|
1668
|
+
return "tagName" in node;
|
|
1669
|
+
}
|
|
1670
|
+
function hasSourceCodeLocation(element) {
|
|
1671
|
+
return "sourceCodeLocation" in element && element.sourceCodeLocation !== null && element.sourceCodeLocation !== void 0;
|
|
1672
|
+
}
|
|
1673
|
+
function getChildNodes(node) {
|
|
1674
|
+
return "childNodes" in node ? node.childNodes : [];
|
|
1675
|
+
}
|
|
1676
|
+
function findFirstElement(root, predicate) {
|
|
1677
|
+
const stack = [root];
|
|
1678
|
+
while (stack.length > 0) {
|
|
1679
|
+
const current = stack.pop();
|
|
1680
|
+
if (isElementNode(current) && predicate(current)) return current;
|
|
1681
|
+
const children = getChildNodes(current);
|
|
1682
|
+
for (let i = children.length - 1; i >= 0; i--) stack.push(children[i]);
|
|
1683
|
+
}
|
|
1684
|
+
}
|
|
1685
|
+
function hasTopLevelText(parentNode) {
|
|
1686
|
+
return parentNode.childNodes.some((node) => isTextNode(node) && Boolean(node.value.trim()));
|
|
1687
|
+
}
|
|
1688
|
+
function isUserProvidedElement(element, inputLowercase) {
|
|
1689
|
+
const tagName = element.tagName.toLowerCase();
|
|
1690
|
+
if (isWrapperTagName(tagName)) return LITERAL_WRAPPER_PATTERNS[tagName].test(inputLowercase) && hasSourceCodeLocation(element);
|
|
1691
|
+
return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
|
|
1692
|
+
}
|
|
1113
1693
|
const HTML_PATTERNS = {
|
|
1114
1694
|
openingTag: /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/,
|
|
1115
1695
|
closingTag: /<\/[a-zA-Z][a-zA-Z0-9-]*\s*>/,
|
|
@@ -1265,37 +1845,21 @@ function validateHtml(htmlString) {
|
|
|
1265
1845
|
isValid: false,
|
|
1266
1846
|
reason: "Output appears to be XML, not HTML"
|
|
1267
1847
|
};
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
].includes(tagName) && !trimmed.toLowerCase().includes(`<${tagName}`)) return false;
|
|
1284
|
-
return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
|
|
1285
|
-
})) return {
|
|
1286
|
-
isValid: false,
|
|
1287
|
-
reason: "Output does not contain recognized HTML elements"
|
|
1288
|
-
};
|
|
1289
|
-
return {
|
|
1290
|
-
isValid: true,
|
|
1291
|
-
reason: "Output is valid HTML"
|
|
1292
|
-
};
|
|
1293
|
-
} catch (error) {
|
|
1294
|
-
return {
|
|
1295
|
-
isValid: false,
|
|
1296
|
-
reason: `HTML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
1297
|
-
};
|
|
1298
|
-
}
|
|
1848
|
+
const document = (0, parse5.parse)(trimmed, { sourceCodeLocationInfo: true });
|
|
1849
|
+
const inputLowercase = trimmed.toLowerCase();
|
|
1850
|
+
const body = findFirstElement(document, (element) => element.tagName === "body");
|
|
1851
|
+
if (!(body !== void 0 && LITERAL_WRAPPER_PATTERNS.body.test(inputLowercase) && hasSourceCodeLocation(body)) && body && hasTopLevelText(body)) return {
|
|
1852
|
+
isValid: false,
|
|
1853
|
+
reason: "Output must be wrapped in HTML tags"
|
|
1854
|
+
};
|
|
1855
|
+
if (!findFirstElement(document, (element) => isUserProvidedElement(element, inputLowercase))) return {
|
|
1856
|
+
isValid: false,
|
|
1857
|
+
reason: "Output does not contain recognized HTML elements"
|
|
1858
|
+
};
|
|
1859
|
+
return {
|
|
1860
|
+
isValid: true,
|
|
1861
|
+
reason: "Output is valid HTML"
|
|
1862
|
+
};
|
|
1299
1863
|
}
|
|
1300
1864
|
const handleContainsHtml = ({ assertion, outputString, inverse }) => {
|
|
1301
1865
|
const pass = containsHtml(outputString) !== inverse;
|
|
@@ -1460,7 +2024,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
|
|
|
1460
2024
|
let result;
|
|
1461
2025
|
if (typeof valueFromScript === "undefined") {
|
|
1462
2026
|
const functionBody = renderedValue.includes("\n") ? renderedValue : buildFunctionBody(renderedValue);
|
|
1463
|
-
result = await validateResult(new Function("output", "context", "process", functionBody)(output, assertionValueContext, require_transform.getProcessShim()));
|
|
2027
|
+
result = await validateResult(new Function("output", "context", "process", functionBody)(output, assertionValueContext, require_transform$1.getProcessShim()));
|
|
1464
2028
|
} else {
|
|
1465
2029
|
require_invariant.invariant(typeof valueFromScript === "boolean" || typeof valueFromScript === "number" || typeof valueFromScript === "object", `Javascript assertion script must return a boolean, number, or object (${assertion.value})`);
|
|
1466
2030
|
result = await validateResult(valueFromScript);
|
|
@@ -1667,7 +2231,7 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
|
|
|
1667
2231
|
const parsedPrompt = require_fetch.parseChatPrompt(promptToModerate, null);
|
|
1668
2232
|
if (parsedPrompt && parsedPrompt.length > 0) promptToModerate = getLastModerationPrompt(parsedPrompt) ?? promptToModerate;
|
|
1669
2233
|
} catch {}
|
|
1670
|
-
const moderationResult = await
|
|
2234
|
+
const moderationResult = await matchesModeration({
|
|
1671
2235
|
userPrompt: promptToModerate,
|
|
1672
2236
|
assistantResponse: outputString,
|
|
1673
2237
|
categories: Array.isArray(assertion.value) ? assertion.value : []
|
|
@@ -2400,11 +2964,10 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
2400
2964
|
const rougeMethod = js_rouge[baseType[baseType.length - 1]];
|
|
2401
2965
|
const score = rougeMethod(outputString, renderedValue, {});
|
|
2402
2966
|
const threshold = assertion.threshold ?? .75;
|
|
2403
|
-
const pass = score >= threshold != inverse;
|
|
2404
2967
|
return {
|
|
2405
|
-
pass,
|
|
2968
|
+
pass: score >= threshold !== inverse,
|
|
2406
2969
|
score: inverse ? 1 - score : score,
|
|
2407
|
-
reason:
|
|
2970
|
+
reason: `${baseType.toUpperCase()} score ${score.toFixed(2)} is ${score >= threshold ? "greater than or equal to" : "less than"} threshold ${threshold}`,
|
|
2408
2971
|
assertion
|
|
2409
2972
|
};
|
|
2410
2973
|
}
|
|
@@ -2466,10 +3029,196 @@ const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertion
|
|
|
2466
3029
|
}
|
|
2467
3030
|
};
|
|
2468
3031
|
//#endregion
|
|
3032
|
+
//#region src/providers/webSearchUtils.ts
|
|
3033
|
+
function hasTool(provider, predicate) {
|
|
3034
|
+
return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
|
|
3035
|
+
}
|
|
3036
|
+
function getProviderId(provider) {
|
|
3037
|
+
if (typeof provider.id !== "function") return null;
|
|
3038
|
+
try {
|
|
3039
|
+
return provider.id();
|
|
3040
|
+
} catch (err) {
|
|
3041
|
+
require_logger.logger.debug(`Failed to read provider id: ${err}`);
|
|
3042
|
+
return null;
|
|
3043
|
+
}
|
|
3044
|
+
}
|
|
3045
|
+
function isOpenAiResponsesProvider(provider, id) {
|
|
3046
|
+
return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
|
|
3047
|
+
}
|
|
3048
|
+
/**
|
|
3049
|
+
* Check if a provider has web search capabilities
|
|
3050
|
+
* @param provider The provider to check
|
|
3051
|
+
* @returns true if the provider supports web search
|
|
3052
|
+
*/
|
|
3053
|
+
function hasWebSearchCapability(provider) {
|
|
3054
|
+
if (!provider) return false;
|
|
3055
|
+
const id = getProviderId(provider);
|
|
3056
|
+
if (!id) return false;
|
|
3057
|
+
if (id.includes("perplexity")) return true;
|
|
3058
|
+
if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
|
|
3059
|
+
if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
|
|
3060
|
+
if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
|
|
3061
|
+
if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
|
|
3062
|
+
if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
|
|
3063
|
+
return false;
|
|
3064
|
+
}
|
|
3065
|
+
/**
|
|
3066
|
+
* Load a provider with web search capabilities.
|
|
3067
|
+
* Tries multiple providers in order of preference until one succeeds.
|
|
3068
|
+
* Uses the latest and most capable models from each provider with specific checkpoint IDs.
|
|
3069
|
+
*
|
|
3070
|
+
* @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
|
|
3071
|
+
* @returns A provider with web search capabilities or null
|
|
3072
|
+
*/
|
|
3073
|
+
async function loadWebSearchProvider(preferAnthropic = false) {
|
|
3074
|
+
const loadAnthropicWebSearch = async () => {
|
|
3075
|
+
try {
|
|
3076
|
+
return await require_providers.loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
|
|
3077
|
+
type: "web_search_20250305",
|
|
3078
|
+
name: "web_search",
|
|
3079
|
+
max_uses: 5
|
|
3080
|
+
}] } } });
|
|
3081
|
+
} catch (err) {
|
|
3082
|
+
require_logger.logger.debug(`Failed to load Anthropic web search provider: ${err}`);
|
|
3083
|
+
return null;
|
|
3084
|
+
}
|
|
3085
|
+
};
|
|
3086
|
+
const loadOpenAIWebSearch = async () => {
|
|
3087
|
+
try {
|
|
3088
|
+
return await require_providers.loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
|
|
3089
|
+
} catch (err) {
|
|
3090
|
+
require_logger.logger.debug(`Failed to load OpenAI web search provider: ${err}`);
|
|
3091
|
+
return null;
|
|
3092
|
+
}
|
|
3093
|
+
};
|
|
3094
|
+
const loadPerplexity = async () => {
|
|
3095
|
+
try {
|
|
3096
|
+
return await require_providers.loadApiProvider("perplexity:sonar-pro");
|
|
3097
|
+
} catch (err) {
|
|
3098
|
+
require_logger.logger.debug(`Failed to load Perplexity provider: ${err}`);
|
|
3099
|
+
return null;
|
|
3100
|
+
}
|
|
3101
|
+
};
|
|
3102
|
+
const loadGoogleWebSearch = async () => {
|
|
3103
|
+
try {
|
|
3104
|
+
return await require_providers.loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
3105
|
+
} catch (err) {
|
|
3106
|
+
require_logger.logger.debug(`Failed to load Google web search provider: ${err}`);
|
|
3107
|
+
return null;
|
|
3108
|
+
}
|
|
3109
|
+
};
|
|
3110
|
+
const loadVertexWebSearch = async () => {
|
|
3111
|
+
try {
|
|
3112
|
+
return await require_providers.loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
3113
|
+
} catch (err) {
|
|
3114
|
+
require_logger.logger.debug(`Failed to load Vertex web search provider: ${err}`);
|
|
3115
|
+
return null;
|
|
3116
|
+
}
|
|
3117
|
+
};
|
|
3118
|
+
const loadXaiWebSearch = async () => {
|
|
3119
|
+
try {
|
|
3120
|
+
return await require_providers.loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
|
|
3121
|
+
} catch (err) {
|
|
3122
|
+
require_logger.logger.debug(`Failed to load xAI web search provider: ${err}`);
|
|
3123
|
+
return null;
|
|
3124
|
+
}
|
|
3125
|
+
};
|
|
3126
|
+
const providers = preferAnthropic ? [
|
|
3127
|
+
loadAnthropicWebSearch,
|
|
3128
|
+
loadOpenAIWebSearch,
|
|
3129
|
+
loadPerplexity,
|
|
3130
|
+
loadGoogleWebSearch,
|
|
3131
|
+
loadVertexWebSearch,
|
|
3132
|
+
loadXaiWebSearch
|
|
3133
|
+
] : [
|
|
3134
|
+
loadOpenAIWebSearch,
|
|
3135
|
+
loadAnthropicWebSearch,
|
|
3136
|
+
loadPerplexity,
|
|
3137
|
+
loadGoogleWebSearch,
|
|
3138
|
+
loadVertexWebSearch,
|
|
3139
|
+
loadXaiWebSearch
|
|
3140
|
+
];
|
|
3141
|
+
for (const getProvider of providers) {
|
|
3142
|
+
const provider = await getProvider();
|
|
3143
|
+
if (provider && hasWebSearchCapability(provider)) {
|
|
3144
|
+
require_logger.logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
|
|
3145
|
+
return provider;
|
|
3146
|
+
}
|
|
3147
|
+
if (provider) require_logger.logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
|
|
3148
|
+
}
|
|
3149
|
+
return null;
|
|
3150
|
+
}
|
|
3151
|
+
//#endregion
|
|
3152
|
+
//#region src/matchers/search.ts
|
|
3153
|
+
async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
|
|
3154
|
+
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
3155
|
+
const defaultProviders = await require_graders.getDefaultProviders();
|
|
3156
|
+
const defaultSearchProviders = [
|
|
3157
|
+
defaultProviders.webSearchProvider,
|
|
3158
|
+
defaultProviders.llmRubricProvider,
|
|
3159
|
+
defaultProviders.gradingProvider
|
|
3160
|
+
];
|
|
3161
|
+
let searchProvider = (grading.provider ? await require_graders.getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
|
|
3162
|
+
if (!hasWebSearchCapability(searchProvider)) {
|
|
3163
|
+
const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
|
|
3164
|
+
if (webSearchDefault) searchProvider = webSearchDefault;
|
|
3165
|
+
}
|
|
3166
|
+
if (!hasWebSearchCapability(searchProvider)) {
|
|
3167
|
+
const webSearchProvider = await loadWebSearchProvider(true);
|
|
3168
|
+
if (webSearchProvider) searchProvider = webSearchProvider;
|
|
3169
|
+
}
|
|
3170
|
+
if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error(`search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:${require_graders.DEFAULT_ANTHROPIC_MODEL}, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider`);
|
|
3171
|
+
const prompt = await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.DEFAULT_WEB_SEARCH_PROMPT), {
|
|
3172
|
+
output: require_graders.tryParse(llmOutput),
|
|
3173
|
+
rubric,
|
|
3174
|
+
...vars || {}
|
|
3175
|
+
});
|
|
3176
|
+
const resp = await require_graders.callProviderWithContext(searchProvider, prompt, "search-rubric", {
|
|
3177
|
+
output: require_graders.tryParse(llmOutput),
|
|
3178
|
+
rubric,
|
|
3179
|
+
...vars || {}
|
|
3180
|
+
}, providerCallContext);
|
|
3181
|
+
if (resp.error || !resp.output) return {
|
|
3182
|
+
pass: false,
|
|
3183
|
+
score: 0,
|
|
3184
|
+
reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
|
|
3185
|
+
tokensUsed: resp.tokenUsage,
|
|
3186
|
+
assertion
|
|
3187
|
+
};
|
|
3188
|
+
try {
|
|
3189
|
+
const result = require_logger.extractFirstJsonObject(String(resp.output));
|
|
3190
|
+
let pass = result.pass ?? false;
|
|
3191
|
+
const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
|
|
3192
|
+
if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
|
|
3193
|
+
return {
|
|
3194
|
+
pass,
|
|
3195
|
+
score,
|
|
3196
|
+
reason: result.reason || "No reason provided",
|
|
3197
|
+
tokensUsed: resp.tokenUsage,
|
|
3198
|
+
assertion,
|
|
3199
|
+
metadata: {
|
|
3200
|
+
searchResults: result.searchResults || [],
|
|
3201
|
+
searchProvider: searchProvider.id()
|
|
3202
|
+
}
|
|
3203
|
+
};
|
|
3204
|
+
} catch (err) {
|
|
3205
|
+
require_logger.logger.warn(`[search-rubric] Could not parse structured JSON from provider response, falling back to substring matching: ${err.message}`);
|
|
3206
|
+
const outputLower = String(resp.output).toLowerCase();
|
|
3207
|
+
const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
|
|
3208
|
+
return {
|
|
3209
|
+
pass,
|
|
3210
|
+
score: pass ? 1 : 0,
|
|
3211
|
+
reason: resp.output,
|
|
3212
|
+
tokensUsed: resp.tokenUsage,
|
|
3213
|
+
assertion
|
|
3214
|
+
};
|
|
3215
|
+
}
|
|
3216
|
+
}
|
|
3217
|
+
//#endregion
|
|
2469
3218
|
//#region src/assertions/searchRubric.ts
|
|
2470
3219
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
2471
3220
|
if (renderedValue == null) throw new Error("search-rubric assertion type must have a string value");
|
|
2472
|
-
const result = await
|
|
3221
|
+
const result = await matchesSearchRubric(String(renderedValue), providerResponse.output, test.options, test.vars, assertion, provider, providerCallContext);
|
|
2473
3222
|
if (inverse) {
|
|
2474
3223
|
result.pass = !result.pass;
|
|
2475
3224
|
result.reason = result.pass ? `Output does not require web search verification: ${result.reason}` : `Output requires web search verification: ${result.reason}`;
|
|
@@ -2500,7 +3249,7 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2500
3249
|
if (Array.isArray(renderedValue)) {
|
|
2501
3250
|
let minScore = Number.POSITIVE_INFINITY;
|
|
2502
3251
|
for (const value of renderedValue) {
|
|
2503
|
-
const result = await
|
|
3252
|
+
const result = await matchesSimilarity(value, outputString, threshold, inverse, test.options, metric);
|
|
2504
3253
|
if (result.pass) return {
|
|
2505
3254
|
assertion,
|
|
2506
3255
|
...result
|
|
@@ -2515,7 +3264,7 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2515
3264
|
};
|
|
2516
3265
|
} else return {
|
|
2517
3266
|
assertion,
|
|
2518
|
-
...await
|
|
3267
|
+
...await matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
|
|
2519
3268
|
};
|
|
2520
3269
|
};
|
|
2521
3270
|
//#endregion
|
|
@@ -3551,7 +4300,7 @@ const ASSERTION_HANDLERS = {
|
|
|
3551
4300
|
"llm-rubric": handleLlmRubric,
|
|
3552
4301
|
meteor: async (params) => {
|
|
3553
4302
|
try {
|
|
3554
|
-
const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-
|
|
4303
|
+
const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-BBGcGeCa.cjs"));
|
|
3555
4304
|
return handleMeteorAssertion(params);
|
|
3556
4305
|
} catch (error) {
|
|
3557
4306
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -3633,7 +4382,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
3633
4382
|
const { cost, logProbs, output: originalOutput } = providerResponse;
|
|
3634
4383
|
let output = originalOutput;
|
|
3635
4384
|
require_invariant.invariant(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
|
|
3636
|
-
if (assertion.transform) output = await require_transform.transform(assertion.transform, output, {
|
|
4385
|
+
if (assertion.transform) output = await require_transform$1.transform(assertion.transform, output, {
|
|
3637
4386
|
vars: resolvedVars,
|
|
3638
4387
|
prompt: { label: prompt },
|
|
3639
4388
|
...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
|
|
@@ -3687,7 +4436,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
3687
4436
|
};
|
|
3688
4437
|
}
|
|
3689
4438
|
else if (filePath.endsWith(".rb")) try {
|
|
3690
|
-
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-
|
|
4439
|
+
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-CO-tuszQ.cjs"));
|
|
3691
4440
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
3692
4441
|
require_logger.logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
3693
4442
|
} catch (error) {
|
|
@@ -3840,7 +4589,7 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
3840
4589
|
async function runCompareAssertion(test, assertion, outputs, context) {
|
|
3841
4590
|
require_invariant.invariant(typeof assertion.value === "string", "select-best must have a string value");
|
|
3842
4591
|
test = require_graders.getFinalTest(test, assertion);
|
|
3843
|
-
return (await
|
|
4592
|
+
return (await matchesSelectBest(assertion.value, outputs, test.options, test.vars, context)).map((result) => ({
|
|
3844
4593
|
...result,
|
|
3845
4594
|
assertion
|
|
3846
4595
|
}));
|
|
@@ -3857,17 +4606,17 @@ async function readAssertions(filePath) {
|
|
|
3857
4606
|
var assertions_default = {
|
|
3858
4607
|
runAssertion,
|
|
3859
4608
|
runAssertions,
|
|
3860
|
-
matchesSimilarity
|
|
3861
|
-
matchesClassification
|
|
4609
|
+
matchesSimilarity,
|
|
4610
|
+
matchesClassification,
|
|
3862
4611
|
matchesLlmRubric: require_graders.matchesLlmRubric,
|
|
3863
4612
|
matchesFactuality: require_graders.matchesFactuality,
|
|
3864
4613
|
matchesClosedQa: require_graders.matchesClosedQa,
|
|
3865
|
-
matchesAnswerRelevance
|
|
3866
|
-
matchesContextRecall
|
|
3867
|
-
matchesContextRelevance
|
|
3868
|
-
matchesContextFaithfulness
|
|
3869
|
-
matchesComparisonBoolean:
|
|
3870
|
-
matchesModeration
|
|
4614
|
+
matchesAnswerRelevance,
|
|
4615
|
+
matchesContextRecall,
|
|
4616
|
+
matchesContextRelevance,
|
|
4617
|
+
matchesContextFaithfulness,
|
|
4618
|
+
matchesComparisonBoolean: matchesSelectBest,
|
|
4619
|
+
matchesModeration,
|
|
3871
4620
|
matchesConversationRelevance
|
|
3872
4621
|
};
|
|
3873
4622
|
//#endregion
|
|
@@ -4242,7 +4991,7 @@ function initializeOtel(config) {
|
|
|
4242
4991
|
require_logger.logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
|
|
4243
4992
|
const resource = (0, _opentelemetry_resources.resourceFromAttributes)({
|
|
4244
4993
|
[_opentelemetry_semantic_conventions.ATTR_SERVICE_NAME]: config.serviceName,
|
|
4245
|
-
[_opentelemetry_semantic_conventions.ATTR_SERVICE_VERSION]:
|
|
4994
|
+
[_opentelemetry_semantic_conventions.ATTR_SERVICE_VERSION]: require_version.VERSION
|
|
4246
4995
|
});
|
|
4247
4996
|
const spanProcessors = [];
|
|
4248
4997
|
if (config.localExport) {
|
|
@@ -4960,13 +5709,13 @@ async function gradeRunEvalResponse({ abortSignal, deferGrading, evalId, latency
|
|
|
4960
5709
|
}
|
|
4961
5710
|
async function transformRunEvalResponse({ evalId, prompt, promptIdx, provider, response, test, testIdx, vars }) {
|
|
4962
5711
|
const processedResponse = { ...response };
|
|
4963
|
-
if (provider.transform) processedResponse.output = await require_transform.transform(provider.transform, processedResponse.output, {
|
|
5712
|
+
if (provider.transform) processedResponse.output = await require_transform$1.transform(provider.transform, processedResponse.output, {
|
|
4964
5713
|
vars,
|
|
4965
5714
|
prompt
|
|
4966
5715
|
});
|
|
4967
5716
|
const providerTransformedOutput = processedResponse.output;
|
|
4968
5717
|
const testTransform = test.options?.transform || test.options?.postprocess;
|
|
4969
|
-
if (testTransform) processedResponse.output = await require_transform.transform(testTransform, processedResponse.output, {
|
|
5718
|
+
if (testTransform) processedResponse.output = await require_transform$1.transform(testTransform, processedResponse.output, {
|
|
4970
5719
|
vars,
|
|
4971
5720
|
prompt,
|
|
4972
5721
|
...response && response.metadata && { metadata: response.metadata }
|
|
@@ -5418,10 +6167,10 @@ async function prepareTestVariables(tests, testSuite) {
|
|
|
5418
6167
|
async function applyInputTransform(testCase, inputTransformDefault) {
|
|
5419
6168
|
const inputTransform = testCase.options?.transformVars || inputTransformDefault;
|
|
5420
6169
|
if (!inputTransform) return;
|
|
5421
|
-
const transformedVars = await require_transform.transform(inputTransform, testCase.vars, {
|
|
6170
|
+
const transformedVars = await require_transform$1.transform(inputTransform, testCase.vars, {
|
|
5422
6171
|
prompt: {},
|
|
5423
6172
|
uuid: crypto.randomUUID()
|
|
5424
|
-
}, true, require_transform.TransformInputType.VARS);
|
|
6173
|
+
}, true, require_transform$1.TransformInputType.VARS);
|
|
5425
6174
|
require_invariant.invariant(typeof transformedVars === "object", "Transform function did not return a valid object");
|
|
5426
6175
|
testCase.vars = {
|
|
5427
6176
|
...testCase.vars,
|
|
@@ -5485,7 +6234,7 @@ async function resolveDefaultTestProvider(defaultTest, testCase) {
|
|
|
5485
6234
|
const defaultProvider = defaultTest.provider;
|
|
5486
6235
|
if (require_types.isApiProvider(defaultProvider)) return defaultProvider;
|
|
5487
6236
|
if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
5488
|
-
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-
|
|
6237
|
+
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-C7lNVBjX.cjs"));
|
|
5489
6238
|
return loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
5490
6239
|
}
|
|
5491
6240
|
return defaultProvider;
|
|
@@ -5645,7 +6394,7 @@ function buildRepeatCacheContextByTestIdx(runEvalOptions) {
|
|
|
5645
6394
|
async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
|
|
5646
6395
|
if (!require_logger.state.resume || !evalRecord.persisted) return;
|
|
5647
6396
|
try {
|
|
5648
|
-
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-
|
|
6397
|
+
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-BBJAHAtw.cjs"));
|
|
5649
6398
|
const completedPairs = await EvalResult.getCompletedIndexPairs(evalRecord.id, { excludeErrors: require_logger.state.retryMode });
|
|
5650
6399
|
const originalCount = runEvalOptions.length;
|
|
5651
6400
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
@@ -6104,9 +6853,8 @@ var Evaluator = class {
|
|
|
6104
6853
|
context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics || createTimeoutMetrics(timeoutMs));
|
|
6105
6854
|
}
|
|
6106
6855
|
async executeEvalSteps({ checkAbort, ciProgressReporter, combinedAbortSignal, concurrentRunEvalOptions, evalStepIndexMap, globalTimeout, groupedRunEvalOptions, isEvalTimedOut, isWebUI, maxEvalTimeMs, processingContext, processedIndices, progressBarManager, prompts, serialRunEvalOptions, shouldGroupGradingByProvider }) {
|
|
6107
|
-
let flushGroupedRows;
|
|
6108
6856
|
try {
|
|
6109
|
-
if (shouldGroupGradingByProvider)
|
|
6857
|
+
if (shouldGroupGradingByProvider) await this.runGroupedEvalSteps({
|
|
6110
6858
|
checkAbort,
|
|
6111
6859
|
evalStepIndexMap,
|
|
6112
6860
|
groupedRunEvalOptions,
|
|
@@ -6138,7 +6886,6 @@ var Evaluator = class {
|
|
|
6138
6886
|
cleanupProgressAfterError(progressBarManager, ciProgressReporter, err);
|
|
6139
6887
|
throw err;
|
|
6140
6888
|
}
|
|
6141
|
-
await flushGroupedRows?.();
|
|
6142
6889
|
if (isEvalTimedOut()) require_logger.logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
6143
6890
|
else if (!processingContext.targetUnavailable) return this.saveInterruptedEval({
|
|
6144
6891
|
ciProgressReporter,
|
|
@@ -6363,7 +7110,7 @@ var Evaluator = class {
|
|
|
6363
7110
|
}
|
|
6364
7111
|
const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
|
|
6365
7112
|
if (!maxScoreAssertion) return;
|
|
6366
|
-
const maxScoreGradingResults = await
|
|
7113
|
+
const maxScoreGradingResults = await selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
|
|
6367
7114
|
updateComparisonReporterProgress({
|
|
6368
7115
|
ciProgressReporter,
|
|
6369
7116
|
compareCount,
|
|
@@ -8434,47 +9181,11 @@ function filterPrompts(prompts, filterPromptsOption) {
|
|
|
8434
9181
|
//#endregion
|
|
8435
9182
|
//#region src/commands/eval/filterProviders.ts
|
|
8436
9183
|
/**
|
|
8437
|
-
* Checks if a value is a valid provider ID (non-empty string).
|
|
8438
|
-
*/
|
|
8439
|
-
function isValidProviderId(id) {
|
|
8440
|
-
return id !== null && id !== void 0 && typeof id === "string" && id !== "";
|
|
8441
|
-
}
|
|
8442
|
-
/**
|
|
8443
9184
|
* Extracts the id and label from a raw provider config without instantiating it.
|
|
8444
9185
|
* Handles all provider config formats: string, function, ProviderOptions, ProviderOptionsMap.
|
|
8445
9186
|
*/
|
|
8446
9187
|
function getProviderIdAndLabel(provider, index) {
|
|
8447
|
-
|
|
8448
|
-
if (typeof provider === "function") {
|
|
8449
|
-
const label = provider.label;
|
|
8450
|
-
return {
|
|
8451
|
-
id: label ?? `custom-function-${index}`,
|
|
8452
|
-
label
|
|
8453
|
-
};
|
|
8454
|
-
}
|
|
8455
|
-
const providerId = provider.id;
|
|
8456
|
-
if ("id" in provider && isValidProviderId(providerId)) return {
|
|
8457
|
-
id: providerId,
|
|
8458
|
-
label: provider.label
|
|
8459
|
-
};
|
|
8460
|
-
const keys = Object.keys(provider);
|
|
8461
|
-
if (keys.length > 0) {
|
|
8462
|
-
const id = keys[0];
|
|
8463
|
-
const value = provider[id];
|
|
8464
|
-
if (typeof value === "object" && value !== null) return {
|
|
8465
|
-
id: value.id || id,
|
|
8466
|
-
label: value.label
|
|
8467
|
-
};
|
|
8468
|
-
}
|
|
8469
|
-
const label = provider.label;
|
|
8470
|
-
if (isValidProviderId(label)) return {
|
|
8471
|
-
id: label,
|
|
8472
|
-
label
|
|
8473
|
-
};
|
|
8474
|
-
return {
|
|
8475
|
-
id: `unknown-${index}`,
|
|
8476
|
-
label
|
|
8477
|
-
};
|
|
9188
|
+
return require_util.normalizeProviderRef(provider, { index });
|
|
8478
9189
|
}
|
|
8479
9190
|
/**
|
|
8480
9191
|
* Filters raw provider configs BEFORE instantiation.
|
|
@@ -10227,7 +10938,7 @@ async function fetchRemoteGeneration(task, prompts) {
|
|
|
10227
10938
|
const body = {
|
|
10228
10939
|
task,
|
|
10229
10940
|
prompts,
|
|
10230
|
-
version:
|
|
10941
|
+
version: require_version.VERSION,
|
|
10231
10942
|
email: require_accounts.getUserEmail()
|
|
10232
10943
|
};
|
|
10233
10944
|
const response = await require_cache.fetchWithCache(require_server.getRemoteGenerationUrl(), {
|
|
@@ -11179,9 +11890,10 @@ function dedupeTestCases(testCases) {
|
|
|
11179
11890
|
return deduped;
|
|
11180
11891
|
}
|
|
11181
11892
|
function buildMaxCharsRetryInstructions(rejectedPromptLengths, limit) {
|
|
11893
|
+
const longestRejectedPromptText = rejectedPromptLengths.length > 0 ? `${Math.max(...rejectedPromptLengths)} characters` : "unknown length";
|
|
11182
11894
|
return dedent.default`
|
|
11183
11895
|
Your previous response included ${rejectedPromptLengths.length} generated prompt${rejectedPromptLengths.length === 1 ? "" : "s"} that exceeded the ${limit ?? "configured"}-character limit.
|
|
11184
|
-
The longest rejected prompt was ${
|
|
11896
|
+
The longest rejected prompt was ${longestRejectedPromptText}.
|
|
11185
11897
|
Generate replacement prompts only, and keep every user message within the character limit.
|
|
11186
11898
|
`.trim();
|
|
11187
11899
|
}
|
|
@@ -11238,7 +11950,7 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
11238
11950
|
n,
|
|
11239
11951
|
purpose,
|
|
11240
11952
|
task: key,
|
|
11241
|
-
version:
|
|
11953
|
+
version: require_version.VERSION,
|
|
11242
11954
|
email: require_accounts.getUserEmail()
|
|
11243
11955
|
});
|
|
11244
11956
|
try {
|
|
@@ -12317,7 +13029,7 @@ function handleFailedPlugins(failedPlugins, strict) {
|
|
|
12317
13029
|
}
|
|
12318
13030
|
function getConfigHash(configPath) {
|
|
12319
13031
|
const content = fs.readFileSync(configPath, "utf8");
|
|
12320
|
-
return (0, crypto$1.createHash)("md5").update(`${
|
|
13032
|
+
return (0, crypto$1.createHash)("md5").update(`${require_version.VERSION}:${content}`).digest("hex");
|
|
12321
13033
|
}
|
|
12322
13034
|
function createHeaderComments({ title, timestampLabel, author, cloudHost, testCasesCount, plugins, strategies, isUpdate = false }) {
|
|
12323
13035
|
const sectionLabel = isUpdate ? "Changes:" : "Test Configuration:";
|
|
@@ -13230,7 +13942,7 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
|
13230
13942
|
for (const row of evaluateTable.body.slice(0, maxRows)) table.push([...row.vars.map((v) => require_text.ellipsize(v, tableCellMaxLength)), ...row.outputs.map(({ pass, text, failureReason: failureType }) => {
|
|
13231
13943
|
text = require_text.ellipsize(text, tableCellMaxLength);
|
|
13232
13944
|
if (pass) return chalk.default.green("[PASS] ") + text;
|
|
13233
|
-
|
|
13945
|
+
return chalk.default.red(failureType === require_types.ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.default.red.bold(c) : c).join("---");
|
|
13234
13946
|
})]);
|
|
13235
13947
|
return table.toString();
|
|
13236
13948
|
}
|
|
@@ -13321,6 +14033,115 @@ function formatDuration(seconds) {
|
|
|
13321
14033
|
}
|
|
13322
14034
|
//#endregion
|
|
13323
14035
|
//#region src/commands/eval/summary.ts
|
|
14036
|
+
function getCompletionMessage({ completionType, evalId, shareableUrl, wasAborted, writeToDatabase, activelySharing }) {
|
|
14037
|
+
if (wasAborted) {
|
|
14038
|
+
const idSuffix = writeToDatabase ? ` (ID: ${chalk.default.cyan(evalId)})` : "";
|
|
14039
|
+
return `${chalk.default.red("✗")} ${completionType} aborted${idSuffix}`;
|
|
14040
|
+
}
|
|
14041
|
+
if (writeToDatabase && shareableUrl) return `${chalk.default.green("✓")} ${completionType} complete: ${shareableUrl}`;
|
|
14042
|
+
if (writeToDatabase && activelySharing) return `${chalk.default.green("✓")} ${completionType} complete`;
|
|
14043
|
+
if (writeToDatabase) return `${chalk.default.green("✓")} ${completionType} complete (ID: ${chalk.default.cyan(evalId)})`;
|
|
14044
|
+
return `${chalk.default.green("✓")} ${completionType} complete`;
|
|
14045
|
+
}
|
|
14046
|
+
function getAbortSummaryLines(targetErrorStatus) {
|
|
14047
|
+
if (targetErrorStatus == null) return [];
|
|
14048
|
+
return [
|
|
14049
|
+
"",
|
|
14050
|
+
chalk.default.red.bold("Scan stopped: Target is unavailable and will not recover on retry."),
|
|
14051
|
+
chalk.default.red(` Target returned HTTP ${targetErrorStatus}`),
|
|
14052
|
+
"",
|
|
14053
|
+
chalk.default.yellow("Possible causes:"),
|
|
14054
|
+
chalk.default.yellow(" • Invalid API key or authentication (401/403)"),
|
|
14055
|
+
chalk.default.yellow(" • Target endpoint does not exist (404)"),
|
|
14056
|
+
chalk.default.yellow(" • Server does not support the request (501)"),
|
|
14057
|
+
"",
|
|
14058
|
+
chalk.default.cyan("To fix: Check your target configuration and credentials.")
|
|
14059
|
+
];
|
|
14060
|
+
}
|
|
14061
|
+
function getGuidanceLines({ writeToDatabase, shareableUrl, wantsToShare, activelySharing, hasExplicitDisable, cloudEnabled }) {
|
|
14062
|
+
if (!writeToDatabase || shareableUrl || wantsToShare || activelySharing) return [];
|
|
14063
|
+
const lines = ["", `» View results: ${chalk.default.green.bold("promptfoo view")}`];
|
|
14064
|
+
if (!hasExplicitDisable) lines.push(cloudEnabled ? `» Create shareable URL: ${chalk.default.green.bold("promptfoo share")}` : `» Share with your team: ${chalk.default.green.bold("https://promptfoo.app")}`);
|
|
14065
|
+
lines.push(`» Feedback: ${chalk.default.green.bold("https://promptfoo.dev/feedback")}`);
|
|
14066
|
+
return lines;
|
|
14067
|
+
}
|
|
14068
|
+
function buildUsageDetails(usage, total) {
|
|
14069
|
+
const parts = [];
|
|
14070
|
+
if (usage.prompt && usage.prompt > 0) parts.push(`${usage.prompt.toLocaleString()} prompt`);
|
|
14071
|
+
if (usage.completion && usage.completion > 0) parts.push(`${usage.completion.toLocaleString()} completion`);
|
|
14072
|
+
if (usage.cached && usage.cached > 0) parts.push(usage.cached === total && parts.length === 0 ? "cached" : `${usage.cached.toLocaleString()} cached`);
|
|
14073
|
+
if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) parts.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
14074
|
+
return parts;
|
|
14075
|
+
}
|
|
14076
|
+
function getTokenUsageLines(tokenUsage, isRedteam, tracker) {
|
|
14077
|
+
const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
|
|
14078
|
+
const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
|
|
14079
|
+
if (!hasEvalTokens && !hasGradingTokens) return [];
|
|
14080
|
+
const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
|
|
14081
|
+
const evalTokens = {
|
|
14082
|
+
prompt: tokenUsage.prompt || 0,
|
|
14083
|
+
completion: tokenUsage.completion || 0,
|
|
14084
|
+
total: tokenUsage.total || combinedTotal,
|
|
14085
|
+
cached: tokenUsage.cached || 0,
|
|
14086
|
+
numRequests: tokenUsage.numRequests || 0,
|
|
14087
|
+
completionDetails: tokenUsage.completionDetails || {
|
|
14088
|
+
reasoning: 0,
|
|
14089
|
+
acceptedPrediction: 0,
|
|
14090
|
+
rejectedPrediction: 0
|
|
14091
|
+
}
|
|
14092
|
+
};
|
|
14093
|
+
const lines = [`${chalk.default.bold("Total Tokens:")} ${chalk.default.white.bold((evalTokens.total + (tokenUsage.assertions?.total || 0)).toLocaleString())}`];
|
|
14094
|
+
if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.default.gray("Probes:")} ${chalk.default.white(tokenUsage.numRequests.toLocaleString())}`);
|
|
14095
|
+
if (evalTokens.total > 0) {
|
|
14096
|
+
const evalParts = buildUsageDetails(evalTokens, evalTokens.total);
|
|
14097
|
+
lines.push(` ${chalk.default.gray("Eval:")} ${chalk.default.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
|
|
14098
|
+
}
|
|
14099
|
+
if (tokenUsage.assertions?.total && tokenUsage.assertions.total > 0) {
|
|
14100
|
+
const gradingParts = buildUsageDetails(tokenUsage.assertions, tokenUsage.assertions.total);
|
|
14101
|
+
lines.push(` ${chalk.default.gray("Grading:")} ${chalk.default.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
|
|
14102
|
+
}
|
|
14103
|
+
lines.push(...getProviderUsageLines(tracker));
|
|
14104
|
+
return lines;
|
|
14105
|
+
}
|
|
14106
|
+
function getProviderUsageLines(tracker) {
|
|
14107
|
+
const providerIds = tracker.getProviderIds();
|
|
14108
|
+
if (providerIds.length <= 1) return [];
|
|
14109
|
+
const sortedProviders = providerIds.map((id) => ({
|
|
14110
|
+
id,
|
|
14111
|
+
usage: tracker.getProviderUsage(id)
|
|
14112
|
+
})).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
|
|
14113
|
+
const lines = ["", chalk.default.bold("Providers:")];
|
|
14114
|
+
for (const { id, usage } of sortedProviders) {
|
|
14115
|
+
if ((usage.total || 0) === 0 && (usage.prompt || 0) + (usage.completion || 0) === 0) continue;
|
|
14116
|
+
const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
|
|
14117
|
+
const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
|
|
14118
|
+
const details = buildUsageDetails(usage, displayTotal);
|
|
14119
|
+
const requestInfo = `${usage.numRequests || 0} requests`;
|
|
14120
|
+
const separator = details.length > 0 ? "; " : "";
|
|
14121
|
+
lines.push(` ${chalk.default.gray(`${displayId}:`)} ${chalk.default.white(displayTotal.toLocaleString())} (${requestInfo}${separator}${details.join(", ")})`);
|
|
14122
|
+
}
|
|
14123
|
+
return lines;
|
|
14124
|
+
}
|
|
14125
|
+
function formatResultPercentage(count, totalTests) {
|
|
14126
|
+
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
14127
|
+
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
14128
|
+
}
|
|
14129
|
+
function formatResultLine(count, label, icon, iconColor, totalTests) {
|
|
14130
|
+
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count, totalTests)})`)}`;
|
|
14131
|
+
}
|
|
14132
|
+
function getResultsLines({ successes, failures, errors, duration, maxConcurrency }) {
|
|
14133
|
+
const totalTests = successes + failures + errors;
|
|
14134
|
+
const errorLabel = errors === 1 ? "error" : "errors";
|
|
14135
|
+
return [
|
|
14136
|
+
"",
|
|
14137
|
+
chalk.default.bold("Results:"),
|
|
14138
|
+
formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green, totalTests),
|
|
14139
|
+
formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red, totalTests),
|
|
14140
|
+
formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red, totalTests),
|
|
14141
|
+
chalk.default.gray(`Duration: ${formatDuration(duration)} (concurrency: ${maxConcurrency})`),
|
|
14142
|
+
""
|
|
14143
|
+
];
|
|
14144
|
+
}
|
|
13324
14145
|
/**
|
|
13325
14146
|
* Generate formatted evaluation summary output for CLI display.
|
|
13326
14147
|
*
|
|
@@ -13359,115 +14180,28 @@ function formatDuration(seconds) {
|
|
|
13359
14180
|
* ```
|
|
13360
14181
|
*/
|
|
13361
14182
|
function generateEvalSummary(params) {
|
|
13362
|
-
|
|
13363
|
-
|
|
13364
|
-
|
|
13365
|
-
|
|
13366
|
-
|
|
13367
|
-
|
|
13368
|
-
|
|
13369
|
-
|
|
13370
|
-
|
|
13371
|
-
|
|
13372
|
-
|
|
13373
|
-
|
|
13374
|
-
|
|
13375
|
-
|
|
13376
|
-
|
|
13377
|
-
|
|
13378
|
-
|
|
13379
|
-
|
|
13380
|
-
|
|
13381
|
-
|
|
13382
|
-
|
|
13383
|
-
|
|
13384
|
-
lines.push("");
|
|
13385
|
-
lines.push(chalk.default.cyan("To fix: Check your target configuration and credentials."));
|
|
13386
|
-
}
|
|
13387
|
-
if (writeToDatabase && !shareableUrl && !wantsToShare && !activelySharing) {
|
|
13388
|
-
lines.push("");
|
|
13389
|
-
lines.push(`» View results: ${chalk.default.green.bold("promptfoo view")}`);
|
|
13390
|
-
if (!hasExplicitDisable) if (cloudEnabled) lines.push(`» Create shareable URL: ${chalk.default.green.bold("promptfoo share")}`);
|
|
13391
|
-
else lines.push(`» Share with your team: ${chalk.default.green.bold("https://promptfoo.app")}`);
|
|
13392
|
-
lines.push(`» Feedback: ${chalk.default.green.bold("https://promptfoo.dev/feedback")}`);
|
|
13393
|
-
}
|
|
13394
|
-
lines.push("");
|
|
13395
|
-
const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
|
|
13396
|
-
const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
|
|
13397
|
-
if (hasEvalTokens || hasGradingTokens) {
|
|
13398
|
-
const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
|
|
13399
|
-
const evalTokens = {
|
|
13400
|
-
prompt: tokenUsage.prompt || 0,
|
|
13401
|
-
completion: tokenUsage.completion || 0,
|
|
13402
|
-
total: tokenUsage.total || combinedTotal,
|
|
13403
|
-
cached: tokenUsage.cached || 0,
|
|
13404
|
-
completionDetails: tokenUsage.completionDetails || {
|
|
13405
|
-
reasoning: 0,
|
|
13406
|
-
acceptedPrediction: 0,
|
|
13407
|
-
rejectedPrediction: 0
|
|
13408
|
-
}
|
|
13409
|
-
};
|
|
13410
|
-
const grandTotal = evalTokens.total + (tokenUsage.assertions?.total || 0);
|
|
13411
|
-
lines.push(`${chalk.default.bold("Total Tokens:")} ${chalk.default.white.bold(grandTotal.toLocaleString())}`);
|
|
13412
|
-
if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.default.gray("Probes:")} ${chalk.default.white(tokenUsage.numRequests.toLocaleString())}`);
|
|
13413
|
-
if (evalTokens.total > 0) {
|
|
13414
|
-
const evalParts = [];
|
|
13415
|
-
if (evalTokens.prompt > 0) evalParts.push(`${evalTokens.prompt.toLocaleString()} prompt`);
|
|
13416
|
-
if (evalTokens.completion > 0) evalParts.push(`${evalTokens.completion.toLocaleString()} completion`);
|
|
13417
|
-
if (evalTokens.cached > 0) if (evalTokens.cached === evalTokens.total && evalParts.length === 0) evalParts.push("cached");
|
|
13418
|
-
else evalParts.push(`${evalTokens.cached.toLocaleString()} cached`);
|
|
13419
|
-
if (evalTokens.completionDetails?.reasoning && evalTokens.completionDetails.reasoning > 0) evalParts.push(`${evalTokens.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13420
|
-
lines.push(` ${chalk.default.gray("Eval:")} ${chalk.default.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
|
|
13421
|
-
}
|
|
13422
|
-
if (tokenUsage.assertions && tokenUsage.assertions.total && tokenUsage.assertions.total > 0) {
|
|
13423
|
-
const gradingParts = [];
|
|
13424
|
-
if (tokenUsage.assertions.prompt && tokenUsage.assertions.prompt > 0) gradingParts.push(`${tokenUsage.assertions.prompt.toLocaleString()} prompt`);
|
|
13425
|
-
if (tokenUsage.assertions.completion && tokenUsage.assertions.completion > 0) gradingParts.push(`${tokenUsage.assertions.completion.toLocaleString()} completion`);
|
|
13426
|
-
if (tokenUsage.assertions.cached && tokenUsage.assertions.cached > 0) if (tokenUsage.assertions.cached === tokenUsage.assertions.total && gradingParts.length === 0) gradingParts.push("cached");
|
|
13427
|
-
else gradingParts.push(`${tokenUsage.assertions.cached.toLocaleString()} cached`);
|
|
13428
|
-
if (tokenUsage.assertions.completionDetails?.reasoning && tokenUsage.assertions.completionDetails.reasoning > 0) gradingParts.push(`${tokenUsage.assertions.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13429
|
-
lines.push(` ${chalk.default.gray("Grading:")} ${chalk.default.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
|
|
13430
|
-
}
|
|
13431
|
-
const providerIds = tracker.getProviderIds();
|
|
13432
|
-
if (providerIds.length > 1) {
|
|
13433
|
-
lines.push("");
|
|
13434
|
-
lines.push(chalk.default.bold("Providers:"));
|
|
13435
|
-
const sortedProviders = providerIds.map((id) => ({
|
|
13436
|
-
id,
|
|
13437
|
-
usage: tracker.getProviderUsage(id)
|
|
13438
|
-
})).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
|
|
13439
|
-
for (const { id, usage } of sortedProviders) if ((usage.total || 0) > 0 || (usage.prompt || 0) + (usage.completion || 0) > 0) {
|
|
13440
|
-
const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
|
|
13441
|
-
const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
|
|
13442
|
-
const details = [];
|
|
13443
|
-
if (usage.prompt && usage.prompt > 0) details.push(`${usage.prompt.toLocaleString()} prompt`);
|
|
13444
|
-
if (usage.completion && usage.completion > 0) details.push(`${usage.completion.toLocaleString()} completion`);
|
|
13445
|
-
if (usage.cached && usage.cached > 0) if (usage.cached === displayTotal && details.length === 0) details.push("cached");
|
|
13446
|
-
else details.push(`${usage.cached.toLocaleString()} cached`);
|
|
13447
|
-
if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) details.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13448
|
-
const breakdown = ` (${`${usage.numRequests || 0} requests`}${details.length > 0 ? "; " : ""}${details.join(", ")})`;
|
|
13449
|
-
lines.push(` ${chalk.default.gray(displayId + ":")} ${chalk.default.white(displayTotal.toLocaleString())}${breakdown}`);
|
|
13450
|
-
}
|
|
13451
|
-
}
|
|
13452
|
-
}
|
|
13453
|
-
lines.push("");
|
|
13454
|
-
const totalTests = successes + failures + errors;
|
|
13455
|
-
const formatResultPercentage = (count) => {
|
|
13456
|
-
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
13457
|
-
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
13458
|
-
};
|
|
13459
|
-
const formatResultLine = (count, label, icon, iconColor) => {
|
|
13460
|
-
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count)})`)}`;
|
|
13461
|
-
};
|
|
13462
|
-
const errorLabel = errors === 1 ? "error" : "errors";
|
|
13463
|
-
lines.push(chalk.default.bold("Results:"));
|
|
13464
|
-
lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green));
|
|
13465
|
-
lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red));
|
|
13466
|
-
lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red));
|
|
13467
|
-
const durationDisplay = formatDuration(duration);
|
|
13468
|
-
lines.push(chalk.default.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
|
|
13469
|
-
lines.push("");
|
|
13470
|
-
return lines;
|
|
14183
|
+
return [
|
|
14184
|
+
getCompletionMessage({
|
|
14185
|
+
completionType: params.isRedteam ? "Red team" : "Eval",
|
|
14186
|
+
evalId: params.evalId,
|
|
14187
|
+
shareableUrl: params.shareableUrl,
|
|
14188
|
+
wasAborted: params.targetErrorStatus != null,
|
|
14189
|
+
writeToDatabase: params.writeToDatabase,
|
|
14190
|
+
activelySharing: params.activelySharing ?? false
|
|
14191
|
+
}),
|
|
14192
|
+
...getAbortSummaryLines(params.targetErrorStatus),
|
|
14193
|
+
...getGuidanceLines({
|
|
14194
|
+
writeToDatabase: params.writeToDatabase,
|
|
14195
|
+
shareableUrl: params.shareableUrl,
|
|
14196
|
+
wantsToShare: params.wantsToShare,
|
|
14197
|
+
activelySharing: params.activelySharing ?? false,
|
|
14198
|
+
hasExplicitDisable: params.hasExplicitDisable,
|
|
14199
|
+
cloudEnabled: params.cloudEnabled
|
|
14200
|
+
}),
|
|
14201
|
+
"",
|
|
14202
|
+
...getTokenUsageLines(params.tokenUsage, params.isRedteam, params.tracker),
|
|
14203
|
+
...getResultsLines(params)
|
|
14204
|
+
];
|
|
13471
14205
|
}
|
|
13472
14206
|
//#endregion
|
|
13473
14207
|
//#region src/commands/retry.ts
|
|
@@ -14237,6 +14971,26 @@ async function doRedteamRun(options) {
|
|
|
14237
14971
|
}
|
|
14238
14972
|
//#endregion
|
|
14239
14973
|
//#region src/index.ts
|
|
14974
|
+
/**
|
|
14975
|
+
* Shallow-clone a test case so the caller can swap in resolved ApiProvider
|
|
14976
|
+
* instances on `options.provider` / `assert[].provider` without leaking those
|
|
14977
|
+
* mutations back to the input. The input may alias the unified config written
|
|
14978
|
+
* to the Eval record, and a live SDK client (e.g. Bedrock's BedrockRuntime,
|
|
14979
|
+
* Anthropic's client) holds circular references that break drizzle's JSON
|
|
14980
|
+
* serialization on `evalRecord.save()`. Fixes #8687.
|
|
14981
|
+
*
|
|
14982
|
+
* Detaches only `options` and `assert[]`. Other reference fields (`provider`,
|
|
14983
|
+
* `vars`, `metadata`, `providerOutput`) remain aliased — callers must reassign
|
|
14984
|
+
* those by reference rather than mutating in place. `assert-set` children are
|
|
14985
|
+
* not deep-cloned because the resolve loop skips `assert-set`; if that ever
|
|
14986
|
+
* changes, extend this helper.
|
|
14987
|
+
*/
|
|
14988
|
+
function cloneTestForResolve(test) {
|
|
14989
|
+
const cloned = { ...test };
|
|
14990
|
+
if (test.options) cloned.options = { ...test.options };
|
|
14991
|
+
if (test.assert) cloned.assert = test.assert.map((assertion) => ({ ...assertion }));
|
|
14992
|
+
return cloned;
|
|
14993
|
+
}
|
|
14240
14994
|
async function evaluate(testSuite, options = {}) {
|
|
14241
14995
|
if (testSuite.writeLatestResults) await runDbMigrations();
|
|
14242
14996
|
const loadedProviders = await require_providers.loadApiProviders(testSuite.providers, { env: testSuite.env });
|
|
@@ -14256,22 +15010,24 @@ async function evaluate(testSuite, options = {}) {
|
|
|
14256
15010
|
nunjucksFilters: await require_util.readFilters(testSuite.nunjucksFilters || {}),
|
|
14257
15011
|
prompts: await require_graders.processPrompts(testSuite.prompts)
|
|
14258
15012
|
};
|
|
14259
|
-
if (typeof constructedTestSuite.defaultTest === "object") {
|
|
14260
|
-
|
|
15013
|
+
if (typeof constructedTestSuite.defaultTest === "object" && constructedTestSuite.defaultTest) {
|
|
15014
|
+
constructedTestSuite.defaultTest = cloneTestForResolve(constructedTestSuite.defaultTest);
|
|
15015
|
+
if (constructedTestSuite.defaultTest.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
|
|
14261
15016
|
env: testSuite.env,
|
|
14262
15017
|
basePath: require_logger.state.basePath
|
|
14263
15018
|
});
|
|
14264
|
-
if (constructedTestSuite.defaultTest
|
|
15019
|
+
if (constructedTestSuite.defaultTest.options?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
|
|
14265
15020
|
env: testSuite.env,
|
|
14266
15021
|
basePath: require_logger.state.basePath
|
|
14267
15022
|
});
|
|
14268
15023
|
}
|
|
14269
|
-
|
|
15024
|
+
constructedTestSuite.tests = (constructedTestSuite.tests || []).map(cloneTestForResolve);
|
|
15025
|
+
for (const test of constructedTestSuite.tests) {
|
|
14270
15026
|
if (test.options?.provider && !require_types.isApiProvider(test.options.provider)) test.options.provider = await require_providers.resolveProvider(test.options.provider, providerMap, {
|
|
14271
15027
|
env: testSuite.env,
|
|
14272
15028
|
basePath: require_logger.state.basePath
|
|
14273
15029
|
});
|
|
14274
|
-
|
|
15030
|
+
for (const assertion of test.assert || []) {
|
|
14275
15031
|
if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
|
|
14276
15032
|
if (assertion.provider && !require_types.isApiProvider(assertion.provider)) assertion.provider = await require_providers.resolveProvider(assertion.provider, providerMap, {
|
|
14277
15033
|
env: testSuite.env,
|