promptfoo 0.121.4 → 0.121.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/{ListApp-DQkFNqE9.js → ListApp-BRUsT43Y.js} +1 -1
- package/dist/src/{accounts-Dy17bs4D.cjs → accounts-BIFntVWB.cjs} +4 -4
- package/dist/src/{accounts-F9d_5sMC.js → accounts-CLJHCDDb.js} +6 -6
- package/dist/src/{accounts-DhMYUUbu.js → accounts-CaLNYnf7.js} +4 -4
- package/dist/src/{accounts-DdJ2pHMI.js → accounts-bnyHT7Ju.js} +5 -5
- package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-B5krlibj.js} +3 -3
- package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-Ba67xmgs.js} +2 -2
- package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-BclbiXiq.js} +3 -3
- package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-D2x0wGhB.cjs} +2 -2
- package/dist/src/{agents-CgaMXvLM.js → agents-BGqaTDnr.js} +5 -5
- package/dist/src/{agents-8FDnTriG.js → agents-BV9yFpXX.js} +5 -5
- package/dist/src/{agents-aYPQLf8W.js → agents-BYdMl1UE.js} +4 -4
- package/dist/src/{agents-pQeBEXMm.js → agents-DhxWMCtH.js} +5 -5
- package/dist/src/{agents-D7-HGxUj.cjs → agents-DiWmQYH9.cjs} +4 -4
- package/dist/src/{agents-BahDpe5G.cjs → agents-WULPVjbH.cjs} +4 -4
- package/dist/src/{agents-DJ35I3Nt.js → agents-emVcx3yh.js} +5 -5
- package/dist/src/{agents-C-R_jfzI.js → agents-n6vPqV3i.js} +4 -4
- package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-BxqK9HF_.js} +7 -7
- package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-BzLjZI_m.cjs} +6 -6
- package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-DR4pgeiC.js} +6 -6
- package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-uPGp0Zdo.js} +7 -7
- package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
- package/dist/src/app/assets/Report-vjzrbgce.js +1 -0
- package/dist/src/app/assets/index-B3NQ8HTd.js +385 -0
- package/dist/src/app/assets/{index-BXGkeMwh.css → index-Cli2yAXv.css} +1 -1
- package/dist/src/app/index.html +27 -2
- package/dist/src/{audio-DcVKoInv.js → audio-BvpTOArF.js} +4 -4
- package/dist/src/{audio-BQtNuYBj.cjs → audio-C0vDeS0j.cjs} +3 -3
- package/dist/src/{audio-B7izf48x.js → audio-CScmnmEB.js} +4 -4
- package/dist/src/{audio-COrn8rM6.js → audio-Da8U9IS5.js} +3 -3
- package/dist/src/{base-fZ9wgg50.js → base-BOMaNEes.js} +3 -3
- package/dist/src/{base-PYJvBE1i.js → base-BTux96b1.js} +2 -2
- package/dist/src/{base-D-670DX8.cjs → base-Tw6uhH8K.cjs} +2 -2
- package/dist/src/{base-yrI1Yal4.js → base-dYsl2hmL.js} +3 -3
- package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B95F_7vE.cjs} +2 -2
- package/dist/src/{blobs-C-F78Kfn.js → blobs-BW4U31ue.js} +2 -2
- package/dist/src/{blobs-BCZavS8s.js → blobs-D_gg8nbm.js} +3 -3
- package/dist/src/{blobs-BQWqnnvL.js → blobs-DjLby-uP.js} +3 -3
- package/dist/src/{cache-mb7c8hbp.js → cache-BI5BY7ey.js} +4 -4
- package/dist/src/{cache-DbLsVWB2.cjs → cache-BRkhlH3k.cjs} +1 -1
- package/dist/src/cache-BlC6aeJ0.js +3 -0
- package/dist/src/{cache-D5NZmMiT.js → cache-Bzttsk0X.js} +2 -2
- package/dist/src/{cache-C4Xb-hNb.js → cache-Cr-qWIbP.js} +3 -3
- package/dist/src/{cache-BIyPcp5v.cjs → cache-DGg-yTZG.cjs} +2 -2
- package/dist/src/{chat-Dr3DUQ0D.js → chat-BLOdH60v.js} +12 -12
- package/dist/src/{chat-BfPaS15_.js → chat-Cx_LkwvZ.js} +12 -12
- package/dist/src/{chat-mW0ORo8G.js → chat-D9nudO9b.js} +4 -4
- package/dist/src/{chat-I9izLm49.js → chat-DChSH_Es.js} +12 -12
- package/dist/src/{chat-MKxMnZJZ.js → chat-DG2LkwLq.js} +2 -2
- package/dist/src/{chat-BPXSW8Bv.cjs → chat-DH97tVV9.cjs} +2 -2
- package/dist/src/{chat-0bwXjVP0.js → chat-aMQZw6R7.js} +4 -4
- package/dist/src/{chat-CclRbxGf.cjs → chat-vYqqv1gP.cjs} +11 -11
- package/dist/src/{chatkit-zUIVoDos.js → chatkit-B8X34dQc.js} +4 -4
- package/dist/src/{chatkit-Cv6AhukM.js → chatkit-BXu42Qwt.js} +3 -3
- package/dist/src/{chatkit-CJnHRRMM.js → chatkit-CbMRoeYw.js} +4 -4
- package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-D44VyUyB.cjs} +3 -3
- package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-BRq0bbIK.cjs} +8 -8
- package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-BjriSVRZ.js} +7 -7
- package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BzNZeZ0N.js} +7 -7
- package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-DYv_AJ8u.js} +7 -7
- package/dist/src/cloud-CoD5OacT.js +3 -0
- package/dist/src/{cloud-DQZ5sVjW.js → cloud-Da0bofJd.js} +3 -3
- package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-CXC4b1EU.js} +4 -4
- package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-CyBoIs1Q.js} +6 -6
- package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-DGOwgexC.js} +6 -6
- package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-DJv5qnyb.cjs} +4 -4
- package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-1sAoOyft.js} +5 -5
- package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-D-dnkzCF.js} +5 -5
- package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-DKVjkDav.cjs} +3 -3
- package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-TJkVrZlB.js} +3 -3
- package/dist/src/codex-app-server-CCLjqCh9.js +1915 -0
- package/dist/src/codex-app-server-CCe0TiDc.js +1915 -0
- package/dist/src/codex-app-server-CPW1LFwh.js +1916 -0
- package/dist/src/codex-app-server-VMRnjZ68.cjs +1920 -0
- package/dist/src/codex-sdk-1jm_qPHf.js +3 -0
- package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-Bd8UbO9q.cjs} +5 -5
- package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-BgEFQ70r.js} +6 -6
- package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-Bzb_TqX9.js} +6 -6
- package/dist/src/{codex-sdk-CWEnH70W.cjs → codex-sdk-Danroptg.cjs} +1 -1
- package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-DfvDTN33.js} +5 -5
- package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-B5ImDlSm.js} +8 -8
- package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-BgAkuYCw.cjs} +7 -7
- package/dist/src/{cometapi-BUlt_ELa.js → cometapi-CC7hWxmX.js} +8 -8
- package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CCbpHkuF.js} +7 -7
- package/dist/src/{completion-x0a_c2y1.js → completion-2iuYVxwi.js} +6 -6
- package/dist/src/{completion-Dnxn7E-j.js → completion-CrD6MQ93.js} +5 -5
- package/dist/src/{completion-BozdoXba.cjs → completion-DtQ72Bm3.cjs} +5 -5
- package/dist/src/{completion-HUe8wDhZ.js → completion-Vq_ad618.js} +6 -6
- package/dist/src/{createHash-ChI45QR1.js → createHash-DPpsZgFF.js} +1 -1
- package/dist/src/{createHash-CwDVU5xr.js → createHash-Un4Q_huE.js} +1 -1
- package/dist/src/{createHash-B7KvgoOD.cjs → createHash-VvBIc-AW.cjs} +1 -1
- package/dist/src/{docker-DCgsveLD.js → docker--3qzPa-6.js} +6 -6
- package/dist/src/{docker-DS4_Osau.cjs → docker-D3AY-5F5.cjs} +5 -5
- package/dist/src/{docker-CQmlA2NU.js → docker-DCsCDvwM.js} +6 -6
- package/dist/src/{docker-ClnmCf1Z.js → docker-Dorv4_Dg.js} +5 -5
- package/dist/src/{embedding-I45KG3o7.cjs → embedding-BXhN5lCH.cjs} +5 -5
- package/dist/src/{embedding-nFbumxcv.js → embedding-ChS1ivFS.js} +5 -5
- package/dist/src/{embedding-D3xTseo7.js → embedding-DNRvZwRN.js} +6 -6
- package/dist/src/{embedding-DD9wa3ae.js → embedding-D_bI4NDq.js} +6 -6
- package/dist/src/{errors-Cw810C93.js → errors-DFHe4L-n.js} +1 -1
- package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
- package/dist/src/{esm-C7PnfdF8.js → esm-BRkfNsYs.js} +1 -1
- package/dist/src/{esm-tVgYPY-f.js → esm-BX8fwlAO.js} +2 -2
- package/dist/src/{esm-CtEPLdAj.cjs → esm-B_rGuPTo.cjs} +1 -1
- package/dist/src/{eval-CzJFfFO9.js → eval-BQPLBJbw.js} +1 -1
- package/dist/src/{eval-u4UVafl6.js → eval-DJ_4A-tr.js} +14 -14
- package/dist/src/evalResult-BBJAHAtw.cjs +2 -0
- package/dist/src/evalResult-BBK58h2B.js +3 -0
- package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-Cx-8OWkb.cjs} +28 -10
- package/dist/src/{evalResult-D3hVYFis.js → evalResult-D6P5I5il.js} +29 -11
- package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-pSvGWFMo.js} +29 -11
- package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-D-UIbbYq.js} +845 -98
- package/dist/src/evaluator-DgLKaZk8.js +3 -0
- package/dist/src/{extractor-Dk6bRWkv.js → extractor-BM3jRERL.js} +5 -5
- package/dist/src/{extractor-WVPOrH43.cjs → extractor-Dxr2J_wK.cjs} +5 -5
- package/dist/src/{extractor-DNSeBVOJ.js → extractor-DxyiFhPk.js} +6 -6
- package/dist/src/{extractor-CAfTSraf.js → extractor-YlZbUMsL.js} +6 -6
- package/dist/src/fetch-8viavNv8.js +3 -0
- package/dist/src/{fetch-BEWnXrrG.js → fetch-B6ch2nU2.js} +9 -20
- package/dist/src/{fetch-Di00EQrc.js → fetch-D9xxyC1p.js} +221 -232
- package/dist/src/{fetch-CJU5ELPa.cjs → fetch-NuqXW1Xb.cjs} +221 -244
- package/dist/src/{fetch-B0Z3Oe4k.js → fetch-Y5qX_kST.js} +8 -19
- package/dist/src/{fileExtensions-BArZuxsI.js → fileExtensions-8CjoL7vB.js} +1 -1
- package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-BGh-W-HT.js} +1 -1
- package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-D9h-8Wxg.cjs} +1 -1
- package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-DysCsxNG.js} +1 -1
- package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-Ch4A7G3o.js} +1 -1
- package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-BokHC-MW.cjs} +1 -1
- package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-C3ZPQU60.js} +1 -1
- package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-CFny3gOy.js} +1 -1
- package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-DxODqT9e.js} +1 -1
- package/dist/src/{graders-Zy3x0zqX.js → graders-BoUqsCEm.js} +1303 -2044
- package/dist/src/{graders--zknU_uk.cjs → graders-Bw1wk_21.cjs} +1553 -2240
- package/dist/src/graders-C84JI-m5.js +2 -0
- package/dist/src/graders-CBbd0K0Q.cjs +2 -0
- package/dist/src/graders-CbQqpHSN.js +3 -0
- package/dist/src/{graders-eIHhRqoC.js → graders-CgPn32yp.js} +1300 -2041
- package/dist/src/{graders-pvbReLLn.js → graders-CwrbifOo.js} +747 -1488
- package/dist/src/graders-DS42d3ZG.js +2 -0
- package/dist/src/{image-9302QVqR.js → image-BeWaInPF.js} +3 -3
- package/dist/src/{image-DVz2RiMF.js → image-BmilRNqO.js} +7 -7
- package/dist/src/{image-x6KqLQl4.cjs → image-CxJoa3aW.cjs} +6 -6
- package/dist/src/{image-De2FBmYV.cjs → image-D10dNAav.cjs} +3 -3
- package/dist/src/{image-dnoUgPrC.js → image-Dr_3I3nK.js} +4 -4
- package/dist/src/{image-B5Mv-Z3h.js → image-DsGRlkh7.js} +7 -7
- package/dist/src/{image-qUpPvmNZ.js → image-a_SGUobh.js} +6 -6
- package/dist/src/{image-u7-rKnYU.js → image-qjO6FWPs.js} +4 -4
- package/dist/src/index.cjs +1052 -296
- package/dist/src/index.d.cts +124 -13
- package/dist/src/index.d.ts +125 -14
- package/dist/src/index.js +1018 -262
- package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-CCICw2cy.js} +2 -2
- package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
- package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
- package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-BBETc5-S.js} +6 -6
- package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-C8qOo26M.cjs} +5 -5
- package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CzAi2rUI.js} +6 -6
- package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-Dr3Kib7F.js} +5 -5
- package/dist/src/{litellm-C2kqjxqp.js → litellm-BLSiANhk.js} +5 -5
- package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CaUmV7Mk.cjs} +4 -4
- package/dist/src/{litellm-p37R1dzQ.js → litellm-DQGo_juI.js} +4 -4
- package/dist/src/{litellm-DRjpcSa7.js → litellm-DRc4qWfc.js} +5 -5
- package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
- package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
- package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-B-tNZzqW.js} +6 -6
- package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CtS3OlGq.js} +5 -5
- package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-PJJgUjOc.js} +6 -6
- package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-if-Ml4R9.cjs} +5 -5
- package/dist/src/main.js +242 -198
- package/dist/src/{messages-zWbkLLHz.js → messages-B9dSjrNf.js} +264 -16
- package/dist/src/{messages-811uVVW5.cjs → messages-BnsVHUnm.cjs} +266 -15
- package/dist/src/{messages-MYTQ2TWp.js → messages-CI69Lasb.js} +264 -16
- package/dist/src/{messages-BTQz42fn.js → messages-CewuNcNS.js} +264 -16
- package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-BBGcGeCa.cjs} +1 -1
- package/dist/src/{meteor-DuAFv6gF.js → meteor-BKTM-7KS.js} +1 -1
- package/dist/src/{meteor-DHdzY1Ss.js → meteor-CeGo0Lu2.js} +2 -2
- package/dist/src/{meteor-CU5UAE-H.js → meteor-Wc_aUVvu.js} +2 -2
- package/dist/src/{modelslab-wu9yi5GE.js → modelslab-BCLOtfek.js} +7 -7
- package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-BkapYJhh.cjs} +6 -6
- package/dist/src/{modelslab-DIq-6y7x.js → modelslab-D73OnKSx.js} +6 -6
- package/dist/src/{modelslab-D0erNWKe.js → modelslab-zpz9JcK0.js} +7 -7
- package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-B8F_TK5w.js} +6 -6
- package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-Bx0NFV2f.js} +5 -5
- package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-CNGJTLtG.js} +6 -6
- package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-DkT7tnoB.cjs} +5 -5
- package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BaXRN1cr.js} +4 -4
- package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-BeTRaFOh.js} +4 -4
- package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CL7Zqv0G.js} +3 -3
- package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-YT426juD.cjs} +3 -3
- package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
- package/dist/src/{openai-Qsvz25mV.js → openai-BT-JvDse.js} +2 -2
- package/dist/src/{openai-URNyItar.cjs → openai-Cy1XLs0c.cjs} +1 -1
- package/dist/src/{openai-iYtrXzOX.js → openai-D4fxGvRx.js} +1 -1
- package/dist/src/{openclaw-CwzlQSQX.js → openclaw-Bq7RVR3k.js} +7 -6
- package/dist/src/{openclaw-CLWrW03k.js → openclaw-DA8U4DsD.js} +8 -7
- package/dist/src/{openclaw-CnQ363Wi.js → openclaw-DObVgpjC.js} +8 -7
- package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-DUBZP3GL.cjs} +8 -7
- package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-BB40Wir1.js} +4 -4
- package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-BM1UAIv1.js} +3 -3
- package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-CeqiOcOU.cjs} +4 -4
- package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-ChdK7F7z.js} +4 -4
- package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-C6thJRXi.js} +4 -4
- package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-CcdIikOu.js} +3 -3
- package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-DNSQj6bf.cjs} +3 -3
- package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver-UYMQx3sy.js} +4 -4
- package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
- package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-BESeALrr.cjs} +1 -1
- package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DoACwqhD.js} +1 -1
- package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-PMsleEzs.js} +2 -2
- package/dist/src/{providers-Bp4S-FvO.js → providers-BuyzKt7C.js} +1 -1
- package/dist/src/{providers-DV3ax9e_.cjs → providers-C7lNVBjX.cjs} +1 -1
- package/dist/src/{providers-u9Enmfok.js → providers-CCE2COJi2.js} +1 -1
- package/dist/src/{providers-DruaQfwu.js → providers-CJh7iriU.js} +18103 -17952
- package/dist/src/{providers-iUt5fbAN.js → providers-Ctcc592x.js} +1 -1
- package/dist/src/{providers-Domz_llv.js → providers-DRrerKra.js} +432 -281
- package/dist/src/{providers-BV_KMZje.js → providers-DT-GtF2t.js} +19094 -18943
- package/dist/src/{providers-1eKkXBKp.cjs → providers-eDShy16E.cjs} +17946 -17795
- package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-C4tltmIn.js} +3 -3
- package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-CoLaCwNY.cjs} +3 -3
- package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-DMO68Jg7.js} +3 -3
- package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-DNqbnRdx.js} +3 -3
- package/dist/src/{quiverai-DR0SnIQV.js → quiverai-BSS9a7wV.js} +3 -3
- package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Bk1KrvL6.js} +4 -4
- package/dist/src/{quiverai-DFotyafY.cjs → quiverai-Bpx6MZ7T.cjs} +3 -3
- package/dist/src/{quiverai-aPPvXOgn.js → quiverai-CPKhWgaT.js} +4 -4
- package/dist/src/{render-DHIZ6_k8.js → render-7uNJ2V14.js} +2 -2
- package/dist/src/{render-CH-62LbA.js → render-DlscvAUJ.js} +1 -1
- package/dist/src/{render-CMEpfLaO.js → render-eui5p5mL.js} +2 -2
- package/dist/src/{render-CgVDrJmM.js → render-nj-UaPdn.js} +2 -2
- package/dist/src/{render-DfQSFxGE.cjs → render-tG6ir9_g.cjs} +1 -1
- package/dist/src/{responses--OsX2aYW.js → responses-1ztiVYsx.js} +49 -15
- package/dist/src/{responses-DL9m8CyY.js → responses-B8haB-mD.js} +49 -15
- package/dist/src/{responses-C-flexAY.js → responses-BiaBguAu.js} +49 -15
- package/dist/src/{responses-Bi9vBuW_.cjs → responses-CF-ayauu.cjs} +48 -14
- package/dist/src/rubyUtils-4hjGxvju.js +3 -0
- package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-BI0p46eZ.js} +3 -3
- package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CIQFnVz4.js} +3 -3
- package/dist/src/rubyUtils-CO-tuszQ.cjs +2 -0
- package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DGnoCYL2.js} +3 -3
- package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-DoifqkiA.cjs} +4 -3
- package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-BDLeW29y.js} +12 -12
- package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-C5T60MKf.js} +13 -13
- package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-ClS_NB07.js} +13 -13
- package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-ljtY12VM.cjs} +12 -12
- package/dist/src/{scanner-1DqWi1Ej.js → scanner-nOCWNIXa.js} +7 -7
- package/dist/src/server/index.js +1067 -265
- package/dist/src/{server-Dx2TyCH2.cjs → server-BEECpeGG.cjs} +5 -5
- package/dist/src/{server-BNYztJkh.js → server-ByiF3qlg.js} +9 -8
- package/dist/src/{server-BSB45Nt9.js → server-ByxbqAcQ.js} +8 -7
- package/dist/src/{server-DaA2eR26.cjs → server-C0XKRNB_.cjs} +1 -1
- package/dist/src/server-C_15p79-.js +3 -0
- package/dist/src/{server-D6Il2Sob.js → server-gyd6d4Hc.js} +5 -5
- package/dist/src/{signal-CE5G3a7x.js → signal-DTtUuU3l.js} +3 -3
- package/dist/src/{slack-acRb0IqQ.js → slack-4zZX1OKP.js} +1 -1
- package/dist/src/{slack-1Rhq0EoV.cjs → slack-BLlsDpfG.cjs} +1 -1
- package/dist/src/{slack-D5Wpy8LM.js → slack-BPYLQLgb.js} +2 -2
- package/dist/src/{slack-DDUe-5MC.js → slack-Bamy_7te.js} +2 -2
- package/dist/src/{store-DAAyxcy6.cjs → store-2K0kDi80.cjs} +2 -2
- package/dist/src/{store-Dn9HUkdW.js → store-2OXm_eBY.js} +3 -3
- package/dist/src/store-BELqNwvz.js +3 -0
- package/dist/src/{store-M0b1WfYb.js → store-BPkzEyFM.js} +2 -2
- package/dist/src/{store-CYEy5J2D.js → store-CPh25336.js} +3 -3
- package/dist/src/store-uQZ4AjPe.cjs +2 -0
- package/dist/src/{tables-CsWou1Bx.js → tables-BMSOS2Gg.js} +3 -3
- package/dist/src/{tables-DUfh1F7Z.cjs → tables-CXbaZ9y1.cjs} +2 -2
- package/dist/src/{tables-C4CH3zRr.js → tables-NlvH23ky.js} +3 -3
- package/dist/src/{tables-DQ4WU5tX.js → tables-WgdUZ8Ck.js} +2 -2
- package/dist/src/{telemetry-dbaJ0E98.js → telemetry--iqaGyaS.js} +5 -4
- package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CEQxGnMZ.cjs} +7 -6
- package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-CgdVGV8N.js} +4 -3
- package/dist/src/{telemetry-CQPez_Jp.js → telemetry-DWdGHvEf.js} +5 -4
- package/dist/src/telemetry-DjNoC_n3.cjs +2 -0
- package/dist/src/telemetry-ZdPZc0fm.js +3 -0
- package/dist/src/{text-BVi-cLPJ.cjs → text-BiNME7QG.cjs} +1 -1
- package/dist/src/{text-KvuD2Iko.js → text-D4lz-Jg_.js} +1 -1
- package/dist/src/{text-DHxdyQqT.js → text-DDQP0tuQ.js} +1 -1
- package/dist/src/{text-CZr46tp_.js → text-NWvfMfkF.js} +1 -1
- package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-2wIvAhB3.js} +1 -1
- package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-4c780gFd.js} +1 -1
- package/dist/src/tokenUsageUtils-BjVkdk18.js +142 -0
- package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-C9odhsbW.cjs} +1 -1
- package/dist/src/{transcription-DuWDupG7.js → transcription-84t4ALo2.js} +5 -5
- package/dist/src/{transcription-CJspiD2c.js → transcription-Bm2emLmJ.js} +6 -6
- package/dist/src/{transcription-BvjmiYB1.cjs → transcription-CZ4LG5hQ.cjs} +5 -5
- package/dist/src/{transcription-V2HaAmy2.js → transcription-D7Q0vJsh.js} +6 -6
- package/dist/src/{transform-zDhMmzwX.js → transform-B-b6Cq-q.js} +5 -5
- package/dist/src/transform-BQt0BeAW.js +3 -0
- package/dist/src/{transform-DgKlRr73.cjs → transform-Bq5oqC0s.cjs} +1 -1
- package/dist/src/{transform-CUnzlsbn.cjs → transform-C9izGX54.cjs} +4 -4
- package/dist/src/{transform-DYX1_Xnh.js → transform-CwbAZ84V.js} +5 -5
- package/dist/src/{transform-CTeuTR3S.cjs → transform-Dg4LcO1Y.cjs} +6 -6
- package/dist/src/{transform-CG0ehZNG.js → transform-DtooZqYY.js} +6 -6
- package/dist/src/{transform-UN5UGu8U.js → transform-DzCF-wqV.js} +5 -5
- package/dist/src/{transform-lQrDE1BQ.js → transform-_DpNB4qp.js} +5 -5
- package/dist/src/{transform-Bbg6A8Jk.js → transform-eGiUAv86.js} +4 -4
- package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-B22swDxr.cjs} +1 -1
- package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-lvCCvuPT.js} +1 -1
- package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-rJGPccjr.js} +1 -1
- package/dist/src/{types-Bgh5SOn6.js → types-BDjGOq4E.js} +4 -2
- package/dist/src/{types-Dm9JM6Vb.js → types-BVH9hjgW.js} +4 -2
- package/dist/src/{types-CeaeaZdP.cjs → types-CgG2rKiW.cjs} +151 -149
- package/dist/src/{types-BGQDAP8i.js → types-DNRZVOue.js} +152 -150
- package/dist/src/{util-C8e5uydV.js → util-3pBZZb_H.js} +142 -17
- package/dist/src/{util-CN3SrLT4.cjs → util-A5_ZsQUn.cjs} +65 -43
- package/dist/src/{util-D3q0WQ-0.js → util-B9CNhyac.js} +66 -44
- package/dist/src/{util-DxWpWjhc.js → util-BQOCAHQC.js} +700 -575
- package/dist/src/{util-BYvQUPp7.js → util-BVXcTwXu.js} +3 -3
- package/dist/src/{util-D9TisOyk.js → util-BlFVL0UF.js} +65 -43
- package/dist/src/{util-C9J8ahRn.js → util-C-kmRosx.js} +66 -44
- package/dist/src/{util-DvU2Pw8c.js → util-DFPeFkiV.js} +3 -3
- package/dist/src/{util-DDs-7g6-.js → util-DN0-b81k.js} +3 -3
- package/dist/src/{util-olYL5C6N.cjs → util-Dpmm_dAI.cjs} +3 -3
- package/dist/src/{util-oGMLA7vc.js → util-Dub0f_ej.js} +700 -575
- package/dist/src/{util-Bxn8emtE.cjs → util-DvpHnLt0.cjs} +718 -570
- package/dist/src/{utils-DJfvjyMj.js → utils-BUMN8orw.js} +3 -3
- package/dist/src/{utils-B05gLxER.cjs → utils-DkVeShIB.cjs} +2 -2
- package/dist/src/{utils-BLJKfv0y.js → utils-kt7lv30R.js} +3 -3
- package/dist/src/{utils-hXtCYanr.js → utils-o8S5huU2.js} +2 -2
- package/dist/src/version-0frU0UTr.js +16 -0
- package/dist/src/version-CbpiUINz.js +17 -0
- package/dist/src/version-CbuBKu2U.js +16 -0
- package/dist/src/version-D9zu9FWB.cjs +27 -0
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +22 -20
- package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
- package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
- package/dist/src/cache-Cr9oLMUa.js +0 -3
- package/dist/src/cloud-Hphvo8kr.js +0 -3
- package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
- package/dist/src/evalResult-D8MT9p0s.js +0 -3
- package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
- package/dist/src/evaluator-CVessDWe.js +0 -3
- package/dist/src/fetch-C7bGKDlQ.js +0 -3
- package/dist/src/graders-BOAzQEUe.cjs +0 -2
- package/dist/src/graders-D4BTsZdG2.js +0 -3
- package/dist/src/graders-DOJK1XpV.js +0 -2
- package/dist/src/graders-NAv9LcBn.js +0 -2
- package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
- package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
- package/dist/src/server-DCtHUqlp.js +0 -3
- package/dist/src/store-CWOSz6D_.cjs +0 -2
- package/dist/src/store-DCDBhv7B.js +0 -3
- package/dist/src/telemetry-C1IqxcdW.js +0 -3
- package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
- package/dist/src/transform-M6ITAESf.js +0 -3
- /package/dist/src/{evalResult-DElBuddX.js → evalResult-spPqh1G_.js} +0 -0
|
@@ -1,25 +1,24 @@
|
|
|
1
1
|
const require_logger = require("./logger-COuQb2xB.cjs");
|
|
2
2
|
const require_invariant = require("./invariant-kfQ8Bu82.cjs");
|
|
3
|
-
const
|
|
4
|
-
const
|
|
5
|
-
const
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
const
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const
|
|
15
|
-
const require_chat = require("./chat-
|
|
16
|
-
const
|
|
17
|
-
const
|
|
18
|
-
const require_messages = require("./messages-
|
|
19
|
-
const require_responses = require("./responses-
|
|
20
|
-
const
|
|
21
|
-
const
|
|
22
|
-
const require_accounts = require("./accounts-Dy17bs4D.cjs");
|
|
3
|
+
const require_fetch = require("./fetch-NuqXW1Xb.cjs");
|
|
4
|
+
const require_types = require("./types-CgG2rKiW.cjs");
|
|
5
|
+
const require_accounts = require("./accounts-BIFntVWB.cjs");
|
|
6
|
+
const require_esm = require("./esm-B_rGuPTo.cjs");
|
|
7
|
+
const require_render = require("./render-tG6ir9_g.cjs");
|
|
8
|
+
const require_server = require("./server-BEECpeGG.cjs");
|
|
9
|
+
const require_providers = require("./providers-eDShy16E.cjs");
|
|
10
|
+
const require_pythonUtils = require("./pythonUtils-CoLaCwNY.cjs");
|
|
11
|
+
const require_fileExtensions = require("./fileExtensions-D9h-8Wxg.cjs");
|
|
12
|
+
const require_util = require("./util-DvpHnLt0.cjs");
|
|
13
|
+
const require_tokenUsageUtils = require("./tokenUsageUtils-C9odhsbW.cjs");
|
|
14
|
+
const require_cache = require("./cache-DGg-yTZG.cjs");
|
|
15
|
+
const require_chat = require("./chat-vYqqv1gP.cjs");
|
|
16
|
+
const require_transform = require("./transform-Dg4LcO1Y.cjs");
|
|
17
|
+
const require_embedding = require("./embedding-BXhN5lCH.cjs");
|
|
18
|
+
const require_messages = require("./messages-BnsVHUnm.cjs");
|
|
19
|
+
const require_responses = require("./responses-CF-ayauu.cjs");
|
|
20
|
+
const require_createHash = require("./createHash-VvBIc-AW.cjs");
|
|
21
|
+
const require_utils = require("./utils-DkVeShIB.cjs");
|
|
23
22
|
let fs = require("fs");
|
|
24
23
|
fs = require_logger.__toESM(fs);
|
|
25
24
|
let path = require("path");
|
|
@@ -27,1739 +26,1489 @@ path = require_logger.__toESM(path);
|
|
|
27
26
|
let js_yaml = require("js-yaml");
|
|
28
27
|
js_yaml = require_logger.__toESM(js_yaml);
|
|
29
28
|
let node_async_hooks = require("node:async_hooks");
|
|
29
|
+
let dedent = require("dedent");
|
|
30
|
+
dedent = require_logger.__toESM(dedent);
|
|
31
|
+
let zod = require("zod");
|
|
32
|
+
zod = require_logger.__toESM(zod);
|
|
33
|
+
let fs_promises = require("fs/promises");
|
|
30
34
|
let node_fs_promises = require("node:fs/promises");
|
|
31
35
|
node_fs_promises = require_logger.__toESM(node_fs_promises);
|
|
32
36
|
let node_path = require("node:path");
|
|
33
37
|
node_path = require_logger.__toESM(node_path);
|
|
38
|
+
let csv_parse_sync = require("csv-parse/sync");
|
|
39
|
+
let glob = require("glob");
|
|
34
40
|
let child_process = require("child_process");
|
|
35
41
|
let python_shell = require("python-shell");
|
|
36
42
|
let rfdc = require("rfdc");
|
|
37
43
|
rfdc = require_logger.__toESM(rfdc);
|
|
38
|
-
let dedent = require("dedent");
|
|
39
|
-
dedent = require_logger.__toESM(dedent);
|
|
40
|
-
let fs_promises = require("fs/promises");
|
|
41
|
-
let glob = require("glob");
|
|
42
|
-
let zod = require("zod");
|
|
43
|
-
zod = require_logger.__toESM(zod);
|
|
44
|
-
let csv_parse_sync = require("csv-parse/sync");
|
|
45
44
|
let cli_progress = require("cli-progress");
|
|
46
45
|
cli_progress = require_logger.__toESM(cli_progress);
|
|
47
|
-
//#region src/
|
|
46
|
+
//#region src/scheduler/providerCallExecutionContext.ts
|
|
47
|
+
const providerCallExecutionContext = new node_async_hooks.AsyncLocalStorage();
|
|
48
|
+
function getProviderCallExecutionContext() {
|
|
49
|
+
return providerCallExecutionContext.getStore();
|
|
50
|
+
}
|
|
51
|
+
function withProviderCallExecutionContext(context, fn) {
|
|
52
|
+
return providerCallExecutionContext.run(context, fn);
|
|
53
|
+
}
|
|
54
|
+
//#endregion
|
|
55
|
+
//#region src/matchers/providers.ts
|
|
48
56
|
/**
|
|
49
|
-
*
|
|
50
|
-
*
|
|
51
|
-
*
|
|
57
|
+
* Helper to call provider with consistent context propagation pattern.
|
|
58
|
+
* Spreads the optional context and merges with prompt label and vars.
|
|
59
|
+
* Also reuses evaluator scheduler context for cancellation, rate limits,
|
|
60
|
+
* and grouped grading provider calls when present.
|
|
52
61
|
*
|
|
53
|
-
*
|
|
54
|
-
*
|
|
55
|
-
*
|
|
56
|
-
* @param prompt - The prompt text
|
|
57
|
-
* @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
|
|
58
|
-
* @param providerResponse - Optional full provider response for contextTransform
|
|
59
|
-
* @returns The resolved context string or array of strings
|
|
60
|
-
* @throws Error if context cannot be resolved or transform fails
|
|
62
|
+
* IMPORTANT: Spread order matters - context is spread first, then prompt/vars
|
|
63
|
+
* override. This ensures originalProvider from context is preserved while
|
|
64
|
+
* allowing this call to specify its own prompt metadata.
|
|
61
65
|
*/
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
66
|
+
function callProviderWithContext(provider, prompt, label, vars, context) {
|
|
67
|
+
const callApiContext = {
|
|
68
|
+
...context,
|
|
69
|
+
prompt: {
|
|
70
|
+
raw: prompt,
|
|
71
|
+
label
|
|
72
|
+
},
|
|
73
|
+
vars
|
|
74
|
+
};
|
|
75
|
+
const executionContext = getProviderCallExecutionContext();
|
|
76
|
+
const callApiOptions = executionContext?.abortSignal ? { abortSignal: executionContext.abortSignal } : void 0;
|
|
77
|
+
const callApi = () => callApiOptions ? provider.callApi(prompt, callApiContext, callApiOptions) : provider.callApi(prompt, callApiContext);
|
|
78
|
+
const executeCall = () => {
|
|
79
|
+
if (executionContext?.rateLimitRegistry && !require_providers.isRateLimitWrapped(provider)) return executionContext.rateLimitRegistry.execute(provider, callApi, require_providers.createProviderRateLimitOptions());
|
|
80
|
+
return callApi();
|
|
81
|
+
};
|
|
82
|
+
if (executionContext?.providerCallQueue) return executionContext.providerCallQueue.enqueue(provider.id(), executeCall);
|
|
83
|
+
return executeCall();
|
|
84
|
+
}
|
|
85
|
+
async function loadFromProviderOptions(provider) {
|
|
86
|
+
require_invariant.invariant(typeof provider === "object", `Provider must be an object, but received a ${typeof provider}: ${provider}`);
|
|
87
|
+
require_invariant.invariant(!Array.isArray(provider), `Provider must be an object, but received an array: ${JSON.stringify(provider)}`);
|
|
88
|
+
require_invariant.invariant(provider.id, "Provider supplied to assertion must have an id");
|
|
89
|
+
return require_providers.loadApiProvider(provider.id, {
|
|
90
|
+
options: provider,
|
|
91
|
+
basePath: require_logger.state.basePath
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
function isSimulatedUserProviderConfig(provider) {
|
|
95
|
+
if (typeof provider === "string") return provider === "promptfoo:simulated-user";
|
|
96
|
+
if (!provider || typeof provider !== "object" || Array.isArray(provider)) return false;
|
|
97
|
+
if (typeof provider.id === "function") return provider.id() === "promptfoo:simulated-user";
|
|
98
|
+
const providerId = provider.id;
|
|
99
|
+
if (typeof providerId === "string") return providerId === "promptfoo:simulated-user";
|
|
100
|
+
return Object.values(provider).some((providerTypeConfig) => isSimulatedUserProviderConfig(providerTypeConfig));
|
|
101
|
+
}
|
|
102
|
+
async function getGradingProvider(type, provider, defaultProvider) {
|
|
103
|
+
let finalProvider;
|
|
104
|
+
if (typeof provider === "string") finalProvider = await require_providers.loadApiProvider(provider, { basePath: require_logger.state.basePath });
|
|
105
|
+
else if (provider != null && typeof provider === "object" && typeof provider.id === "function") finalProvider = provider;
|
|
106
|
+
else if (provider != null && typeof provider === "object") {
|
|
107
|
+
const typeValue = provider[type];
|
|
108
|
+
if (typeValue) finalProvider = await getGradingProvider(type, typeValue, defaultProvider);
|
|
109
|
+
else if (provider.id) finalProvider = await loadFromProviderOptions(provider);
|
|
110
|
+
else if (Array.isArray(provider)) throw new Error(`Provider must be an object or string, but received an array.\n\nCheck that the provider ${JSON.stringify(provider[0], null, 2)} is not nested in an array.`);
|
|
111
|
+
else throw new Error(`Invalid provider definition for output type '${type}': ${JSON.stringify(provider, null, 2)}`);
|
|
112
|
+
} else {
|
|
113
|
+
const defaultTest = require_logger.state.config?.defaultTest;
|
|
114
|
+
const defaultTestObj = typeof defaultTest === "object" ? defaultTest : null;
|
|
115
|
+
const cfg = [
|
|
116
|
+
defaultTestObj?.provider || void 0,
|
|
117
|
+
defaultTestObj?.options?.provider?.text || void 0,
|
|
118
|
+
defaultTestObj?.options?.provider || void 0
|
|
119
|
+
].find((candidateProvider) => {
|
|
120
|
+
if (!candidateProvider) return false;
|
|
121
|
+
if (isSimulatedUserProviderConfig(candidateProvider)) {
|
|
122
|
+
require_logger.logger.debug("[Grading] Skipping promptfoo:simulated-user as an implicit grader fallback");
|
|
123
|
+
return false;
|
|
71
124
|
}
|
|
72
|
-
|
|
73
|
-
}
|
|
74
|
-
} else if (fallbackContext) contextValue = fallbackContext;
|
|
75
|
-
if (assertion.contextTransform) try {
|
|
76
|
-
const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
|
|
77
|
-
const transformed = await require_transform.transform(assertion.contextTransform, outputForTransform, {
|
|
78
|
-
vars: test.vars,
|
|
79
|
-
prompt: { label: prompt },
|
|
80
|
-
...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
|
|
125
|
+
return true;
|
|
81
126
|
});
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
127
|
+
if (cfg) {
|
|
128
|
+
finalProvider = await getGradingProvider(type, cfg, defaultProvider);
|
|
129
|
+
if (finalProvider) require_logger.logger.debug("[Grading] Using provider from defaultTest fallback", { providerId: finalProvider.id() });
|
|
130
|
+
} else finalProvider = defaultProvider;
|
|
86
131
|
}
|
|
87
|
-
|
|
88
|
-
return contextValue;
|
|
132
|
+
return finalProvider;
|
|
89
133
|
}
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
134
|
+
async function getAndCheckProvider(type, provider, defaultProvider, checkName) {
|
|
135
|
+
const matchedProvider = await getGradingProvider(type, provider, defaultProvider);
|
|
136
|
+
if (!matchedProvider) if (defaultProvider) {
|
|
137
|
+
require_logger.logger.warn("[Grading] Falling back to default provider", {
|
|
138
|
+
checkName,
|
|
139
|
+
type
|
|
140
|
+
});
|
|
141
|
+
return defaultProvider;
|
|
142
|
+
} else throw new Error(`No provider of type ${type} found for '${checkName}'`);
|
|
143
|
+
let isValidProviderType = true;
|
|
144
|
+
if (type === "embedding") isValidProviderType = "callEmbeddingApi" in matchedProvider || "callSimilarityApi" in matchedProvider;
|
|
145
|
+
else if (type === "classification") isValidProviderType = "callClassificationApi" in matchedProvider;
|
|
146
|
+
else if (type === "moderation") isValidProviderType = "callModerationApi" in matchedProvider;
|
|
147
|
+
if (!isValidProviderType) {
|
|
148
|
+
if (provider) throw new Error(`Provider ${matchedProvider.id()} is not a valid ${type} provider for '${checkName}'`);
|
|
149
|
+
if (defaultProvider) {
|
|
150
|
+
require_logger.logger.warn("[Grading] Falling back to default provider after type check failed", {
|
|
151
|
+
checkName,
|
|
152
|
+
providerId: matchedProvider.id(),
|
|
153
|
+
type
|
|
154
|
+
});
|
|
155
|
+
return defaultProvider;
|
|
156
|
+
}
|
|
157
|
+
throw new Error(`Provider ${matchedProvider.id()} is not a valid ${type} provider for '${checkName}'`);
|
|
158
|
+
}
|
|
159
|
+
return matchedProvider;
|
|
96
160
|
}
|
|
97
161
|
//#endregion
|
|
98
|
-
//#region src/
|
|
99
|
-
const
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
return Object.freeze(ret);
|
|
114
|
-
}
|
|
115
|
-
async function loadFromJavaScriptFile(filePath, functionName, args) {
|
|
116
|
-
const requiredModule = await require_esm.importModule(filePath, functionName);
|
|
117
|
-
if (functionName && typeof requiredModule[functionName] === "function") return requiredModule[functionName](...args);
|
|
118
|
-
else if (typeof requiredModule === "function") return requiredModule(...args);
|
|
119
|
-
else if (requiredModule.default && typeof requiredModule.default === "function") return requiredModule.default(...args);
|
|
120
|
-
else throw new Error(`Assertion malformed: ${filePath} must export a function or have a default export as a function`);
|
|
121
|
-
}
|
|
122
|
-
function processFileReference(fileRef) {
|
|
123
|
-
const basePath = require_logger.state.basePath || "";
|
|
124
|
-
const filePath = path.default.resolve(basePath, fileRef.slice(7));
|
|
125
|
-
const fileContent = fs.default.readFileSync(filePath, "utf8");
|
|
126
|
-
const extension = path.default.extname(filePath);
|
|
127
|
-
if ([
|
|
128
|
-
".json",
|
|
129
|
-
".yaml",
|
|
130
|
-
".yml"
|
|
131
|
-
].includes(extension)) return js_yaml.default.load(fileContent);
|
|
132
|
-
else if (extension === ".txt") return fileContent.trim();
|
|
133
|
-
else throw new Error(`Unsupported file type: ${filePath}`);
|
|
162
|
+
//#region src/providers/anthropic/defaults.ts
|
|
163
|
+
const DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-6";
|
|
164
|
+
/**
|
|
165
|
+
* Helper function to create a lazy-loaded provider. This allows the .env file to be
|
|
166
|
+
* loaded first before the provider is initialized.
|
|
167
|
+
* @param factory Factory function that creates provider instance with optional env
|
|
168
|
+
* @returns Object with getter that lazily initializes the provider with the latest env
|
|
169
|
+
*/
|
|
170
|
+
function createLazyProvider(factory) {
|
|
171
|
+
const instances = /* @__PURE__ */ new Map();
|
|
172
|
+
return { getInstance(env) {
|
|
173
|
+
const cacheKey = env ? JSON.stringify(env) : "";
|
|
174
|
+
if (!instances.has(cacheKey)) instances.set(cacheKey, factory(env));
|
|
175
|
+
return instances.get(cacheKey);
|
|
176
|
+
} };
|
|
134
177
|
}
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
178
|
+
var AnthropicLlmRubricProvider = class extends require_messages.AnthropicMessagesProvider {
|
|
179
|
+
constructor(modelName, options = {}) {
|
|
180
|
+
const { env, config = {} } = options;
|
|
181
|
+
super(modelName, {
|
|
182
|
+
env,
|
|
183
|
+
config: {
|
|
184
|
+
tool_choice: {
|
|
185
|
+
type: "tool",
|
|
186
|
+
name: "grade_output"
|
|
187
|
+
},
|
|
188
|
+
tools: [{
|
|
189
|
+
name: "grade_output",
|
|
190
|
+
description: "Grade the given output based on specific criteria",
|
|
191
|
+
input_schema: {
|
|
192
|
+
type: "object",
|
|
193
|
+
properties: {
|
|
194
|
+
pass: {
|
|
195
|
+
type: "boolean",
|
|
196
|
+
description: "Whether the output passes the criteria"
|
|
197
|
+
},
|
|
198
|
+
score: {
|
|
199
|
+
type: "number",
|
|
200
|
+
description: "The score assigned to the output"
|
|
201
|
+
},
|
|
202
|
+
reason: {
|
|
203
|
+
type: "string",
|
|
204
|
+
description: "The reason for the given grade"
|
|
205
|
+
}
|
|
206
|
+
},
|
|
207
|
+
required: [
|
|
208
|
+
"pass",
|
|
209
|
+
"score",
|
|
210
|
+
"reason"
|
|
211
|
+
]
|
|
212
|
+
}
|
|
213
|
+
}],
|
|
214
|
+
...config
|
|
215
|
+
}
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
async callApi(prompt) {
|
|
219
|
+
const result = await super.callApi(prompt);
|
|
220
|
+
if (typeof result.output !== "string") return { error: `Anthropic LLM rubric grader - malformed non-string output\n\n${JSON.stringify(result.output)}` };
|
|
221
|
+
try {
|
|
222
|
+
return { output: JSON.parse(result.output).input };
|
|
223
|
+
} catch (err) {
|
|
224
|
+
return { error: `Anthropic LLM rubric grader - invalid JSON: ${err}\n\n${result.output}` };
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
};
|
|
228
|
+
const gradingProviderFactory = createLazyProvider((env) => new require_messages.AnthropicMessagesProvider(DEFAULT_ANTHROPIC_MODEL, { env }));
|
|
229
|
+
const llmRubricProviderFactory = createLazyProvider((env) => new AnthropicLlmRubricProvider(DEFAULT_ANTHROPIC_MODEL, { env }));
|
|
230
|
+
const webSearchProviderFactory = createLazyProvider((env) => new require_messages.AnthropicMessagesProvider(DEFAULT_ANTHROPIC_MODEL, {
|
|
231
|
+
env,
|
|
232
|
+
config: { tools: [{
|
|
233
|
+
type: "web_search_20250305",
|
|
234
|
+
name: "web_search",
|
|
235
|
+
max_uses: 5
|
|
236
|
+
}] }
|
|
237
|
+
}));
|
|
238
|
+
/**
|
|
239
|
+
* Gets all default Anthropic providers with the given environment overrides
|
|
240
|
+
* @param env - Optional environment overrides
|
|
241
|
+
* @returns Anthropic provider implementations for various functions
|
|
242
|
+
*/
|
|
243
|
+
function getAnthropicProviders(env) {
|
|
244
|
+
const gradingProvider = gradingProviderFactory.getInstance(env);
|
|
245
|
+
return {
|
|
246
|
+
gradingJsonProvider: gradingProvider,
|
|
247
|
+
gradingProvider,
|
|
248
|
+
llmRubricProvider: llmRubricProviderFactory.getInstance(env),
|
|
249
|
+
suggestionsProvider: gradingProvider,
|
|
250
|
+
synthesizeProvider: gradingProvider,
|
|
251
|
+
webSearchProvider: webSearchProviderFactory.getInstance(env)
|
|
252
|
+
};
|
|
138
253
|
}
|
|
139
254
|
//#endregion
|
|
140
|
-
//#region src/
|
|
141
|
-
const
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
2. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. The exact sentence is present in the given context. So [Attributed]
|
|
155
|
-
3. He published 4 papers in 1905. There is no mention about papers he wrote in given the context. So [Not Attributed]
|
|
156
|
-
4. Einstein moved to Switzerland in 1895. There is not supporting evidence for this in the given the context. So [Not Attributed]
|
|
157
|
-
|
|
158
|
-
context:{{context}}
|
|
159
|
-
answer:{{groundTruth}}
|
|
160
|
-
classification:
|
|
161
|
-
`;
|
|
162
|
-
const CONTEXT_RECALL_ATTRIBUTED_TOKEN = "[Attributed]";
|
|
163
|
-
const CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN = "[Not Attributed]";
|
|
164
|
-
const CONTEXT_RELEVANCE = `Please extract relevant sentences from the provided context that is absolutely required answer the following query. If no relevant sentences are found, or if you believe the query cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.
|
|
165
|
-
|
|
166
|
-
query: {{query}}
|
|
167
|
-
context: {{context}}
|
|
168
|
-
candidate sentences:
|
|
169
|
-
`;
|
|
170
|
-
const CONTEXT_RELEVANCE_BAD = "Insufficient Information";
|
|
171
|
-
const CONTEXT_FAITHFULNESS_LONGFORM = `Given a question and answer, create one or more statements from each sentence in the given answer.
|
|
172
|
-
question: Who was Albert Einstein and what is he best known for?
|
|
173
|
-
answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
|
|
174
|
-
statements:\nAlbert Einstein was born in Germany.\nAlbert Einstein was best known for his theory of relativity.
|
|
175
|
-
question: Cadmium Chloride is slightly soluble in this chemical, it is also called what?
|
|
176
|
-
answer: alcohol
|
|
177
|
-
statements:\nCadmium Chloride is slightly soluble in alcohol.
|
|
178
|
-
question: Were Shahul and Jithin of the same nationality?
|
|
179
|
-
answer: They were from different countries.
|
|
180
|
-
statements:\nShahul and Jithin were from different countries.
|
|
181
|
-
question:{{question}}
|
|
182
|
-
answer: {{answer}}
|
|
183
|
-
statements:\n`;
|
|
184
|
-
const CONTEXT_FAITHFULNESS_NLI_STATEMENTS = `Prompt: Natural language inference
|
|
185
|
-
Consider the given context and following statements, then determine whether they are supported by the information present in the context.Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Provide a final verdict for each statement in order at the end in the given format. Do not deviate from the specified format.
|
|
186
|
-
|
|
187
|
-
Context:\nJohn is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.
|
|
188
|
-
statements:\n1. John is majoring in Biology.\n2. John is taking a course on Artificial Intelligence.\n3. John is a dedicated student.\n4. John has a part-time job.\n5. John is interested in computer programming.\n
|
|
189
|
-
Answer:
|
|
190
|
-
1. John is majoring in Biology.
|
|
191
|
-
Explanation: John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology. Verdict: No.
|
|
192
|
-
2. John is taking a course on Artificial Intelligence.
|
|
193
|
-
Explanation: The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI. Verdict: No.
|
|
194
|
-
3. John is a dedicated student.
|
|
195
|
-
Explanation: The prompt states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication. Verdict: Yes.
|
|
196
|
-
4. John has a part-time job.
|
|
197
|
-
Explanation: There is no information given in the context about John having a part-time job. Therefore, it cannot be deduced that John has a part-time job. Verdict: No.
|
|
198
|
-
5. John is interested in computer programming.
|
|
199
|
-
Explanation: The context states that John is pursuing a degree in Computer Science, which implies an interest in computer programming. Verdict: Yes.
|
|
200
|
-
Final verdict for each statement in order: No. No. Yes. No. Yes.
|
|
201
|
-
context:\n{{context}}
|
|
202
|
-
statements:\n{{statements|join("\\n")}}
|
|
203
|
-
Answer:
|
|
204
|
-
`;
|
|
205
|
-
//#endregion
|
|
206
|
-
//#region src/prompts/grading.ts
|
|
207
|
-
const DEFAULT_GRADING_PROMPT = JSON.stringify([{
|
|
208
|
-
role: "system",
|
|
209
|
-
content: dedent.default`You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
210
|
-
|
|
211
|
-
Examples:
|
|
212
|
-
|
|
213
|
-
<Output>Hello world</Output>
|
|
214
|
-
<Rubric>Content contains a greeting</Rubric>
|
|
215
|
-
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
|
|
216
|
-
|
|
217
|
-
<Output>Avast ye swabs, repel the invaders!</Output>
|
|
218
|
-
<Rubric>Does not speak like a pirate</Rubric>
|
|
219
|
-
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}`
|
|
220
|
-
}, {
|
|
221
|
-
role: "user",
|
|
222
|
-
content: "<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>"
|
|
223
|
-
}]);
|
|
224
|
-
const PROMPTFOO_FACTUALITY_PROMPT = JSON.stringify([{
|
|
225
|
-
role: "system",
|
|
226
|
-
content: dedent.default`
|
|
227
|
-
You are a precise factuality evaluator that compares a submitted answer to an expert answer.
|
|
228
|
-
|
|
229
|
-
Your task is to analyze the factual content while ignoring differences in style, grammar, or punctuation.
|
|
230
|
-
You must categorize the submission into one of these options:
|
|
231
|
-
|
|
232
|
-
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
|
|
233
|
-
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
|
|
234
|
-
(C) The submitted answer contains all the same details as the expert answer.
|
|
235
|
-
(D) There is a disagreement between the submitted answer and the expert answer.
|
|
236
|
-
(E) The answers differ, but these differences don't matter from the perspective of factuality.
|
|
237
|
-
|
|
238
|
-
Respond ONLY with a JSON object in this format:
|
|
239
|
-
{
|
|
240
|
-
"category": "[LETTER]",
|
|
241
|
-
"reason": "[DETAILED EXPLANATION]"
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
- The "category" must be a single letter A, B, C, D, or E.
|
|
245
|
-
- Provide a clear, detailed explanation in the "reason" field.
|
|
246
|
-
- Your response must be valid JSON with no additional text.`
|
|
247
|
-
}, {
|
|
248
|
-
role: "user",
|
|
249
|
-
content: dedent.default`
|
|
250
|
-
I need you to compare these answers:
|
|
251
|
-
|
|
252
|
-
<question>
|
|
253
|
-
{{input}}
|
|
254
|
-
</question>
|
|
255
|
-
|
|
256
|
-
<expert_answer>
|
|
257
|
-
{{ideal}}
|
|
258
|
-
</expert_answer>
|
|
259
|
-
|
|
260
|
-
<submitted_answer>
|
|
261
|
-
{{completion}}
|
|
262
|
-
</submitted_answer>
|
|
263
|
-
|
|
264
|
-
Please analyze the factual relationship between these answers according to the categories you've been given.`
|
|
265
|
-
}]);
|
|
266
|
-
const OPENAI_CLOSED_QA_PROMPT = JSON.stringify([{
|
|
267
|
-
role: "system",
|
|
268
|
-
content: `You are assessing a submitted answer on a given task based on a criterion. Here is the data:
|
|
269
|
-
[BEGIN DATA]
|
|
270
|
-
***
|
|
271
|
-
[Task]: {{input}}
|
|
272
|
-
***
|
|
273
|
-
[Submission]: {{completion}}
|
|
274
|
-
***
|
|
275
|
-
[Criterion]: {{criteria}}
|
|
276
|
-
***
|
|
277
|
-
[END DATA]
|
|
278
|
-
Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
|
|
279
|
-
|
|
280
|
-
Reasoning:`
|
|
281
|
-
}]);
|
|
282
|
-
const SUGGEST_PROMPTS_SYSTEM_MESSAGE = {
|
|
283
|
-
role: "system",
|
|
284
|
-
content: `You're helping a scientist who is tuning a prompt for a large language model. You will receive messages, and each message is a full prompt. Generate a candidate variation of the given prompt. This variation will be tested for quality in order to select a winner.
|
|
285
|
-
|
|
286
|
-
Substantially revise the prompt, revising its structure and content however necessary to make it perform better, while preserving the original intent and including important details.
|
|
287
|
-
|
|
288
|
-
Your output is going to be copied directly into the program. It should contain the prompt ONLY`
|
|
289
|
-
};
|
|
290
|
-
const SELECT_BEST_PROMPT = JSON.stringify([{
|
|
291
|
-
role: "system",
|
|
292
|
-
content: `You are comparing multiple pieces of text to see which best fits the following criteria: {{criteria}}
|
|
293
|
-
|
|
294
|
-
Here are the pieces of text:
|
|
295
|
-
|
|
296
|
-
{% for output in outputs %}
|
|
297
|
-
<Text index="{{ loop.index0 }}">
|
|
298
|
-
{{ output }}
|
|
299
|
-
</Text>
|
|
300
|
-
{% endfor %}
|
|
301
|
-
|
|
302
|
-
Output the index of the text that best fits the criteria. You must output a single integer.`
|
|
303
|
-
}]);
|
|
304
|
-
const DEFAULT_WEB_SEARCH_PROMPT = JSON.stringify([{
|
|
305
|
-
role: "system",
|
|
306
|
-
content: dedent.default`You are grading output according to a user-specified rubric, with the ability to search the web for current information. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
307
|
-
|
|
308
|
-
You MUST search the web when:
|
|
309
|
-
- The rubric asks about current information (prices, weather, news, etc.)
|
|
310
|
-
- Facts need to be verified against recent data
|
|
311
|
-
- The rubric references time-sensitive information
|
|
312
|
-
|
|
313
|
-
Examples:
|
|
314
|
-
|
|
315
|
-
<Output>The current CEO of Microsoft is Satya Nadella</Output>
|
|
316
|
-
<Rubric>Contains accurate information about Microsoft's leadership</Rubric>
|
|
317
|
-
{"reason": "I searched and confirmed Satya Nadella is indeed the current CEO of Microsoft", "pass": true, "score": 1.0}
|
|
318
|
-
|
|
319
|
-
<Output>Bitcoin is trading at $45,000</Output>
|
|
320
|
-
<Rubric>Provides current Bitcoin price within 10% accuracy</Rubric>
|
|
321
|
-
{"reason": "Web search shows Bitcoin is currently trading at $98,000, not $45,000. The output is off by more than 50%", "pass": false, "score": 0.0}`
|
|
322
|
-
}, {
|
|
323
|
-
role: "user",
|
|
324
|
-
content: "<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>"
|
|
325
|
-
}]);
|
|
326
|
-
const TRAJECTORY_GOAL_SUCCESS_PROMPT = JSON.stringify([{
|
|
327
|
-
role: "system",
|
|
328
|
-
content: dedent.default`You are grading whether an AI agent successfully completed a goal based on its final output and a summarized execution trajectory. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
329
|
-
|
|
330
|
-
Judge end-to-end success, not stylistic perfection.
|
|
331
|
-
Use the trajectory as evidence for what the agent actually did.
|
|
332
|
-
Give partial credit when the agent made progress but did not fully achieve the goal.
|
|
333
|
-
|
|
334
|
-
Examples:
|
|
335
|
-
|
|
336
|
-
<Goal>Find the order status and tell the user whether it has shipped</Goal>
|
|
337
|
-
<Trajectory>{"stepCount":2,"steps":[{"index":1,"type":"tool","name":"search_orders"},{"index":2,"type":"message","name":"agent response"}]}</Trajectory>
|
|
338
|
-
<Output>Your order shipped yesterday and should arrive on Tuesday.</Output>
|
|
339
|
-
{"reason":"The agent used the order lookup tool and gave the user the shipping status, so the goal was achieved.","pass":true,"score":1.0}
|
|
340
|
-
|
|
341
|
-
<Goal>Find the order status and tell the user whether it has shipped</Goal>
|
|
342
|
-
<Trajectory>{"stepCount":1,"steps":[{"index":1,"type":"message","name":"agent response"}]}</Trajectory>
|
|
343
|
-
<Output>I cannot check your order right now.</Output>
|
|
344
|
-
{"reason":"The agent did not show evidence of checking the order and did not provide the requested status.","pass":false,"score":0.0}`
|
|
345
|
-
}, {
|
|
346
|
-
role: "user",
|
|
347
|
-
content: dedent.default`<Goal>
|
|
348
|
-
{{ goal }}
|
|
349
|
-
</Goal>
|
|
350
|
-
<Trajectory>
|
|
351
|
-
{{ trajectory }}
|
|
352
|
-
</Trajectory>
|
|
353
|
-
<Output>
|
|
354
|
-
{{ output }}
|
|
355
|
-
</Output>`
|
|
356
|
-
}]);
|
|
357
|
-
//#endregion
|
|
358
|
-
//#region src/prompts/processors/csv.ts
|
|
359
|
-
/**
|
|
360
|
-
* Process a CSV file containing prompts
|
|
361
|
-
*
|
|
362
|
-
* CSV format can be either:
|
|
363
|
-
* 1. Single column with prompt text per line
|
|
364
|
-
* 2. CSV with a 'prompt' column and optional 'label' column
|
|
365
|
-
*
|
|
366
|
-
* @param filePath Path to the CSV file
|
|
367
|
-
* @param basePrompt Base prompt properties to include
|
|
368
|
-
* @returns Array of processed prompts
|
|
369
|
-
*/
|
|
370
|
-
async function processCsvPrompts(filePath, basePrompt) {
|
|
371
|
-
const content = fs.default.readFileSync(filePath, "utf8");
|
|
372
|
-
const delimiter = require_logger.getEnvString("PROMPTFOO_CSV_DELIMITER", ",");
|
|
373
|
-
const enforceStrict = require_logger.getEnvBool("PROMPTFOO_CSV_STRICT", false);
|
|
374
|
-
if (!content.includes(delimiter)) {
|
|
375
|
-
const lines = content.split(/\r?\n/).filter((line) => line.trim());
|
|
376
|
-
const startIndex = lines[0]?.toLowerCase().trim() === "prompt" ? 1 : 0;
|
|
377
|
-
return lines.slice(startIndex).map((line, index) => ({
|
|
378
|
-
...basePrompt,
|
|
379
|
-
raw: line,
|
|
380
|
-
label: basePrompt.label || `Prompt ${index + 1} - ${line}`
|
|
381
|
-
}));
|
|
382
|
-
}
|
|
383
|
-
try {
|
|
384
|
-
return (0, csv_parse_sync.parse)(content, {
|
|
385
|
-
columns: true,
|
|
386
|
-
bom: true,
|
|
387
|
-
delimiter,
|
|
388
|
-
relax_quotes: !enforceStrict,
|
|
389
|
-
skip_empty_lines: true,
|
|
390
|
-
trim: true
|
|
391
|
-
}).filter((row) => row.prompt).map((row, index) => {
|
|
392
|
-
return {
|
|
393
|
-
...basePrompt,
|
|
394
|
-
raw: row.prompt,
|
|
395
|
-
label: row.label || basePrompt.label || `Prompt ${index + 1} - ${row.prompt}`
|
|
396
|
-
};
|
|
397
|
-
});
|
|
398
|
-
} catch {
|
|
399
|
-
const lines = content.split(/\r?\n/).filter((line) => line.trim());
|
|
400
|
-
const startIndex = lines[0]?.toLowerCase().trim() === "prompt" ? 1 : 0;
|
|
401
|
-
return lines.slice(startIndex).map((line, index) => ({
|
|
402
|
-
...basePrompt,
|
|
403
|
-
raw: line,
|
|
404
|
-
label: basePrompt.label || `Prompt ${index + 1} - ${line}`
|
|
405
|
-
}));
|
|
406
|
-
}
|
|
407
|
-
}
|
|
408
|
-
//#endregion
|
|
409
|
-
//#region src/prompts/processors/executable.ts
|
|
410
|
-
const ANSI_ESCAPE = /\x1b(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])/g;
|
|
411
|
-
function stripText(text) {
|
|
412
|
-
return text.replace(ANSI_ESCAPE, "");
|
|
413
|
-
}
|
|
414
|
-
/**
|
|
415
|
-
* Executable prompt function. Executes any script/binary and returns its output as the prompt.
|
|
416
|
-
* The script receives context as JSON in its arguments.
|
|
417
|
-
* @param scriptPath - Path to the executable script.
|
|
418
|
-
* @param context - Context for the prompt.
|
|
419
|
-
* @returns The prompt output from the script.
|
|
420
|
-
*/
|
|
421
|
-
const executablePromptFunction = async (scriptPath, context) => {
|
|
422
|
-
require_invariant.invariant(context.provider?.id, "provider.id is required");
|
|
423
|
-
const transformedContext = {
|
|
424
|
-
vars: context.vars,
|
|
425
|
-
provider: {
|
|
426
|
-
id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
|
|
427
|
-
label: context.provider?.label
|
|
428
|
-
},
|
|
429
|
-
config: context.config ?? {}
|
|
430
|
-
};
|
|
431
|
-
const scriptParts = require_providers.parseScriptParts(scriptPath);
|
|
432
|
-
const fileHashes = require_providers.getFileHashes(scriptParts);
|
|
433
|
-
const cacheKey = `exec-prompt:${scriptPath}:${fileHashes.join(":")}:${require_logger.safeJsonStringify(transformedContext)}`;
|
|
434
|
-
let cachedResult;
|
|
435
|
-
if (fileHashes.length > 0 && require_cache.isCacheEnabled()) {
|
|
436
|
-
cachedResult = await require_cache.getCache().get(cacheKey);
|
|
437
|
-
if (cachedResult) {
|
|
438
|
-
require_logger.logger.debug(`Returning cached result for executable prompt ${scriptPath}`);
|
|
439
|
-
return cachedResult;
|
|
440
|
-
}
|
|
441
|
-
}
|
|
442
|
-
return new Promise((resolve, reject) => {
|
|
443
|
-
const command = scriptParts.shift();
|
|
444
|
-
require_invariant.invariant(command, "No command found in script path");
|
|
445
|
-
const scriptArgs = scriptParts.concat([require_logger.safeJsonStringify(transformedContext)]);
|
|
446
|
-
const options = {
|
|
447
|
-
cwd: context.config?.basePath,
|
|
448
|
-
timeout: context.config?.timeout || 6e4
|
|
449
|
-
};
|
|
450
|
-
require_logger.logger.debug(`Executing prompt script: ${command} ${scriptArgs.join(" ")}`);
|
|
451
|
-
(0, child_process.execFile)(command, scriptArgs, options, async (error, stdout, stderr) => {
|
|
452
|
-
if (error) {
|
|
453
|
-
require_logger.logger.error(`Error running executable prompt ${scriptPath}: ${error.message}`);
|
|
454
|
-
reject(error);
|
|
455
|
-
return;
|
|
456
|
-
}
|
|
457
|
-
const standardOutput = stripText(Buffer.from(stdout).toString("utf8").trim());
|
|
458
|
-
const errorOutput = stripText(Buffer.from(stderr).toString("utf8").trim());
|
|
459
|
-
if (errorOutput) {
|
|
460
|
-
require_logger.logger.debug(`Error output from executable prompt ${scriptPath}: ${errorOutput}`);
|
|
461
|
-
if (!standardOutput) {
|
|
462
|
-
reject(new Error(errorOutput));
|
|
463
|
-
return;
|
|
464
|
-
}
|
|
465
|
-
}
|
|
466
|
-
require_logger.logger.debug(`Output from executable prompt ${scriptPath}: ${standardOutput}`);
|
|
467
|
-
if (fileHashes.length > 0 && require_cache.isCacheEnabled()) await require_cache.getCache().set(cacheKey, standardOutput);
|
|
468
|
-
resolve(standardOutput);
|
|
469
|
-
});
|
|
470
|
-
});
|
|
471
|
-
};
|
|
472
|
-
/**
|
|
473
|
-
* Processes an executable file to generate prompts.
|
|
474
|
-
* The executable can be any script or binary that outputs prompt text to stdout.
|
|
475
|
-
* It receives the context as JSON in its first argument.
|
|
476
|
-
*
|
|
477
|
-
* @param filePath - Path to the executable file (can include arguments).
|
|
478
|
-
* @param prompt - The raw prompt data.
|
|
479
|
-
* @param functionName - Not used for executables, but kept for interface consistency.
|
|
480
|
-
* @returns Array of prompts generated from the executable.
|
|
481
|
-
*/
|
|
482
|
-
async function processExecutableFile(filePath, prompt, _functionName) {
|
|
483
|
-
let rawContent = filePath;
|
|
484
|
-
const firstPart = require_providers.parseScriptParts(filePath)[0];
|
|
485
|
-
if (firstPart) try {
|
|
486
|
-
const stats = await (0, fs_promises.stat)(firstPart);
|
|
487
|
-
if (stats.isFile() && stats.size < 1024 * 100) {
|
|
488
|
-
const content = await (0, fs_promises.readFile)(firstPart, "utf-8");
|
|
489
|
-
if (!/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(content.substring(0, 1e3))) rawContent = content;
|
|
490
|
-
}
|
|
491
|
-
} catch (_e) {}
|
|
492
|
-
const label = prompt.label ?? filePath;
|
|
493
|
-
return [{
|
|
494
|
-
raw: rawContent,
|
|
495
|
-
label,
|
|
496
|
-
function: (context) => executablePromptFunction(filePath, {
|
|
497
|
-
...context,
|
|
498
|
-
config: prompt.config
|
|
499
|
-
}),
|
|
500
|
-
config: prompt.config
|
|
501
|
-
}];
|
|
502
|
-
}
|
|
503
|
-
//#endregion
|
|
504
|
-
//#region src/prompts/processors/javascript.ts
|
|
505
|
-
const transformContext = (context) => {
|
|
506
|
-
require_invariant.invariant(context.provider, "Provider is required");
|
|
507
|
-
return {
|
|
508
|
-
vars: context.vars,
|
|
509
|
-
provider: {
|
|
510
|
-
id: context.provider.id(),
|
|
511
|
-
label: context.provider.label
|
|
512
|
-
},
|
|
513
|
-
config: context.config ?? {}
|
|
514
|
-
};
|
|
515
|
-
};
|
|
516
|
-
/**
|
|
517
|
-
* Processes a JavaScript file to import and execute a module function as a prompt.
|
|
518
|
-
* @param filePath - Path to the JavaScript file.
|
|
519
|
-
* @param functionName - Optional function name to execute.
|
|
520
|
-
* @returns Promise resolving to an array of prompts.
|
|
521
|
-
*/
|
|
522
|
-
async function processJsFile(filePath, prompt, functionName) {
|
|
523
|
-
const promptFunction = await require_esm.importModule(filePath, functionName);
|
|
524
|
-
return [{
|
|
525
|
-
raw: String(promptFunction),
|
|
526
|
-
label: prompt.label ? prompt.label : functionName ? `${filePath}:${functionName}` : filePath,
|
|
527
|
-
function: (context) => promptFunction(transformContext({
|
|
528
|
-
...context,
|
|
529
|
-
config: prompt.config ?? {}
|
|
530
|
-
})),
|
|
531
|
-
config: prompt.config ?? {}
|
|
532
|
-
}];
|
|
533
|
-
}
|
|
534
|
-
//#endregion
|
|
535
|
-
//#region src/prompts/processors/jinja.ts
|
|
536
|
-
/**
|
|
537
|
-
* Processes a Jinja2 template file to extract prompts.
|
|
538
|
-
* Similar to markdown files, each Jinja2 file is treated as a single prompt.
|
|
539
|
-
*
|
|
540
|
-
* @param filePath - Path to the Jinja2 template file.
|
|
541
|
-
* @param prompt - The raw prompt data.
|
|
542
|
-
* @returns Array of one `Prompt` object.
|
|
543
|
-
*/
|
|
544
|
-
function processJinjaFile(filePath, prompt) {
|
|
545
|
-
const content = fs.readFileSync(filePath, "utf8");
|
|
546
|
-
return [{
|
|
547
|
-
raw: content,
|
|
548
|
-
label: prompt.label || `${filePath}: ${content.slice(0, 50)}...`,
|
|
549
|
-
config: prompt.config
|
|
550
|
-
}];
|
|
551
|
-
}
|
|
552
|
-
//#endregion
|
|
553
|
-
//#region src/prompts/processors/json.ts
|
|
554
|
-
/**
|
|
555
|
-
* Processes a JSON file to extract prompts.
|
|
556
|
-
* This function reads a JSON file and converts it to a `Prompt` object.
|
|
557
|
-
* Any file:// references within the JSON content are recursively resolved.
|
|
558
|
-
*
|
|
559
|
-
* @param filePath - The path to the JSON file.
|
|
560
|
-
* @param prompt - The raw prompt data, used for labeling.
|
|
561
|
-
* @returns An array of one `Prompt` object.
|
|
562
|
-
* @throws Will throw an error if the file cannot be read.
|
|
563
|
-
*/
|
|
564
|
-
function processJsonFile(filePath, prompt) {
|
|
565
|
-
const fileContents = fs.readFileSync(filePath, "utf8");
|
|
566
|
-
let processedContents = fileContents;
|
|
567
|
-
try {
|
|
568
|
-
const resolved = require_util.maybeLoadConfigFromExternalFile(JSON.parse(fileContents));
|
|
569
|
-
processedContents = JSON.stringify(resolved);
|
|
570
|
-
} catch {}
|
|
571
|
-
return [{
|
|
572
|
-
raw: processedContents,
|
|
573
|
-
label: prompt.label || `${filePath}: ${processedContents}`,
|
|
574
|
-
config: prompt.config
|
|
575
|
-
}];
|
|
576
|
-
}
|
|
577
|
-
//#endregion
|
|
578
|
-
//#region src/prompts/processors/jsonl.ts
|
|
579
|
-
/**
|
|
580
|
-
* Processes a JSONL file to extract prompts.
|
|
581
|
-
* @param filePath - Path to the JSONL file.
|
|
582
|
-
* @param prompt - The raw prompt data.
|
|
583
|
-
* @returns Array of prompts extracted from the file.
|
|
584
|
-
*/
|
|
585
|
-
function processJsonlFile(filePath, prompt) {
|
|
586
|
-
const jsonLines = fs.readFileSync(filePath, "utf-8").split(/\r?\n/).filter((line) => line.length > 0);
|
|
587
|
-
const containsMultiple = jsonLines.length > 1;
|
|
588
|
-
return jsonLines.map((json) => ({
|
|
589
|
-
raw: json,
|
|
590
|
-
label: containsMultiple ? prompt.label ? `${prompt.label}: ${json}` : `${filePath}: ${json}` : prompt.label || `${filePath}`,
|
|
591
|
-
config: prompt.config
|
|
592
|
-
}));
|
|
593
|
-
}
|
|
255
|
+
//#region src/providers/github/defaults.ts
|
|
256
|
+
const githubConfig = {
|
|
257
|
+
apiBaseUrl: "https://models.github.ai/inference",
|
|
258
|
+
apiKeyEnvar: "GITHUB_TOKEN"
|
|
259
|
+
};
|
|
260
|
+
const DefaultGitHubGradingProvider = new require_chat.OpenAiChatCompletionProvider("openai/gpt-5", { config: githubConfig });
|
|
261
|
+
const DefaultGitHubGradingJsonProvider = new require_chat.OpenAiChatCompletionProvider("openai/gpt-5", { config: {
|
|
262
|
+
...githubConfig,
|
|
263
|
+
response_format: { type: "json_object" }
|
|
264
|
+
} });
|
|
265
|
+
const DefaultGitHubSuggestionsProvider = new require_chat.OpenAiChatCompletionProvider("openai/gpt-5", { config: githubConfig });
|
|
266
|
+
new require_chat.OpenAiChatCompletionProvider("openai/gpt-5-nano", { config: githubConfig });
|
|
267
|
+
new require_chat.OpenAiChatCompletionProvider("openai/gpt-5-mini", { config: githubConfig });
|
|
268
|
+
new require_chat.OpenAiChatCompletionProvider("openai/o4-mini", { config: githubConfig });
|
|
594
269
|
//#endregion
|
|
595
|
-
//#region src/
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
}];
|
|
602
|
-
}
|
|
270
|
+
//#region src/providers/mistral/defaults.ts
|
|
271
|
+
const DefaultEmbeddingProvider$1 = new require_providers.MistralEmbeddingProvider();
|
|
272
|
+
const DefaultGradingProvider$1 = new require_providers.MistralChatCompletionProvider("mistral-large-latest");
|
|
273
|
+
const DefaultGradingJsonProvider$1 = new require_providers.MistralChatCompletionProvider("mistral-large-latest", { config: { response_format: { type: "json_object" } } });
|
|
274
|
+
const DefaultSuggestionsProvider$1 = new require_providers.MistralChatCompletionProvider("mistral-large-latest");
|
|
275
|
+
const DefaultSynthesizeProvider = new require_providers.MistralChatCompletionProvider("mistral-large-latest");
|
|
603
276
|
//#endregion
|
|
604
|
-
//#region src/
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
|
|
635
|
-
label: context.provider?.label
|
|
636
|
-
},
|
|
637
|
-
config: context.config ?? {}
|
|
638
|
-
};
|
|
639
|
-
const options = {
|
|
640
|
-
mode: "text",
|
|
641
|
-
pythonPath: require_logger.getEnvString("PROMPTFOO_PYTHON", "python"),
|
|
642
|
-
args: [require_logger.safeJsonStringify(transformedContext)]
|
|
277
|
+
//#region src/providers/openai/defaults.ts
|
|
278
|
+
const DEFAULT_OPENAI_GRADING_MODEL = "gpt-5.4-2026-03-05";
|
|
279
|
+
const DefaultEmbeddingProvider = new require_embedding.OpenAiEmbeddingProvider("text-embedding-3-large");
|
|
280
|
+
const DefaultGradingProvider = new require_chat.OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL);
|
|
281
|
+
const DefaultGradingJsonProvider = new require_chat.OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL, { config: { response_format: { type: "json_object" } } });
|
|
282
|
+
const DefaultSuggestionsProvider = new require_chat.OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL);
|
|
283
|
+
const DefaultModerationProvider = new require_providers.OpenAiModerationProvider("omni-moderation-latest");
|
|
284
|
+
const DefaultWebSearchProvider = new require_responses.OpenAiResponsesProvider("gpt-5.4-2026-03-05", { config: { tools: [{ type: "web_search_preview" }] } });
|
|
285
|
+
async function getDefaultProviderPreferences(env) {
|
|
286
|
+
const hasAnthropicCredentials = Boolean(require_logger.getEnvString("ANTHROPIC_API_KEY") || env?.ANTHROPIC_API_KEY);
|
|
287
|
+
const hasOpenAiCredentials = Boolean(require_logger.getEnvString("OPENAI_API_KEY") || env?.OPENAI_API_KEY);
|
|
288
|
+
const hasGitHubCredentials = Boolean(require_logger.getEnvString("GITHUB_TOKEN") || env?.GITHUB_TOKEN);
|
|
289
|
+
const hasGoogleAiStudioCredentials = Boolean(require_logger.getEnvString("GEMINI_API_KEY") || env?.GEMINI_API_KEY || require_logger.getEnvString("GOOGLE_API_KEY") || env?.GOOGLE_API_KEY || require_logger.getEnvString("PALM_API_KEY") || env?.PALM_API_KEY);
|
|
290
|
+
const hasAzureApiKey = require_logger.getEnvString("AZURE_OPENAI_API_KEY") || env?.AZURE_OPENAI_API_KEY || require_logger.getEnvString("AZURE_API_KEY") || env?.AZURE_API_KEY;
|
|
291
|
+
const hasAzureClientCreds = (require_logger.getEnvString("AZURE_CLIENT_ID") || env?.AZURE_CLIENT_ID) && (require_logger.getEnvString("AZURE_CLIENT_SECRET") || env?.AZURE_CLIENT_SECRET) && (require_logger.getEnvString("AZURE_TENANT_ID") || env?.AZURE_TENANT_ID);
|
|
292
|
+
const hasMistralCredentials = Boolean(require_logger.getEnvString("MISTRAL_API_KEY") || env?.MISTRAL_API_KEY);
|
|
293
|
+
const preferAzure = Boolean(!hasOpenAiCredentials && (hasAzureApiKey || hasAzureClientCreds) && (require_logger.getEnvString("AZURE_DEPLOYMENT_NAME") || env?.AZURE_DEPLOYMENT_NAME) && (require_logger.getEnvString("AZURE_OPENAI_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_DEPLOYMENT_NAME));
|
|
294
|
+
const preferAnthropic = !hasOpenAiCredentials && hasAnthropicCredentials;
|
|
295
|
+
const shouldUseFallbackDefaults = !preferAzure && !hasOpenAiCredentials && !hasAnthropicCredentials && !hasGoogleAiStudioCredentials;
|
|
296
|
+
const useGoogleVertexDefaults = shouldUseFallbackDefaults ? await require_transform.hasGoogleDefaultCredentials() : false;
|
|
297
|
+
const useNonGoogleFallbackDefaults = shouldUseFallbackDefaults && !useGoogleVertexDefaults;
|
|
298
|
+
const hasCodexCredentials = useNonGoogleFallbackDefaults && !hasMistralCredentials && require_server.hasCodexDefaultCredentials(env);
|
|
299
|
+
return {
|
|
300
|
+
preferAnthropic,
|
|
301
|
+
preferAzure,
|
|
302
|
+
useCodexDefaults: hasCodexCredentials,
|
|
303
|
+
useGitHubDefaults: useNonGoogleFallbackDefaults && !hasMistralCredentials && !hasCodexCredentials && hasGitHubCredentials,
|
|
304
|
+
useGoogleAiStudioDefaults: !hasOpenAiCredentials && !hasAnthropicCredentials && hasGoogleAiStudioCredentials,
|
|
305
|
+
useGoogleVertexDefaults,
|
|
306
|
+
useMistralDefaults: useNonGoogleFallbackDefaults && hasMistralCredentials
|
|
643
307
|
};
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
308
|
+
}
|
|
309
|
+
async function getDefaultProviders(env) {
|
|
310
|
+
const { preferAnthropic, preferAzure, useCodexDefaults, useGitHubDefaults, useGoogleAiStudioDefaults, useGoogleVertexDefaults, useMistralDefaults } = await getDefaultProviderPreferences(env);
|
|
311
|
+
let providers;
|
|
312
|
+
if (preferAzure) {
|
|
313
|
+
require_logger.logger.debug("Using Azure OpenAI default providers");
|
|
314
|
+
const deploymentName = require_logger.getEnvString("AZURE_OPENAI_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_DEPLOYMENT_NAME;
|
|
315
|
+
if (!deploymentName) throw new Error("AZURE_OPENAI_DEPLOYMENT_NAME must be set when using Azure OpenAI");
|
|
316
|
+
const embeddingDeploymentName = require_logger.getEnvString("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME || deploymentName;
|
|
317
|
+
const azureProvider = new require_providers.AzureChatCompletionProvider(deploymentName, { env });
|
|
318
|
+
providers = {
|
|
319
|
+
embeddingProvider: new require_providers.AzureEmbeddingProvider(embeddingDeploymentName, { env }),
|
|
320
|
+
gradingJsonProvider: azureProvider,
|
|
321
|
+
gradingProvider: azureProvider,
|
|
322
|
+
moderationProvider: DefaultModerationProvider,
|
|
323
|
+
suggestionsProvider: azureProvider,
|
|
324
|
+
synthesizeProvider: azureProvider
|
|
325
|
+
};
|
|
326
|
+
} else if (preferAnthropic) {
|
|
327
|
+
require_logger.logger.debug("Using Anthropic default providers");
|
|
328
|
+
const anthropicProviders = getAnthropicProviders(env);
|
|
329
|
+
providers = {
|
|
330
|
+
embeddingProvider: DefaultEmbeddingProvider,
|
|
331
|
+
gradingJsonProvider: anthropicProviders.gradingJsonProvider,
|
|
332
|
+
gradingProvider: anthropicProviders.gradingProvider,
|
|
333
|
+
llmRubricProvider: anthropicProviders.llmRubricProvider,
|
|
334
|
+
moderationProvider: DefaultModerationProvider,
|
|
335
|
+
suggestionsProvider: anthropicProviders.suggestionsProvider,
|
|
336
|
+
synthesizeProvider: anthropicProviders.synthesizeProvider,
|
|
337
|
+
webSearchProvider: anthropicProviders.webSearchProvider
|
|
338
|
+
};
|
|
339
|
+
} else if (useGoogleAiStudioDefaults) {
|
|
340
|
+
require_logger.logger.debug("Using Google AI Studio default providers");
|
|
341
|
+
providers = {
|
|
342
|
+
embeddingProvider: require_providers.DefaultEmbeddingProvider,
|
|
343
|
+
gradingJsonProvider: require_providers.DefaultGradingJsonProvider,
|
|
344
|
+
gradingProvider: require_providers.DefaultGradingProvider$1,
|
|
345
|
+
llmRubricProvider: require_providers.DefaultLlmRubricProvider,
|
|
346
|
+
moderationProvider: DefaultModerationProvider,
|
|
347
|
+
suggestionsProvider: require_providers.DefaultSuggestionsProvider,
|
|
348
|
+
synthesizeProvider: require_providers.DefaultSynthesizeProvider
|
|
349
|
+
};
|
|
350
|
+
} else if (useGoogleVertexDefaults) {
|
|
351
|
+
require_logger.logger.debug("Using Google Vertex default providers");
|
|
352
|
+
providers = {
|
|
353
|
+
embeddingProvider: require_providers.DefaultEmbeddingProvider,
|
|
354
|
+
gradingJsonProvider: require_providers.DefaultGradingProvider,
|
|
355
|
+
gradingProvider: require_providers.DefaultGradingProvider,
|
|
356
|
+
moderationProvider: DefaultModerationProvider,
|
|
357
|
+
suggestionsProvider: require_providers.DefaultGradingProvider,
|
|
358
|
+
synthesizeProvider: require_providers.DefaultGradingProvider
|
|
359
|
+
};
|
|
360
|
+
} else if (useMistralDefaults) {
|
|
361
|
+
require_logger.logger.debug("Using Mistral default providers");
|
|
362
|
+
providers = {
|
|
363
|
+
embeddingProvider: DefaultEmbeddingProvider$1,
|
|
364
|
+
gradingJsonProvider: DefaultGradingJsonProvider$1,
|
|
365
|
+
gradingProvider: DefaultGradingProvider$1,
|
|
366
|
+
moderationProvider: DefaultModerationProvider,
|
|
367
|
+
suggestionsProvider: DefaultSuggestionsProvider$1,
|
|
368
|
+
synthesizeProvider: DefaultSynthesizeProvider
|
|
369
|
+
};
|
|
370
|
+
} else if (useCodexDefaults) {
|
|
371
|
+
require_logger.logger.debug("Using Codex SDK default providers from ChatGPT/Codex credentials");
|
|
372
|
+
providers = {
|
|
373
|
+
embeddingProvider: DefaultEmbeddingProvider,
|
|
374
|
+
moderationProvider: DefaultModerationProvider,
|
|
375
|
+
...require_server.getCodexDefaultProviders(env)
|
|
376
|
+
};
|
|
377
|
+
} else if (useGitHubDefaults) {
|
|
378
|
+
require_logger.logger.debug("Using GitHub Models default providers");
|
|
379
|
+
providers = {
|
|
380
|
+
embeddingProvider: DefaultEmbeddingProvider,
|
|
381
|
+
gradingJsonProvider: DefaultGitHubGradingJsonProvider,
|
|
382
|
+
gradingProvider: DefaultGitHubGradingProvider,
|
|
383
|
+
moderationProvider: DefaultModerationProvider,
|
|
384
|
+
suggestionsProvider: DefaultGitHubSuggestionsProvider,
|
|
385
|
+
synthesizeProvider: DefaultGitHubGradingJsonProvider
|
|
386
|
+
};
|
|
387
|
+
} else {
|
|
388
|
+
require_logger.logger.debug("Using OpenAI default providers");
|
|
389
|
+
providers = {
|
|
390
|
+
embeddingProvider: DefaultEmbeddingProvider,
|
|
391
|
+
gradingJsonProvider: DefaultGradingJsonProvider,
|
|
392
|
+
gradingProvider: DefaultGradingProvider,
|
|
393
|
+
moderationProvider: DefaultModerationProvider,
|
|
394
|
+
suggestionsProvider: DefaultSuggestionsProvider,
|
|
395
|
+
synthesizeProvider: DefaultGradingJsonProvider,
|
|
396
|
+
webSearchProvider: DefaultWebSearchProvider
|
|
397
|
+
};
|
|
398
|
+
}
|
|
399
|
+
if (require_logger.getEnvString("AZURE_CONTENT_SAFETY_ENDPOINT") || env?.AZURE_CONTENT_SAFETY_ENDPOINT) providers.moderationProvider = new require_providers.AzureModerationProvider("text-content-safety", { env });
|
|
400
|
+
return providers;
|
|
670
401
|
}
|
|
671
402
|
//#endregion
|
|
672
|
-
//#region src/
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
403
|
+
//#region src/assertions/utils.ts
|
|
404
|
+
const clone = (0, rfdc.default)();
|
|
405
|
+
function getFinalTest(test, assertion) {
|
|
406
|
+
const ret = clone({
|
|
407
|
+
...test,
|
|
408
|
+
...test.options && test.options.provider && { options: {
|
|
409
|
+
...test.options,
|
|
410
|
+
provider: void 0
|
|
411
|
+
} },
|
|
412
|
+
...test.provider && { provider: void 0 }
|
|
413
|
+
});
|
|
414
|
+
ret.options = ret.options || {};
|
|
415
|
+
if (test.provider) ret.provider = test.provider;
|
|
416
|
+
ret.options.provider = assertion.provider || test?.options?.provider;
|
|
417
|
+
ret.options.rubricPrompt = assertion.rubricPrompt || ret.options.rubricPrompt;
|
|
418
|
+
return Object.freeze(ret);
|
|
419
|
+
}
|
|
420
|
+
async function loadFromJavaScriptFile(filePath, functionName, args) {
|
|
421
|
+
const requiredModule = await require_esm.importModule(filePath, functionName);
|
|
422
|
+
if (functionName && typeof requiredModule[functionName] === "function") return requiredModule[functionName](...args);
|
|
423
|
+
else if (typeof requiredModule === "function") return requiredModule(...args);
|
|
424
|
+
else if (requiredModule.default && typeof requiredModule.default === "function") return requiredModule.default(...args);
|
|
425
|
+
else throw new Error(`Assertion malformed: ${filePath} must export a function or have a default export as a function`);
|
|
426
|
+
}
|
|
427
|
+
function processFileReference(fileRef) {
|
|
428
|
+
const basePath = require_logger.state.basePath || "";
|
|
429
|
+
const filePath = path.default.resolve(basePath, fileRef.slice(7));
|
|
430
|
+
const fileContent = fs.default.readFileSync(filePath, "utf8");
|
|
431
|
+
const extension = path.default.extname(filePath);
|
|
432
|
+
if ([
|
|
433
|
+
".json",
|
|
434
|
+
".yaml",
|
|
435
|
+
".yml"
|
|
436
|
+
].includes(extension)) return js_yaml.default.load(fileContent);
|
|
437
|
+
else if (extension === ".txt") return fileContent.trim();
|
|
438
|
+
else throw new Error(`Unsupported file type: ${filePath}`);
|
|
439
|
+
}
|
|
440
|
+
function coerceString(value) {
|
|
441
|
+
if (typeof value === "string") return value;
|
|
442
|
+
return JSON.stringify(value);
|
|
686
443
|
}
|
|
687
444
|
//#endregion
|
|
688
|
-
//#region src/
|
|
445
|
+
//#region src/matchers/shared.ts
|
|
689
446
|
/**
|
|
690
|
-
*
|
|
691
|
-
*
|
|
692
|
-
*
|
|
693
|
-
*
|
|
447
|
+
* Normalize token usage for matcher results. Unlike the evaluator-level
|
|
448
|
+
* normalizeTokenUsage, this excludes the `assertions` field and preserves
|
|
449
|
+
* the existing completionDetails shape (passing through whatever the
|
|
450
|
+
* provider returned, or undefined if not present).
|
|
694
451
|
*/
|
|
695
|
-
function
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
452
|
+
function normalizeMatcherTokenUsage(tokenUsage) {
|
|
453
|
+
return {
|
|
454
|
+
total: tokenUsage?.total || 0,
|
|
455
|
+
prompt: tokenUsage?.prompt || 0,
|
|
456
|
+
completion: tokenUsage?.completion || 0,
|
|
457
|
+
cached: tokenUsage?.cached || 0,
|
|
458
|
+
numRequests: tokenUsage?.numRequests || 0,
|
|
459
|
+
completionDetails: tokenUsage?.completionDetails || {
|
|
460
|
+
reasoning: 0,
|
|
461
|
+
acceptedPrediction: 0,
|
|
462
|
+
rejectedPrediction: 0
|
|
463
|
+
}
|
|
706
464
|
};
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
465
|
+
}
|
|
466
|
+
function fail(reason, tokensUsed) {
|
|
467
|
+
return {
|
|
468
|
+
pass: false,
|
|
469
|
+
reason,
|
|
470
|
+
score: 0,
|
|
471
|
+
tokensUsed: normalizeMatcherTokenUsage(tokensUsed)
|
|
472
|
+
};
|
|
473
|
+
}
|
|
474
|
+
function cosineSimilarity(vecA, vecB) {
|
|
475
|
+
if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
|
|
476
|
+
const dotProduct = vecA.reduce((acc, val, idx) => acc + val * vecB[idx], 0);
|
|
477
|
+
const vecAMagnitude = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
|
|
478
|
+
const vecBMagnitude = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
|
|
479
|
+
if (vecAMagnitude === 0 || vecBMagnitude === 0) return 0;
|
|
480
|
+
return dotProduct / (vecAMagnitude * vecBMagnitude);
|
|
481
|
+
}
|
|
482
|
+
function dotProduct(vecA, vecB) {
|
|
483
|
+
if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
|
|
484
|
+
return vecA.reduce((acc, val, idx) => acc + val * vecB[idx], 0);
|
|
485
|
+
}
|
|
486
|
+
function euclideanDistance(vecA, vecB) {
|
|
487
|
+
if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
|
|
488
|
+
const sumSquaredDiff = vecA.reduce((acc, val, idx) => {
|
|
489
|
+
const diff = val - vecB[idx];
|
|
490
|
+
return acc + diff * diff;
|
|
491
|
+
}, 0);
|
|
492
|
+
return Math.sqrt(sumSquaredDiff);
|
|
493
|
+
}
|
|
494
|
+
function tryParse(content) {
|
|
495
|
+
try {
|
|
496
|
+
return JSON.parse(content);
|
|
497
|
+
} catch {}
|
|
498
|
+
return content;
|
|
499
|
+
}
|
|
500
|
+
function splitIntoSentences(text) {
|
|
501
|
+
return text.split("\n").filter((sentence) => sentence.trim() !== "");
|
|
711
502
|
}
|
|
712
503
|
//#endregion
|
|
713
|
-
//#region src/
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
504
|
+
//#region src/matchers/rubric.ts
|
|
505
|
+
const nunjucks = require_render.getNunjucksEngine(void 0, false, true);
|
|
506
|
+
var LlmRubricProviderError = class extends Error {
|
|
507
|
+
constructor(message) {
|
|
508
|
+
super(message);
|
|
509
|
+
this.name = "LlmRubricProviderError";
|
|
510
|
+
}
|
|
511
|
+
};
|
|
512
|
+
async function loadRubricPrompt(rubricPrompt, defaultPrompt) {
|
|
513
|
+
if (!rubricPrompt) return defaultPrompt;
|
|
514
|
+
if (typeof rubricPrompt === "object" && Object.keys(rubricPrompt).length === 0) return defaultPrompt;
|
|
515
|
+
if (typeof rubricPrompt === "string" && rubricPrompt.startsWith("file://")) {
|
|
516
|
+
const basePath = require_logger.state.basePath || "";
|
|
517
|
+
const { filePath, functionName } = require_util.parseFileUrl(require_util.getNunjucksEngineForFilePath().renderString(rubricPrompt, {}));
|
|
518
|
+
const resolvedPath = path.default.resolve(basePath, filePath);
|
|
519
|
+
if (require_fileExtensions.isJavascriptFile(filePath)) rubricPrompt = await loadFromJavaScriptFile(resolvedPath, functionName, []);
|
|
520
|
+
else {
|
|
521
|
+
if (!fs.existsSync(resolvedPath)) throw new Error(`File does not exist: ${resolvedPath}`);
|
|
522
|
+
rubricPrompt = fs.readFileSync(resolvedPath, "utf8");
|
|
523
|
+
}
|
|
524
|
+
} else rubricPrompt = require_util.maybeLoadFromExternalFile(rubricPrompt);
|
|
525
|
+
if (typeof rubricPrompt === "object") rubricPrompt = JSON.stringify(rubricPrompt);
|
|
526
|
+
require_invariant.invariant(typeof rubricPrompt === "string", "rubricPrompt must be a string");
|
|
527
|
+
return rubricPrompt;
|
|
528
|
+
}
|
|
529
|
+
function processContextForTemplating(context, enableObjectAccess) {
|
|
530
|
+
if (enableObjectAccess) return context;
|
|
531
|
+
return Object.fromEntries(Object.entries(context).map(([key, value]) => {
|
|
532
|
+
if (value && typeof value === "object") {
|
|
533
|
+
if (Array.isArray(value)) return [key, value.map((item) => item && typeof item === "object" ? JSON.stringify(item) : item)];
|
|
534
|
+
return [key, JSON.stringify(value)];
|
|
535
|
+
}
|
|
536
|
+
return [key, value];
|
|
537
|
+
}));
|
|
538
|
+
}
|
|
539
|
+
async function renderLlmRubricPrompt(rubricPrompt, context) {
|
|
540
|
+
const processedContext = processContextForTemplating(context, require_logger.getEnvBool("PROMPTFOO_DISABLE_OBJECT_STRINGIFY", false));
|
|
728
541
|
try {
|
|
729
|
-
const
|
|
730
|
-
|
|
731
|
-
} catch (
|
|
732
|
-
require_logger.logger.debug(`
|
|
542
|
+
const parsed = JSON.parse(rubricPrompt, (_k, v) => typeof v === "string" ? nunjucks.renderString(v, processedContext) : v);
|
|
543
|
+
return JSON.stringify(parsed);
|
|
544
|
+
} catch (err) {
|
|
545
|
+
require_logger.logger.debug(`[Rubric] Rubric prompt is not valid JSON, using Nunjucks rendering: ${err.message}`);
|
|
733
546
|
}
|
|
734
|
-
return
|
|
735
|
-
raw: maybeParsed,
|
|
736
|
-
label: prompt.label || `${filePath}: ${maybeParsed?.slice(0, 80)}`,
|
|
737
|
-
config: prompt.config
|
|
738
|
-
}];
|
|
547
|
+
return nunjucks.renderString(rubricPrompt, processedContext);
|
|
739
548
|
}
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
*/
|
|
748
|
-
function readProviderPromptMap(config, parsedPrompts) {
|
|
749
|
-
const ret = {};
|
|
750
|
-
if (!config.providers) return ret;
|
|
751
|
-
const allPrompts = [];
|
|
752
|
-
for (const prompt of parsedPrompts) allPrompts.push(prompt.label);
|
|
753
|
-
if (typeof config.providers === "string") return { [config.providers]: allPrompts };
|
|
754
|
-
if (typeof config.providers === "function") return { "Custom function": allPrompts };
|
|
755
|
-
for (const provider of config.providers) if (typeof provider === "object") if (provider.id) {
|
|
756
|
-
const rawProvider = provider;
|
|
757
|
-
require_invariant.invariant(rawProvider.id, "You must specify an `id` on the Provider when you override options.prompts");
|
|
758
|
-
ret[rawProvider.id] = rawProvider.prompts || allPrompts;
|
|
759
|
-
if (rawProvider.label) ret[rawProvider.label] = rawProvider.prompts || allPrompts;
|
|
760
|
-
} else {
|
|
761
|
-
const rawProvider = provider;
|
|
762
|
-
const originalId = Object.keys(rawProvider)[0];
|
|
763
|
-
const id = rawProvider[originalId].id || originalId;
|
|
764
|
-
ret[id] = rawProvider[originalId].prompts || allPrompts;
|
|
549
|
+
function parseJsonGradingResponse(label, resp) {
|
|
550
|
+
let jsonObjects = [];
|
|
551
|
+
if (typeof resp.output === "string") try {
|
|
552
|
+
jsonObjects = require_logger.extractJsonObjects(resp.output);
|
|
553
|
+
if (jsonObjects.length === 0) return { failure: fail(`Could not extract JSON from ${label} response`, resp.tokenUsage) };
|
|
554
|
+
} catch (err) {
|
|
555
|
+
return { failure: fail(`${label} produced malformed response: ${err}\n\n${resp.output}`, resp.tokenUsage) };
|
|
765
556
|
}
|
|
766
|
-
|
|
557
|
+
else if (typeof resp.output === "object" && resp.output !== null && !Array.isArray(resp.output)) jsonObjects = [resp.output];
|
|
558
|
+
else return { failure: fail(`${label} produced malformed response - output must be string or object. Output: ${JSON.stringify(resp.output)}`, resp.tokenUsage) };
|
|
559
|
+
const parsed = jsonObjects[0];
|
|
560
|
+
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) return { failure: fail(`${label} produced malformed response. We were not able to parse the response as JSON. Output: ${JSON.stringify(resp.output)}`, resp.tokenUsage) };
|
|
561
|
+
return { parsed };
|
|
767
562
|
}
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
if (
|
|
779
|
-
|
|
780
|
-
|
|
563
|
+
async function runJsonGradingPrompt({ assertion, checkName, defaultPrompt, grading, label, providerCallContext, throwOnError, vars }) {
|
|
564
|
+
const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading.rubricPrompt, defaultPrompt), vars);
|
|
565
|
+
const defaultProviders = await getDefaultProviders();
|
|
566
|
+
const defaultProvider = defaultProviders.llmRubricProvider || defaultProviders.gradingJsonProvider;
|
|
567
|
+
const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, defaultProvider, checkName), prompt, label, vars, providerCallContext);
|
|
568
|
+
if (resp.error || !resp.output) {
|
|
569
|
+
if (throwOnError) throw new Error(resp.error || "No output");
|
|
570
|
+
return fail(resp.error || "No output", resp.tokenUsage);
|
|
571
|
+
}
|
|
572
|
+
const { parsed, failure } = parseJsonGradingResponse(label, resp);
|
|
573
|
+
if (!parsed) return failure;
|
|
574
|
+
let pass = parsed.pass ?? true;
|
|
575
|
+
if (typeof pass !== "boolean") pass = /^(true|yes|pass|y)$/i.test(String(pass));
|
|
576
|
+
let score = parsed.score;
|
|
577
|
+
if (typeof score !== "number") score = Number.isFinite(Number(score)) ? Number(score) : Number(pass);
|
|
578
|
+
const threshold = typeof assertion?.threshold === "string" ? Number(assertion.threshold) : assertion?.threshold;
|
|
579
|
+
if (typeof threshold === "number" && Number.isFinite(threshold)) pass = pass && score >= threshold;
|
|
580
|
+
const reason = parsed.reason || (pass ? "Grading passed" : `Score ${score} below threshold ${threshold}`);
|
|
581
|
+
let responseMetadata = {};
|
|
582
|
+
if (resp.metadata && typeof resp.metadata === "object" && !Array.isArray(resp.metadata)) {
|
|
583
|
+
const serializedMetadata = require_logger.safeJsonStringify(resp.metadata);
|
|
584
|
+
responseMetadata = serializedMetadata ? JSON.parse(serializedMetadata) : {};
|
|
781
585
|
}
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
prompts.push(...processString(prompt));
|
|
586
|
+
return {
|
|
587
|
+
assertion,
|
|
588
|
+
pass,
|
|
589
|
+
score,
|
|
590
|
+
reason,
|
|
591
|
+
tokensUsed: normalizeMatcherTokenUsage({
|
|
592
|
+
...resp.tokenUsage,
|
|
593
|
+
completionDetails: resp.tokenUsage?.completionDetails || parsed.tokensUsed?.completionDetails
|
|
594
|
+
}),
|
|
595
|
+
metadata: {
|
|
596
|
+
...responseMetadata,
|
|
597
|
+
renderedGradingPrompt: prompt
|
|
795
598
|
}
|
|
796
|
-
|
|
797
|
-
}
|
|
798
|
-
if (extension === ".csv") return processCsvPrompts(filePath, prompt);
|
|
799
|
-
if (extension === ".j2") return processJinjaFile(filePath, prompt);
|
|
800
|
-
if (extension === ".json") return processJsonFile(filePath, prompt);
|
|
801
|
-
if (extension === ".jsonl") return processJsonlFile(filePath, prompt);
|
|
802
|
-
if (extension && require_fileExtensions.isJavascriptFile(extension)) return processJsFile(filePath, prompt, functionName);
|
|
803
|
-
if (extension === ".md") return processMarkdownFile(filePath, prompt);
|
|
804
|
-
if (extension === ".py") return processPythonFile(filePath, prompt, functionName);
|
|
805
|
-
if (extension === ".txt") return processTxtFile(filePath, prompt);
|
|
806
|
-
if (extension && [".yml", ".yaml"].includes(extension)) return processYamlFile(filePath, prompt);
|
|
807
|
-
if (extension && [
|
|
808
|
-
".sh",
|
|
809
|
-
".bash",
|
|
810
|
-
".exe",
|
|
811
|
-
".bat",
|
|
812
|
-
".cmd",
|
|
813
|
-
".ps1",
|
|
814
|
-
".rb",
|
|
815
|
-
".pl"
|
|
816
|
-
].includes(extension)) return await processExecutableFile(filePath, prompt, functionName);
|
|
817
|
-
try {
|
|
818
|
-
const stats = await (0, fs_promises.stat)(filePath);
|
|
819
|
-
if (stats.isFile() && (stats.mode & 73) !== 0) return await processExecutableFile(filePath, prompt, functionName);
|
|
820
|
-
} catch (_e) {}
|
|
821
|
-
return [];
|
|
599
|
+
};
|
|
822
600
|
}
|
|
601
|
+
//#endregion
|
|
602
|
+
//#region src/prompts/processors/csv.ts
|
|
823
603
|
/**
|
|
824
|
-
*
|
|
825
|
-
*
|
|
826
|
-
*
|
|
827
|
-
*
|
|
604
|
+
* Process a CSV file containing prompts
|
|
605
|
+
*
|
|
606
|
+
* CSV format can be either:
|
|
607
|
+
* 1. Single column with prompt text per line
|
|
608
|
+
* 2. CSV with a 'prompt' column and optional 'label' column
|
|
609
|
+
*
|
|
610
|
+
* @param filePath Path to the CSV file
|
|
611
|
+
* @param basePrompt Base prompt properties to include
|
|
612
|
+
* @returns Array of processed prompts
|
|
828
613
|
*/
|
|
829
|
-
async function
|
|
830
|
-
|
|
831
|
-
const
|
|
832
|
-
const
|
|
833
|
-
|
|
834
|
-
const
|
|
835
|
-
|
|
836
|
-
|
|
614
|
+
async function processCsvPrompts(filePath, basePrompt) {
|
|
615
|
+
const content = fs.default.readFileSync(filePath, "utf8");
|
|
616
|
+
const delimiter = require_logger.getEnvString("PROMPTFOO_CSV_DELIMITER", ",");
|
|
617
|
+
const enforceStrict = require_logger.getEnvBool("PROMPTFOO_CSV_STRICT", false);
|
|
618
|
+
if (!content.includes(delimiter)) {
|
|
619
|
+
const lines = content.split(/\r?\n/).filter((line) => line.trim());
|
|
620
|
+
const startIndex = lines[0]?.toLowerCase().trim() === "prompt" ? 1 : 0;
|
|
621
|
+
return lines.slice(startIndex).map((line, index) => ({
|
|
622
|
+
...basePrompt,
|
|
623
|
+
raw: line,
|
|
624
|
+
label: basePrompt.label || `Prompt ${index + 1} - ${line}`
|
|
625
|
+
}));
|
|
837
626
|
}
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
}
|
|
847
|
-
else if (typeof promptInput === "string") return readPrompts(promptInput);
|
|
848
|
-
try {
|
|
849
|
-
return require_types.PromptSchema.parse(promptInput);
|
|
850
|
-
} catch (error) {
|
|
851
|
-
require_logger.logger.warn(`Prompt input is not a valid prompt schema: ${error}\nFalling back to serialized JSON as raw prompt.`);
|
|
627
|
+
try {
|
|
628
|
+
return (0, csv_parse_sync.parse)(content, {
|
|
629
|
+
columns: true,
|
|
630
|
+
bom: true,
|
|
631
|
+
delimiter,
|
|
632
|
+
relax_quotes: !enforceStrict,
|
|
633
|
+
skip_empty_lines: true,
|
|
634
|
+
trim: true
|
|
635
|
+
}).filter((row) => row.prompt).map((row, index) => {
|
|
852
636
|
return {
|
|
853
|
-
|
|
854
|
-
|
|
637
|
+
...basePrompt,
|
|
638
|
+
raw: row.prompt,
|
|
639
|
+
label: row.label || basePrompt.label || `Prompt ${index + 1} - ${row.prompt}`
|
|
855
640
|
};
|
|
856
|
-
}
|
|
857
|
-
}
|
|
641
|
+
});
|
|
642
|
+
} catch {
|
|
643
|
+
const lines = content.split(/\r?\n/).filter((line) => line.trim());
|
|
644
|
+
const startIndex = lines[0]?.toLowerCase().trim() === "prompt" ? 1 : 0;
|
|
645
|
+
return lines.slice(startIndex).map((line, index) => ({
|
|
646
|
+
...basePrompt,
|
|
647
|
+
raw: line,
|
|
648
|
+
label: basePrompt.label || `Prompt ${index + 1} - ${line}`
|
|
649
|
+
}));
|
|
650
|
+
}
|
|
858
651
|
}
|
|
859
|
-
const GEVAL_PROMPT_STEPS = `
|
|
860
|
-
Given evaluation criteria that outline how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to check whether the criteria are satisfied by the text.
|
|
861
|
-
|
|
862
|
-
**EVALUATION CRITERIA**
|
|
863
|
-
{{criteria}}
|
|
864
|
-
|
|
865
|
-
**OUTPUT FORMAT**
|
|
866
|
-
IMPORTANT:
|
|
867
|
-
- Return output ONLY as a minified JSON object (no code fences).
|
|
868
|
-
- The JSON object must contain a single key, "steps", whose value is a list of strings.
|
|
869
|
-
- Each string must represent one evaluation step.
|
|
870
|
-
- Do NOT include any explanations, commentary, extra text, or additional formatting.
|
|
871
|
-
|
|
872
|
-
Format:
|
|
873
|
-
{"steps": <list_of_strings>}
|
|
874
|
-
|
|
875
|
-
Example:
|
|
876
|
-
{"steps":["<Evaluation Step 1>","<Evaluation Step 2>","<Evaluation Step 3>","<Evaluation Step 4>"]}
|
|
877
|
-
|
|
878
|
-
Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
|
|
879
|
-
JSON:
|
|
880
|
-
`;
|
|
881
|
-
const GEVAL_PROMPT_EVALUATE = `
|
|
882
|
-
You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
|
|
883
|
-
Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
|
|
884
|
-
|
|
885
|
-
**Evaluation Criteria**
|
|
886
|
-
{{criteria}}
|
|
887
|
-
|
|
888
|
-
**Evaluation Steps**
|
|
889
|
-
- {{steps}}
|
|
890
|
-
Given the evaluation steps, return a JSON with two keys:
|
|
891
|
-
1) a "score" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the condition described by the Evaluation Criteria is fully and clearly observed in the Reply according to the Evaluation Steps, and 0 indicates that it is not observed at all;
|
|
892
|
-
2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!
|
|
893
|
-
|
|
894
|
-
**Prompt**
|
|
895
|
-
{{input}}
|
|
896
|
-
|
|
897
|
-
**Reply**
|
|
898
|
-
{{output}}
|
|
899
|
-
|
|
900
|
-
**OUTPUT FORMAT**
|
|
901
|
-
IMPORTANT:
|
|
902
|
-
- Return output ONLY as a minified JSON object (no code fences).
|
|
903
|
-
- The JSON object must contain exactly two keys: "score" and "reason".
|
|
904
|
-
- No additional words, explanations, or formatting are needed.
|
|
905
|
-
- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
|
|
906
|
-
|
|
907
|
-
Example JSON:
|
|
908
|
-
{"score":0,"reason":"The text of reply does not follow the evaluation criteria provided."}
|
|
909
|
-
|
|
910
|
-
Here is the final evaluation in the required minified JSON format:
|
|
911
|
-
JSON:
|
|
912
|
-
`;
|
|
913
652
|
//#endregion
|
|
914
|
-
//#region src/
|
|
915
|
-
const
|
|
653
|
+
//#region src/prompts/processors/executable.ts
|
|
654
|
+
const ANSI_ESCAPE = /\x1b(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])/g;
|
|
655
|
+
function stripText(text) {
|
|
656
|
+
return text.replace(ANSI_ESCAPE, "");
|
|
657
|
+
}
|
|
916
658
|
/**
|
|
917
|
-
*
|
|
918
|
-
*
|
|
919
|
-
* @param
|
|
920
|
-
* @
|
|
659
|
+
* Executable prompt function. Executes any script/binary and returns its output as the prompt.
|
|
660
|
+
* The script receives context as JSON in its arguments.
|
|
661
|
+
* @param scriptPath - Path to the executable script.
|
|
662
|
+
* @param context - Context for the prompt.
|
|
663
|
+
* @returns The prompt output from the script.
|
|
921
664
|
*/
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
}
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
name: "grade_output",
|
|
942
|
-
description: "Grade the given output based on specific criteria",
|
|
943
|
-
input_schema: {
|
|
944
|
-
type: "object",
|
|
945
|
-
properties: {
|
|
946
|
-
pass: {
|
|
947
|
-
type: "boolean",
|
|
948
|
-
description: "Whether the output passes the criteria"
|
|
949
|
-
},
|
|
950
|
-
score: {
|
|
951
|
-
type: "number",
|
|
952
|
-
description: "The score assigned to the output"
|
|
953
|
-
},
|
|
954
|
-
reason: {
|
|
955
|
-
type: "string",
|
|
956
|
-
description: "The reason for the given grade"
|
|
957
|
-
}
|
|
958
|
-
},
|
|
959
|
-
required: [
|
|
960
|
-
"pass",
|
|
961
|
-
"score",
|
|
962
|
-
"reason"
|
|
963
|
-
]
|
|
964
|
-
}
|
|
965
|
-
}],
|
|
966
|
-
...config
|
|
967
|
-
}
|
|
968
|
-
});
|
|
969
|
-
}
|
|
970
|
-
async callApi(prompt) {
|
|
971
|
-
const result = await super.callApi(prompt);
|
|
972
|
-
if (typeof result.output !== "string") return { error: `Anthropic LLM rubric grader - malformed non-string output\n\n${JSON.stringify(result.output)}` };
|
|
973
|
-
try {
|
|
974
|
-
return { output: JSON.parse(result.output).input };
|
|
975
|
-
} catch (err) {
|
|
976
|
-
return { error: `Anthropic LLM rubric grader - invalid JSON: ${err}\n\n${result.output}` };
|
|
665
|
+
const executablePromptFunction = async (scriptPath, context) => {
|
|
666
|
+
require_invariant.invariant(context.provider?.id, "provider.id is required");
|
|
667
|
+
const transformedContext = {
|
|
668
|
+
vars: context.vars,
|
|
669
|
+
provider: {
|
|
670
|
+
id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
|
|
671
|
+
label: context.provider?.label
|
|
672
|
+
},
|
|
673
|
+
config: context.config ?? {}
|
|
674
|
+
};
|
|
675
|
+
const scriptParts = require_providers.parseScriptParts(scriptPath);
|
|
676
|
+
const fileHashes = require_providers.getFileHashes(scriptParts);
|
|
677
|
+
const cacheKey = `exec-prompt:${scriptPath}:${fileHashes.join(":")}:${require_logger.safeJsonStringify(transformedContext)}`;
|
|
678
|
+
let cachedResult;
|
|
679
|
+
if (fileHashes.length > 0 && require_cache.isCacheEnabled()) {
|
|
680
|
+
cachedResult = await require_cache.getCache().get(cacheKey);
|
|
681
|
+
if (cachedResult) {
|
|
682
|
+
require_logger.logger.debug(`Returning cached result for executable prompt ${scriptPath}`);
|
|
683
|
+
return cachedResult;
|
|
977
684
|
}
|
|
978
685
|
}
|
|
686
|
+
return new Promise((resolve, reject) => {
|
|
687
|
+
const command = scriptParts.shift();
|
|
688
|
+
require_invariant.invariant(command, "No command found in script path");
|
|
689
|
+
const scriptArgs = scriptParts.concat([require_logger.safeJsonStringify(transformedContext)]);
|
|
690
|
+
const options = {
|
|
691
|
+
cwd: context.config?.basePath,
|
|
692
|
+
timeout: context.config?.timeout || 6e4
|
|
693
|
+
};
|
|
694
|
+
require_logger.logger.debug(`Executing prompt script: ${command} ${scriptArgs.join(" ")}`);
|
|
695
|
+
(0, child_process.execFile)(command, scriptArgs, options, async (error, stdout, stderr) => {
|
|
696
|
+
if (error) {
|
|
697
|
+
require_logger.logger.error(`Error running executable prompt ${scriptPath}: ${error.message}`);
|
|
698
|
+
reject(error);
|
|
699
|
+
return;
|
|
700
|
+
}
|
|
701
|
+
const standardOutput = stripText(Buffer.from(stdout).toString("utf8").trim());
|
|
702
|
+
const errorOutput = stripText(Buffer.from(stderr).toString("utf8").trim());
|
|
703
|
+
if (errorOutput) {
|
|
704
|
+
require_logger.logger.debug(`Error output from executable prompt ${scriptPath}: ${errorOutput}`);
|
|
705
|
+
if (!standardOutput) {
|
|
706
|
+
reject(new Error(errorOutput));
|
|
707
|
+
return;
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
require_logger.logger.debug(`Output from executable prompt ${scriptPath}: ${standardOutput}`);
|
|
711
|
+
if (fileHashes.length > 0 && require_cache.isCacheEnabled()) await require_cache.getCache().set(cacheKey, standardOutput);
|
|
712
|
+
resolve(standardOutput);
|
|
713
|
+
});
|
|
714
|
+
});
|
|
979
715
|
};
|
|
980
|
-
const gradingProviderFactory = createLazyProvider((env) => new require_messages.AnthropicMessagesProvider(DEFAULT_ANTHROPIC_MODEL, { env }));
|
|
981
|
-
const llmRubricProviderFactory = createLazyProvider((env) => new AnthropicLlmRubricProvider(DEFAULT_ANTHROPIC_MODEL, { env }));
|
|
982
|
-
const webSearchProviderFactory = createLazyProvider((env) => new require_messages.AnthropicMessagesProvider(DEFAULT_ANTHROPIC_MODEL, {
|
|
983
|
-
env,
|
|
984
|
-
config: { tools: [{
|
|
985
|
-
type: "web_search_20250305",
|
|
986
|
-
name: "web_search",
|
|
987
|
-
max_uses: 5
|
|
988
|
-
}] }
|
|
989
|
-
}));
|
|
990
716
|
/**
|
|
991
|
-
*
|
|
992
|
-
*
|
|
993
|
-
*
|
|
717
|
+
* Processes an executable file to generate prompts.
|
|
718
|
+
* The executable can be any script or binary that outputs prompt text to stdout.
|
|
719
|
+
* It receives the context as JSON in its first argument.
|
|
720
|
+
*
|
|
721
|
+
* @param filePath - Path to the executable file (can include arguments).
|
|
722
|
+
* @param prompt - The raw prompt data.
|
|
723
|
+
* @param functionName - Not used for executables, but kept for interface consistency.
|
|
724
|
+
* @returns Array of prompts generated from the executable.
|
|
994
725
|
*/
|
|
995
|
-
function
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
}
|
|
726
|
+
async function processExecutableFile(filePath, prompt, _functionName) {
|
|
727
|
+
let rawContent = filePath;
|
|
728
|
+
const firstPart = require_providers.parseScriptParts(filePath)[0];
|
|
729
|
+
if (firstPart) try {
|
|
730
|
+
const stats = await (0, fs_promises.stat)(firstPart);
|
|
731
|
+
if (stats.isFile() && stats.size < 1024 * 100) {
|
|
732
|
+
const content = await (0, fs_promises.readFile)(firstPart, "utf-8");
|
|
733
|
+
if (!/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(content.substring(0, 1e3))) rawContent = content;
|
|
734
|
+
}
|
|
735
|
+
} catch (_e) {}
|
|
736
|
+
const label = prompt.label ?? filePath;
|
|
737
|
+
return [{
|
|
738
|
+
raw: rawContent,
|
|
739
|
+
label,
|
|
740
|
+
function: (context) => executablePromptFunction(filePath, {
|
|
741
|
+
...context,
|
|
742
|
+
config: prompt.config
|
|
743
|
+
}),
|
|
744
|
+
config: prompt.config
|
|
745
|
+
}];
|
|
1005
746
|
}
|
|
1006
747
|
//#endregion
|
|
1007
|
-
//#region src/
|
|
1008
|
-
const
|
|
1009
|
-
|
|
1010
|
-
apiKeyEnvar: "GITHUB_TOKEN"
|
|
1011
|
-
};
|
|
1012
|
-
const DefaultGitHubGradingProvider = new require_chat.OpenAiChatCompletionProvider("openai/gpt-5", { config: githubConfig });
|
|
1013
|
-
const DefaultGitHubGradingJsonProvider = new require_chat.OpenAiChatCompletionProvider("openai/gpt-5", { config: {
|
|
1014
|
-
...githubConfig,
|
|
1015
|
-
response_format: { type: "json_object" }
|
|
1016
|
-
} });
|
|
1017
|
-
const DefaultGitHubSuggestionsProvider = new require_chat.OpenAiChatCompletionProvider("openai/gpt-5", { config: githubConfig });
|
|
1018
|
-
new require_chat.OpenAiChatCompletionProvider("openai/gpt-5-nano", { config: githubConfig });
|
|
1019
|
-
new require_chat.OpenAiChatCompletionProvider("openai/gpt-5-mini", { config: githubConfig });
|
|
1020
|
-
new require_chat.OpenAiChatCompletionProvider("openai/o4-mini", { config: githubConfig });
|
|
1021
|
-
//#endregion
|
|
1022
|
-
//#region src/providers/mistral/defaults.ts
|
|
1023
|
-
const DefaultEmbeddingProvider$1 = new require_providers.MistralEmbeddingProvider();
|
|
1024
|
-
const DefaultGradingProvider$1 = new require_providers.MistralChatCompletionProvider("mistral-large-latest");
|
|
1025
|
-
const DefaultGradingJsonProvider$1 = new require_providers.MistralChatCompletionProvider("mistral-large-latest", { config: { response_format: { type: "json_object" } } });
|
|
1026
|
-
const DefaultSuggestionsProvider$1 = new require_providers.MistralChatCompletionProvider("mistral-large-latest");
|
|
1027
|
-
const DefaultSynthesizeProvider = new require_providers.MistralChatCompletionProvider("mistral-large-latest");
|
|
1028
|
-
//#endregion
|
|
1029
|
-
//#region src/providers/openai/defaults.ts
|
|
1030
|
-
const DEFAULT_OPENAI_GRADING_MODEL = "gpt-5.4-2026-03-05";
|
|
1031
|
-
const DefaultEmbeddingProvider = new require_embedding.OpenAiEmbeddingProvider("text-embedding-3-large");
|
|
1032
|
-
const DefaultGradingProvider = new require_chat.OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL);
|
|
1033
|
-
const DefaultGradingJsonProvider = new require_chat.OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL, { config: { response_format: { type: "json_object" } } });
|
|
1034
|
-
const DefaultSuggestionsProvider = new require_chat.OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL);
|
|
1035
|
-
const DefaultModerationProvider = new require_providers.OpenAiModerationProvider("omni-moderation-latest");
|
|
1036
|
-
const DefaultWebSearchProvider = new require_responses.OpenAiResponsesProvider("gpt-5.4-2026-03-05", { config: { tools: [{ type: "web_search_preview" }] } });
|
|
1037
|
-
async function getDefaultProviderPreferences(env) {
|
|
1038
|
-
const hasAnthropicCredentials = Boolean(require_logger.getEnvString("ANTHROPIC_API_KEY") || env?.ANTHROPIC_API_KEY);
|
|
1039
|
-
const hasOpenAiCredentials = Boolean(require_logger.getEnvString("OPENAI_API_KEY") || env?.OPENAI_API_KEY);
|
|
1040
|
-
const hasGitHubCredentials = Boolean(require_logger.getEnvString("GITHUB_TOKEN") || env?.GITHUB_TOKEN);
|
|
1041
|
-
const hasGoogleAiStudioCredentials = Boolean(require_logger.getEnvString("GEMINI_API_KEY") || env?.GEMINI_API_KEY || require_logger.getEnvString("GOOGLE_API_KEY") || env?.GOOGLE_API_KEY || require_logger.getEnvString("PALM_API_KEY") || env?.PALM_API_KEY);
|
|
1042
|
-
const hasAzureApiKey = require_logger.getEnvString("AZURE_OPENAI_API_KEY") || env?.AZURE_OPENAI_API_KEY || require_logger.getEnvString("AZURE_API_KEY") || env?.AZURE_API_KEY;
|
|
1043
|
-
const hasAzureClientCreds = (require_logger.getEnvString("AZURE_CLIENT_ID") || env?.AZURE_CLIENT_ID) && (require_logger.getEnvString("AZURE_CLIENT_SECRET") || env?.AZURE_CLIENT_SECRET) && (require_logger.getEnvString("AZURE_TENANT_ID") || env?.AZURE_TENANT_ID);
|
|
1044
|
-
const hasMistralCredentials = Boolean(require_logger.getEnvString("MISTRAL_API_KEY") || env?.MISTRAL_API_KEY);
|
|
1045
|
-
const preferAzure = Boolean(!hasOpenAiCredentials && (hasAzureApiKey || hasAzureClientCreds) && (require_logger.getEnvString("AZURE_DEPLOYMENT_NAME") || env?.AZURE_DEPLOYMENT_NAME) && (require_logger.getEnvString("AZURE_OPENAI_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_DEPLOYMENT_NAME));
|
|
1046
|
-
const preferAnthropic = !hasOpenAiCredentials && hasAnthropicCredentials;
|
|
1047
|
-
const shouldUseFallbackDefaults = !preferAzure && !hasOpenAiCredentials && !hasAnthropicCredentials && !hasGoogleAiStudioCredentials;
|
|
1048
|
-
const useGoogleVertexDefaults = shouldUseFallbackDefaults ? await require_transform$1.hasGoogleDefaultCredentials() : false;
|
|
1049
|
-
const useNonGoogleFallbackDefaults = shouldUseFallbackDefaults && !useGoogleVertexDefaults;
|
|
1050
|
-
const hasCodexCredentials = useNonGoogleFallbackDefaults && !hasMistralCredentials && require_server.hasCodexDefaultCredentials(env);
|
|
748
|
+
//#region src/prompts/processors/javascript.ts
|
|
749
|
+
const transformContext = (context) => {
|
|
750
|
+
require_invariant.invariant(context.provider, "Provider is required");
|
|
1051
751
|
return {
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
useMistralDefaults: useNonGoogleFallbackDefaults && hasMistralCredentials
|
|
752
|
+
vars: context.vars,
|
|
753
|
+
provider: {
|
|
754
|
+
id: context.provider.id(),
|
|
755
|
+
label: context.provider.label
|
|
756
|
+
},
|
|
757
|
+
config: context.config ?? {}
|
|
1059
758
|
};
|
|
1060
|
-
}
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
} else if (preferAnthropic) {
|
|
1079
|
-
require_logger.logger.debug("Using Anthropic default providers");
|
|
1080
|
-
const anthropicProviders = getAnthropicProviders(env);
|
|
1081
|
-
providers = {
|
|
1082
|
-
embeddingProvider: DefaultEmbeddingProvider,
|
|
1083
|
-
gradingJsonProvider: anthropicProviders.gradingJsonProvider,
|
|
1084
|
-
gradingProvider: anthropicProviders.gradingProvider,
|
|
1085
|
-
llmRubricProvider: anthropicProviders.llmRubricProvider,
|
|
1086
|
-
moderationProvider: DefaultModerationProvider,
|
|
1087
|
-
suggestionsProvider: anthropicProviders.suggestionsProvider,
|
|
1088
|
-
synthesizeProvider: anthropicProviders.synthesizeProvider,
|
|
1089
|
-
webSearchProvider: anthropicProviders.webSearchProvider
|
|
1090
|
-
};
|
|
1091
|
-
} else if (useGoogleAiStudioDefaults) {
|
|
1092
|
-
require_logger.logger.debug("Using Google AI Studio default providers");
|
|
1093
|
-
providers = {
|
|
1094
|
-
embeddingProvider: require_providers.DefaultEmbeddingProvider,
|
|
1095
|
-
gradingJsonProvider: require_providers.DefaultGradingJsonProvider,
|
|
1096
|
-
gradingProvider: require_providers.DefaultGradingProvider$1,
|
|
1097
|
-
llmRubricProvider: require_providers.DefaultLlmRubricProvider,
|
|
1098
|
-
moderationProvider: DefaultModerationProvider,
|
|
1099
|
-
suggestionsProvider: require_providers.DefaultSuggestionsProvider,
|
|
1100
|
-
synthesizeProvider: require_providers.DefaultSynthesizeProvider
|
|
1101
|
-
};
|
|
1102
|
-
} else if (useGoogleVertexDefaults) {
|
|
1103
|
-
require_logger.logger.debug("Using Google Vertex default providers");
|
|
1104
|
-
providers = {
|
|
1105
|
-
embeddingProvider: require_providers.DefaultEmbeddingProvider,
|
|
1106
|
-
gradingJsonProvider: require_providers.DefaultGradingProvider,
|
|
1107
|
-
gradingProvider: require_providers.DefaultGradingProvider,
|
|
1108
|
-
moderationProvider: DefaultModerationProvider,
|
|
1109
|
-
suggestionsProvider: require_providers.DefaultGradingProvider,
|
|
1110
|
-
synthesizeProvider: require_providers.DefaultGradingProvider
|
|
1111
|
-
};
|
|
1112
|
-
} else if (useMistralDefaults) {
|
|
1113
|
-
require_logger.logger.debug("Using Mistral default providers");
|
|
1114
|
-
providers = {
|
|
1115
|
-
embeddingProvider: DefaultEmbeddingProvider$1,
|
|
1116
|
-
gradingJsonProvider: DefaultGradingJsonProvider$1,
|
|
1117
|
-
gradingProvider: DefaultGradingProvider$1,
|
|
1118
|
-
moderationProvider: DefaultModerationProvider,
|
|
1119
|
-
suggestionsProvider: DefaultSuggestionsProvider$1,
|
|
1120
|
-
synthesizeProvider: DefaultSynthesizeProvider
|
|
1121
|
-
};
|
|
1122
|
-
} else if (useCodexDefaults) {
|
|
1123
|
-
require_logger.logger.debug("Using Codex SDK default providers from ChatGPT/Codex credentials");
|
|
1124
|
-
providers = {
|
|
1125
|
-
embeddingProvider: DefaultEmbeddingProvider,
|
|
1126
|
-
moderationProvider: DefaultModerationProvider,
|
|
1127
|
-
...require_server.getCodexDefaultProviders(env)
|
|
1128
|
-
};
|
|
1129
|
-
} else if (useGitHubDefaults) {
|
|
1130
|
-
require_logger.logger.debug("Using GitHub Models default providers");
|
|
1131
|
-
providers = {
|
|
1132
|
-
embeddingProvider: DefaultEmbeddingProvider,
|
|
1133
|
-
gradingJsonProvider: DefaultGitHubGradingJsonProvider,
|
|
1134
|
-
gradingProvider: DefaultGitHubGradingProvider,
|
|
1135
|
-
moderationProvider: DefaultModerationProvider,
|
|
1136
|
-
suggestionsProvider: DefaultGitHubSuggestionsProvider,
|
|
1137
|
-
synthesizeProvider: DefaultGitHubGradingJsonProvider
|
|
1138
|
-
};
|
|
1139
|
-
} else {
|
|
1140
|
-
require_logger.logger.debug("Using OpenAI default providers");
|
|
1141
|
-
providers = {
|
|
1142
|
-
embeddingProvider: DefaultEmbeddingProvider,
|
|
1143
|
-
gradingJsonProvider: DefaultGradingJsonProvider,
|
|
1144
|
-
gradingProvider: DefaultGradingProvider,
|
|
1145
|
-
moderationProvider: DefaultModerationProvider,
|
|
1146
|
-
suggestionsProvider: DefaultSuggestionsProvider,
|
|
1147
|
-
synthesizeProvider: DefaultGradingJsonProvider,
|
|
1148
|
-
webSearchProvider: DefaultWebSearchProvider
|
|
1149
|
-
};
|
|
1150
|
-
}
|
|
1151
|
-
if (require_logger.getEnvString("AZURE_CONTENT_SAFETY_ENDPOINT") || env?.AZURE_CONTENT_SAFETY_ENDPOINT) providers.moderationProvider = new require_providers.AzureModerationProvider("text-content-safety", { env });
|
|
1152
|
-
return providers;
|
|
759
|
+
};
|
|
760
|
+
/**
|
|
761
|
+
* Processes a JavaScript file to import and execute a module function as a prompt.
|
|
762
|
+
* @param filePath - Path to the JavaScript file.
|
|
763
|
+
* @param functionName - Optional function name to execute.
|
|
764
|
+
* @returns Promise resolving to an array of prompts.
|
|
765
|
+
*/
|
|
766
|
+
async function processJsFile(filePath, prompt, functionName) {
|
|
767
|
+
const promptFunction = await require_esm.importModule(filePath, functionName);
|
|
768
|
+
return [{
|
|
769
|
+
raw: String(promptFunction),
|
|
770
|
+
label: prompt.label ? prompt.label : functionName ? `${filePath}:${functionName}` : filePath,
|
|
771
|
+
function: (context) => promptFunction(transformContext({
|
|
772
|
+
...context,
|
|
773
|
+
config: prompt.config ?? {}
|
|
774
|
+
})),
|
|
775
|
+
config: prompt.config ?? {}
|
|
776
|
+
}];
|
|
1153
777
|
}
|
|
1154
778
|
//#endregion
|
|
1155
|
-
//#region src/
|
|
1156
|
-
|
|
1157
|
-
|
|
779
|
+
//#region src/prompts/processors/jinja.ts
|
|
780
|
+
/**
|
|
781
|
+
* Processes a Jinja2 template file to extract prompts.
|
|
782
|
+
* Similar to markdown files, each Jinja2 file is treated as a single prompt.
|
|
783
|
+
*
|
|
784
|
+
* @param filePath - Path to the Jinja2 template file.
|
|
785
|
+
* @param prompt - The raw prompt data.
|
|
786
|
+
* @returns Array of one `Prompt` object.
|
|
787
|
+
*/
|
|
788
|
+
function processJinjaFile(filePath, prompt) {
|
|
789
|
+
const content = fs.readFileSync(filePath, "utf8");
|
|
790
|
+
return [{
|
|
791
|
+
raw: content,
|
|
792
|
+
label: prompt.label || `${filePath}: ${content.slice(0, 50)}...`,
|
|
793
|
+
config: prompt.config
|
|
794
|
+
}];
|
|
1158
795
|
}
|
|
1159
|
-
|
|
1160
|
-
|
|
796
|
+
//#endregion
|
|
797
|
+
//#region src/prompts/processors/json.ts
|
|
798
|
+
/**
|
|
799
|
+
* Processes a JSON file to extract prompts.
|
|
800
|
+
* This function reads a JSON file and converts it to a `Prompt` object.
|
|
801
|
+
* Any file:// references within the JSON content are recursively resolved.
|
|
802
|
+
*
|
|
803
|
+
* @param filePath - The path to the JSON file.
|
|
804
|
+
* @param prompt - The raw prompt data, used for labeling.
|
|
805
|
+
* @returns An array of one `Prompt` object.
|
|
806
|
+
* @throws Will throw an error if the file cannot be read.
|
|
807
|
+
*/
|
|
808
|
+
function processJsonFile(filePath, prompt) {
|
|
809
|
+
const fileContents = fs.readFileSync(filePath, "utf8");
|
|
810
|
+
let processedContents = fileContents;
|
|
1161
811
|
try {
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
}
|
|
1168
|
-
|
|
1169
|
-
|
|
812
|
+
const resolved = require_util.maybeLoadConfigFromExternalFile(JSON.parse(fileContents));
|
|
813
|
+
processedContents = JSON.stringify(resolved);
|
|
814
|
+
} catch {}
|
|
815
|
+
return [{
|
|
816
|
+
raw: processedContents,
|
|
817
|
+
label: prompt.label || `${filePath}: ${processedContents}`,
|
|
818
|
+
config: prompt.config
|
|
819
|
+
}];
|
|
1170
820
|
}
|
|
821
|
+
//#endregion
|
|
822
|
+
//#region src/prompts/processors/jsonl.ts
|
|
1171
823
|
/**
|
|
1172
|
-
*
|
|
1173
|
-
* @param
|
|
1174
|
-
* @
|
|
824
|
+
* Processes a JSONL file to extract prompts.
|
|
825
|
+
* @param filePath - Path to the JSONL file.
|
|
826
|
+
* @param prompt - The raw prompt data.
|
|
827
|
+
* @returns Array of prompts extracted from the file.
|
|
1175
828
|
*/
|
|
1176
|
-
function
|
|
1177
|
-
|
|
1178
|
-
const
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
829
|
+
function processJsonlFile(filePath, prompt) {
|
|
830
|
+
const jsonLines = fs.readFileSync(filePath, "utf-8").split(/\r?\n/).filter((line) => line.length > 0);
|
|
831
|
+
const containsMultiple = jsonLines.length > 1;
|
|
832
|
+
return jsonLines.map((json) => ({
|
|
833
|
+
raw: json,
|
|
834
|
+
label: containsMultiple ? prompt.label ? `${prompt.label}: ${json}` : `${filePath}: ${json}` : prompt.label || `${filePath}`,
|
|
835
|
+
config: prompt.config
|
|
836
|
+
}));
|
|
837
|
+
}
|
|
838
|
+
//#endregion
|
|
839
|
+
//#region src/prompts/processors/markdown.ts
|
|
840
|
+
function processMarkdownFile(filePath, prompt) {
|
|
841
|
+
const content = fs.default.readFileSync(filePath, "utf8");
|
|
842
|
+
return [{
|
|
843
|
+
raw: content,
|
|
844
|
+
label: prompt.label || `${filePath}: ${content.slice(0, 50)}...`
|
|
845
|
+
}];
|
|
1187
846
|
}
|
|
847
|
+
//#endregion
|
|
848
|
+
//#region src/prompts/processors/python.ts
|
|
1188
849
|
/**
|
|
1189
|
-
*
|
|
1190
|
-
*
|
|
1191
|
-
*
|
|
1192
|
-
*
|
|
1193
|
-
* @
|
|
1194
|
-
* @returns A provider with web search capabilities or null
|
|
850
|
+
* Python prompt function. Runs a specific function from the python file.
|
|
851
|
+
* @param promptPath - Path to the Python file.
|
|
852
|
+
* @param functionName - Function name to execute.
|
|
853
|
+
* @param context - Context for the prompt.
|
|
854
|
+
* @returns The prompts
|
|
1195
855
|
*/
|
|
1196
|
-
async
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
}
|
|
1221
|
-
|
|
1222
|
-
return null;
|
|
1223
|
-
}
|
|
1224
|
-
};
|
|
1225
|
-
const loadGoogleWebSearch = async () => {
|
|
1226
|
-
try {
|
|
1227
|
-
return await require_providers.loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
1228
|
-
} catch (err) {
|
|
1229
|
-
require_logger.logger.debug(`Failed to load Google web search provider: ${err}`);
|
|
1230
|
-
return null;
|
|
1231
|
-
}
|
|
1232
|
-
};
|
|
1233
|
-
const loadVertexWebSearch = async () => {
|
|
1234
|
-
try {
|
|
1235
|
-
return await require_providers.loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
1236
|
-
} catch (err) {
|
|
1237
|
-
require_logger.logger.debug(`Failed to load Vertex web search provider: ${err}`);
|
|
1238
|
-
return null;
|
|
1239
|
-
}
|
|
856
|
+
const pythonPromptFunction = async (filePath, functionName, context) => {
|
|
857
|
+
require_invariant.invariant(context.provider?.id, "provider.id is required");
|
|
858
|
+
return require_pythonUtils.runPython(filePath, functionName, [{
|
|
859
|
+
vars: context.vars,
|
|
860
|
+
provider: {
|
|
861
|
+
id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
|
|
862
|
+
label: context.provider?.label
|
|
863
|
+
},
|
|
864
|
+
config: context.config ?? {}
|
|
865
|
+
}]);
|
|
866
|
+
};
|
|
867
|
+
/**
|
|
868
|
+
* Legacy Python prompt function. Runs the whole python file.
|
|
869
|
+
* @param filePath - Path to the Python file.
|
|
870
|
+
* @param context - Context for the prompt.
|
|
871
|
+
* @returns The prompts
|
|
872
|
+
*/
|
|
873
|
+
const pythonPromptFunctionLegacy = async (filePath, context) => {
|
|
874
|
+
require_invariant.invariant(context?.provider?.id, "provider.id is required");
|
|
875
|
+
const transformedContext = {
|
|
876
|
+
vars: context.vars,
|
|
877
|
+
provider: {
|
|
878
|
+
id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
|
|
879
|
+
label: context.provider?.label
|
|
880
|
+
},
|
|
881
|
+
config: context.config ?? {}
|
|
1240
882
|
};
|
|
1241
|
-
const
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
require_logger.logger.debug(`Failed to load xAI web search provider: ${err}`);
|
|
1246
|
-
return null;
|
|
1247
|
-
}
|
|
883
|
+
const options = {
|
|
884
|
+
mode: "text",
|
|
885
|
+
pythonPath: require_logger.getEnvString("PROMPTFOO_PYTHON", "python"),
|
|
886
|
+
args: [require_logger.safeJsonStringify(transformedContext)]
|
|
1248
887
|
};
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
}
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
888
|
+
require_logger.logger.debug(`Executing python prompt script ${filePath}`);
|
|
889
|
+
const results = (await python_shell.PythonShell.run(filePath, options)).join("\n");
|
|
890
|
+
require_logger.logger.debug(`Python prompt script ${filePath} returned: ${results}`);
|
|
891
|
+
return results;
|
|
892
|
+
};
|
|
893
|
+
/**
|
|
894
|
+
* Processes a Python file to extract or execute a function as a prompt.
|
|
895
|
+
* @param filePath - Path to the Python file.
|
|
896
|
+
* @param prompt - The raw prompt data.
|
|
897
|
+
* @param functionName - Optional function name to execute.
|
|
898
|
+
* @returns Array of prompts extracted or executed from the file.
|
|
899
|
+
*/
|
|
900
|
+
function processPythonFile(filePath, prompt, functionName) {
|
|
901
|
+
const fileContent = fs.readFileSync(filePath, "utf-8");
|
|
902
|
+
return [{
|
|
903
|
+
raw: fileContent,
|
|
904
|
+
label: prompt.label ?? (functionName ? `${filePath}:${functionName}` : `${filePath}: ${fileContent}`),
|
|
905
|
+
function: functionName ? (context) => pythonPromptFunction(filePath, functionName, {
|
|
906
|
+
...context,
|
|
907
|
+
config: prompt.config
|
|
908
|
+
}) : (context) => pythonPromptFunctionLegacy(filePath, {
|
|
909
|
+
...context,
|
|
910
|
+
config: prompt.config
|
|
911
|
+
}),
|
|
912
|
+
config: prompt.config
|
|
913
|
+
}];
|
|
1273
914
|
}
|
|
1274
915
|
//#endregion
|
|
1275
|
-
//#region src/
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
if (!result || result.pass === void 0) throw new Error(`Remote grading failed. Response data is invalid: ${JSON.stringify(data)}`);
|
|
1290
|
-
return {
|
|
1291
|
-
pass: result.pass,
|
|
1292
|
-
score: result.score,
|
|
1293
|
-
reason: result.reason,
|
|
1294
|
-
tokensUsed: result.tokensUsed
|
|
1295
|
-
};
|
|
1296
|
-
} catch (error) {
|
|
1297
|
-
throw new Error(`Could not perform remote grading: ${error}`);
|
|
1298
|
-
}
|
|
916
|
+
//#region src/prompts/processors/string.ts
|
|
917
|
+
/**
|
|
918
|
+
* Processes a string as a literal prompt.
|
|
919
|
+
* @param prompt - The raw prompt data.
|
|
920
|
+
* @returns Array of prompts created from the string.
|
|
921
|
+
*/
|
|
922
|
+
function processString(prompt) {
|
|
923
|
+
require_invariant.invariant(typeof prompt.raw === "string", `prompt.raw must be a string, but got ${JSON.stringify(prompt.raw)}`);
|
|
924
|
+
return [{
|
|
925
|
+
id: prompt.id,
|
|
926
|
+
raw: prompt.raw,
|
|
927
|
+
label: prompt.label ?? `${prompt.raw}`,
|
|
928
|
+
config: prompt.config
|
|
929
|
+
}];
|
|
1299
930
|
}
|
|
1300
931
|
//#endregion
|
|
1301
|
-
//#region src/
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
932
|
+
//#region src/prompts/processors/text.ts
|
|
933
|
+
/**
|
|
934
|
+
* Processes a text file to extract prompts, splitting by a delimiter.
|
|
935
|
+
* @param filePath - Path to the text file.
|
|
936
|
+
* @param prompt - The raw prompt data.
|
|
937
|
+
* @returns Array of prompts extracted from the file.
|
|
938
|
+
*/
|
|
939
|
+
function processTxtFile(filePath, { label }) {
|
|
940
|
+
const lines = fs.readFileSync(filePath, "utf-8").split(/\r?\n/);
|
|
941
|
+
const prompts = [];
|
|
942
|
+
let buffer = [];
|
|
943
|
+
const flush = () => {
|
|
944
|
+
const raw = buffer.join("\n").trim();
|
|
945
|
+
if (raw.length > 0) prompts.push({
|
|
946
|
+
raw,
|
|
947
|
+
label: label ? `${label}: ${filePath}: ${raw}` : `${filePath}: ${raw}`
|
|
948
|
+
});
|
|
949
|
+
buffer = [];
|
|
1312
950
|
};
|
|
951
|
+
for (const line of lines) if (line.trim() === require_utils.PROMPT_DELIMITER) flush();
|
|
952
|
+
else buffer.push(line);
|
|
953
|
+
flush();
|
|
954
|
+
return prompts;
|
|
1313
955
|
}
|
|
1314
|
-
|
|
1315
|
-
|
|
956
|
+
//#endregion
|
|
957
|
+
//#region src/prompts/processors/yaml.ts
|
|
958
|
+
/**
|
|
959
|
+
* Processes a YAML file to extract prompts.
|
|
960
|
+
* This function reads a YAML file, parses it, and maps each entry to a `Prompt` object.
|
|
961
|
+
* Each prompt is labeled with the file path and the YAML content.
|
|
962
|
+
* Any file:// references within the YAML content are recursively resolved.
|
|
963
|
+
*
|
|
964
|
+
* @param filePath - The path to the YAML file.
|
|
965
|
+
* @param prompt - The raw prompt data, used for labeling.
|
|
966
|
+
* @returns An array of `Prompt` objects extracted from the YAML file.
|
|
967
|
+
* @throws Will throw an error if the file cannot be read or parsed.
|
|
968
|
+
*/
|
|
969
|
+
function processYamlFile(filePath, prompt) {
|
|
970
|
+
const fileContents = fs.readFileSync(filePath, "utf8");
|
|
971
|
+
let maybeParsed = fileContents;
|
|
1316
972
|
try {
|
|
1317
|
-
const
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
const { data } = await require_cache.fetchWithCache(WITHPI_API_URL, {
|
|
1322
|
-
method: "POST",
|
|
1323
|
-
headers: {
|
|
1324
|
-
"Content-Type": "application/json",
|
|
1325
|
-
"x-api-key": apiKey
|
|
1326
|
-
},
|
|
1327
|
-
body
|
|
1328
|
-
}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
1329
|
-
return convertPiResultToGradingResult(data, passThreshold);
|
|
1330
|
-
} else throw new Error(`Env var WITHPI_API_KEY must be set. Visit https://docs.withpi.ai for more information.`);
|
|
1331
|
-
} catch (error) {
|
|
1332
|
-
throw new Error(`Could not perform remote grading: ${error}`);
|
|
973
|
+
const resolved = require_util.maybeLoadConfigFromExternalFile(js_yaml.default.load(fileContents));
|
|
974
|
+
maybeParsed = JSON.stringify(resolved);
|
|
975
|
+
} catch (e) {
|
|
976
|
+
require_logger.logger.debug(`Error parsing YAML file ${filePath}: ${e}`);
|
|
1333
977
|
}
|
|
978
|
+
return [{
|
|
979
|
+
raw: maybeParsed,
|
|
980
|
+
label: prompt.label || `${filePath}: ${maybeParsed?.slice(0, 80)}`,
|
|
981
|
+
config: prompt.config
|
|
982
|
+
}];
|
|
1334
983
|
}
|
|
1335
984
|
//#endregion
|
|
1336
|
-
//#region src/
|
|
1337
|
-
const
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
985
|
+
//#region src/external/prompts/ragas.ts
|
|
986
|
+
const ANSWER_RELEVANCY_GENERATE = `Generate question for the given answer.
|
|
987
|
+
Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India
|
|
988
|
+
Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?
|
|
989
|
+
|
|
990
|
+
Answer:{{answer}}
|
|
991
|
+
Question:`;
|
|
992
|
+
const CONTEXT_RECALL = `Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not.
|
|
993
|
+
Think in steps and reason before coming to conclusion.
|
|
994
|
+
|
|
995
|
+
context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
|
|
996
|
+
answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895
|
|
997
|
+
classification
|
|
998
|
+
1. Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. The date of birth of Einstein is mentioned clearly in the context. So [Attributed]
|
|
999
|
+
2. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. The exact sentence is present in the given context. So [Attributed]
|
|
1000
|
+
3. He published 4 papers in 1905. There is no mention about papers he wrote in given the context. So [Not Attributed]
|
|
1001
|
+
4. Einstein moved to Switzerland in 1895. There is not supporting evidence for this in the given the context. So [Not Attributed]
|
|
1002
|
+
|
|
1003
|
+
context:{{context}}
|
|
1004
|
+
answer:{{groundTruth}}
|
|
1005
|
+
classification:
|
|
1006
|
+
`;
|
|
1007
|
+
const CONTEXT_RECALL_ATTRIBUTED_TOKEN = "[Attributed]";
|
|
1008
|
+
const CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN = "[Not Attributed]";
|
|
1009
|
+
const CONTEXT_RELEVANCE = `Please extract relevant sentences from the provided context that is absolutely required answer the following query. If no relevant sentences are found, or if you believe the query cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.
|
|
1010
|
+
|
|
1011
|
+
query: {{query}}
|
|
1012
|
+
context: {{context}}
|
|
1013
|
+
candidate sentences:
|
|
1014
|
+
`;
|
|
1015
|
+
const CONTEXT_RELEVANCE_BAD = "Insufficient Information";
|
|
1016
|
+
const CONTEXT_FAITHFULNESS_LONGFORM = `Given a question and answer, create one or more statements from each sentence in the given answer.
|
|
1017
|
+
question: Who was Albert Einstein and what is he best known for?
|
|
1018
|
+
answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
|
|
1019
|
+
statements:\nAlbert Einstein was born in Germany.\nAlbert Einstein was best known for his theory of relativity.
|
|
1020
|
+
question: Cadmium Chloride is slightly soluble in this chemical, it is also called what?
|
|
1021
|
+
answer: alcohol
|
|
1022
|
+
statements:\nCadmium Chloride is slightly soluble in alcohol.
|
|
1023
|
+
question: Were Shahul and Jithin of the same nationality?
|
|
1024
|
+
answer: They were from different countries.
|
|
1025
|
+
statements:\nShahul and Jithin were from different countries.
|
|
1026
|
+
question:{{question}}
|
|
1027
|
+
answer: {{answer}}
|
|
1028
|
+
statements:\n`;
|
|
1029
|
+
const CONTEXT_FAITHFULNESS_NLI_STATEMENTS = `Prompt: Natural language inference
|
|
1030
|
+
Consider the given context and following statements, then determine whether they are supported by the information present in the context.Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Provide a final verdict for each statement in order at the end in the given format. Do not deviate from the specified format.
|
|
1031
|
+
|
|
1032
|
+
Context:\nJohn is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.
|
|
1033
|
+
statements:\n1. John is majoring in Biology.\n2. John is taking a course on Artificial Intelligence.\n3. John is a dedicated student.\n4. John has a part-time job.\n5. John is interested in computer programming.\n
|
|
1034
|
+
Answer:
|
|
1035
|
+
1. John is majoring in Biology.
|
|
1036
|
+
Explanation: John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology. Verdict: No.
|
|
1037
|
+
2. John is taking a course on Artificial Intelligence.
|
|
1038
|
+
Explanation: The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI. Verdict: No.
|
|
1039
|
+
3. John is a dedicated student.
|
|
1040
|
+
Explanation: The prompt states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication. Verdict: Yes.
|
|
1041
|
+
4. John has a part-time job.
|
|
1042
|
+
Explanation: There is no information given in the context about John having a part-time job. Therefore, it cannot be deduced that John has a part-time job. Verdict: No.
|
|
1043
|
+
5. John is interested in computer programming.
|
|
1044
|
+
Explanation: The context states that John is pursuing a degree in Computer Science, which implies an interest in computer programming. Verdict: Yes.
|
|
1045
|
+
Final verdict for each statement in order: No. No. Yes. No. Yes.
|
|
1046
|
+
context:\n{{context}}
|
|
1047
|
+
statements:\n{{statements|join("\\n")}}
|
|
1048
|
+
Answer:
|
|
1049
|
+
`;
|
|
1050
|
+
//#endregion
|
|
1051
|
+
//#region src/prompts/grading.ts
|
|
1052
|
+
const DEFAULT_GRADING_PROMPT = JSON.stringify([{
|
|
1053
|
+
role: "system",
|
|
1054
|
+
content: dedent.default`You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
1055
|
+
|
|
1056
|
+
Examples:
|
|
1057
|
+
|
|
1058
|
+
<Output>Hello world</Output>
|
|
1059
|
+
<Rubric>Content contains a greeting</Rubric>
|
|
1060
|
+
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
|
|
1061
|
+
|
|
1062
|
+
<Output>Avast ye swabs, repel the invaders!</Output>
|
|
1063
|
+
<Rubric>Does not speak like a pirate</Rubric>
|
|
1064
|
+
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}`
|
|
1065
|
+
}, {
|
|
1066
|
+
role: "user",
|
|
1067
|
+
content: "<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>"
|
|
1068
|
+
}]);
|
|
1069
|
+
const PROMPTFOO_FACTUALITY_PROMPT = JSON.stringify([{
|
|
1070
|
+
role: "system",
|
|
1071
|
+
content: dedent.default`
|
|
1072
|
+
You are a precise factuality evaluator that compares a submitted answer to an expert answer.
|
|
1073
|
+
|
|
1074
|
+
Your task is to analyze the factual content while ignoring differences in style, grammar, or punctuation.
|
|
1075
|
+
You must categorize the submission into one of these options:
|
|
1076
|
+
|
|
1077
|
+
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
|
|
1078
|
+
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
|
|
1079
|
+
(C) The submitted answer contains all the same details as the expert answer.
|
|
1080
|
+
(D) There is a disagreement between the submitted answer and the expert answer.
|
|
1081
|
+
(E) The answers differ, but these differences don't matter from the perspective of factuality.
|
|
1082
|
+
|
|
1083
|
+
Respond ONLY with a JSON object in this format:
|
|
1084
|
+
{
|
|
1085
|
+
"category": "[LETTER]",
|
|
1086
|
+
"reason": "[DETAILED EXPLANATION]"
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
- The "category" must be a single letter A, B, C, D, or E.
|
|
1090
|
+
- Provide a clear, detailed explanation in the "reason" field.
|
|
1091
|
+
- Your response must be valid JSON with no additional text.`
|
|
1092
|
+
}, {
|
|
1093
|
+
role: "user",
|
|
1094
|
+
content: dedent.default`
|
|
1095
|
+
I need you to compare these answers:
|
|
1096
|
+
|
|
1097
|
+
<question>
|
|
1098
|
+
{{input}}
|
|
1099
|
+
</question>
|
|
1100
|
+
|
|
1101
|
+
<expert_answer>
|
|
1102
|
+
{{ideal}}
|
|
1103
|
+
</expert_answer>
|
|
1104
|
+
|
|
1105
|
+
<submitted_answer>
|
|
1106
|
+
{{completion}}
|
|
1107
|
+
</submitted_answer>
|
|
1108
|
+
|
|
1109
|
+
Please analyze the factual relationship between these answers according to the categories you've been given.`
|
|
1110
|
+
}]);
|
|
1111
|
+
const OPENAI_CLOSED_QA_PROMPT = JSON.stringify([{
|
|
1112
|
+
role: "system",
|
|
1113
|
+
content: `You are assessing a submitted answer on a given task based on a criterion. Here is the data:
|
|
1114
|
+
[BEGIN DATA]
|
|
1115
|
+
***
|
|
1116
|
+
[Task]: {{input}}
|
|
1117
|
+
***
|
|
1118
|
+
[Submission]: {{completion}}
|
|
1119
|
+
***
|
|
1120
|
+
[Criterion]: {{criteria}}
|
|
1121
|
+
***
|
|
1122
|
+
[END DATA]
|
|
1123
|
+
Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
|
|
1124
|
+
|
|
1125
|
+
Reasoning:`
|
|
1126
|
+
}]);
|
|
1127
|
+
const SUGGEST_PROMPTS_SYSTEM_MESSAGE = {
|
|
1128
|
+
role: "system",
|
|
1129
|
+
content: `You're helping a scientist who is tuning a prompt for a large language model. You will receive messages, and each message is a full prompt. Generate a candidate variation of the given prompt. This variation will be tested for quality in order to select a winner.
|
|
1130
|
+
|
|
1131
|
+
Substantially revise the prompt, revising its structure and content however necessary to make it perform better, while preserving the original intent and including important details.
|
|
1132
|
+
|
|
1133
|
+
Your output is going to be copied directly into the program. It should contain the prompt ONLY`
|
|
1134
|
+
};
|
|
1135
|
+
const SELECT_BEST_PROMPT = JSON.stringify([{
|
|
1136
|
+
role: "system",
|
|
1137
|
+
content: `You are comparing multiple pieces of text to see which best fits the following criteria: {{criteria}}
|
|
1138
|
+
|
|
1139
|
+
Here are the pieces of text:
|
|
1140
|
+
|
|
1141
|
+
{% for output in outputs %}
|
|
1142
|
+
<Text index="{{ loop.index0 }}">
|
|
1143
|
+
{{ output }}
|
|
1144
|
+
</Text>
|
|
1145
|
+
{% endfor %}
|
|
1146
|
+
|
|
1147
|
+
Output the index of the text that best fits the criteria. You must output a single integer.`
|
|
1148
|
+
}]);
|
|
1149
|
+
const DEFAULT_WEB_SEARCH_PROMPT = JSON.stringify([{
|
|
1150
|
+
role: "system",
|
|
1151
|
+
content: dedent.default`You are grading output according to a user-specified rubric, with the ability to search the web for current information. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
1152
|
+
|
|
1153
|
+
You MUST search the web when:
|
|
1154
|
+
- The rubric asks about current information (prices, weather, news, etc.)
|
|
1155
|
+
- Facts need to be verified against recent data
|
|
1156
|
+
- The rubric references time-sensitive information
|
|
1157
|
+
|
|
1158
|
+
Examples:
|
|
1159
|
+
|
|
1160
|
+
<Output>The current CEO of Microsoft is Satya Nadella</Output>
|
|
1161
|
+
<Rubric>Contains accurate information about Microsoft's leadership</Rubric>
|
|
1162
|
+
{"reason": "I searched and confirmed Satya Nadella is indeed the current CEO of Microsoft", "pass": true, "score": 1.0}
|
|
1163
|
+
|
|
1164
|
+
<Output>Bitcoin is trading at $45,000</Output>
|
|
1165
|
+
<Rubric>Provides current Bitcoin price within 10% accuracy</Rubric>
|
|
1166
|
+
{"reason": "Web search shows Bitcoin is currently trading at $98,000, not $45,000. The output is off by more than 50%", "pass": false, "score": 0.0}`
|
|
1167
|
+
}, {
|
|
1168
|
+
role: "user",
|
|
1169
|
+
content: "<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>"
|
|
1170
|
+
}]);
|
|
1171
|
+
const TRAJECTORY_GOAL_SUCCESS_PROMPT = JSON.stringify([{
|
|
1172
|
+
role: "system",
|
|
1173
|
+
content: dedent.default`You are grading whether an AI agent successfully completed a goal based on its final output and a summarized execution trajectory. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
1174
|
+
|
|
1175
|
+
Judge end-to-end success, not stylistic perfection.
|
|
1176
|
+
Use the trajectory as evidence for what the agent actually did.
|
|
1177
|
+
Give partial credit when the agent made progress but did not fully achieve the goal.
|
|
1178
|
+
|
|
1179
|
+
Examples:
|
|
1180
|
+
|
|
1181
|
+
<Goal>Find the order status and tell the user whether it has shipped</Goal>
|
|
1182
|
+
<Trajectory>{"stepCount":2,"steps":[{"index":1,"type":"tool","name":"search_orders"},{"index":2,"type":"message","name":"agent response"}]}</Trajectory>
|
|
1183
|
+
<Output>Your order shipped yesterday and should arrive on Tuesday.</Output>
|
|
1184
|
+
{"reason":"The agent used the order lookup tool and gave the user the shipping status, so the goal was achieved.","pass":true,"score":1.0}
|
|
1185
|
+
|
|
1186
|
+
<Goal>Find the order status and tell the user whether it has shipped</Goal>
|
|
1187
|
+
<Trajectory>{"stepCount":1,"steps":[{"index":1,"type":"message","name":"agent response"}]}</Trajectory>
|
|
1188
|
+
<Output>I cannot check your order right now.</Output>
|
|
1189
|
+
{"reason":"The agent did not show evidence of checking the order and did not provide the requested status.","pass":false,"score":0.0}`
|
|
1190
|
+
}, {
|
|
1191
|
+
role: "user",
|
|
1192
|
+
content: dedent.default`<Goal>
|
|
1193
|
+
{{ goal }}
|
|
1194
|
+
</Goal>
|
|
1195
|
+
<Trajectory>
|
|
1196
|
+
{{ trajectory }}
|
|
1197
|
+
</Trajectory>
|
|
1198
|
+
<Output>
|
|
1199
|
+
{{ output }}
|
|
1200
|
+
</Output>`
|
|
1201
|
+
}]);
|
|
1344
1202
|
//#endregion
|
|
1345
|
-
//#region src/
|
|
1346
|
-
var LlmRubricProviderError = class extends Error {
|
|
1347
|
-
constructor(message) {
|
|
1348
|
-
super(message);
|
|
1349
|
-
this.name = "LlmRubricProviderError";
|
|
1350
|
-
}
|
|
1351
|
-
};
|
|
1352
|
-
const nunjucks = require_render.getNunjucksEngine(void 0, false, true);
|
|
1353
|
-
const FACTUALITY_CATEGORY_DESCRIPTIONS = {
|
|
1354
|
-
A: "The submitted answer is a subset of the expert answer and is fully consistent with it.",
|
|
1355
|
-
B: "The submitted answer is a superset of the expert answer and is fully consistent with it.",
|
|
1356
|
-
C: "The submitted answer contains all the same details as the expert answer.",
|
|
1357
|
-
D: "There is a disagreement between the submitted answer and the expert answer.",
|
|
1358
|
-
E: "The answers differ, but these differences don't matter from the perspective of factuality."
|
|
1359
|
-
};
|
|
1360
|
-
function cosineSimilarity(vecA, vecB) {
|
|
1361
|
-
if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
|
|
1362
|
-
return vecA.reduce((acc, val, idx) => acc + val * vecB[idx], 0) / (Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0)) * Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0)));
|
|
1363
|
-
}
|
|
1364
|
-
function dotProduct(vecA, vecB) {
|
|
1365
|
-
if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
|
|
1366
|
-
return vecA.reduce((acc, val, idx) => acc + val * vecB[idx], 0);
|
|
1367
|
-
}
|
|
1368
|
-
function euclideanDistance(vecA, vecB) {
|
|
1369
|
-
if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
|
|
1370
|
-
const sumSquaredDiff = vecA.reduce((acc, val, idx) => {
|
|
1371
|
-
const diff = val - vecB[idx];
|
|
1372
|
-
return acc + diff * diff;
|
|
1373
|
-
}, 0);
|
|
1374
|
-
return Math.sqrt(sumSquaredDiff);
|
|
1375
|
-
}
|
|
1203
|
+
//#region src/prompts/index.ts
|
|
1376
1204
|
/**
|
|
1377
|
-
*
|
|
1378
|
-
*
|
|
1379
|
-
*
|
|
1380
|
-
*
|
|
1381
|
-
* override. This ensures originalProvider from context is preserved while
|
|
1382
|
-
* allowing this call to specify its own prompt metadata.
|
|
1205
|
+
* Reads and maps provider prompts based on the configuration and parsed prompts.
|
|
1206
|
+
* @param config - The configuration object.
|
|
1207
|
+
* @param parsedPrompts - Array of parsed prompts.
|
|
1208
|
+
* @returns A map of provider IDs to their respective prompts.
|
|
1383
1209
|
*/
|
|
1384
|
-
function
|
|
1385
|
-
const
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
const executeCall = () => {
|
|
1397
|
-
if (executionContext?.rateLimitRegistry && !require_providers.isRateLimitWrapped(provider)) return executionContext.rateLimitRegistry.execute(provider, callApi, require_providers.createProviderRateLimitOptions());
|
|
1398
|
-
return callApi();
|
|
1399
|
-
};
|
|
1400
|
-
if (executionContext?.providerCallQueue) return executionContext.providerCallQueue.enqueue(provider.id(), executeCall);
|
|
1401
|
-
return executeCall();
|
|
1402
|
-
}
|
|
1403
|
-
async function loadFromProviderOptions(provider) {
|
|
1404
|
-
require_invariant.invariant(typeof provider === "object", `Provider must be an object, but received a ${typeof provider}: ${provider}`);
|
|
1405
|
-
require_invariant.invariant(!Array.isArray(provider), `Provider must be an object, but received an array: ${JSON.stringify(provider)}`);
|
|
1406
|
-
require_invariant.invariant(provider.id, "Provider supplied to assertion must have an id");
|
|
1407
|
-
return require_providers.loadApiProvider(provider.id, {
|
|
1408
|
-
options: provider,
|
|
1409
|
-
basePath: require_logger.state.basePath
|
|
1410
|
-
});
|
|
1411
|
-
}
|
|
1412
|
-
function isSimulatedUserProviderConfig(provider) {
|
|
1413
|
-
if (typeof provider === "string") return provider === "promptfoo:simulated-user";
|
|
1414
|
-
if (!provider || typeof provider !== "object" || Array.isArray(provider)) return false;
|
|
1415
|
-
if (typeof provider.id === "function") return provider.id() === "promptfoo:simulated-user";
|
|
1416
|
-
const providerId = provider.id;
|
|
1417
|
-
if (typeof providerId === "string") return providerId === "promptfoo:simulated-user";
|
|
1418
|
-
return Object.values(provider).some((providerTypeConfig) => isSimulatedUserProviderConfig(providerTypeConfig));
|
|
1419
|
-
}
|
|
1420
|
-
async function getGradingProvider(type, provider, defaultProvider) {
|
|
1421
|
-
let finalProvider;
|
|
1422
|
-
if (typeof provider === "string") finalProvider = await require_providers.loadApiProvider(provider, { basePath: require_logger.state.basePath });
|
|
1423
|
-
else if (typeof provider === "object" && typeof provider.id === "function") finalProvider = provider;
|
|
1424
|
-
else if (typeof provider === "object") {
|
|
1425
|
-
const typeValue = provider[type];
|
|
1426
|
-
if (typeValue) finalProvider = await getGradingProvider(type, typeValue, defaultProvider);
|
|
1427
|
-
else if (provider.id) finalProvider = await loadFromProviderOptions(provider);
|
|
1428
|
-
else if (Array.isArray(provider)) throw new Error(`Provider must be an object or string, but received an array.\n\nCheck that the provider ${JSON.stringify(provider[0], null, 2)} is not nested in an array.`);
|
|
1429
|
-
else throw new Error(`Invalid provider definition for output type '${type}': ${JSON.stringify(provider, null, 2)}`);
|
|
1210
|
+
function readProviderPromptMap(config, parsedPrompts) {
|
|
1211
|
+
const ret = {};
|
|
1212
|
+
if (!config.providers) return ret;
|
|
1213
|
+
const allPrompts = [];
|
|
1214
|
+
for (const prompt of parsedPrompts) allPrompts.push(prompt.label);
|
|
1215
|
+
if (typeof config.providers === "string") return { [config.providers]: allPrompts };
|
|
1216
|
+
if (typeof config.providers === "function") return { "Custom function": allPrompts };
|
|
1217
|
+
for (const provider of config.providers) if (typeof provider === "object") if (provider.id) {
|
|
1218
|
+
const rawProvider = provider;
|
|
1219
|
+
require_invariant.invariant(rawProvider.id, "You must specify an `id` on the Provider when you override options.prompts");
|
|
1220
|
+
ret[rawProvider.id] = rawProvider.prompts || allPrompts;
|
|
1221
|
+
if (rawProvider.label) ret[rawProvider.label] = rawProvider.prompts || allPrompts;
|
|
1430
1222
|
} else {
|
|
1431
|
-
const
|
|
1432
|
-
const
|
|
1433
|
-
const
|
|
1434
|
-
|
|
1435
|
-
defaultTestObj?.options?.provider?.text || void 0,
|
|
1436
|
-
defaultTestObj?.options?.provider || void 0
|
|
1437
|
-
].find((candidateProvider) => {
|
|
1438
|
-
if (!candidateProvider) return false;
|
|
1439
|
-
if (isSimulatedUserProviderConfig(candidateProvider)) {
|
|
1440
|
-
require_logger.logger.debug("[Grading] Skipping promptfoo:simulated-user as an implicit grader fallback");
|
|
1441
|
-
return false;
|
|
1442
|
-
}
|
|
1443
|
-
return true;
|
|
1444
|
-
});
|
|
1445
|
-
if (cfg) {
|
|
1446
|
-
finalProvider = await getGradingProvider(type, cfg, defaultProvider);
|
|
1447
|
-
if (finalProvider) require_logger.logger.debug(`[Grading] Using provider from defaultTest fallback: ${finalProvider.id()}`);
|
|
1448
|
-
} else finalProvider = defaultProvider;
|
|
1223
|
+
const rawProvider = provider;
|
|
1224
|
+
const originalId = Object.keys(rawProvider)[0];
|
|
1225
|
+
const id = rawProvider[originalId].id || originalId;
|
|
1226
|
+
ret[id] = rawProvider[originalId].prompts || allPrompts;
|
|
1449
1227
|
}
|
|
1450
|
-
return
|
|
1451
|
-
}
|
|
1452
|
-
async function getAndCheckProvider(type, provider, defaultProvider, checkName) {
|
|
1453
|
-
const matchedProvider = await getGradingProvider(type, provider, defaultProvider);
|
|
1454
|
-
if (!matchedProvider) if (defaultProvider) {
|
|
1455
|
-
require_logger.logger.warn(`No provider of type ${type} found for '${checkName}', falling back to default`);
|
|
1456
|
-
return defaultProvider;
|
|
1457
|
-
} else throw new Error(`No provider of type ${type} found for '${checkName}'`);
|
|
1458
|
-
let isValidProviderType = true;
|
|
1459
|
-
if (type === "embedding") isValidProviderType = "callEmbeddingApi" in matchedProvider || "callSimilarityApi" in matchedProvider;
|
|
1460
|
-
else if (type === "classification") isValidProviderType = "callClassificationApi" in matchedProvider;
|
|
1461
|
-
else if (type === "moderation") isValidProviderType = "callModerationApi" in matchedProvider;
|
|
1462
|
-
if (!isValidProviderType) if (defaultProvider) {
|
|
1463
|
-
require_logger.logger.warn(`Provider ${matchedProvider.id()} is not a valid ${type} provider for '${checkName}', falling back to default`);
|
|
1464
|
-
return defaultProvider;
|
|
1465
|
-
} else throw new Error(`Provider ${matchedProvider.id()} is not a valid ${type} provider for '${checkName}'`);
|
|
1466
|
-
return matchedProvider;
|
|
1467
|
-
}
|
|
1468
|
-
function fail(reason, tokensUsed) {
|
|
1469
|
-
return {
|
|
1470
|
-
pass: false,
|
|
1471
|
-
reason,
|
|
1472
|
-
score: 0,
|
|
1473
|
-
tokensUsed: {
|
|
1474
|
-
total: tokensUsed?.total || 0,
|
|
1475
|
-
prompt: tokensUsed?.prompt || 0,
|
|
1476
|
-
completion: tokensUsed?.completion || 0,
|
|
1477
|
-
cached: tokensUsed?.cached || 0,
|
|
1478
|
-
numRequests: tokensUsed?.numRequests || 0,
|
|
1479
|
-
completionDetails: tokensUsed?.completionDetails
|
|
1480
|
-
}
|
|
1481
|
-
};
|
|
1228
|
+
return ret;
|
|
1482
1229
|
}
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1230
|
+
/**
|
|
1231
|
+
* Processes a raw prompt based on its content type and path.
|
|
1232
|
+
* @param prompt - The raw prompt data.
|
|
1233
|
+
* @param basePath - Base path for file resolution.
|
|
1234
|
+
* @param maxRecursionDepth - Maximum recursion depth for globbing.
|
|
1235
|
+
* @returns Promise resolving to an array of processed prompts.
|
|
1236
|
+
*/
|
|
1237
|
+
async function processPrompt(prompt, basePath = "", maxRecursionDepth = 1) {
|
|
1238
|
+
require_invariant.invariant(typeof prompt.raw === "string", `prompt.raw must be a string, but got ${JSON.stringify(prompt.raw)}`);
|
|
1239
|
+
if (prompt.function) return [prompt];
|
|
1240
|
+
if (prompt.raw.startsWith("exec:")) {
|
|
1241
|
+
const { filePath, functionName } = require_util.parsePathOrGlob(basePath, prompt.raw.substring(5));
|
|
1242
|
+
return await processExecutableFile(filePath, prompt, functionName);
|
|
1243
|
+
}
|
|
1244
|
+
if (!require_utils.maybeFilePath(prompt.raw)) return processString(prompt);
|
|
1245
|
+
const { extension, functionName, isPathPattern, filePath } = require_util.parsePathOrGlob(basePath, prompt.raw);
|
|
1246
|
+
if (isPathPattern && maxRecursionDepth > 0) {
|
|
1247
|
+
const globbedPath = (0, glob.globSync)(filePath.replace(/\\/g, "/"), { windowsPathsNoEscape: true });
|
|
1248
|
+
require_logger.logger.debug(`Expanded prompt ${prompt.raw} to ${filePath} and then to ${JSON.stringify(globbedPath)}`);
|
|
1249
|
+
const prompts = [];
|
|
1250
|
+
for (const globbedFilePath of globbedPath) {
|
|
1251
|
+
const processedPrompts = await processPrompt({ raw: functionName ? `${globbedFilePath}:${functionName}` : globbedFilePath }, basePath, maxRecursionDepth - 1);
|
|
1252
|
+
prompts.push(...processedPrompts);
|
|
1494
1253
|
}
|
|
1495
|
-
|
|
1496
|
-
}
|
|
1497
|
-
|
|
1498
|
-
return {
|
|
1499
|
-
total: 0,
|
|
1500
|
-
prompt: 0,
|
|
1501
|
-
completion: 0,
|
|
1502
|
-
cached: 0,
|
|
1503
|
-
numRequests: 0,
|
|
1504
|
-
completionDetails: {
|
|
1505
|
-
reasoning: 0,
|
|
1506
|
-
acceptedPrediction: 0,
|
|
1507
|
-
rejectedPrediction: 0
|
|
1254
|
+
if (prompts.length === 0) {
|
|
1255
|
+
require_logger.logger.debug(`Attempted to load file at "${prompt.raw}", but no file found. Using raw string.`);
|
|
1256
|
+
prompts.push(...processString(prompt));
|
|
1508
1257
|
}
|
|
1509
|
-
|
|
1258
|
+
return prompts;
|
|
1259
|
+
}
|
|
1260
|
+
if (extension === ".csv") return processCsvPrompts(filePath, prompt);
|
|
1261
|
+
if (extension === ".j2") return processJinjaFile(filePath, prompt);
|
|
1262
|
+
if (extension === ".json") return processJsonFile(filePath, prompt);
|
|
1263
|
+
if (extension === ".jsonl") return processJsonlFile(filePath, prompt);
|
|
1264
|
+
if (extension && require_fileExtensions.isJavascriptFile(extension)) return processJsFile(filePath, prompt, functionName);
|
|
1265
|
+
if (extension === ".md") return processMarkdownFile(filePath, prompt);
|
|
1266
|
+
if (extension === ".py") return processPythonFile(filePath, prompt, functionName);
|
|
1267
|
+
if (extension === ".txt") return processTxtFile(filePath, prompt);
|
|
1268
|
+
if (extension && [".yml", ".yaml"].includes(extension)) return processYamlFile(filePath, prompt);
|
|
1269
|
+
if (extension && [
|
|
1270
|
+
".sh",
|
|
1271
|
+
".bash",
|
|
1272
|
+
".exe",
|
|
1273
|
+
".bat",
|
|
1274
|
+
".cmd",
|
|
1275
|
+
".ps1",
|
|
1276
|
+
".rb",
|
|
1277
|
+
".pl"
|
|
1278
|
+
].includes(extension)) return await processExecutableFile(filePath, prompt, functionName);
|
|
1279
|
+
try {
|
|
1280
|
+
const stats = await (0, fs_promises.stat)(filePath);
|
|
1281
|
+
if (stats.isFile() && (stats.mode & 73) !== 0) return await processExecutableFile(filePath, prompt, functionName);
|
|
1282
|
+
} catch (_e) {}
|
|
1283
|
+
return [];
|
|
1510
1284
|
}
|
|
1511
|
-
|
|
1512
|
-
|
|
1285
|
+
/**
|
|
1286
|
+
* Reads and processes prompts from a specified path or glob pattern.
|
|
1287
|
+
* @param promptPathOrGlobs - The path or glob pattern.
|
|
1288
|
+
* @param basePath - Base path for file resolution.
|
|
1289
|
+
* @returns Promise resolving to an array of processed prompts.
|
|
1290
|
+
*/
|
|
1291
|
+
async function readPrompts(promptPathOrGlobs, basePath = "") {
|
|
1292
|
+
require_logger.logger.debug(`Reading prompts from ${JSON.stringify(promptPathOrGlobs)}`);
|
|
1293
|
+
const promptPartials = require_utils.normalizeInput(promptPathOrGlobs);
|
|
1294
|
+
const prompts = [];
|
|
1295
|
+
for (const prompt of promptPartials) {
|
|
1296
|
+
const promptBatch = await processPrompt(prompt, basePath);
|
|
1297
|
+
if (promptBatch.length === 0) throw new Error(`There are no prompts in ${JSON.stringify(prompt.raw)}`);
|
|
1298
|
+
prompts.push(...promptBatch);
|
|
1299
|
+
}
|
|
1300
|
+
return prompts;
|
|
1513
1301
|
}
|
|
1514
|
-
function
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1302
|
+
async function processPrompts(prompts) {
|
|
1303
|
+
return (await Promise.all(prompts.map(async (promptInput) => {
|
|
1304
|
+
if (typeof promptInput === "function") return {
|
|
1305
|
+
raw: promptInput.toString(),
|
|
1306
|
+
label: promptInput?.name ?? promptInput.toString(),
|
|
1307
|
+
function: promptInput
|
|
1308
|
+
};
|
|
1309
|
+
else if (typeof promptInput === "string") return readPrompts(promptInput);
|
|
1310
|
+
try {
|
|
1311
|
+
return require_types.PromptSchema.parse(promptInput);
|
|
1312
|
+
} catch (error) {
|
|
1313
|
+
require_logger.logger.warn(`Prompt input is not a valid prompt schema: ${error}\nFalling back to serialized JSON as raw prompt.`);
|
|
1314
|
+
return {
|
|
1315
|
+
raw: JSON.stringify(promptInput),
|
|
1316
|
+
label: JSON.stringify(promptInput)
|
|
1317
|
+
};
|
|
1527
1318
|
}
|
|
1528
|
-
};
|
|
1529
|
-
}
|
|
1530
|
-
function accumulateTokens(target, update) {
|
|
1531
|
-
require_tokenUsageUtils.accumulateTokenUsage(target, update);
|
|
1319
|
+
}))).flat();
|
|
1532
1320
|
}
|
|
1533
|
-
|
|
1534
|
-
|
|
1321
|
+
const GEVAL_PROMPT_STEPS = `
|
|
1322
|
+
Given evaluation criteria that outline how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to check whether the criteria are satisfied by the text.
|
|
1323
|
+
|
|
1324
|
+
**EVALUATION CRITERIA**
|
|
1325
|
+
{{criteria}}
|
|
1326
|
+
|
|
1327
|
+
**OUTPUT FORMAT**
|
|
1328
|
+
IMPORTANT:
|
|
1329
|
+
- Return output ONLY as a minified JSON object (no code fences).
|
|
1330
|
+
- The JSON object must contain a single key, "steps", whose value is a list of strings.
|
|
1331
|
+
- Each string must represent one evaluation step.
|
|
1332
|
+
- Do NOT include any explanations, commentary, extra text, or additional formatting.
|
|
1333
|
+
|
|
1334
|
+
Format:
|
|
1335
|
+
{"steps": <list_of_strings>}
|
|
1336
|
+
|
|
1337
|
+
Example:
|
|
1338
|
+
{"steps":["<Evaluation Step 1>","<Evaluation Step 2>","<Evaluation Step 3>","<Evaluation Step 4>"]}
|
|
1339
|
+
|
|
1340
|
+
Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
|
|
1341
|
+
JSON:
|
|
1342
|
+
`;
|
|
1343
|
+
const GEVAL_PROMPT_EVALUATE = `
|
|
1344
|
+
You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
|
|
1345
|
+
Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
|
|
1346
|
+
|
|
1347
|
+
**Evaluation Criteria**
|
|
1348
|
+
{{criteria}}
|
|
1349
|
+
|
|
1350
|
+
**Evaluation Steps**
|
|
1351
|
+
- {{steps}}
|
|
1352
|
+
Given the evaluation steps, return a JSON with two keys:
|
|
1353
|
+
1) a "score" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the condition described by the Evaluation Criteria is fully and clearly observed in the Reply according to the Evaluation Steps, and 0 indicates that it is not observed at all;
|
|
1354
|
+
2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!
|
|
1355
|
+
|
|
1356
|
+
**Prompt**
|
|
1357
|
+
{{input}}
|
|
1358
|
+
|
|
1359
|
+
**Reply**
|
|
1360
|
+
{{output}}
|
|
1361
|
+
|
|
1362
|
+
**OUTPUT FORMAT**
|
|
1363
|
+
IMPORTANT:
|
|
1364
|
+
- Return output ONLY as a minified JSON object (no code fences).
|
|
1365
|
+
- The JSON object must contain exactly two keys: "score" and "reason".
|
|
1366
|
+
- No additional words, explanations, or formatting are needed.
|
|
1367
|
+
- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
|
|
1368
|
+
|
|
1369
|
+
Example JSON:
|
|
1370
|
+
{"score":0,"reason":"The text of reply does not follow the evaluation criteria provided."}
|
|
1371
|
+
|
|
1372
|
+
Here is the final evaluation in the required minified JSON format:
|
|
1373
|
+
JSON:
|
|
1374
|
+
`;
|
|
1375
|
+
//#endregion
|
|
1376
|
+
//#region src/remoteGrading.ts
|
|
1377
|
+
async function doRemoteGrading(payload) {
|
|
1535
1378
|
try {
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1379
|
+
payload.email = require_accounts.getUserEmail();
|
|
1380
|
+
const body = JSON.stringify(payload);
|
|
1381
|
+
require_logger.logger.debug(`Performing remote grading: ${body}`);
|
|
1382
|
+
const { data, status, statusText } = await require_cache.fetchWithCache(require_server.getRemoteGenerationUrl(), {
|
|
1383
|
+
method: "POST",
|
|
1384
|
+
headers: { "Content-Type": "application/json" },
|
|
1385
|
+
body
|
|
1386
|
+
}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
1387
|
+
require_logger.logger.debug(`Remote grading result: status=${status}, statusText=${statusText}, data=${JSON.stringify(data)}`);
|
|
1388
|
+
if (status !== 200) throw new Error(`Remote grading failed with status ${status}: ${statusText} ${JSON.stringify(data)}`);
|
|
1389
|
+
const { result } = data;
|
|
1390
|
+
if (!result || result.pass === void 0) throw new Error(`Remote grading failed. Response data is invalid: ${JSON.stringify(data)}`);
|
|
1391
|
+
return {
|
|
1392
|
+
pass: result.pass,
|
|
1393
|
+
score: result.score,
|
|
1394
|
+
reason: result.reason,
|
|
1395
|
+
tokensUsed: result.tokensUsed
|
|
1396
|
+
};
|
|
1543
1397
|
} catch (error) {
|
|
1544
|
-
|
|
1398
|
+
throw new Error(`Could not perform remote grading: ${error}`);
|
|
1545
1399
|
}
|
|
1546
1400
|
}
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
async function computeSimilarityFromNativeProvider(provider, expected, output, metric) {
|
|
1553
|
-
const tokensUsed = createMatcherTokenUsage();
|
|
1554
|
-
if (metric !== "cosine") return { failure: fail(`Provider ${provider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed) };
|
|
1555
|
-
const similarityResp = await provider.callSimilarityApi(expected, output);
|
|
1556
|
-
copySimilarityTokenUsage(tokensUsed, similarityResp);
|
|
1557
|
-
if (similarityResp.error) return { failure: fail(similarityResp.error, tokensUsed) };
|
|
1558
|
-
if (similarityResp.similarity == null) return { failure: fail("Unknown error fetching similarity", tokensUsed) };
|
|
1559
|
-
return {
|
|
1560
|
-
similarity: similarityResp.similarity,
|
|
1561
|
-
tokensUsed
|
|
1562
|
-
};
|
|
1401
|
+
//#endregion
|
|
1402
|
+
//#region src/remoteScoring.ts
|
|
1403
|
+
function getWithPiApiKey() {
|
|
1404
|
+
const withPiApiKey = require_logger.getEnvString("WITHPI_API_KEY");
|
|
1405
|
+
if (withPiApiKey) return withPiApiKey;
|
|
1563
1406
|
}
|
|
1564
|
-
|
|
1565
|
-
const expectedEmbedding = await provider.callEmbeddingApi(expected);
|
|
1566
|
-
const outputEmbedding = await provider.callEmbeddingApi(output);
|
|
1567
|
-
const tokensUsed = combineEmbeddingTokenUsage(expectedEmbedding, outputEmbedding);
|
|
1568
|
-
if (expectedEmbedding.error || outputEmbedding.error) return { failure: fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed) };
|
|
1569
|
-
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return { failure: fail("Embedding not found", tokensUsed) };
|
|
1407
|
+
function convertPiResultToGradingResult(result, threshold) {
|
|
1570
1408
|
return {
|
|
1571
|
-
|
|
1572
|
-
|
|
1409
|
+
pass: result.total_score > threshold,
|
|
1410
|
+
score: result.total_score,
|
|
1411
|
+
namedScores: result.question_scores,
|
|
1412
|
+
reason: "Pi Scorer"
|
|
1573
1413
|
};
|
|
1574
1414
|
}
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1415
|
+
const WITHPI_API_URL = `https://api.withpi.ai/v1/scoring_system/score`;
|
|
1416
|
+
async function doRemoteScoringWithPi(payload, passThreshold = .5) {
|
|
1417
|
+
try {
|
|
1418
|
+
const apiKey = getWithPiApiKey();
|
|
1419
|
+
if (apiKey) {
|
|
1420
|
+
const body = JSON.stringify(payload);
|
|
1421
|
+
require_logger.logger.debug(`Performing remote scoring with pi: ${body}`);
|
|
1422
|
+
const { data } = await require_cache.fetchWithCache(WITHPI_API_URL, {
|
|
1423
|
+
method: "POST",
|
|
1424
|
+
headers: {
|
|
1425
|
+
"Content-Type": "application/json",
|
|
1426
|
+
"x-api-key": apiKey
|
|
1427
|
+
},
|
|
1428
|
+
body
|
|
1429
|
+
}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
1430
|
+
return convertPiResultToGradingResult(data, passThreshold);
|
|
1431
|
+
} else throw new Error(`Env var WITHPI_API_KEY must be set. Visit https://docs.withpi.ai for more information.`);
|
|
1432
|
+
} catch (error) {
|
|
1433
|
+
throw new Error(`Could not perform remote grading: ${error}`);
|
|
1581
1434
|
}
|
|
1582
1435
|
}
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1436
|
+
//#endregion
|
|
1437
|
+
//#region src/matchers/llmGrading.ts
|
|
1438
|
+
const FACTUALITY_CATEGORY_DESCRIPTIONS = {
|
|
1439
|
+
A: "The submitted answer is a subset of the expert answer and is fully consistent with it.",
|
|
1440
|
+
B: "The submitted answer is a superset of the expert answer and is fully consistent with it.",
|
|
1441
|
+
C: "The submitted answer contains all the same details as the expert answer.",
|
|
1442
|
+
D: "There is a disagreement between the submitted answer and the expert answer.",
|
|
1443
|
+
E: "The answers differ, but these differences don't matter from the perspective of factuality."
|
|
1444
|
+
};
|
|
1445
|
+
function getFactualityScoreLookup(grading) {
|
|
1591
1446
|
return {
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1447
|
+
A: grading.factuality?.subset ?? 1,
|
|
1448
|
+
B: grading.factuality?.superset ?? 1,
|
|
1449
|
+
C: grading.factuality?.agree ?? 1,
|
|
1450
|
+
D: grading.factuality?.disagree ?? 0,
|
|
1451
|
+
E: grading.factuality?.differButFactual ?? 1
|
|
1596
1452
|
};
|
|
1597
1453
|
}
|
|
1598
|
-
function
|
|
1599
|
-
const
|
|
1600
|
-
const
|
|
1601
|
-
const
|
|
1454
|
+
function buildFactualityResult(option, reason, grading, resp) {
|
|
1455
|
+
const scoreLookup = getFactualityScoreLookup(grading);
|
|
1456
|
+
const passing = Object.keys(scoreLookup).filter((key) => scoreLookup[key] > 0);
|
|
1457
|
+
const failing = Object.keys(scoreLookup).filter((key) => scoreLookup[key] === 0);
|
|
1458
|
+
const pass = passing.includes(option) && !failing.includes(option);
|
|
1602
1459
|
return {
|
|
1603
1460
|
pass,
|
|
1604
|
-
score:
|
|
1605
|
-
reason
|
|
1606
|
-
tokensUsed
|
|
1461
|
+
score: scoreLookup[option] ?? (pass ? 1 : 0),
|
|
1462
|
+
reason,
|
|
1463
|
+
tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage)
|
|
1607
1464
|
};
|
|
1608
1465
|
}
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
});
|
|
1616
|
-
if (remoteResult) return remoteResult;
|
|
1617
|
-
const defaults = await getDefaultProviders();
|
|
1618
|
-
const computation = await computeNativeSimilarity(await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check"), expected, output, metric);
|
|
1619
|
-
return "failure" in computation ? computation.failure : buildSimilarityResult(computation.similarity, threshold, inverse, metric, computation.tokensUsed);
|
|
1620
|
-
}
|
|
1621
|
-
/**
|
|
1622
|
-
*
|
|
1623
|
-
* @param expected Expected classification. If undefined, matches any classification.
|
|
1624
|
-
* @param output Text to classify.
|
|
1625
|
-
* @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
|
|
1626
|
-
* @param grading
|
|
1627
|
-
* @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
|
|
1628
|
-
*/
|
|
1629
|
-
async function matchesClassification(expected, output, threshold, grading) {
|
|
1630
|
-
const resp = await (await getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
|
|
1631
|
-
if (!resp.classification) return fail(resp.error || "Unknown error fetching classification");
|
|
1632
|
-
let score;
|
|
1633
|
-
if (expected === void 0) score = Math.max(...Object.values(resp.classification));
|
|
1634
|
-
else score = resp.classification[expected] || 0;
|
|
1635
|
-
if (score >= threshold - Number.EPSILON) {
|
|
1636
|
-
const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
|
|
1466
|
+
function parseFactualityJsonResponse(responseText) {
|
|
1467
|
+
try {
|
|
1468
|
+
const jsonData = require_logger.extractFirstJsonObject(responseText);
|
|
1469
|
+
if (!jsonData?.category || typeof jsonData.category !== "string") return;
|
|
1470
|
+
const option = jsonData.category.trim().toUpperCase();
|
|
1471
|
+
if (!/^[A-E]$/.test(option)) throw new Error(`Invalid category value: ${option}`);
|
|
1637
1472
|
return {
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
reason
|
|
1473
|
+
option,
|
|
1474
|
+
reason: jsonData.reason?.trim() || `Category ${option}: ${FACTUALITY_CATEGORY_DESCRIPTIONS[option]}`
|
|
1641
1475
|
};
|
|
1642
|
-
}
|
|
1643
|
-
return {
|
|
1644
|
-
pass: false,
|
|
1645
|
-
score,
|
|
1646
|
-
reason: `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
|
|
1647
|
-
};
|
|
1648
|
-
}
|
|
1649
|
-
async function loadRubricPrompt(rubricPrompt, defaultPrompt) {
|
|
1650
|
-
if (!rubricPrompt || typeof rubricPrompt === "object" && Object.keys(rubricPrompt).length === 0) return defaultPrompt;
|
|
1651
|
-
if (typeof rubricPrompt === "string" && rubricPrompt.startsWith("file://")) {
|
|
1652
|
-
const basePath = require_logger.state.basePath || "";
|
|
1653
|
-
const { filePath, functionName } = require_util.parseFileUrl(require_util.getNunjucksEngineForFilePath().renderString(rubricPrompt, {}));
|
|
1654
|
-
const resolvedPath = path.default.resolve(basePath, filePath);
|
|
1655
|
-
if (require_fileExtensions.isJavascriptFile(filePath)) rubricPrompt = await loadFromJavaScriptFile(resolvedPath, functionName, []);
|
|
1656
|
-
else {
|
|
1657
|
-
if (!fs.existsSync(resolvedPath)) throw new Error(`File does not exist: ${resolvedPath}`);
|
|
1658
|
-
rubricPrompt = fs.readFileSync(resolvedPath, "utf8");
|
|
1659
|
-
}
|
|
1660
|
-
} else rubricPrompt = require_util.maybeLoadFromExternalFile(rubricPrompt);
|
|
1661
|
-
if (typeof rubricPrompt === "object") rubricPrompt = JSON.stringify(rubricPrompt);
|
|
1662
|
-
require_invariant.invariant(typeof rubricPrompt === "string", "rubricPrompt must be a string");
|
|
1663
|
-
return rubricPrompt;
|
|
1664
|
-
}
|
|
1665
|
-
function tryParse(content) {
|
|
1666
|
-
try {
|
|
1667
|
-
return JSON.parse(content);
|
|
1668
|
-
} catch {}
|
|
1669
|
-
return content;
|
|
1670
|
-
}
|
|
1671
|
-
function splitIntoSentences(text) {
|
|
1672
|
-
return text.split("\n").filter((sentence) => sentence.trim() !== "");
|
|
1673
|
-
}
|
|
1674
|
-
function processContextForTemplating(context, enableObjectAccess) {
|
|
1675
|
-
if (enableObjectAccess) return context;
|
|
1676
|
-
return Object.fromEntries(Object.entries(context).map(([key, value]) => {
|
|
1677
|
-
if (value && typeof value === "object") {
|
|
1678
|
-
if (Array.isArray(value)) return [key, value.map((item) => item && typeof item === "object" ? JSON.stringify(item) : item)];
|
|
1679
|
-
return [key, JSON.stringify(value)];
|
|
1680
|
-
}
|
|
1681
|
-
return [key, value];
|
|
1682
|
-
}));
|
|
1683
|
-
}
|
|
1684
|
-
async function renderLlmRubricPrompt(rubricPrompt, context) {
|
|
1685
|
-
const processedContext = processContextForTemplating(context, require_logger.getEnvBool("PROMPTFOO_DISABLE_OBJECT_STRINGIFY", false));
|
|
1686
|
-
try {
|
|
1687
|
-
const parsed = JSON.parse(rubricPrompt, (_k, v) => typeof v === "string" ? nunjucks.renderString(v, processedContext) : v);
|
|
1688
|
-
return JSON.stringify(parsed);
|
|
1689
|
-
} catch {}
|
|
1690
|
-
return nunjucks.renderString(rubricPrompt, processedContext);
|
|
1691
|
-
}
|
|
1692
|
-
function parseJsonGradingResponse(label, resp) {
|
|
1693
|
-
let jsonObjects = [];
|
|
1694
|
-
if (typeof resp.output === "string") try {
|
|
1695
|
-
jsonObjects = require_logger.extractJsonObjects(resp.output);
|
|
1696
|
-
if (jsonObjects.length === 0) return { failure: fail(`Could not extract JSON from ${label} response`, resp.tokenUsage) };
|
|
1697
1476
|
} catch (err) {
|
|
1698
|
-
|
|
1477
|
+
const error = err;
|
|
1478
|
+
if (error.message.startsWith("Invalid category value:")) throw error;
|
|
1479
|
+
require_logger.logger.debug(`JSON parsing failed: ${error.message}`);
|
|
1480
|
+
return;
|
|
1699
1481
|
}
|
|
1700
|
-
else if (typeof resp.output === "object" && resp.output !== null && !Array.isArray(resp.output)) jsonObjects = [resp.output];
|
|
1701
|
-
else return { failure: fail(`${label} produced malformed response - output must be string or object. Output: ${JSON.stringify(resp.output)}`, resp.tokenUsage) };
|
|
1702
|
-
const parsed = jsonObjects[0];
|
|
1703
|
-
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) return { failure: fail(`${label} produced malformed response. We were not able to parse the response as JSON. Output: ${JSON.stringify(resp.output)}`, resp.tokenUsage) };
|
|
1704
|
-
return { parsed };
|
|
1705
1482
|
}
|
|
1706
|
-
|
|
1707
|
-
const
|
|
1708
|
-
|
|
1709
|
-
const
|
|
1710
|
-
const
|
|
1711
|
-
if (resp.error || !resp.output) {
|
|
1712
|
-
if (throwOnError) throw new Error(resp.error || "No output");
|
|
1713
|
-
return fail(resp.error || "No output", resp.tokenUsage);
|
|
1714
|
-
}
|
|
1715
|
-
const { parsed, failure } = parseJsonGradingResponse(label, resp);
|
|
1716
|
-
if (!parsed) return failure;
|
|
1717
|
-
let pass = parsed.pass ?? true;
|
|
1718
|
-
if (typeof pass !== "boolean") pass = /^(true|yes|pass|y)$/i.test(String(pass));
|
|
1719
|
-
let score = parsed.score;
|
|
1720
|
-
if (typeof score !== "number") score = Number.isFinite(Number(score)) ? Number(score) : Number(pass);
|
|
1721
|
-
const threshold = typeof assertion?.threshold === "string" ? Number(assertion.threshold) : assertion?.threshold;
|
|
1722
|
-
if (typeof threshold === "number" && Number.isFinite(threshold)) pass = pass && score >= threshold;
|
|
1723
|
-
const reason = parsed.reason || (pass ? "Grading passed" : `Score ${score} below threshold ${threshold}`);
|
|
1724
|
-
let responseMetadata = {};
|
|
1725
|
-
if (resp.metadata && typeof resp.metadata === "object" && !Array.isArray(resp.metadata)) {
|
|
1726
|
-
const serializedMetadata = require_logger.safeJsonStringify(resp.metadata);
|
|
1727
|
-
responseMetadata = serializedMetadata ? JSON.parse(serializedMetadata) : {};
|
|
1728
|
-
}
|
|
1483
|
+
function parseLegacyFactualityResponse(responseText) {
|
|
1484
|
+
const answerMatch = responseText.match(/\s*\(?([a-eA-E])\)/);
|
|
1485
|
+
if (!answerMatch) throw new Error(`Factuality checker output did not match expected format: ${responseText}`);
|
|
1486
|
+
const option = answerMatch[1].toUpperCase();
|
|
1487
|
+
const reasonMatch = responseText.match(/\)\s*(.*)/s);
|
|
1729
1488
|
return {
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
score,
|
|
1733
|
-
reason,
|
|
1734
|
-
tokensUsed: {
|
|
1735
|
-
total: resp.tokenUsage?.total || 0,
|
|
1736
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
1737
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
1738
|
-
cached: resp.tokenUsage?.cached || 0,
|
|
1739
|
-
numRequests: resp.tokenUsage?.numRequests || 0,
|
|
1740
|
-
completionDetails: parsed.tokensUsed?.completionDetails || {
|
|
1741
|
-
reasoning: 0,
|
|
1742
|
-
acceptedPrediction: 0,
|
|
1743
|
-
rejectedPrediction: 0
|
|
1744
|
-
}
|
|
1745
|
-
},
|
|
1746
|
-
metadata: {
|
|
1747
|
-
...responseMetadata,
|
|
1748
|
-
renderedGradingPrompt: prompt
|
|
1749
|
-
}
|
|
1489
|
+
option,
|
|
1490
|
+
reason: reasonMatch?.[1] ? reasonMatch[1].trim() : responseText
|
|
1750
1491
|
};
|
|
1751
1492
|
}
|
|
1752
1493
|
async function matchesLlmRubric(rubric, llmOutput, grading, vars, assertion, options, providerCallContext) {
|
|
1753
1494
|
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1495
|
+
const shouldPreferRemote = options?.preferRemote || grading.__promptfooPreferRemote || !grading.provider;
|
|
1496
|
+
if (!grading.rubricPrompt && shouldPreferRemote && !require_logger.state.config?.redteam?.provider && require_logger.state.config?.redteam && require_server.shouldGenerateRemote({ canUseCodexDefaultProvider: true })) try {
|
|
1497
|
+
return {
|
|
1498
|
+
...await doRemoteGrading({
|
|
1499
|
+
task: "llm-rubric",
|
|
1500
|
+
rubric,
|
|
1501
|
+
output: llmOutput,
|
|
1502
|
+
vars: vars || {}
|
|
1503
|
+
}),
|
|
1504
|
+
assertion
|
|
1505
|
+
};
|
|
1506
|
+
} catch (error) {
|
|
1507
|
+
return {
|
|
1508
|
+
...fail(`Could not perform remote grading: ${error}`),
|
|
1509
|
+
assertion
|
|
1510
|
+
};
|
|
1511
|
+
}
|
|
1763
1512
|
try {
|
|
1764
1513
|
return await runJsonGradingPrompt({
|
|
1765
1514
|
assertion,
|
|
@@ -1807,89 +1556,42 @@ async function matchesPiScore(renderedValue, llmInput, llmOutput, assertion) {
|
|
|
1807
1556
|
assertion
|
|
1808
1557
|
};
|
|
1809
1558
|
}
|
|
1810
|
-
function
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
function getFactualityScoreLookup(grading) {
|
|
1814
|
-
return {
|
|
1815
|
-
A: grading.factuality?.subset ?? 1,
|
|
1816
|
-
B: grading.factuality?.superset ?? 1,
|
|
1817
|
-
C: grading.factuality?.agree ?? 1,
|
|
1818
|
-
D: grading.factuality?.disagree ?? 0,
|
|
1819
|
-
E: grading.factuality?.differButFactual ?? 1
|
|
1820
|
-
};
|
|
1821
|
-
}
|
|
1822
|
-
function buildFactualityCategoryResult(category, reason, grading, tokensUsed) {
|
|
1823
|
-
const option = category.trim().toUpperCase();
|
|
1824
|
-
if (!isFactualityCategory(option)) return fail(`Invalid category value: ${option}`, tokensUsed);
|
|
1825
|
-
const score = getFactualityScoreLookup(grading)[option];
|
|
1826
|
-
return {
|
|
1827
|
-
pass: score > 0,
|
|
1828
|
-
score,
|
|
1829
|
-
reason: reason?.trim() || `Category ${option}: ${FACTUALITY_CATEGORY_DESCRIPTIONS[option]}`,
|
|
1830
|
-
tokensUsed: normalizeTokenUsage(tokensUsed)
|
|
1831
|
-
};
|
|
1832
|
-
}
|
|
1833
|
-
function parseJsonFactualityOutput(output) {
|
|
1834
|
-
try {
|
|
1835
|
-
const jsonData = require_logger.extractFirstJsonObject(output);
|
|
1836
|
-
return typeof jsonData?.category === "string" ? {
|
|
1837
|
-
category: jsonData.category,
|
|
1838
|
-
reason: jsonData.reason
|
|
1839
|
-
} : null;
|
|
1840
|
-
} catch (err) {
|
|
1841
|
-
require_logger.logger.debug(`JSON parsing failed: ${err.message}`);
|
|
1842
|
-
return null;
|
|
1843
|
-
}
|
|
1844
|
-
}
|
|
1845
|
-
function parseLegacyFactualityOutput(output) {
|
|
1846
|
-
const answerMatch = output.match(/\s*\(?([a-eA-E])\)/);
|
|
1847
|
-
if (!answerMatch) return { failure: `Factuality checker output did not match expected format: ${output}` };
|
|
1848
|
-
const reasonMatch = output.match(/\)\s*(.*)/s);
|
|
1849
|
-
return {
|
|
1850
|
-
category: answerMatch[1],
|
|
1851
|
-
reason: reasonMatch?.[1]?.trim() || output
|
|
1852
|
-
};
|
|
1853
|
-
}
|
|
1854
|
-
function gradeFactualityOutput(output, grading, tokensUsed) {
|
|
1855
|
-
const jsonResult = parseJsonFactualityOutput(output);
|
|
1856
|
-
if (jsonResult) return buildFactualityCategoryResult(jsonResult.category, jsonResult.reason, grading, tokensUsed);
|
|
1857
|
-
require_logger.logger.info("Falling back to legacy pattern matching for factuality check");
|
|
1858
|
-
const legacyResult = parseLegacyFactualityOutput(output);
|
|
1859
|
-
return "failure" in legacyResult ? fail(legacyResult.failure, tokensUsed) : buildFactualityCategoryResult(legacyResult.category, legacyResult.reason, grading, tokensUsed);
|
|
1860
|
-
}
|
|
1861
|
-
async function matchesFactuality(input, expected, output, grading, vars, providerCallContext) {
|
|
1862
|
-
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
1863
|
-
const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, PROMPTFOO_FACTUALITY_PROMPT), {
|
|
1864
|
-
input,
|
|
1865
|
-
ideal: expected,
|
|
1866
|
-
completion: tryParse(output),
|
|
1867
|
-
...vars || {}
|
|
1868
|
-
});
|
|
1869
|
-
const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, (await getDefaultProviders()).gradingProvider, "factuality check"), prompt, "factuality", {
|
|
1559
|
+
async function matchesFactuality(input, expected, output, grading, vars, providerCallContext) {
|
|
1560
|
+
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
1561
|
+
const templateVars = {
|
|
1870
1562
|
input,
|
|
1871
1563
|
ideal: expected,
|
|
1872
1564
|
completion: tryParse(output),
|
|
1873
1565
|
...vars || {}
|
|
1874
|
-
}
|
|
1566
|
+
};
|
|
1567
|
+
const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, PROMPTFOO_FACTUALITY_PROMPT), templateVars);
|
|
1568
|
+
const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, (await getDefaultProviders()).gradingProvider, "factuality check"), prompt, "factuality", templateVars, providerCallContext);
|
|
1875
1569
|
if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
|
|
1876
1570
|
require_invariant.invariant(typeof resp.output === "string", "factuality produced malformed response");
|
|
1877
|
-
|
|
1571
|
+
try {
|
|
1572
|
+
const parsedJson = parseFactualityJsonResponse(resp.output);
|
|
1573
|
+
if (parsedJson) return buildFactualityResult(parsedJson.option, parsedJson.reason, grading, resp);
|
|
1574
|
+
} catch (err) {
|
|
1575
|
+
return fail(err.message, resp.tokenUsage);
|
|
1576
|
+
}
|
|
1577
|
+
require_logger.logger.info("Falling back to legacy pattern matching for factuality check");
|
|
1578
|
+
try {
|
|
1579
|
+
const parsedLegacy = parseLegacyFactualityResponse(resp.output);
|
|
1580
|
+
return buildFactualityResult(parsedLegacy.option, parsedLegacy.reason, grading, resp);
|
|
1581
|
+
} catch (err) {
|
|
1582
|
+
return fail(err.message, resp.tokenUsage);
|
|
1583
|
+
}
|
|
1878
1584
|
}
|
|
1879
1585
|
async function matchesClosedQa(input, expected, output, grading, vars, providerCallContext) {
|
|
1880
1586
|
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
1881
|
-
const
|
|
1882
|
-
input,
|
|
1883
|
-
criteria: expected,
|
|
1884
|
-
completion: tryParse(output),
|
|
1885
|
-
...vars || {}
|
|
1886
|
-
});
|
|
1887
|
-
const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, (await getDefaultProviders()).gradingProvider, "model-graded-closedqa check"), prompt, "model-graded-closedqa", {
|
|
1587
|
+
const templateVars = {
|
|
1888
1588
|
input,
|
|
1889
1589
|
criteria: expected,
|
|
1890
1590
|
completion: tryParse(output),
|
|
1891
1591
|
...vars || {}
|
|
1892
|
-
}
|
|
1592
|
+
};
|
|
1593
|
+
const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, OPENAI_CLOSED_QA_PROMPT), templateVars);
|
|
1594
|
+
const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, (await getDefaultProviders()).gradingProvider, "model-graded-closedqa check"), prompt, "model-graded-closedqa", templateVars, providerCallContext);
|
|
1893
1595
|
if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
|
|
1894
1596
|
require_invariant.invariant(typeof resp.output === "string", "model-graded-closedqa produced malformed response");
|
|
1895
1597
|
try {
|
|
@@ -1902,18 +1604,7 @@ async function matchesClosedQa(input, expected, output, grading, vars, providerC
|
|
|
1902
1604
|
pass,
|
|
1903
1605
|
score: pass ? 1 : 0,
|
|
1904
1606
|
reason,
|
|
1905
|
-
tokensUsed:
|
|
1906
|
-
total: resp.tokenUsage?.total || 0,
|
|
1907
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
1908
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
1909
|
-
cached: resp.tokenUsage?.cached || 0,
|
|
1910
|
-
numRequests: resp.tokenUsage?.numRequests || 0,
|
|
1911
|
-
completionDetails: resp.tokenUsage?.completionDetails || {
|
|
1912
|
-
reasoning: 0,
|
|
1913
|
-
acceptedPrediction: 0,
|
|
1914
|
-
rejectedPrediction: 0
|
|
1915
|
-
}
|
|
1916
|
-
}
|
|
1607
|
+
tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage)
|
|
1917
1608
|
};
|
|
1918
1609
|
} catch (err) {
|
|
1919
1610
|
return fail(`Error parsing output: ${err.message}`, resp.tokenUsage);
|
|
@@ -1923,490 +1614,51 @@ async function matchesGEval(criteria, input, output, threshold, grading, provide
|
|
|
1923
1614
|
if (!input) throw Error("No source text to estimate reply");
|
|
1924
1615
|
const maxScore = 10;
|
|
1925
1616
|
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "reply geval check");
|
|
1926
|
-
const tokensUsed =
|
|
1927
|
-
total: 0,
|
|
1928
|
-
prompt: 0,
|
|
1929
|
-
completion: 0,
|
|
1930
|
-
cached: 0,
|
|
1931
|
-
numRequests: 0,
|
|
1932
|
-
completionDetails: {
|
|
1933
|
-
reasoning: 0,
|
|
1934
|
-
acceptedPrediction: 0,
|
|
1935
|
-
rejectedPrediction: 0
|
|
1936
|
-
}
|
|
1937
|
-
};
|
|
1617
|
+
const tokensUsed = normalizeMatcherTokenUsage(void 0);
|
|
1938
1618
|
const respSteps = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(typeof grading?.rubricPrompt === "object" && !Array.isArray(grading?.rubricPrompt) ? grading?.rubricPrompt?.["steps"] : void 0, GEVAL_PROMPT_STEPS), { criteria }), "g-eval-steps", { criteria }, providerCallContext);
|
|
1939
|
-
|
|
1619
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, respSteps.tokenUsage);
|
|
1620
|
+
if (respSteps.error) return fail(respSteps.error, tokensUsed);
|
|
1621
|
+
if (!respSteps.output) return fail("No output", tokensUsed);
|
|
1622
|
+
if (typeof respSteps.output !== "string") return fail("LLM-proposed evaluation steps response is not a string", tokensUsed);
|
|
1940
1623
|
let steps;
|
|
1941
1624
|
try {
|
|
1942
|
-
|
|
1625
|
+
const stepsMatch = respSteps.output.match(/\{"steps".+\}/g);
|
|
1626
|
+
if (!stepsMatch) return fail(`LLM-proposed evaluation steps are not in JSON format: ${respSteps.output}`, tokensUsed);
|
|
1627
|
+
steps = JSON.parse(stepsMatch[0]).steps;
|
|
1943
1628
|
if (!steps.length) return fail("LLM does not propose any evaluation step", tokensUsed);
|
|
1944
|
-
} catch {
|
|
1945
|
-
return fail(`LLM-proposed evaluation steps are not in JSON format: ${respSteps.output}`, tokensUsed);
|
|
1629
|
+
} catch (err) {
|
|
1630
|
+
return fail(`LLM-proposed evaluation steps are not in JSON format: ${err.message}\n\n${respSteps.output}`, tokensUsed);
|
|
1946
1631
|
}
|
|
1947
|
-
const
|
|
1948
|
-
|
|
1949
|
-
steps: steps.join("\n- "),
|
|
1950
|
-
maxScore: maxScore.toString(),
|
|
1951
|
-
input: tryParse(input),
|
|
1952
|
-
output: tryParse(output)
|
|
1953
|
-
}), "g-eval", {
|
|
1632
|
+
const evalPrompt = await loadRubricPrompt(typeof grading?.rubricPrompt === "object" && !Array.isArray(grading?.rubricPrompt) ? grading?.rubricPrompt?.["evaluate"] : void 0, GEVAL_PROMPT_EVALUATE);
|
|
1633
|
+
const evalVars = {
|
|
1954
1634
|
criteria,
|
|
1955
1635
|
steps: steps.join("\n- "),
|
|
1956
1636
|
maxScore: maxScore.toString(),
|
|
1957
1637
|
input: tryParse(input),
|
|
1958
1638
|
output: tryParse(output)
|
|
1959
|
-
}
|
|
1960
|
-
|
|
1639
|
+
};
|
|
1640
|
+
const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(evalPrompt, evalVars), "g-eval", evalVars, providerCallContext);
|
|
1641
|
+
require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
1642
|
+
if (resp.error) return fail(resp.error, tokensUsed);
|
|
1643
|
+
if (!resp.output) return fail("No output", tokensUsed);
|
|
1644
|
+
if (typeof resp.output !== "string") return fail("LLM-proposed evaluation result response is not a string", tokensUsed);
|
|
1961
1645
|
let result;
|
|
1962
1646
|
try {
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1647
|
+
const resultMatch = resp.output.match(/\{.+\}/g);
|
|
1648
|
+
if (!resultMatch) return fail(`LLM-proposed evaluation result is not in JSON format: ${resp.output}`, tokensUsed);
|
|
1649
|
+
result = JSON.parse(resultMatch[0]);
|
|
1650
|
+
} catch (err) {
|
|
1651
|
+
return fail(`LLM-proposed evaluation result is not in JSON format: ${err.message}\n\n${resp.output}`, tokensUsed);
|
|
1966
1652
|
}
|
|
1653
|
+
const rawScore = typeof result.score === "number" ? result.score : Number(result.score);
|
|
1654
|
+
if (!Number.isFinite(rawScore)) return fail(`G-Eval result has invalid or missing score: ${JSON.stringify(result.score)}`, tokensUsed);
|
|
1967
1655
|
return {
|
|
1968
|
-
pass:
|
|
1969
|
-
score:
|
|
1656
|
+
pass: rawScore / maxScore >= threshold,
|
|
1657
|
+
score: rawScore / maxScore,
|
|
1970
1658
|
reason: result.reason,
|
|
1971
1659
|
tokensUsed
|
|
1972
1660
|
};
|
|
1973
1661
|
}
|
|
1974
|
-
async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
|
|
1975
|
-
const embeddingProvider = await getAndCheckProvider("embedding", grading?.provider, (await getDefaultProviders()).embeddingProvider, "answer relevancy check");
|
|
1976
|
-
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "answer relevancy check");
|
|
1977
|
-
const tokensUsed = {
|
|
1978
|
-
total: 0,
|
|
1979
|
-
prompt: 0,
|
|
1980
|
-
completion: 0,
|
|
1981
|
-
cached: 0,
|
|
1982
|
-
numRequests: 0,
|
|
1983
|
-
completionDetails: {
|
|
1984
|
-
reasoning: 0,
|
|
1985
|
-
acceptedPrediction: 0,
|
|
1986
|
-
rejectedPrediction: 0
|
|
1987
|
-
}
|
|
1988
|
-
};
|
|
1989
|
-
const candidateQuestions = [];
|
|
1990
|
-
for (let i = 0; i < 3; i++) {
|
|
1991
|
-
const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, ANSWER_RELEVANCY_GENERATE), { answer: tryParse(output) }), "answer-relevance", { answer: tryParse(output) }, providerCallContext);
|
|
1992
|
-
accumulateTokens(tokensUsed, resp.tokenUsage);
|
|
1993
|
-
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
1994
|
-
require_invariant.invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
|
|
1995
|
-
candidateQuestions.push(resp.output);
|
|
1996
|
-
}
|
|
1997
|
-
require_invariant.invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id} must implement callEmbeddingApi for similarity check`);
|
|
1998
|
-
const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
|
|
1999
|
-
accumulateTokens(tokensUsed, inputEmbeddingResp.tokenUsage);
|
|
2000
|
-
if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
|
|
2001
|
-
const inputEmbedding = inputEmbeddingResp.embedding;
|
|
2002
|
-
const similarities = [];
|
|
2003
|
-
const questionsWithScores = [];
|
|
2004
|
-
for (const question of candidateQuestions) {
|
|
2005
|
-
const resp = await embeddingProvider.callEmbeddingApi(question);
|
|
2006
|
-
accumulateTokens(tokensUsed, resp.tokenUsage);
|
|
2007
|
-
if (resp.error || !resp.embedding) return fail(resp.error || "No embedding", tokensUsed);
|
|
2008
|
-
const questionSimilarity = cosineSimilarity(inputEmbedding, resp.embedding);
|
|
2009
|
-
similarities.push(questionSimilarity);
|
|
2010
|
-
questionsWithScores.push({
|
|
2011
|
-
question,
|
|
2012
|
-
similarity: questionSimilarity
|
|
2013
|
-
});
|
|
2014
|
-
}
|
|
2015
|
-
const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
|
|
2016
|
-
const pass = similarity >= threshold - Number.EPSILON;
|
|
2017
|
-
const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
|
|
2018
|
-
const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
2019
|
-
const metadata = {
|
|
2020
|
-
generatedQuestions: questionsWithScores,
|
|
2021
|
-
averageSimilarity: similarity,
|
|
2022
|
-
threshold
|
|
2023
|
-
};
|
|
2024
|
-
if (pass) return {
|
|
2025
|
-
pass: true,
|
|
2026
|
-
score: similarity,
|
|
2027
|
-
reason: greaterThanReason,
|
|
2028
|
-
tokensUsed,
|
|
2029
|
-
metadata
|
|
2030
|
-
};
|
|
2031
|
-
return {
|
|
2032
|
-
pass: false,
|
|
2033
|
-
score: similarity,
|
|
2034
|
-
reason: lessThanReason,
|
|
2035
|
-
tokensUsed,
|
|
2036
|
-
metadata
|
|
2037
|
-
};
|
|
2038
|
-
}
|
|
2039
|
-
async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
|
|
2040
|
-
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context recall check");
|
|
2041
|
-
const contextString = serializeContext(context);
|
|
2042
|
-
const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RECALL), {
|
|
2043
|
-
context: contextString,
|
|
2044
|
-
groundTruth,
|
|
2045
|
-
...vars || {}
|
|
2046
|
-
}), "context-recall", {
|
|
2047
|
-
context: contextString,
|
|
2048
|
-
groundTruth,
|
|
2049
|
-
...vars || {}
|
|
2050
|
-
}, providerCallContext);
|
|
2051
|
-
if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
|
|
2052
|
-
require_invariant.invariant(typeof resp.output === "string", "context-recall produced malformed response");
|
|
2053
|
-
const attributedTokenLower = CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
|
|
2054
|
-
const notAttributedTokenLower = CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
|
|
2055
|
-
const sentences = splitIntoSentences(resp.output).filter((line) => {
|
|
2056
|
-
const lowerLine = line.toLowerCase();
|
|
2057
|
-
return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
|
|
2058
|
-
});
|
|
2059
|
-
const sentenceAttributions = [];
|
|
2060
|
-
let numerator = 0;
|
|
2061
|
-
for (const sentence of sentences) {
|
|
2062
|
-
const isAttributed = sentence.toLowerCase().includes(attributedTokenLower);
|
|
2063
|
-
if (isAttributed) numerator++;
|
|
2064
|
-
const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
|
|
2065
|
-
const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
|
|
2066
|
-
sentenceAttributions.push({
|
|
2067
|
-
sentence: cleanSentence,
|
|
2068
|
-
attributed: isAttributed
|
|
2069
|
-
});
|
|
2070
|
-
}
|
|
2071
|
-
const score = sentences.length > 0 ? numerator / sentences.length : 0;
|
|
2072
|
-
const pass = score >= threshold - Number.EPSILON;
|
|
2073
|
-
const metadata = {
|
|
2074
|
-
sentenceAttributions,
|
|
2075
|
-
totalSentences: sentences.length,
|
|
2076
|
-
attributedSentences: numerator,
|
|
2077
|
-
score
|
|
2078
|
-
};
|
|
2079
|
-
return {
|
|
2080
|
-
pass,
|
|
2081
|
-
score,
|
|
2082
|
-
reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
|
|
2083
|
-
tokensUsed: {
|
|
2084
|
-
total: resp.tokenUsage?.total || 0,
|
|
2085
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
2086
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
2087
|
-
cached: resp.tokenUsage?.cached || 0,
|
|
2088
|
-
numRequests: resp.tokenUsage?.numRequests || 0,
|
|
2089
|
-
completionDetails: resp.tokenUsage?.completionDetails || {
|
|
2090
|
-
reasoning: 0,
|
|
2091
|
-
acceptedPrediction: 0,
|
|
2092
|
-
rejectedPrediction: 0
|
|
2093
|
-
}
|
|
2094
|
-
},
|
|
2095
|
-
metadata
|
|
2096
|
-
};
|
|
2097
|
-
}
|
|
2098
|
-
async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
|
|
2099
|
-
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context relevance check");
|
|
2100
|
-
const contextString = serializeContext(context);
|
|
2101
|
-
const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RELEVANCE), {
|
|
2102
|
-
context: contextString,
|
|
2103
|
-
query: question
|
|
2104
|
-
}), "context-relevance", {
|
|
2105
|
-
context: contextString,
|
|
2106
|
-
query: question
|
|
2107
|
-
}, providerCallContext);
|
|
2108
|
-
if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
|
|
2109
|
-
require_invariant.invariant(typeof resp.output === "string", "context-relevance produced malformed response");
|
|
2110
|
-
const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : splitIntoSentences(context);
|
|
2111
|
-
const totalContextUnits = contextUnits.length;
|
|
2112
|
-
const extractedSentences = splitIntoSentences(resp.output);
|
|
2113
|
-
const relevantSentences = [];
|
|
2114
|
-
const insufficientInformation = resp.output.includes(CONTEXT_RELEVANCE_BAD);
|
|
2115
|
-
let numerator = 0;
|
|
2116
|
-
if (insufficientInformation) numerator = 0;
|
|
2117
|
-
else {
|
|
2118
|
-
numerator = extractedSentences.length;
|
|
2119
|
-
relevantSentences.push(...extractedSentences);
|
|
2120
|
-
}
|
|
2121
|
-
const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
|
|
2122
|
-
const pass = score >= threshold - Number.EPSILON;
|
|
2123
|
-
const metadata = {
|
|
2124
|
-
extractedSentences: relevantSentences,
|
|
2125
|
-
totalContextUnits,
|
|
2126
|
-
totalContextSentences: totalContextUnits,
|
|
2127
|
-
contextUnits,
|
|
2128
|
-
relevantSentenceCount: numerator,
|
|
2129
|
-
insufficientInformation,
|
|
2130
|
-
score
|
|
2131
|
-
};
|
|
2132
|
-
return {
|
|
2133
|
-
pass,
|
|
2134
|
-
score,
|
|
2135
|
-
reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
|
|
2136
|
-
tokensUsed: {
|
|
2137
|
-
total: resp.tokenUsage?.total || 0,
|
|
2138
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
2139
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
2140
|
-
cached: resp.tokenUsage?.cached || 0,
|
|
2141
|
-
numRequests: resp.tokenUsage?.numRequests || 0,
|
|
2142
|
-
completionDetails: resp.tokenUsage?.completionDetails || {
|
|
2143
|
-
reasoning: 0,
|
|
2144
|
-
acceptedPrediction: 0,
|
|
2145
|
-
rejectedPrediction: 0
|
|
2146
|
-
}
|
|
2147
|
-
},
|
|
2148
|
-
metadata
|
|
2149
|
-
};
|
|
2150
|
-
}
|
|
2151
|
-
async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
|
|
2152
|
-
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "faithfulness check");
|
|
2153
|
-
const tokensUsed = {
|
|
2154
|
-
total: 0,
|
|
2155
|
-
prompt: 0,
|
|
2156
|
-
completion: 0,
|
|
2157
|
-
cached: 0,
|
|
2158
|
-
numRequests: 0,
|
|
2159
|
-
completionDetails: {
|
|
2160
|
-
reasoning: 0,
|
|
2161
|
-
acceptedPrediction: 0,
|
|
2162
|
-
rejectedPrediction: 0
|
|
2163
|
-
}
|
|
2164
|
-
};
|
|
2165
|
-
if (grading?.rubricPrompt) require_invariant.invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
|
|
2166
|
-
const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
|
|
2167
|
-
const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
|
|
2168
|
-
const longformPrompt = await loadRubricPrompt(rawLongformPrompt, CONTEXT_FAITHFULNESS_LONGFORM);
|
|
2169
|
-
const nliPrompt = await loadRubricPrompt(rawNliPrompt, CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
|
|
2170
|
-
let promptText = await renderLlmRubricPrompt(longformPrompt, {
|
|
2171
|
-
question: query,
|
|
2172
|
-
answer: tryParse(output),
|
|
2173
|
-
...vars || {}
|
|
2174
|
-
});
|
|
2175
|
-
let resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
|
|
2176
|
-
question: query,
|
|
2177
|
-
answer: tryParse(output),
|
|
2178
|
-
...vars || {}
|
|
2179
|
-
}, providerCallContext);
|
|
2180
|
-
accumulateTokens(tokensUsed, resp.tokenUsage);
|
|
2181
|
-
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
2182
|
-
require_invariant.invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
2183
|
-
const contextString = serializeContext(context);
|
|
2184
|
-
const statements = splitIntoSentences(resp.output);
|
|
2185
|
-
promptText = await renderLlmRubricPrompt(nliPrompt, {
|
|
2186
|
-
context: contextString,
|
|
2187
|
-
statements,
|
|
2188
|
-
...vars || {}
|
|
2189
|
-
});
|
|
2190
|
-
resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
|
|
2191
|
-
context: contextString,
|
|
2192
|
-
statements,
|
|
2193
|
-
...vars || {}
|
|
2194
|
-
}, providerCallContext);
|
|
2195
|
-
accumulateTokens(tokensUsed, resp.tokenUsage);
|
|
2196
|
-
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
2197
|
-
require_invariant.invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
2198
|
-
let finalAnswer = "Final verdict for each statement in order:";
|
|
2199
|
-
finalAnswer = finalAnswer.toLowerCase();
|
|
2200
|
-
let verdicts = resp.output.toLowerCase().trim();
|
|
2201
|
-
let score = 0;
|
|
2202
|
-
if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
|
|
2203
|
-
verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
|
|
2204
|
-
const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
|
|
2205
|
-
if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
|
|
2206
|
-
} else {
|
|
2207
|
-
const noVerdictCount = verdicts.split("verdict: no").length - 1;
|
|
2208
|
-
if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
|
|
2209
|
-
}
|
|
2210
|
-
score = Math.min(1, Math.max(0, score));
|
|
2211
|
-
const pass = score >= threshold - Number.EPSILON;
|
|
2212
|
-
return {
|
|
2213
|
-
pass,
|
|
2214
|
-
score,
|
|
2215
|
-
reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
|
|
2216
|
-
tokensUsed
|
|
2217
|
-
};
|
|
2218
|
-
}
|
|
2219
|
-
async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
|
|
2220
|
-
require_invariant.invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
|
|
2221
|
-
const resp = await callProviderWithContext(await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "select-best check"), await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, SELECT_BEST_PROMPT), {
|
|
2222
|
-
criteria,
|
|
2223
|
-
outputs: outputs.map((o) => tryParse(o)),
|
|
2224
|
-
...vars || {}
|
|
2225
|
-
}), "select-best", {
|
|
2226
|
-
criteria,
|
|
2227
|
-
outputs: outputs.map((o) => tryParse(o)),
|
|
2228
|
-
...vars || {}
|
|
2229
|
-
}, providerCallContext);
|
|
2230
|
-
if (resp.error || !resp.output) return new Array(outputs.length).fill(fail(resp.error || "No output", resp.tokenUsage));
|
|
2231
|
-
require_invariant.invariant(typeof resp.output === "string", "select-best produced malformed response");
|
|
2232
|
-
const firstDigitMatch = resp.output.trim().match(/\d/);
|
|
2233
|
-
const verdict = firstDigitMatch ? Number.parseInt(firstDigitMatch[0], 10) : NaN;
|
|
2234
|
-
if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return new Array(outputs.length).fill(fail(`Invalid select-best verdict: ${verdict}`));
|
|
2235
|
-
const tokensUsed = {
|
|
2236
|
-
total: resp.tokenUsage?.total || 0,
|
|
2237
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
2238
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
2239
|
-
cached: resp.tokenUsage?.cached || 0,
|
|
2240
|
-
numRequests: resp.tokenUsage?.numRequests || 0,
|
|
2241
|
-
completionDetails: resp.tokenUsage?.completionDetails || {
|
|
2242
|
-
reasoning: 0,
|
|
2243
|
-
acceptedPrediction: 0,
|
|
2244
|
-
rejectedPrediction: 0
|
|
2245
|
-
}
|
|
2246
|
-
};
|
|
2247
|
-
return outputs.map((_output, index) => {
|
|
2248
|
-
if (index === verdict) return {
|
|
2249
|
-
pass: true,
|
|
2250
|
-
score: 1,
|
|
2251
|
-
reason: `Output selected as the best: ${criteria}`,
|
|
2252
|
-
tokensUsed
|
|
2253
|
-
};
|
|
2254
|
-
else return {
|
|
2255
|
-
pass: false,
|
|
2256
|
-
score: 0,
|
|
2257
|
-
reason: `Output not selected: ${criteria}`,
|
|
2258
|
-
tokensUsed
|
|
2259
|
-
};
|
|
2260
|
-
});
|
|
2261
|
-
}
|
|
2262
|
-
async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
|
|
2263
|
-
require_invariant.invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
|
|
2264
|
-
const value = assertion.value || {};
|
|
2265
|
-
const options = {
|
|
2266
|
-
method: typeof value === "object" && "method" in value ? value.method : "average",
|
|
2267
|
-
weights: typeof value === "object" && "weights" in value ? value.weights : {},
|
|
2268
|
-
threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
|
|
2269
|
-
};
|
|
2270
|
-
const scores = resultsWithGradingResults.map((result, index) => {
|
|
2271
|
-
const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
|
|
2272
|
-
if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
|
|
2273
|
-
let totalWeightedScore = 0;
|
|
2274
|
-
let totalWeight = 0;
|
|
2275
|
-
relevantResults.forEach((componentResult) => {
|
|
2276
|
-
const assertionType = componentResult.assertion?.type || "unknown";
|
|
2277
|
-
const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
|
|
2278
|
-
const score = componentResult.score || 0;
|
|
2279
|
-
totalWeightedScore += score * weight;
|
|
2280
|
-
totalWeight += weight;
|
|
2281
|
-
});
|
|
2282
|
-
let aggregateScore;
|
|
2283
|
-
if (options.method === "sum") aggregateScore = totalWeightedScore;
|
|
2284
|
-
else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
|
|
2285
|
-
return {
|
|
2286
|
-
index,
|
|
2287
|
-
score: aggregateScore,
|
|
2288
|
-
componentCount: relevantResults.length,
|
|
2289
|
-
totalWeight
|
|
2290
|
-
};
|
|
2291
|
-
});
|
|
2292
|
-
let maxScore = -Infinity;
|
|
2293
|
-
let winnerIndex = 0;
|
|
2294
|
-
for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
|
|
2295
|
-
maxScore = scores[i].score;
|
|
2296
|
-
winnerIndex = i;
|
|
2297
|
-
}
|
|
2298
|
-
const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
|
|
2299
|
-
return scores.map(({ index, score, componentCount, totalWeight }) => {
|
|
2300
|
-
const isWinner = index === winnerIndex && meetsThreshold;
|
|
2301
|
-
return {
|
|
2302
|
-
pass: isWinner,
|
|
2303
|
-
score: isWinner ? 1 : 0,
|
|
2304
|
-
reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
|
|
2305
|
-
namedScores: {
|
|
2306
|
-
maxScore: score,
|
|
2307
|
-
assertionCount: componentCount,
|
|
2308
|
-
totalWeight
|
|
2309
|
-
}
|
|
2310
|
-
};
|
|
2311
|
-
});
|
|
2312
|
-
}
|
|
2313
|
-
async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
|
|
2314
|
-
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
2315
|
-
const defaultProviders = await getDefaultProviders();
|
|
2316
|
-
const defaultSearchProviders = [
|
|
2317
|
-
defaultProviders.webSearchProvider,
|
|
2318
|
-
defaultProviders.llmRubricProvider,
|
|
2319
|
-
defaultProviders.gradingProvider
|
|
2320
|
-
];
|
|
2321
|
-
let searchProvider = (grading.provider ? await getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
|
|
2322
|
-
if (!hasWebSearchCapability(searchProvider)) {
|
|
2323
|
-
const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
|
|
2324
|
-
if (webSearchDefault) searchProvider = webSearchDefault;
|
|
2325
|
-
}
|
|
2326
|
-
if (!hasWebSearchCapability(searchProvider)) {
|
|
2327
|
-
const webSearchProvider = await loadWebSearchProvider(true);
|
|
2328
|
-
if (webSearchProvider) searchProvider = webSearchProvider;
|
|
2329
|
-
}
|
|
2330
|
-
if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error("search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:claude-sonnet-4, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider");
|
|
2331
|
-
const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, DEFAULT_WEB_SEARCH_PROMPT), {
|
|
2332
|
-
output: tryParse(llmOutput),
|
|
2333
|
-
rubric,
|
|
2334
|
-
...vars || {}
|
|
2335
|
-
});
|
|
2336
|
-
const resp = await callProviderWithContext(searchProvider, prompt, "search-rubric", {
|
|
2337
|
-
output: tryParse(llmOutput),
|
|
2338
|
-
rubric,
|
|
2339
|
-
...vars || {}
|
|
2340
|
-
}, providerCallContext);
|
|
2341
|
-
if (resp.error || !resp.output) return {
|
|
2342
|
-
pass: false,
|
|
2343
|
-
score: 0,
|
|
2344
|
-
reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
|
|
2345
|
-
tokensUsed: resp.tokenUsage,
|
|
2346
|
-
assertion
|
|
2347
|
-
};
|
|
2348
|
-
try {
|
|
2349
|
-
const result = require_logger.extractFirstJsonObject(String(resp.output));
|
|
2350
|
-
let pass = result.pass ?? false;
|
|
2351
|
-
const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
|
|
2352
|
-
if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
|
|
2353
|
-
return {
|
|
2354
|
-
pass,
|
|
2355
|
-
score,
|
|
2356
|
-
reason: result.reason || "No reason provided",
|
|
2357
|
-
tokensUsed: resp.tokenUsage,
|
|
2358
|
-
assertion,
|
|
2359
|
-
metadata: {
|
|
2360
|
-
searchResults: result.searchResults || [],
|
|
2361
|
-
searchProvider: searchProvider.id()
|
|
2362
|
-
}
|
|
2363
|
-
};
|
|
2364
|
-
} catch {
|
|
2365
|
-
const outputLower = String(resp.output).toLowerCase();
|
|
2366
|
-
const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
|
|
2367
|
-
return {
|
|
2368
|
-
pass,
|
|
2369
|
-
score: pass ? 1 : 0,
|
|
2370
|
-
reason: resp.output,
|
|
2371
|
-
tokensUsed: resp.tokenUsage,
|
|
2372
|
-
assertion
|
|
2373
|
-
};
|
|
2374
|
-
}
|
|
2375
|
-
}
|
|
2376
|
-
async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
|
|
2377
|
-
if (!assistantResponse) return {
|
|
2378
|
-
pass: true,
|
|
2379
|
-
score: 1,
|
|
2380
|
-
reason: "No output to moderate"
|
|
2381
|
-
};
|
|
2382
|
-
const defaultProviders = await getDefaultProviders();
|
|
2383
|
-
const defaultModerationProvider = !require_logger.getEnvString("OPENAI_API_KEY") && (require_logger.getEnvString("REPLICATE_API_KEY") || require_logger.getEnvString("REPLICATE_API_TOKEN")) ? await require_providers.loadApiProvider(require_types.LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
|
|
2384
|
-
const moderationProvider = await getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
|
|
2385
|
-
require_invariant.invariant(moderationProvider, "Moderation provider must be defined");
|
|
2386
|
-
const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
|
|
2387
|
-
if (resp.error) return {
|
|
2388
|
-
pass: false,
|
|
2389
|
-
score: 0,
|
|
2390
|
-
reason: `Moderation API error: ${resp.error}`
|
|
2391
|
-
};
|
|
2392
|
-
const { flags } = resp;
|
|
2393
|
-
if (!flags || flags.length === 0) return {
|
|
2394
|
-
pass: true,
|
|
2395
|
-
score: 1,
|
|
2396
|
-
reason: "No moderation flags detected"
|
|
2397
|
-
};
|
|
2398
|
-
const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
|
|
2399
|
-
if (filteredFlags.length > 0) return {
|
|
2400
|
-
pass: false,
|
|
2401
|
-
score: 0,
|
|
2402
|
-
reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
|
|
2403
|
-
};
|
|
2404
|
-
return {
|
|
2405
|
-
pass: true,
|
|
2406
|
-
score: 1,
|
|
2407
|
-
reason: "No relevant moderation flags detected"
|
|
2408
|
-
};
|
|
2409
|
-
}
|
|
2410
1662
|
//#endregion
|
|
2411
1663
|
//#region src/integrations/huggingfaceDatasets.ts
|
|
2412
1664
|
/**
|
|
@@ -2992,7 +2244,7 @@ var RedteamPluginBase = class RedteamPluginBase {
|
|
|
2992
2244
|
const rejectedPromptLengths = [];
|
|
2993
2245
|
let rejectedPromptLimit;
|
|
2994
2246
|
for (const prompt of parsedPrompts) {
|
|
2995
|
-
const violation = require_providers.getGeneratedPromptOverLimit("__prompt" in prompt ?
|
|
2247
|
+
const violation = require_providers.getGeneratedPromptOverLimit("__prompt" in prompt ? prompt.__prompt : JSON.stringify(prompt), this.config.maxCharsPerMessage);
|
|
2996
2248
|
if (violation) {
|
|
2997
2249
|
rejectedPromptLengths.push(violation.length);
|
|
2998
2250
|
rejectedPromptLimit = violation.limit;
|
|
@@ -3156,10 +2408,17 @@ var RedteamGraderBase = class {
|
|
|
3156
2408
|
},
|
|
3157
2409
|
rubric: finalRubric
|
|
3158
2410
|
};
|
|
3159
|
-
const
|
|
2411
|
+
const defaultTest = typeof require_logger.state.config?.defaultTest === "object" ? require_logger.state.config.defaultTest : void 0;
|
|
2412
|
+
const hasConfiguredGradingProvider = Boolean(require_logger.state.config?.redteam?.provider || defaultTest?.options?.provider);
|
|
2413
|
+
const grading = {
|
|
3160
2414
|
...test.options,
|
|
3161
2415
|
provider: await require_providers.redteamProviderManager.getGradingProvider({ jsonOnly: true })
|
|
3162
|
-
}
|
|
2416
|
+
};
|
|
2417
|
+
if (!hasConfiguredGradingProvider) {
|
|
2418
|
+
Object.defineProperty(grading, "__promptfooPreferRemote", { value: true });
|
|
2419
|
+
require_logger.logger.debug("[Redteam] No configured grading provider detected, preferring remote grading");
|
|
2420
|
+
}
|
|
2421
|
+
const grade = await matchesLlmRubric(finalRubric, llmOutput, grading);
|
|
3163
2422
|
require_logger.logger.debug(`Redteam grading result for ${this.id}: - ${JSON.stringify(grade)}`);
|
|
3164
2423
|
let suggestions;
|
|
3165
2424
|
if (!grade.pass) suggestions = this.getSuggestions({
|
|
@@ -15978,6 +15237,12 @@ function getGraderById(id) {
|
|
|
15978
15237
|
return grader;
|
|
15979
15238
|
}
|
|
15980
15239
|
//#endregion
|
|
15240
|
+
Object.defineProperty(exports, "ANSWER_RELEVANCY_GENERATE", {
|
|
15241
|
+
enumerable: true,
|
|
15242
|
+
get: function() {
|
|
15243
|
+
return ANSWER_RELEVANCY_GENERATE;
|
|
15244
|
+
}
|
|
15245
|
+
});
|
|
15981
15246
|
Object.defineProperty(exports, "AegisPlugin", {
|
|
15982
15247
|
enumerable: true,
|
|
15983
15248
|
get: function() {
|
|
@@ -15990,6 +15255,48 @@ Object.defineProperty(exports, "BeavertailsPlugin", {
|
|
|
15990
15255
|
return BeavertailsPlugin;
|
|
15991
15256
|
}
|
|
15992
15257
|
});
|
|
15258
|
+
Object.defineProperty(exports, "CONTEXT_FAITHFULNESS_LONGFORM", {
|
|
15259
|
+
enumerable: true,
|
|
15260
|
+
get: function() {
|
|
15261
|
+
return CONTEXT_FAITHFULNESS_LONGFORM;
|
|
15262
|
+
}
|
|
15263
|
+
});
|
|
15264
|
+
Object.defineProperty(exports, "CONTEXT_FAITHFULNESS_NLI_STATEMENTS", {
|
|
15265
|
+
enumerable: true,
|
|
15266
|
+
get: function() {
|
|
15267
|
+
return CONTEXT_FAITHFULNESS_NLI_STATEMENTS;
|
|
15268
|
+
}
|
|
15269
|
+
});
|
|
15270
|
+
Object.defineProperty(exports, "CONTEXT_RECALL", {
|
|
15271
|
+
enumerable: true,
|
|
15272
|
+
get: function() {
|
|
15273
|
+
return CONTEXT_RECALL;
|
|
15274
|
+
}
|
|
15275
|
+
});
|
|
15276
|
+
Object.defineProperty(exports, "CONTEXT_RECALL_ATTRIBUTED_TOKEN", {
|
|
15277
|
+
enumerable: true,
|
|
15278
|
+
get: function() {
|
|
15279
|
+
return CONTEXT_RECALL_ATTRIBUTED_TOKEN;
|
|
15280
|
+
}
|
|
15281
|
+
});
|
|
15282
|
+
Object.defineProperty(exports, "CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN", {
|
|
15283
|
+
enumerable: true,
|
|
15284
|
+
get: function() {
|
|
15285
|
+
return CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN;
|
|
15286
|
+
}
|
|
15287
|
+
});
|
|
15288
|
+
Object.defineProperty(exports, "CONTEXT_RELEVANCE", {
|
|
15289
|
+
enumerable: true,
|
|
15290
|
+
get: function() {
|
|
15291
|
+
return CONTEXT_RELEVANCE;
|
|
15292
|
+
}
|
|
15293
|
+
});
|
|
15294
|
+
Object.defineProperty(exports, "CONTEXT_RELEVANCE_BAD", {
|
|
15295
|
+
enumerable: true,
|
|
15296
|
+
get: function() {
|
|
15297
|
+
return CONTEXT_RELEVANCE_BAD;
|
|
15298
|
+
}
|
|
15299
|
+
});
|
|
15993
15300
|
Object.defineProperty(exports, "ContractPlugin", {
|
|
15994
15301
|
enumerable: true,
|
|
15995
15302
|
get: function() {
|
|
@@ -16002,6 +15309,18 @@ Object.defineProperty(exports, "CrossSessionLeakPlugin", {
|
|
|
16002
15309
|
return CrossSessionLeakPlugin;
|
|
16003
15310
|
}
|
|
16004
15311
|
});
|
|
15312
|
+
Object.defineProperty(exports, "DEFAULT_ANTHROPIC_MODEL", {
|
|
15313
|
+
enumerable: true,
|
|
15314
|
+
get: function() {
|
|
15315
|
+
return DEFAULT_ANTHROPIC_MODEL;
|
|
15316
|
+
}
|
|
15317
|
+
});
|
|
15318
|
+
Object.defineProperty(exports, "DEFAULT_WEB_SEARCH_PROMPT", {
|
|
15319
|
+
enumerable: true,
|
|
15320
|
+
get: function() {
|
|
15321
|
+
return DEFAULT_WEB_SEARCH_PROMPT;
|
|
15322
|
+
}
|
|
15323
|
+
});
|
|
16005
15324
|
Object.defineProperty(exports, "DebugAccessPlugin", {
|
|
16006
15325
|
enumerable: true,
|
|
16007
15326
|
get: function() {
|
|
@@ -16098,6 +15417,12 @@ Object.defineProperty(exports, "RedteamPluginBase", {
|
|
|
16098
15417
|
return RedteamPluginBase;
|
|
16099
15418
|
}
|
|
16100
15419
|
});
|
|
15420
|
+
Object.defineProperty(exports, "SELECT_BEST_PROMPT", {
|
|
15421
|
+
enumerable: true,
|
|
15422
|
+
get: function() {
|
|
15423
|
+
return SELECT_BEST_PROMPT;
|
|
15424
|
+
}
|
|
15425
|
+
});
|
|
16101
15426
|
Object.defineProperty(exports, "SUGGEST_PROMPTS_SYSTEM_MESSAGE", {
|
|
16102
15427
|
enumerable: true,
|
|
16103
15428
|
get: function() {
|
|
@@ -16188,12 +15513,36 @@ Object.defineProperty(exports, "coerceString", {
|
|
|
16188
15513
|
return coerceString;
|
|
16189
15514
|
}
|
|
16190
15515
|
});
|
|
15516
|
+
Object.defineProperty(exports, "cosineSimilarity", {
|
|
15517
|
+
enumerable: true,
|
|
15518
|
+
get: function() {
|
|
15519
|
+
return cosineSimilarity;
|
|
15520
|
+
}
|
|
15521
|
+
});
|
|
16191
15522
|
Object.defineProperty(exports, "determinePolicyTypeFromId", {
|
|
16192
15523
|
enumerable: true,
|
|
16193
15524
|
get: function() {
|
|
16194
15525
|
return determinePolicyTypeFromId;
|
|
16195
15526
|
}
|
|
16196
15527
|
});
|
|
15528
|
+
Object.defineProperty(exports, "doRemoteGrading", {
|
|
15529
|
+
enumerable: true,
|
|
15530
|
+
get: function() {
|
|
15531
|
+
return doRemoteGrading;
|
|
15532
|
+
}
|
|
15533
|
+
});
|
|
15534
|
+
Object.defineProperty(exports, "dotProduct", {
|
|
15535
|
+
enumerable: true,
|
|
15536
|
+
get: function() {
|
|
15537
|
+
return dotProduct;
|
|
15538
|
+
}
|
|
15539
|
+
});
|
|
15540
|
+
Object.defineProperty(exports, "euclideanDistance", {
|
|
15541
|
+
enumerable: true,
|
|
15542
|
+
get: function() {
|
|
15543
|
+
return euclideanDistance;
|
|
15544
|
+
}
|
|
15545
|
+
});
|
|
16197
15546
|
Object.defineProperty(exports, "fail", {
|
|
16198
15547
|
enumerable: true,
|
|
16199
15548
|
get: function() {
|
|
@@ -16236,6 +15585,12 @@ Object.defineProperty(exports, "getGraderById", {
|
|
|
16236
15585
|
return getGraderById;
|
|
16237
15586
|
}
|
|
16238
15587
|
});
|
|
15588
|
+
Object.defineProperty(exports, "getGradingProvider", {
|
|
15589
|
+
enumerable: true,
|
|
15590
|
+
get: function() {
|
|
15591
|
+
return getGradingProvider;
|
|
15592
|
+
}
|
|
15593
|
+
});
|
|
16239
15594
|
Object.defineProperty(exports, "getPiiLeakTestsForCategory", {
|
|
16240
15595
|
enumerable: true,
|
|
16241
15596
|
get: function() {
|
|
@@ -16266,42 +15621,12 @@ Object.defineProperty(exports, "makeInlinePolicyIdSync", {
|
|
|
16266
15621
|
return makeInlinePolicyIdSync;
|
|
16267
15622
|
}
|
|
16268
15623
|
});
|
|
16269
|
-
Object.defineProperty(exports, "matchesAnswerRelevance", {
|
|
16270
|
-
enumerable: true,
|
|
16271
|
-
get: function() {
|
|
16272
|
-
return matchesAnswerRelevance;
|
|
16273
|
-
}
|
|
16274
|
-
});
|
|
16275
|
-
Object.defineProperty(exports, "matchesClassification", {
|
|
16276
|
-
enumerable: true,
|
|
16277
|
-
get: function() {
|
|
16278
|
-
return matchesClassification;
|
|
16279
|
-
}
|
|
16280
|
-
});
|
|
16281
15624
|
Object.defineProperty(exports, "matchesClosedQa", {
|
|
16282
15625
|
enumerable: true,
|
|
16283
15626
|
get: function() {
|
|
16284
15627
|
return matchesClosedQa;
|
|
16285
15628
|
}
|
|
16286
15629
|
});
|
|
16287
|
-
Object.defineProperty(exports, "matchesContextFaithfulness", {
|
|
16288
|
-
enumerable: true,
|
|
16289
|
-
get: function() {
|
|
16290
|
-
return matchesContextFaithfulness;
|
|
16291
|
-
}
|
|
16292
|
-
});
|
|
16293
|
-
Object.defineProperty(exports, "matchesContextRecall", {
|
|
16294
|
-
enumerable: true,
|
|
16295
|
-
get: function() {
|
|
16296
|
-
return matchesContextRecall;
|
|
16297
|
-
}
|
|
16298
|
-
});
|
|
16299
|
-
Object.defineProperty(exports, "matchesContextRelevance", {
|
|
16300
|
-
enumerable: true,
|
|
16301
|
-
get: function() {
|
|
16302
|
-
return matchesContextRelevance;
|
|
16303
|
-
}
|
|
16304
|
-
});
|
|
16305
15630
|
Object.defineProperty(exports, "matchesFactuality", {
|
|
16306
15631
|
enumerable: true,
|
|
16307
15632
|
get: function() {
|
|
@@ -16320,40 +15645,22 @@ Object.defineProperty(exports, "matchesLlmRubric", {
|
|
|
16320
15645
|
return matchesLlmRubric;
|
|
16321
15646
|
}
|
|
16322
15647
|
});
|
|
16323
|
-
Object.defineProperty(exports, "matchesModeration", {
|
|
16324
|
-
enumerable: true,
|
|
16325
|
-
get: function() {
|
|
16326
|
-
return matchesModeration;
|
|
16327
|
-
}
|
|
16328
|
-
});
|
|
16329
15648
|
Object.defineProperty(exports, "matchesPiScore", {
|
|
16330
15649
|
enumerable: true,
|
|
16331
15650
|
get: function() {
|
|
16332
15651
|
return matchesPiScore;
|
|
16333
15652
|
}
|
|
16334
15653
|
});
|
|
16335
|
-
Object.defineProperty(exports, "
|
|
16336
|
-
enumerable: true,
|
|
16337
|
-
get: function() {
|
|
16338
|
-
return matchesSearchRubric;
|
|
16339
|
-
}
|
|
16340
|
-
});
|
|
16341
|
-
Object.defineProperty(exports, "matchesSelectBest", {
|
|
16342
|
-
enumerable: true,
|
|
16343
|
-
get: function() {
|
|
16344
|
-
return matchesSelectBest;
|
|
16345
|
-
}
|
|
16346
|
-
});
|
|
16347
|
-
Object.defineProperty(exports, "matchesSimilarity", {
|
|
15654
|
+
Object.defineProperty(exports, "matchesTrajectoryGoalSuccess", {
|
|
16348
15655
|
enumerable: true,
|
|
16349
15656
|
get: function() {
|
|
16350
|
-
return
|
|
15657
|
+
return matchesTrajectoryGoalSuccess;
|
|
16351
15658
|
}
|
|
16352
15659
|
});
|
|
16353
|
-
Object.defineProperty(exports, "
|
|
15660
|
+
Object.defineProperty(exports, "normalizeMatcherTokenUsage", {
|
|
16354
15661
|
enumerable: true,
|
|
16355
15662
|
get: function() {
|
|
16356
|
-
return
|
|
15663
|
+
return normalizeMatcherTokenUsage;
|
|
16357
15664
|
}
|
|
16358
15665
|
});
|
|
16359
15666
|
Object.defineProperty(exports, "processFileReference", {
|
|
@@ -16380,10 +15687,10 @@ Object.defineProperty(exports, "readProviderPromptMap", {
|
|
|
16380
15687
|
return readProviderPromptMap;
|
|
16381
15688
|
}
|
|
16382
15689
|
});
|
|
16383
|
-
Object.defineProperty(exports, "
|
|
15690
|
+
Object.defineProperty(exports, "renderLlmRubricPrompt", {
|
|
16384
15691
|
enumerable: true,
|
|
16385
15692
|
get: function() {
|
|
16386
|
-
return
|
|
15693
|
+
return renderLlmRubricPrompt;
|
|
16387
15694
|
}
|
|
16388
15695
|
});
|
|
16389
15696
|
Object.defineProperty(exports, "retryWithDeduplication", {
|
|
@@ -16398,10 +15705,16 @@ Object.defineProperty(exports, "sampleArray", {
|
|
|
16398
15705
|
return sampleArray;
|
|
16399
15706
|
}
|
|
16400
15707
|
});
|
|
16401
|
-
Object.defineProperty(exports, "
|
|
15708
|
+
Object.defineProperty(exports, "splitIntoSentences", {
|
|
15709
|
+
enumerable: true,
|
|
15710
|
+
get: function() {
|
|
15711
|
+
return splitIntoSentences;
|
|
15712
|
+
}
|
|
15713
|
+
});
|
|
15714
|
+
Object.defineProperty(exports, "tryParse", {
|
|
16402
15715
|
enumerable: true,
|
|
16403
15716
|
get: function() {
|
|
16404
|
-
return
|
|
15717
|
+
return tryParse;
|
|
16405
15718
|
}
|
|
16406
15719
|
});
|
|
16407
15720
|
Object.defineProperty(exports, "withProviderCallExecutionContext", {
|
|
@@ -16411,4 +15724,4 @@ Object.defineProperty(exports, "withProviderCallExecutionContext", {
|
|
|
16411
15724
|
}
|
|
16412
15725
|
});
|
|
16413
15726
|
|
|
16414
|
-
//# sourceMappingURL=graders
|
|
15727
|
+
//# sourceMappingURL=graders-Bw1wk_21.cjs.map
|