promptfoo 0.121.4 → 0.121.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (346) hide show
  1. package/dist/src/{ListApp-DQkFNqE9.js → ListApp-BRUsT43Y.js} +1 -1
  2. package/dist/src/{accounts-Dy17bs4D.cjs → accounts-BIFntVWB.cjs} +4 -4
  3. package/dist/src/{accounts-F9d_5sMC.js → accounts-CLJHCDDb.js} +6 -6
  4. package/dist/src/{accounts-DhMYUUbu.js → accounts-CaLNYnf7.js} +4 -4
  5. package/dist/src/{accounts-DdJ2pHMI.js → accounts-bnyHT7Ju.js} +5 -5
  6. package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-B5krlibj.js} +3 -3
  7. package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-Ba67xmgs.js} +2 -2
  8. package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-BclbiXiq.js} +3 -3
  9. package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-D2x0wGhB.cjs} +2 -2
  10. package/dist/src/{agents-CgaMXvLM.js → agents-BGqaTDnr.js} +5 -5
  11. package/dist/src/{agents-8FDnTriG.js → agents-BV9yFpXX.js} +5 -5
  12. package/dist/src/{agents-aYPQLf8W.js → agents-BYdMl1UE.js} +4 -4
  13. package/dist/src/{agents-pQeBEXMm.js → agents-DhxWMCtH.js} +5 -5
  14. package/dist/src/{agents-D7-HGxUj.cjs → agents-DiWmQYH9.cjs} +4 -4
  15. package/dist/src/{agents-BahDpe5G.cjs → agents-WULPVjbH.cjs} +4 -4
  16. package/dist/src/{agents-DJ35I3Nt.js → agents-emVcx3yh.js} +5 -5
  17. package/dist/src/{agents-C-R_jfzI.js → agents-n6vPqV3i.js} +4 -4
  18. package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-BxqK9HF_.js} +7 -7
  19. package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-BzLjZI_m.cjs} +6 -6
  20. package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-DR4pgeiC.js} +6 -6
  21. package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-uPGp0Zdo.js} +7 -7
  22. package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
  23. package/dist/src/app/assets/Report-vjzrbgce.js +1 -0
  24. package/dist/src/app/assets/index-B3NQ8HTd.js +385 -0
  25. package/dist/src/app/assets/{index-BXGkeMwh.css → index-Cli2yAXv.css} +1 -1
  26. package/dist/src/app/index.html +27 -2
  27. package/dist/src/{audio-DcVKoInv.js → audio-BvpTOArF.js} +4 -4
  28. package/dist/src/{audio-BQtNuYBj.cjs → audio-C0vDeS0j.cjs} +3 -3
  29. package/dist/src/{audio-B7izf48x.js → audio-CScmnmEB.js} +4 -4
  30. package/dist/src/{audio-COrn8rM6.js → audio-Da8U9IS5.js} +3 -3
  31. package/dist/src/{base-fZ9wgg50.js → base-BOMaNEes.js} +3 -3
  32. package/dist/src/{base-PYJvBE1i.js → base-BTux96b1.js} +2 -2
  33. package/dist/src/{base-D-670DX8.cjs → base-Tw6uhH8K.cjs} +2 -2
  34. package/dist/src/{base-yrI1Yal4.js → base-dYsl2hmL.js} +3 -3
  35. package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B95F_7vE.cjs} +2 -2
  36. package/dist/src/{blobs-C-F78Kfn.js → blobs-BW4U31ue.js} +2 -2
  37. package/dist/src/{blobs-BCZavS8s.js → blobs-D_gg8nbm.js} +3 -3
  38. package/dist/src/{blobs-BQWqnnvL.js → blobs-DjLby-uP.js} +3 -3
  39. package/dist/src/{cache-mb7c8hbp.js → cache-BI5BY7ey.js} +4 -4
  40. package/dist/src/{cache-DbLsVWB2.cjs → cache-BRkhlH3k.cjs} +1 -1
  41. package/dist/src/cache-BlC6aeJ0.js +3 -0
  42. package/dist/src/{cache-D5NZmMiT.js → cache-Bzttsk0X.js} +2 -2
  43. package/dist/src/{cache-C4Xb-hNb.js → cache-Cr-qWIbP.js} +3 -3
  44. package/dist/src/{cache-BIyPcp5v.cjs → cache-DGg-yTZG.cjs} +2 -2
  45. package/dist/src/{chat-Dr3DUQ0D.js → chat-BLOdH60v.js} +12 -12
  46. package/dist/src/{chat-BfPaS15_.js → chat-Cx_LkwvZ.js} +12 -12
  47. package/dist/src/{chat-mW0ORo8G.js → chat-D9nudO9b.js} +4 -4
  48. package/dist/src/{chat-I9izLm49.js → chat-DChSH_Es.js} +12 -12
  49. package/dist/src/{chat-MKxMnZJZ.js → chat-DG2LkwLq.js} +2 -2
  50. package/dist/src/{chat-BPXSW8Bv.cjs → chat-DH97tVV9.cjs} +2 -2
  51. package/dist/src/{chat-0bwXjVP0.js → chat-aMQZw6R7.js} +4 -4
  52. package/dist/src/{chat-CclRbxGf.cjs → chat-vYqqv1gP.cjs} +11 -11
  53. package/dist/src/{chatkit-zUIVoDos.js → chatkit-B8X34dQc.js} +4 -4
  54. package/dist/src/{chatkit-Cv6AhukM.js → chatkit-BXu42Qwt.js} +3 -3
  55. package/dist/src/{chatkit-CJnHRRMM.js → chatkit-CbMRoeYw.js} +4 -4
  56. package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-D44VyUyB.cjs} +3 -3
  57. package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-BRq0bbIK.cjs} +8 -8
  58. package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-BjriSVRZ.js} +7 -7
  59. package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BzNZeZ0N.js} +7 -7
  60. package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-DYv_AJ8u.js} +7 -7
  61. package/dist/src/cloud-CoD5OacT.js +3 -0
  62. package/dist/src/{cloud-DQZ5sVjW.js → cloud-Da0bofJd.js} +3 -3
  63. package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-CXC4b1EU.js} +4 -4
  64. package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-CyBoIs1Q.js} +6 -6
  65. package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-DGOwgexC.js} +6 -6
  66. package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-DJv5qnyb.cjs} +4 -4
  67. package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-1sAoOyft.js} +5 -5
  68. package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-D-dnkzCF.js} +5 -5
  69. package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-DKVjkDav.cjs} +3 -3
  70. package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-TJkVrZlB.js} +3 -3
  71. package/dist/src/codex-app-server-CCLjqCh9.js +1915 -0
  72. package/dist/src/codex-app-server-CCe0TiDc.js +1915 -0
  73. package/dist/src/codex-app-server-CPW1LFwh.js +1916 -0
  74. package/dist/src/codex-app-server-VMRnjZ68.cjs +1920 -0
  75. package/dist/src/codex-sdk-1jm_qPHf.js +3 -0
  76. package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-Bd8UbO9q.cjs} +5 -5
  77. package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-BgEFQ70r.js} +6 -6
  78. package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-Bzb_TqX9.js} +6 -6
  79. package/dist/src/{codex-sdk-CWEnH70W.cjs → codex-sdk-Danroptg.cjs} +1 -1
  80. package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-DfvDTN33.js} +5 -5
  81. package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-B5ImDlSm.js} +8 -8
  82. package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-BgAkuYCw.cjs} +7 -7
  83. package/dist/src/{cometapi-BUlt_ELa.js → cometapi-CC7hWxmX.js} +8 -8
  84. package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CCbpHkuF.js} +7 -7
  85. package/dist/src/{completion-x0a_c2y1.js → completion-2iuYVxwi.js} +6 -6
  86. package/dist/src/{completion-Dnxn7E-j.js → completion-CrD6MQ93.js} +5 -5
  87. package/dist/src/{completion-BozdoXba.cjs → completion-DtQ72Bm3.cjs} +5 -5
  88. package/dist/src/{completion-HUe8wDhZ.js → completion-Vq_ad618.js} +6 -6
  89. package/dist/src/{createHash-ChI45QR1.js → createHash-DPpsZgFF.js} +1 -1
  90. package/dist/src/{createHash-CwDVU5xr.js → createHash-Un4Q_huE.js} +1 -1
  91. package/dist/src/{createHash-B7KvgoOD.cjs → createHash-VvBIc-AW.cjs} +1 -1
  92. package/dist/src/{docker-DCgsveLD.js → docker--3qzPa-6.js} +6 -6
  93. package/dist/src/{docker-DS4_Osau.cjs → docker-D3AY-5F5.cjs} +5 -5
  94. package/dist/src/{docker-CQmlA2NU.js → docker-DCsCDvwM.js} +6 -6
  95. package/dist/src/{docker-ClnmCf1Z.js → docker-Dorv4_Dg.js} +5 -5
  96. package/dist/src/{embedding-I45KG3o7.cjs → embedding-BXhN5lCH.cjs} +5 -5
  97. package/dist/src/{embedding-nFbumxcv.js → embedding-ChS1ivFS.js} +5 -5
  98. package/dist/src/{embedding-D3xTseo7.js → embedding-DNRvZwRN.js} +6 -6
  99. package/dist/src/{embedding-DD9wa3ae.js → embedding-D_bI4NDq.js} +6 -6
  100. package/dist/src/{errors-Cw810C93.js → errors-DFHe4L-n.js} +1 -1
  101. package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
  102. package/dist/src/{esm-C7PnfdF8.js → esm-BRkfNsYs.js} +1 -1
  103. package/dist/src/{esm-tVgYPY-f.js → esm-BX8fwlAO.js} +2 -2
  104. package/dist/src/{esm-CtEPLdAj.cjs → esm-B_rGuPTo.cjs} +1 -1
  105. package/dist/src/{eval-CzJFfFO9.js → eval-BQPLBJbw.js} +1 -1
  106. package/dist/src/{eval-u4UVafl6.js → eval-DJ_4A-tr.js} +14 -14
  107. package/dist/src/evalResult-BBJAHAtw.cjs +2 -0
  108. package/dist/src/evalResult-BBK58h2B.js +3 -0
  109. package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-Cx-8OWkb.cjs} +28 -10
  110. package/dist/src/{evalResult-D3hVYFis.js → evalResult-D6P5I5il.js} +29 -11
  111. package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-pSvGWFMo.js} +29 -11
  112. package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-D-UIbbYq.js} +845 -98
  113. package/dist/src/evaluator-DgLKaZk8.js +3 -0
  114. package/dist/src/{extractor-Dk6bRWkv.js → extractor-BM3jRERL.js} +5 -5
  115. package/dist/src/{extractor-WVPOrH43.cjs → extractor-Dxr2J_wK.cjs} +5 -5
  116. package/dist/src/{extractor-DNSeBVOJ.js → extractor-DxyiFhPk.js} +6 -6
  117. package/dist/src/{extractor-CAfTSraf.js → extractor-YlZbUMsL.js} +6 -6
  118. package/dist/src/fetch-8viavNv8.js +3 -0
  119. package/dist/src/{fetch-BEWnXrrG.js → fetch-B6ch2nU2.js} +9 -20
  120. package/dist/src/{fetch-Di00EQrc.js → fetch-D9xxyC1p.js} +221 -232
  121. package/dist/src/{fetch-CJU5ELPa.cjs → fetch-NuqXW1Xb.cjs} +221 -244
  122. package/dist/src/{fetch-B0Z3Oe4k.js → fetch-Y5qX_kST.js} +8 -19
  123. package/dist/src/{fileExtensions-BArZuxsI.js → fileExtensions-8CjoL7vB.js} +1 -1
  124. package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-BGh-W-HT.js} +1 -1
  125. package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-D9h-8Wxg.cjs} +1 -1
  126. package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-DysCsxNG.js} +1 -1
  127. package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-Ch4A7G3o.js} +1 -1
  128. package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-BokHC-MW.cjs} +1 -1
  129. package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-C3ZPQU60.js} +1 -1
  130. package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-CFny3gOy.js} +1 -1
  131. package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-DxODqT9e.js} +1 -1
  132. package/dist/src/{graders-Zy3x0zqX.js → graders-BoUqsCEm.js} +1303 -2044
  133. package/dist/src/{graders--zknU_uk.cjs → graders-Bw1wk_21.cjs} +1553 -2240
  134. package/dist/src/graders-C84JI-m5.js +2 -0
  135. package/dist/src/graders-CBbd0K0Q.cjs +2 -0
  136. package/dist/src/graders-CbQqpHSN.js +3 -0
  137. package/dist/src/{graders-eIHhRqoC.js → graders-CgPn32yp.js} +1300 -2041
  138. package/dist/src/{graders-pvbReLLn.js → graders-CwrbifOo.js} +747 -1488
  139. package/dist/src/graders-DS42d3ZG.js +2 -0
  140. package/dist/src/{image-9302QVqR.js → image-BeWaInPF.js} +3 -3
  141. package/dist/src/{image-DVz2RiMF.js → image-BmilRNqO.js} +7 -7
  142. package/dist/src/{image-x6KqLQl4.cjs → image-CxJoa3aW.cjs} +6 -6
  143. package/dist/src/{image-De2FBmYV.cjs → image-D10dNAav.cjs} +3 -3
  144. package/dist/src/{image-dnoUgPrC.js → image-Dr_3I3nK.js} +4 -4
  145. package/dist/src/{image-B5Mv-Z3h.js → image-DsGRlkh7.js} +7 -7
  146. package/dist/src/{image-qUpPvmNZ.js → image-a_SGUobh.js} +6 -6
  147. package/dist/src/{image-u7-rKnYU.js → image-qjO6FWPs.js} +4 -4
  148. package/dist/src/index.cjs +1052 -296
  149. package/dist/src/index.d.cts +124 -13
  150. package/dist/src/index.d.ts +125 -14
  151. package/dist/src/index.js +1018 -262
  152. package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-CCICw2cy.js} +2 -2
  153. package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
  154. package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
  155. package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-BBETc5-S.js} +6 -6
  156. package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-C8qOo26M.cjs} +5 -5
  157. package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CzAi2rUI.js} +6 -6
  158. package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-Dr3Kib7F.js} +5 -5
  159. package/dist/src/{litellm-C2kqjxqp.js → litellm-BLSiANhk.js} +5 -5
  160. package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CaUmV7Mk.cjs} +4 -4
  161. package/dist/src/{litellm-p37R1dzQ.js → litellm-DQGo_juI.js} +4 -4
  162. package/dist/src/{litellm-DRjpcSa7.js → litellm-DRc4qWfc.js} +5 -5
  163. package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
  164. package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
  165. package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-B-tNZzqW.js} +6 -6
  166. package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CtS3OlGq.js} +5 -5
  167. package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-PJJgUjOc.js} +6 -6
  168. package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-if-Ml4R9.cjs} +5 -5
  169. package/dist/src/main.js +242 -198
  170. package/dist/src/{messages-zWbkLLHz.js → messages-B9dSjrNf.js} +264 -16
  171. package/dist/src/{messages-811uVVW5.cjs → messages-BnsVHUnm.cjs} +266 -15
  172. package/dist/src/{messages-MYTQ2TWp.js → messages-CI69Lasb.js} +264 -16
  173. package/dist/src/{messages-BTQz42fn.js → messages-CewuNcNS.js} +264 -16
  174. package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-BBGcGeCa.cjs} +1 -1
  175. package/dist/src/{meteor-DuAFv6gF.js → meteor-BKTM-7KS.js} +1 -1
  176. package/dist/src/{meteor-DHdzY1Ss.js → meteor-CeGo0Lu2.js} +2 -2
  177. package/dist/src/{meteor-CU5UAE-H.js → meteor-Wc_aUVvu.js} +2 -2
  178. package/dist/src/{modelslab-wu9yi5GE.js → modelslab-BCLOtfek.js} +7 -7
  179. package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-BkapYJhh.cjs} +6 -6
  180. package/dist/src/{modelslab-DIq-6y7x.js → modelslab-D73OnKSx.js} +6 -6
  181. package/dist/src/{modelslab-D0erNWKe.js → modelslab-zpz9JcK0.js} +7 -7
  182. package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-B8F_TK5w.js} +6 -6
  183. package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-Bx0NFV2f.js} +5 -5
  184. package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-CNGJTLtG.js} +6 -6
  185. package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-DkT7tnoB.cjs} +5 -5
  186. package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BaXRN1cr.js} +4 -4
  187. package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-BeTRaFOh.js} +4 -4
  188. package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CL7Zqv0G.js} +3 -3
  189. package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-YT426juD.cjs} +3 -3
  190. package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
  191. package/dist/src/{openai-Qsvz25mV.js → openai-BT-JvDse.js} +2 -2
  192. package/dist/src/{openai-URNyItar.cjs → openai-Cy1XLs0c.cjs} +1 -1
  193. package/dist/src/{openai-iYtrXzOX.js → openai-D4fxGvRx.js} +1 -1
  194. package/dist/src/{openclaw-CwzlQSQX.js → openclaw-Bq7RVR3k.js} +7 -6
  195. package/dist/src/{openclaw-CLWrW03k.js → openclaw-DA8U4DsD.js} +8 -7
  196. package/dist/src/{openclaw-CnQ363Wi.js → openclaw-DObVgpjC.js} +8 -7
  197. package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-DUBZP3GL.cjs} +8 -7
  198. package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-BB40Wir1.js} +4 -4
  199. package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-BM1UAIv1.js} +3 -3
  200. package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-CeqiOcOU.cjs} +4 -4
  201. package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-ChdK7F7z.js} +4 -4
  202. package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-C6thJRXi.js} +4 -4
  203. package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-CcdIikOu.js} +3 -3
  204. package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-DNSQj6bf.cjs} +3 -3
  205. package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver-UYMQx3sy.js} +4 -4
  206. package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
  207. package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-BESeALrr.cjs} +1 -1
  208. package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DoACwqhD.js} +1 -1
  209. package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-PMsleEzs.js} +2 -2
  210. package/dist/src/{providers-Bp4S-FvO.js → providers-BuyzKt7C.js} +1 -1
  211. package/dist/src/{providers-DV3ax9e_.cjs → providers-C7lNVBjX.cjs} +1 -1
  212. package/dist/src/{providers-u9Enmfok.js → providers-CCE2COJi2.js} +1 -1
  213. package/dist/src/{providers-DruaQfwu.js → providers-CJh7iriU.js} +18103 -17952
  214. package/dist/src/{providers-iUt5fbAN.js → providers-Ctcc592x.js} +1 -1
  215. package/dist/src/{providers-Domz_llv.js → providers-DRrerKra.js} +432 -281
  216. package/dist/src/{providers-BV_KMZje.js → providers-DT-GtF2t.js} +19094 -18943
  217. package/dist/src/{providers-1eKkXBKp.cjs → providers-eDShy16E.cjs} +17946 -17795
  218. package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-C4tltmIn.js} +3 -3
  219. package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-CoLaCwNY.cjs} +3 -3
  220. package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-DMO68Jg7.js} +3 -3
  221. package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-DNqbnRdx.js} +3 -3
  222. package/dist/src/{quiverai-DR0SnIQV.js → quiverai-BSS9a7wV.js} +3 -3
  223. package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Bk1KrvL6.js} +4 -4
  224. package/dist/src/{quiverai-DFotyafY.cjs → quiverai-Bpx6MZ7T.cjs} +3 -3
  225. package/dist/src/{quiverai-aPPvXOgn.js → quiverai-CPKhWgaT.js} +4 -4
  226. package/dist/src/{render-DHIZ6_k8.js → render-7uNJ2V14.js} +2 -2
  227. package/dist/src/{render-CH-62LbA.js → render-DlscvAUJ.js} +1 -1
  228. package/dist/src/{render-CMEpfLaO.js → render-eui5p5mL.js} +2 -2
  229. package/dist/src/{render-CgVDrJmM.js → render-nj-UaPdn.js} +2 -2
  230. package/dist/src/{render-DfQSFxGE.cjs → render-tG6ir9_g.cjs} +1 -1
  231. package/dist/src/{responses--OsX2aYW.js → responses-1ztiVYsx.js} +49 -15
  232. package/dist/src/{responses-DL9m8CyY.js → responses-B8haB-mD.js} +49 -15
  233. package/dist/src/{responses-C-flexAY.js → responses-BiaBguAu.js} +49 -15
  234. package/dist/src/{responses-Bi9vBuW_.cjs → responses-CF-ayauu.cjs} +48 -14
  235. package/dist/src/rubyUtils-4hjGxvju.js +3 -0
  236. package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-BI0p46eZ.js} +3 -3
  237. package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CIQFnVz4.js} +3 -3
  238. package/dist/src/rubyUtils-CO-tuszQ.cjs +2 -0
  239. package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DGnoCYL2.js} +3 -3
  240. package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-DoifqkiA.cjs} +4 -3
  241. package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-BDLeW29y.js} +12 -12
  242. package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-C5T60MKf.js} +13 -13
  243. package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-ClS_NB07.js} +13 -13
  244. package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-ljtY12VM.cjs} +12 -12
  245. package/dist/src/{scanner-1DqWi1Ej.js → scanner-nOCWNIXa.js} +7 -7
  246. package/dist/src/server/index.js +1067 -265
  247. package/dist/src/{server-Dx2TyCH2.cjs → server-BEECpeGG.cjs} +5 -5
  248. package/dist/src/{server-BNYztJkh.js → server-ByiF3qlg.js} +9 -8
  249. package/dist/src/{server-BSB45Nt9.js → server-ByxbqAcQ.js} +8 -7
  250. package/dist/src/{server-DaA2eR26.cjs → server-C0XKRNB_.cjs} +1 -1
  251. package/dist/src/server-C_15p79-.js +3 -0
  252. package/dist/src/{server-D6Il2Sob.js → server-gyd6d4Hc.js} +5 -5
  253. package/dist/src/{signal-CE5G3a7x.js → signal-DTtUuU3l.js} +3 -3
  254. package/dist/src/{slack-acRb0IqQ.js → slack-4zZX1OKP.js} +1 -1
  255. package/dist/src/{slack-1Rhq0EoV.cjs → slack-BLlsDpfG.cjs} +1 -1
  256. package/dist/src/{slack-D5Wpy8LM.js → slack-BPYLQLgb.js} +2 -2
  257. package/dist/src/{slack-DDUe-5MC.js → slack-Bamy_7te.js} +2 -2
  258. package/dist/src/{store-DAAyxcy6.cjs → store-2K0kDi80.cjs} +2 -2
  259. package/dist/src/{store-Dn9HUkdW.js → store-2OXm_eBY.js} +3 -3
  260. package/dist/src/store-BELqNwvz.js +3 -0
  261. package/dist/src/{store-M0b1WfYb.js → store-BPkzEyFM.js} +2 -2
  262. package/dist/src/{store-CYEy5J2D.js → store-CPh25336.js} +3 -3
  263. package/dist/src/store-uQZ4AjPe.cjs +2 -0
  264. package/dist/src/{tables-CsWou1Bx.js → tables-BMSOS2Gg.js} +3 -3
  265. package/dist/src/{tables-DUfh1F7Z.cjs → tables-CXbaZ9y1.cjs} +2 -2
  266. package/dist/src/{tables-C4CH3zRr.js → tables-NlvH23ky.js} +3 -3
  267. package/dist/src/{tables-DQ4WU5tX.js → tables-WgdUZ8Ck.js} +2 -2
  268. package/dist/src/{telemetry-dbaJ0E98.js → telemetry--iqaGyaS.js} +5 -4
  269. package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CEQxGnMZ.cjs} +7 -6
  270. package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-CgdVGV8N.js} +4 -3
  271. package/dist/src/{telemetry-CQPez_Jp.js → telemetry-DWdGHvEf.js} +5 -4
  272. package/dist/src/telemetry-DjNoC_n3.cjs +2 -0
  273. package/dist/src/telemetry-ZdPZc0fm.js +3 -0
  274. package/dist/src/{text-BVi-cLPJ.cjs → text-BiNME7QG.cjs} +1 -1
  275. package/dist/src/{text-KvuD2Iko.js → text-D4lz-Jg_.js} +1 -1
  276. package/dist/src/{text-DHxdyQqT.js → text-DDQP0tuQ.js} +1 -1
  277. package/dist/src/{text-CZr46tp_.js → text-NWvfMfkF.js} +1 -1
  278. package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-2wIvAhB3.js} +1 -1
  279. package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-4c780gFd.js} +1 -1
  280. package/dist/src/tokenUsageUtils-BjVkdk18.js +142 -0
  281. package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-C9odhsbW.cjs} +1 -1
  282. package/dist/src/{transcription-DuWDupG7.js → transcription-84t4ALo2.js} +5 -5
  283. package/dist/src/{transcription-CJspiD2c.js → transcription-Bm2emLmJ.js} +6 -6
  284. package/dist/src/{transcription-BvjmiYB1.cjs → transcription-CZ4LG5hQ.cjs} +5 -5
  285. package/dist/src/{transcription-V2HaAmy2.js → transcription-D7Q0vJsh.js} +6 -6
  286. package/dist/src/{transform-zDhMmzwX.js → transform-B-b6Cq-q.js} +5 -5
  287. package/dist/src/transform-BQt0BeAW.js +3 -0
  288. package/dist/src/{transform-DgKlRr73.cjs → transform-Bq5oqC0s.cjs} +1 -1
  289. package/dist/src/{transform-CUnzlsbn.cjs → transform-C9izGX54.cjs} +4 -4
  290. package/dist/src/{transform-DYX1_Xnh.js → transform-CwbAZ84V.js} +5 -5
  291. package/dist/src/{transform-CTeuTR3S.cjs → transform-Dg4LcO1Y.cjs} +6 -6
  292. package/dist/src/{transform-CG0ehZNG.js → transform-DtooZqYY.js} +6 -6
  293. package/dist/src/{transform-UN5UGu8U.js → transform-DzCF-wqV.js} +5 -5
  294. package/dist/src/{transform-lQrDE1BQ.js → transform-_DpNB4qp.js} +5 -5
  295. package/dist/src/{transform-Bbg6A8Jk.js → transform-eGiUAv86.js} +4 -4
  296. package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-B22swDxr.cjs} +1 -1
  297. package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-lvCCvuPT.js} +1 -1
  298. package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-rJGPccjr.js} +1 -1
  299. package/dist/src/{types-Bgh5SOn6.js → types-BDjGOq4E.js} +4 -2
  300. package/dist/src/{types-Dm9JM6Vb.js → types-BVH9hjgW.js} +4 -2
  301. package/dist/src/{types-CeaeaZdP.cjs → types-CgG2rKiW.cjs} +151 -149
  302. package/dist/src/{types-BGQDAP8i.js → types-DNRZVOue.js} +152 -150
  303. package/dist/src/{util-C8e5uydV.js → util-3pBZZb_H.js} +142 -17
  304. package/dist/src/{util-CN3SrLT4.cjs → util-A5_ZsQUn.cjs} +65 -43
  305. package/dist/src/{util-D3q0WQ-0.js → util-B9CNhyac.js} +66 -44
  306. package/dist/src/{util-DxWpWjhc.js → util-BQOCAHQC.js} +700 -575
  307. package/dist/src/{util-BYvQUPp7.js → util-BVXcTwXu.js} +3 -3
  308. package/dist/src/{util-D9TisOyk.js → util-BlFVL0UF.js} +65 -43
  309. package/dist/src/{util-C9J8ahRn.js → util-C-kmRosx.js} +66 -44
  310. package/dist/src/{util-DvU2Pw8c.js → util-DFPeFkiV.js} +3 -3
  311. package/dist/src/{util-DDs-7g6-.js → util-DN0-b81k.js} +3 -3
  312. package/dist/src/{util-olYL5C6N.cjs → util-Dpmm_dAI.cjs} +3 -3
  313. package/dist/src/{util-oGMLA7vc.js → util-Dub0f_ej.js} +700 -575
  314. package/dist/src/{util-Bxn8emtE.cjs → util-DvpHnLt0.cjs} +718 -570
  315. package/dist/src/{utils-DJfvjyMj.js → utils-BUMN8orw.js} +3 -3
  316. package/dist/src/{utils-B05gLxER.cjs → utils-DkVeShIB.cjs} +2 -2
  317. package/dist/src/{utils-BLJKfv0y.js → utils-kt7lv30R.js} +3 -3
  318. package/dist/src/{utils-hXtCYanr.js → utils-o8S5huU2.js} +2 -2
  319. package/dist/src/version-0frU0UTr.js +16 -0
  320. package/dist/src/version-CbpiUINz.js +17 -0
  321. package/dist/src/version-CbuBKu2U.js +16 -0
  322. package/dist/src/version-D9zu9FWB.cjs +27 -0
  323. package/dist/tsconfig.tsbuildinfo +1 -1
  324. package/package.json +22 -20
  325. package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
  326. package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
  327. package/dist/src/cache-Cr9oLMUa.js +0 -3
  328. package/dist/src/cloud-Hphvo8kr.js +0 -3
  329. package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
  330. package/dist/src/evalResult-D8MT9p0s.js +0 -3
  331. package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
  332. package/dist/src/evaluator-CVessDWe.js +0 -3
  333. package/dist/src/fetch-C7bGKDlQ.js +0 -3
  334. package/dist/src/graders-BOAzQEUe.cjs +0 -2
  335. package/dist/src/graders-D4BTsZdG2.js +0 -3
  336. package/dist/src/graders-DOJK1XpV.js +0 -2
  337. package/dist/src/graders-NAv9LcBn.js +0 -2
  338. package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
  339. package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
  340. package/dist/src/server-DCtHUqlp.js +0 -3
  341. package/dist/src/store-CWOSz6D_.cjs +0 -2
  342. package/dist/src/store-DCDBhv7B.js +0 -3
  343. package/dist/src/telemetry-C1IqxcdW.js +0 -3
  344. package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
  345. package/dist/src/transform-M6ITAESf.js +0 -3
  346. /package/dist/src/{evalResult-DElBuddX.js → evalResult-spPqh1G_.js} +0 -0
@@ -1,351 +1,595 @@
1
1
  import { O as isCI, S as getEnvBool, T as getEnvString, a as logger, h as extractJsonObjects, k as state, m as extractFirstJsonObject, y as safeJsonStringify } from "./logger-Ct2S6Yx-.js";
2
2
  import { t as invariant } from "./invariant-Ddh24eXh.js";
3
- import { r as importModule } from "./esm-C7PnfdF8.js";
4
- import { r as runPython } from "./pythonUtils-C2UQ30Rz.js";
5
- import { i as isJavascriptFile } from "./fileExtensions-DnqA1y9x.js";
6
- import { n as transform } from "./transform-Bbg6A8Jk.js";
7
- import { B as isValidReusablePolicyId, Dt as CODING_AGENT_PLUGIN_DISPLAY_NAMES, Et as CODING_AGENT_PLUGIN_DESCRIPTIONS, G as MULTI_TURN_STRATEGIES, Ot as PromptSchema, R as PolicyObjectSchema, Tt as CODING_AGENT_PLUGINS, dt as LLAMA_GUARD_REPLICATE_PROVIDER } from "./types-BGQDAP8i.js";
8
- import { a as getNunjucksEngineForFilePath, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, f as parsePathOrGlob, g as parseFileUrl, s as maybeLoadConfigFromExternalFile } from "./util-oGMLA7vc.js";
9
- import { a as getNunjucksEngine, r as extractVariablesFromTemplate } from "./render-CH-62LbA.js";
10
- import { d as sleep, h as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy } from "./fetch-Di00EQrc.js";
11
- import { a as isCacheEnabled, i as getCache, r as fetchWithCache } from "./cache-D5NZmMiT.js";
12
- import { F as getGeneratedPromptOverLimit, I as getMaxCharsPerMessageModifierValue, J as MistralChatCompletionProvider, M as isRateLimitWrapped, O as redteamProviderManager, P as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY, R as REDTEAM_MEMORY_POISONING_PLUGIN_ID, S as removePrefix, W as getPoliciesFromCloud, X as DefaultEmbeddingProvider$2, Y as MistralEmbeddingProvider, Z as DefaultGradingProvider$3, _ as extractVariablesFromJson, at as AzureModerationProvider, b as isBasicRefusal, ct as getFileHashes, et as DefaultGradingJsonProvider$2, f as checkExfilTracking, g as extractPromptFromTags, h as extractInputVarsFromPrompt, it as DefaultSynthesizeProvider$1, j as createProviderRateLimitOptions, lt as parseScriptParts, m as extractGoalFromPrompt, n as loadApiProvider, nt as DefaultLlmRubricProvider, ot as AzureEmbeddingProvider, p as extractAllPromptsFromTags, q as OpenAiModerationProvider, rt as DefaultSuggestionsProvider$2, st as AzureChatCompletionProvider, tt as DefaultGradingProvider$2, x as isEmptyResponse, y as getShortPluginId } from "./providers-DruaQfwu.js";
13
- import { a as PROMPT_DELIMITER, n as maybeFilePath, r as normalizeInput } from "./utils-hXtCYanr.js";
14
- import { n as sha256 } from "./createHash-4gFQpDDv.js";
15
- import { t as OpenAiChatCompletionProvider } from "./chat-I9izLm49.js";
16
- import { r as accumulateTokenUsage } from "./tokenUsageUtils-C-bmyHoE.js";
17
- import { x as hasGoogleDefaultCredentials } from "./transform-CG0ehZNG.js";
18
- import { t as AnthropicMessagesProvider } from "./messages-BTQz42fn.js";
19
- import { t as OpenAiResponsesProvider } from "./responses--OsX2aYW.js";
20
- import { d as hasCodexDefaultCredentials, i as getRemoteGenerationUrl, l as shouldGenerateRemote, u as getCodexDefaultProviders } from "./server-D6Il2Sob.js";
21
- import { t as OpenAiEmbeddingProvider } from "./embedding-nFbumxcv.js";
22
- import { i as getUserEmail } from "./accounts-DhMYUUbu.js";
3
+ import { o as sleep, r as fetchWithTimeout, t as fetchWithProxy, u as REQUEST_TIMEOUT_MS } from "./fetch-D9xxyC1p.js";
4
+ import { Dt as CODING_AGENT_PLUGIN_DESCRIPTIONS, Et as CODING_AGENT_PLUGINS, K as MULTI_TURN_STRATEGIES, N as PromptSchema, Ot as CODING_AGENT_PLUGIN_DISPLAY_NAMES, V as isValidReusablePolicyId, z as PolicyObjectSchema } from "./types-DNRZVOue.js";
5
+ import { i as getUserEmail } from "./accounts-CaLNYnf7.js";
6
+ import { r as importModule } from "./esm-BRkfNsYs.js";
7
+ import { a as getNunjucksEngine, r as extractVariablesFromTemplate } from "./render-DlscvAUJ.js";
8
+ import { d as hasCodexDefaultCredentials, i as getRemoteGenerationUrl, l as shouldGenerateRemote, u as getCodexDefaultProviders } from "./server-gyd6d4Hc.js";
9
+ import { $ as getMaxCharsPerMessageModifierValue, A as extractAllPromptsFromTags, C as AzureEmbeddingProvider, G as redteamProviderManager, I as getShortPluginId, J as createProviderRateLimitOptions, L as isBasicRefusal, M as extractInputVarsFromPrompt, N as extractPromptFromTags, P as extractVariablesFromJson, Q as getGeneratedPromptOverLimit, R as isEmptyResponse, S as AzureModerationProvider, Y as isRateLimitWrapped, Z as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY, _ as DefaultGradingJsonProvider$2, b as DefaultSuggestionsProvider$2, c as OpenAiModerationProvider, f as DefaultEmbeddingProvider$2, j as extractGoalFromPrompt, k as checkExfilTracking, l as MistralChatCompletionProvider, n as loadApiProvider, o as getFileHashes, p as DefaultGradingProvider$3, s as parseScriptParts, st as getPoliciesFromCloud, tt as REDTEAM_MEMORY_POISONING_PLUGIN_ID, u as MistralEmbeddingProvider, v as DefaultGradingProvider$2, w as AzureChatCompletionProvider, x as DefaultSynthesizeProvider$1, y as DefaultLlmRubricProvider, z as removePrefix } from "./providers-CJh7iriU.js";
10
+ import { r as runPython } from "./pythonUtils-DMO68Jg7.js";
11
+ import { i as isJavascriptFile } from "./fileExtensions-8CjoL7vB.js";
12
+ import { N as parseFileUrl, O as maybeLoadToolsFromExternalFile, S as getNunjucksEngineForFilePath, T as maybeLoadFromExternalFile, k as parsePathOrGlob, w as maybeLoadConfigFromExternalFile } from "./util-Dub0f_ej.js";
13
+ import { r as accumulateTokenUsage } from "./tokenUsageUtils-4c780gFd.js";
14
+ import { a as isCacheEnabled, i as getCache, r as fetchWithCache } from "./cache-Bzttsk0X.js";
15
+ import { t as OpenAiChatCompletionProvider } from "./chat-Cx_LkwvZ.js";
16
+ import { x as hasGoogleDefaultCredentials } from "./transform-_DpNB4qp.js";
17
+ import { t as OpenAiEmbeddingProvider } from "./embedding-ChS1ivFS.js";
18
+ import { t as AnthropicMessagesProvider } from "./messages-CI69Lasb.js";
19
+ import { t as OpenAiResponsesProvider } from "./responses-BiaBguAu.js";
20
+ import { n as sha256 } from "./createHash-Un4Q_huE.js";
21
+ import { a as PROMPT_DELIMITER, n as maybeFilePath, r as normalizeInput } from "./utils-o8S5huU2.js";
23
22
  import * as fs$2 from "fs";
24
23
  import fs from "fs";
25
24
  import path from "path";
26
25
  import yaml from "js-yaml";
27
26
  import { AsyncLocalStorage } from "node:async_hooks";
27
+ import dedent from "dedent";
28
+ import z$1 from "zod";
29
+ import { readFile, stat } from "fs/promises";
28
30
  import path$1 from "node:path";
29
31
  import fsPromises from "node:fs/promises";
32
+ import { parse as parse$1 } from "csv-parse/sync";
33
+ import { globSync } from "glob";
30
34
  import { execFile } from "child_process";
31
35
  import { PythonShell } from "python-shell";
32
36
  import Clone from "rfdc";
33
- import dedent from "dedent";
34
- import { readFile, stat } from "fs/promises";
35
- import { globSync } from "glob";
36
- import z$1 from "zod";
37
- import { parse as parse$1 } from "csv-parse/sync";
38
37
  import cliProgress from "cli-progress";
39
- //#region src/assertions/contextUtils.ts
38
+ //#region src/scheduler/providerCallExecutionContext.ts
39
+ const providerCallExecutionContext = new AsyncLocalStorage();
40
+ function getProviderCallExecutionContext() {
41
+ return providerCallExecutionContext.getStore();
42
+ }
43
+ function withProviderCallExecutionContext(context, fn) {
44
+ return providerCallExecutionContext.run(context, fn);
45
+ }
46
+ //#endregion
47
+ //#region src/matchers/providers.ts
40
48
  /**
41
- * Resolves the context value for context-based assertions.
42
- * Supports extracting context from test variables or transforming from output.
43
- * Can return either a single context string or an array of context chunks.
49
+ * Helper to call provider with consistent context propagation pattern.
50
+ * Spreads the optional context and merges with prompt label and vars.
51
+ * Also reuses evaluator scheduler context for cancellation, rate limits,
52
+ * and grouped grading provider calls when present.
44
53
  *
45
- * @param assertion - The assertion configuration
46
- * @param test - The test case
47
- * @param output - The provider output (after provider transform, before test transform)
48
- * @param prompt - The prompt text
49
- * @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
50
- * @param providerResponse - Optional full provider response for contextTransform
51
- * @returns The resolved context string or array of strings
52
- * @throws Error if context cannot be resolved or transform fails
54
+ * IMPORTANT: Spread order matters - context is spread first, then prompt/vars
55
+ * override. This ensures originalProvider from context is preserved while
56
+ * allowing this call to specify its own prompt metadata.
53
57
  */
54
- async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
55
- let contextValue;
56
- if (test.vars?.context) {
57
- if (typeof test.vars.context === "string") contextValue = test.vars.context;
58
- else if (Array.isArray(test.vars.context)) {
59
- const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
60
- if (invalidEntry) {
61
- const [idx, val] = invalidEntry;
62
- invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
58
+ function callProviderWithContext(provider, prompt, label, vars, context) {
59
+ const callApiContext = {
60
+ ...context,
61
+ prompt: {
62
+ raw: prompt,
63
+ label
64
+ },
65
+ vars
66
+ };
67
+ const executionContext = getProviderCallExecutionContext();
68
+ const callApiOptions = executionContext?.abortSignal ? { abortSignal: executionContext.abortSignal } : void 0;
69
+ const callApi = () => callApiOptions ? provider.callApi(prompt, callApiContext, callApiOptions) : provider.callApi(prompt, callApiContext);
70
+ const executeCall = () => {
71
+ if (executionContext?.rateLimitRegistry && !isRateLimitWrapped(provider)) return executionContext.rateLimitRegistry.execute(provider, callApi, createProviderRateLimitOptions());
72
+ return callApi();
73
+ };
74
+ if (executionContext?.providerCallQueue) return executionContext.providerCallQueue.enqueue(provider.id(), executeCall);
75
+ return executeCall();
76
+ }
77
+ async function loadFromProviderOptions(provider) {
78
+ invariant(typeof provider === "object", `Provider must be an object, but received a ${typeof provider}: ${provider}`);
79
+ invariant(!Array.isArray(provider), `Provider must be an object, but received an array: ${JSON.stringify(provider)}`);
80
+ invariant(provider.id, "Provider supplied to assertion must have an id");
81
+ return loadApiProvider(provider.id, {
82
+ options: provider,
83
+ basePath: state.basePath
84
+ });
85
+ }
86
+ function isSimulatedUserProviderConfig(provider) {
87
+ if (typeof provider === "string") return provider === "promptfoo:simulated-user";
88
+ if (!provider || typeof provider !== "object" || Array.isArray(provider)) return false;
89
+ if (typeof provider.id === "function") return provider.id() === "promptfoo:simulated-user";
90
+ const providerId = provider.id;
91
+ if (typeof providerId === "string") return providerId === "promptfoo:simulated-user";
92
+ return Object.values(provider).some((providerTypeConfig) => isSimulatedUserProviderConfig(providerTypeConfig));
93
+ }
94
+ async function getGradingProvider(type, provider, defaultProvider) {
95
+ let finalProvider;
96
+ if (typeof provider === "string") finalProvider = await loadApiProvider(provider, { basePath: state.basePath });
97
+ else if (provider != null && typeof provider === "object" && typeof provider.id === "function") finalProvider = provider;
98
+ else if (provider != null && typeof provider === "object") {
99
+ const typeValue = provider[type];
100
+ if (typeValue) finalProvider = await getGradingProvider(type, typeValue, defaultProvider);
101
+ else if (provider.id) finalProvider = await loadFromProviderOptions(provider);
102
+ else if (Array.isArray(provider)) throw new Error(`Provider must be an object or string, but received an array.\n\nCheck that the provider ${JSON.stringify(provider[0], null, 2)} is not nested in an array.`);
103
+ else throw new Error(`Invalid provider definition for output type '${type}': ${JSON.stringify(provider, null, 2)}`);
104
+ } else {
105
+ const defaultTest = state.config?.defaultTest;
106
+ const defaultTestObj = typeof defaultTest === "object" ? defaultTest : null;
107
+ const cfg = [
108
+ defaultTestObj?.provider || void 0,
109
+ defaultTestObj?.options?.provider?.text || void 0,
110
+ defaultTestObj?.options?.provider || void 0
111
+ ].find((candidateProvider) => {
112
+ if (!candidateProvider) return false;
113
+ if (isSimulatedUserProviderConfig(candidateProvider)) {
114
+ logger.debug("[Grading] Skipping promptfoo:simulated-user as an implicit grader fallback");
115
+ return false;
63
116
  }
64
- contextValue = test.vars.context;
65
- }
66
- } else if (fallbackContext) contextValue = fallbackContext;
67
- if (assertion.contextTransform) try {
68
- const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
69
- const transformed = await transform(assertion.contextTransform, outputForTransform, {
70
- vars: test.vars,
71
- prompt: { label: prompt },
72
- ...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
117
+ return true;
73
118
  });
74
- invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${assertion.contextTransform}`);
75
- contextValue = transformed;
76
- } catch (error) {
77
- throw new Error(`Failed to transform context using expression '${assertion.contextTransform}': ${error instanceof Error ? error.message : String(error)}`);
119
+ if (cfg) {
120
+ finalProvider = await getGradingProvider(type, cfg, defaultProvider);
121
+ if (finalProvider) logger.debug("[Grading] Using provider from defaultTest fallback", { providerId: finalProvider.id() });
122
+ } else finalProvider = defaultProvider;
78
123
  }
79
- invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
80
- return contextValue;
124
+ return finalProvider;
81
125
  }
82
- /**
83
- * Serializes context (string or string[]) to a single string for prompts.
84
- * Joins chunks with double newlines to preserve separation.
85
- */
86
- function serializeContext(context) {
87
- return Array.isArray(context) ? context.join("\n\n") : context;
126
+ async function getAndCheckProvider(type, provider, defaultProvider, checkName) {
127
+ const matchedProvider = await getGradingProvider(type, provider, defaultProvider);
128
+ if (!matchedProvider) if (defaultProvider) {
129
+ logger.warn("[Grading] Falling back to default provider", {
130
+ checkName,
131
+ type
132
+ });
133
+ return defaultProvider;
134
+ } else throw new Error(`No provider of type ${type} found for '${checkName}'`);
135
+ let isValidProviderType = true;
136
+ if (type === "embedding") isValidProviderType = "callEmbeddingApi" in matchedProvider || "callSimilarityApi" in matchedProvider;
137
+ else if (type === "classification") isValidProviderType = "callClassificationApi" in matchedProvider;
138
+ else if (type === "moderation") isValidProviderType = "callModerationApi" in matchedProvider;
139
+ if (!isValidProviderType) {
140
+ if (provider) throw new Error(`Provider ${matchedProvider.id()} is not a valid ${type} provider for '${checkName}'`);
141
+ if (defaultProvider) {
142
+ logger.warn("[Grading] Falling back to default provider after type check failed", {
143
+ checkName,
144
+ providerId: matchedProvider.id(),
145
+ type
146
+ });
147
+ return defaultProvider;
148
+ }
149
+ throw new Error(`Provider ${matchedProvider.id()} is not a valid ${type} provider for '${checkName}'`);
150
+ }
151
+ return matchedProvider;
88
152
  }
89
153
  //#endregion
90
- //#region src/assertions/utils.ts
91
- const clone = Clone();
92
- function getFinalTest(test, assertion) {
93
- const ret = clone({
94
- ...test,
95
- ...test.options && test.options.provider && { options: {
96
- ...test.options,
97
- provider: void 0
98
- } },
99
- ...test.provider && { provider: void 0 }
100
- });
101
- ret.options = ret.options || {};
102
- if (test.provider) ret.provider = test.provider;
103
- ret.options.provider = assertion.provider || test?.options?.provider;
104
- ret.options.rubricPrompt = assertion.rubricPrompt || ret.options.rubricPrompt;
105
- return Object.freeze(ret);
106
- }
107
- async function loadFromJavaScriptFile(filePath, functionName, args) {
108
- const requiredModule = await importModule(filePath, functionName);
109
- if (functionName && typeof requiredModule[functionName] === "function") return requiredModule[functionName](...args);
110
- else if (typeof requiredModule === "function") return requiredModule(...args);
111
- else if (requiredModule.default && typeof requiredModule.default === "function") return requiredModule.default(...args);
112
- else throw new Error(`Assertion malformed: ${filePath} must export a function or have a default export as a function`);
113
- }
114
- function processFileReference(fileRef) {
115
- const basePath = state.basePath || "";
116
- const filePath = path.resolve(basePath, fileRef.slice(7));
117
- const fileContent = fs.readFileSync(filePath, "utf8");
118
- const extension = path.extname(filePath);
119
- if ([
120
- ".json",
121
- ".yaml",
122
- ".yml"
123
- ].includes(extension)) return yaml.load(fileContent);
124
- else if (extension === ".txt") return fileContent.trim();
125
- else throw new Error(`Unsupported file type: ${filePath}`);
154
+ //#region src/providers/anthropic/defaults.ts
155
+ const DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-6";
156
+ /**
157
+ * Helper function to create a lazy-loaded provider. This allows the .env file to be
158
+ * loaded first before the provider is initialized.
159
+ * @param factory Factory function that creates provider instance with optional env
160
+ * @returns Object with getter that lazily initializes the provider with the latest env
161
+ */
162
+ function createLazyProvider(factory) {
163
+ const instances = /* @__PURE__ */ new Map();
164
+ return { getInstance(env) {
165
+ const cacheKey = env ? JSON.stringify(env) : "";
166
+ if (!instances.has(cacheKey)) instances.set(cacheKey, factory(env));
167
+ return instances.get(cacheKey);
168
+ } };
126
169
  }
127
- function coerceString(value) {
128
- if (typeof value === "string") return value;
129
- return JSON.stringify(value);
170
+ var AnthropicLlmRubricProvider = class extends AnthropicMessagesProvider {
171
+ constructor(modelName, options = {}) {
172
+ const { env, config = {} } = options;
173
+ super(modelName, {
174
+ env,
175
+ config: {
176
+ tool_choice: {
177
+ type: "tool",
178
+ name: "grade_output"
179
+ },
180
+ tools: [{
181
+ name: "grade_output",
182
+ description: "Grade the given output based on specific criteria",
183
+ input_schema: {
184
+ type: "object",
185
+ properties: {
186
+ pass: {
187
+ type: "boolean",
188
+ description: "Whether the output passes the criteria"
189
+ },
190
+ score: {
191
+ type: "number",
192
+ description: "The score assigned to the output"
193
+ },
194
+ reason: {
195
+ type: "string",
196
+ description: "The reason for the given grade"
197
+ }
198
+ },
199
+ required: [
200
+ "pass",
201
+ "score",
202
+ "reason"
203
+ ]
204
+ }
205
+ }],
206
+ ...config
207
+ }
208
+ });
209
+ }
210
+ async callApi(prompt) {
211
+ const result = await super.callApi(prompt);
212
+ if (typeof result.output !== "string") return { error: `Anthropic LLM rubric grader - malformed non-string output\n\n${JSON.stringify(result.output)}` };
213
+ try {
214
+ return { output: JSON.parse(result.output).input };
215
+ } catch (err) {
216
+ return { error: `Anthropic LLM rubric grader - invalid JSON: ${err}\n\n${result.output}` };
217
+ }
218
+ }
219
+ };
220
+ const gradingProviderFactory = createLazyProvider((env) => new AnthropicMessagesProvider(DEFAULT_ANTHROPIC_MODEL, { env }));
221
+ const llmRubricProviderFactory = createLazyProvider((env) => new AnthropicLlmRubricProvider(DEFAULT_ANTHROPIC_MODEL, { env }));
222
+ const webSearchProviderFactory = createLazyProvider((env) => new AnthropicMessagesProvider(DEFAULT_ANTHROPIC_MODEL, {
223
+ env,
224
+ config: { tools: [{
225
+ type: "web_search_20250305",
226
+ name: "web_search",
227
+ max_uses: 5
228
+ }] }
229
+ }));
230
+ /**
231
+ * Gets all default Anthropic providers with the given environment overrides
232
+ * @param env - Optional environment overrides
233
+ * @returns Anthropic provider implementations for various functions
234
+ */
235
+ function getAnthropicProviders(env) {
236
+ const gradingProvider = gradingProviderFactory.getInstance(env);
237
+ return {
238
+ gradingJsonProvider: gradingProvider,
239
+ gradingProvider,
240
+ llmRubricProvider: llmRubricProviderFactory.getInstance(env),
241
+ suggestionsProvider: gradingProvider,
242
+ synthesizeProvider: gradingProvider,
243
+ webSearchProvider: webSearchProviderFactory.getInstance(env)
244
+ };
130
245
  }
131
246
  //#endregion
132
- //#region src/external/prompts/ragas.ts
133
- const ANSWER_RELEVANCY_GENERATE = `Generate question for the given answer.
134
- Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India
135
- Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?
136
-
137
- Answer:{{answer}}
138
- Question:`;
139
- const CONTEXT_RECALL = `Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not.
140
- Think in steps and reason before coming to conclusion.
141
-
142
- context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
143
- answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895
144
- classification
145
- 1. Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. The date of birth of Einstein is mentioned clearly in the context. So [Attributed]
146
- 2. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. The exact sentence is present in the given context. So [Attributed]
147
- 3. He published 4 papers in 1905. There is no mention about papers he wrote in given the context. So [Not Attributed]
148
- 4. Einstein moved to Switzerland in 1895. There is not supporting evidence for this in the given the context. So [Not Attributed]
149
-
150
- context:{{context}}
151
- answer:{{groundTruth}}
152
- classification:
153
- `;
154
- const CONTEXT_RECALL_ATTRIBUTED_TOKEN = "[Attributed]";
155
- const CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN = "[Not Attributed]";
156
- const CONTEXT_RELEVANCE = `Please extract relevant sentences from the provided context that is absolutely required answer the following query. If no relevant sentences are found, or if you believe the query cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.
157
-
158
- query: {{query}}
159
- context: {{context}}
160
- candidate sentences:
161
- `;
162
- const CONTEXT_RELEVANCE_BAD = "Insufficient Information";
163
- const CONTEXT_FAITHFULNESS_LONGFORM = `Given a question and answer, create one or more statements from each sentence in the given answer.
164
- question: Who was Albert Einstein and what is he best known for?
165
- answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
166
- statements:\nAlbert Einstein was born in Germany.\nAlbert Einstein was best known for his theory of relativity.
167
- question: Cadmium Chloride is slightly soluble in this chemical, it is also called what?
168
- answer: alcohol
169
- statements:\nCadmium Chloride is slightly soluble in alcohol.
170
- question: Were Shahul and Jithin of the same nationality?
171
- answer: They were from different countries.
172
- statements:\nShahul and Jithin were from different countries.
173
- question:{{question}}
174
- answer: {{answer}}
175
- statements:\n`;
176
- const CONTEXT_FAITHFULNESS_NLI_STATEMENTS = `Prompt: Natural language inference
177
- Consider the given context and following statements, then determine whether they are supported by the information present in the context.Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Provide a final verdict for each statement in order at the end in the given format. Do not deviate from the specified format.
178
-
179
- Context:\nJohn is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.
180
- statements:\n1. John is majoring in Biology.\n2. John is taking a course on Artificial Intelligence.\n3. John is a dedicated student.\n4. John has a part-time job.\n5. John is interested in computer programming.\n
181
- Answer:
182
- 1. John is majoring in Biology.
183
- Explanation: John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology. Verdict: No.
184
- 2. John is taking a course on Artificial Intelligence.
185
- Explanation: The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI. Verdict: No.
186
- 3. John is a dedicated student.
187
- Explanation: The prompt states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication. Verdict: Yes.
188
- 4. John has a part-time job.
189
- Explanation: There is no information given in the context about John having a part-time job. Therefore, it cannot be deduced that John has a part-time job. Verdict: No.
190
- 5. John is interested in computer programming.
191
- Explanation: The context states that John is pursuing a degree in Computer Science, which implies an interest in computer programming. Verdict: Yes.
192
- Final verdict for each statement in order: No. No. Yes. No. Yes.
193
- context:\n{{context}}
194
- statements:\n{{statements|join("\\n")}}
195
- Answer:
196
- `;
247
+ //#region src/providers/github/defaults.ts
248
+ const githubConfig = {
249
+ apiBaseUrl: "https://models.github.ai/inference",
250
+ apiKeyEnvar: "GITHUB_TOKEN"
251
+ };
252
+ const DefaultGitHubGradingProvider = new OpenAiChatCompletionProvider("openai/gpt-5", { config: githubConfig });
253
+ const DefaultGitHubGradingJsonProvider = new OpenAiChatCompletionProvider("openai/gpt-5", { config: {
254
+ ...githubConfig,
255
+ response_format: { type: "json_object" }
256
+ } });
257
+ const DefaultGitHubSuggestionsProvider = new OpenAiChatCompletionProvider("openai/gpt-5", { config: githubConfig });
258
+ new OpenAiChatCompletionProvider("openai/gpt-5-nano", { config: githubConfig });
259
+ new OpenAiChatCompletionProvider("openai/gpt-5-mini", { config: githubConfig });
260
+ new OpenAiChatCompletionProvider("openai/o4-mini", { config: githubConfig });
197
261
  //#endregion
198
- //#region src/prompts/grading.ts
199
- const DEFAULT_GRADING_PROMPT = JSON.stringify([{
200
- role: "system",
201
- content: dedent`You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
202
-
203
- Examples:
204
-
205
- <Output>Hello world</Output>
206
- <Rubric>Content contains a greeting</Rubric>
207
- {"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
208
-
209
- <Output>Avast ye swabs, repel the invaders!</Output>
210
- <Rubric>Does not speak like a pirate</Rubric>
211
- {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}`
212
- }, {
213
- role: "user",
214
- content: "<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>"
215
- }]);
216
- const PROMPTFOO_FACTUALITY_PROMPT = JSON.stringify([{
217
- role: "system",
218
- content: dedent`
219
- You are a precise factuality evaluator that compares a submitted answer to an expert answer.
220
-
221
- Your task is to analyze the factual content while ignoring differences in style, grammar, or punctuation.
222
- You must categorize the submission into one of these options:
223
-
224
- (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
225
- (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
226
- (C) The submitted answer contains all the same details as the expert answer.
227
- (D) There is a disagreement between the submitted answer and the expert answer.
228
- (E) The answers differ, but these differences don't matter from the perspective of factuality.
229
-
230
- Respond ONLY with a JSON object in this format:
231
- {
232
- "category": "[LETTER]",
233
- "reason": "[DETAILED EXPLANATION]"
234
- }
235
-
236
- - The "category" must be a single letter A, B, C, D, or E.
237
- - Provide a clear, detailed explanation in the "reason" field.
238
- - Your response must be valid JSON with no additional text.`
239
- }, {
240
- role: "user",
241
- content: dedent`
242
- I need you to compare these answers:
243
-
244
- <question>
245
- {{input}}
246
- </question>
247
-
248
- <expert_answer>
249
- {{ideal}}
250
- </expert_answer>
251
-
252
- <submitted_answer>
253
- {{completion}}
254
- </submitted_answer>
255
-
256
- Please analyze the factual relationship between these answers according to the categories you've been given.`
257
- }]);
258
- const OPENAI_CLOSED_QA_PROMPT = JSON.stringify([{
259
- role: "system",
260
- content: `You are assessing a submitted answer on a given task based on a criterion. Here is the data:
261
- [BEGIN DATA]
262
- ***
263
- [Task]: {{input}}
264
- ***
265
- [Submission]: {{completion}}
266
- ***
267
- [Criterion]: {{criteria}}
268
- ***
269
- [END DATA]
270
- Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
271
-
272
- Reasoning:`
273
- }]);
274
- const SUGGEST_PROMPTS_SYSTEM_MESSAGE = {
275
- role: "system",
276
- content: `You're helping a scientist who is tuning a prompt for a large language model. You will receive messages, and each message is a full prompt. Generate a candidate variation of the given prompt. This variation will be tested for quality in order to select a winner.
277
-
278
- Substantially revise the prompt, revising its structure and content however necessary to make it perform better, while preserving the original intent and including important details.
279
-
280
- Your output is going to be copied directly into the program. It should contain the prompt ONLY`
262
+ //#region src/providers/mistral/defaults.ts
263
+ const DefaultEmbeddingProvider$1 = new MistralEmbeddingProvider();
264
+ const DefaultGradingProvider$1 = new MistralChatCompletionProvider("mistral-large-latest");
265
+ const DefaultGradingJsonProvider$1 = new MistralChatCompletionProvider("mistral-large-latest", { config: { response_format: { type: "json_object" } } });
266
+ const DefaultSuggestionsProvider$1 = new MistralChatCompletionProvider("mistral-large-latest");
267
+ const DefaultSynthesizeProvider = new MistralChatCompletionProvider("mistral-large-latest");
268
+ //#endregion
269
+ //#region src/providers/openai/defaults.ts
270
+ const DEFAULT_OPENAI_GRADING_MODEL = "gpt-5.4-2026-03-05";
271
+ const DefaultEmbeddingProvider = new OpenAiEmbeddingProvider("text-embedding-3-large");
272
+ const DefaultGradingProvider = new OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL);
273
+ const DefaultGradingJsonProvider = new OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL, { config: { response_format: { type: "json_object" } } });
274
+ const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL);
275
+ const DefaultModerationProvider = new OpenAiModerationProvider("omni-moderation-latest");
276
+ const DefaultWebSearchProvider = new OpenAiResponsesProvider("gpt-5.4-2026-03-05", { config: { tools: [{ type: "web_search_preview" }] } });
277
+ async function getDefaultProviderPreferences(env) {
278
+ const hasAnthropicCredentials = Boolean(getEnvString("ANTHROPIC_API_KEY") || env?.ANTHROPIC_API_KEY);
279
+ const hasOpenAiCredentials = Boolean(getEnvString("OPENAI_API_KEY") || env?.OPENAI_API_KEY);
280
+ const hasGitHubCredentials = Boolean(getEnvString("GITHUB_TOKEN") || env?.GITHUB_TOKEN);
281
+ const hasGoogleAiStudioCredentials = Boolean(getEnvString("GEMINI_API_KEY") || env?.GEMINI_API_KEY || getEnvString("GOOGLE_API_KEY") || env?.GOOGLE_API_KEY || getEnvString("PALM_API_KEY") || env?.PALM_API_KEY);
282
+ const hasAzureApiKey = getEnvString("AZURE_OPENAI_API_KEY") || env?.AZURE_OPENAI_API_KEY || getEnvString("AZURE_API_KEY") || env?.AZURE_API_KEY;
283
+ const hasAzureClientCreds = (getEnvString("AZURE_CLIENT_ID") || env?.AZURE_CLIENT_ID) && (getEnvString("AZURE_CLIENT_SECRET") || env?.AZURE_CLIENT_SECRET) && (getEnvString("AZURE_TENANT_ID") || env?.AZURE_TENANT_ID);
284
+ const hasMistralCredentials = Boolean(getEnvString("MISTRAL_API_KEY") || env?.MISTRAL_API_KEY);
285
+ const preferAzure = Boolean(!hasOpenAiCredentials && (hasAzureApiKey || hasAzureClientCreds) && (getEnvString("AZURE_DEPLOYMENT_NAME") || env?.AZURE_DEPLOYMENT_NAME) && (getEnvString("AZURE_OPENAI_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_DEPLOYMENT_NAME));
286
+ const preferAnthropic = !hasOpenAiCredentials && hasAnthropicCredentials;
287
+ const shouldUseFallbackDefaults = !preferAzure && !hasOpenAiCredentials && !hasAnthropicCredentials && !hasGoogleAiStudioCredentials;
288
+ const useGoogleVertexDefaults = shouldUseFallbackDefaults ? await hasGoogleDefaultCredentials() : false;
289
+ const useNonGoogleFallbackDefaults = shouldUseFallbackDefaults && !useGoogleVertexDefaults;
290
+ const hasCodexCredentials = useNonGoogleFallbackDefaults && !hasMistralCredentials && hasCodexDefaultCredentials(env);
291
+ return {
292
+ preferAnthropic,
293
+ preferAzure,
294
+ useCodexDefaults: hasCodexCredentials,
295
+ useGitHubDefaults: useNonGoogleFallbackDefaults && !hasMistralCredentials && !hasCodexCredentials && hasGitHubCredentials,
296
+ useGoogleAiStudioDefaults: !hasOpenAiCredentials && !hasAnthropicCredentials && hasGoogleAiStudioCredentials,
297
+ useGoogleVertexDefaults,
298
+ useMistralDefaults: useNonGoogleFallbackDefaults && hasMistralCredentials
299
+ };
300
+ }
301
+ async function getDefaultProviders(env) {
302
+ const { preferAnthropic, preferAzure, useCodexDefaults, useGitHubDefaults, useGoogleAiStudioDefaults, useGoogleVertexDefaults, useMistralDefaults } = await getDefaultProviderPreferences(env);
303
+ let providers;
304
+ if (preferAzure) {
305
+ logger.debug("Using Azure OpenAI default providers");
306
+ const deploymentName = getEnvString("AZURE_OPENAI_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_DEPLOYMENT_NAME;
307
+ if (!deploymentName) throw new Error("AZURE_OPENAI_DEPLOYMENT_NAME must be set when using Azure OpenAI");
308
+ const embeddingDeploymentName = getEnvString("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME || deploymentName;
309
+ const azureProvider = new AzureChatCompletionProvider(deploymentName, { env });
310
+ providers = {
311
+ embeddingProvider: new AzureEmbeddingProvider(embeddingDeploymentName, { env }),
312
+ gradingJsonProvider: azureProvider,
313
+ gradingProvider: azureProvider,
314
+ moderationProvider: DefaultModerationProvider,
315
+ suggestionsProvider: azureProvider,
316
+ synthesizeProvider: azureProvider
317
+ };
318
+ } else if (preferAnthropic) {
319
+ logger.debug("Using Anthropic default providers");
320
+ const anthropicProviders = getAnthropicProviders(env);
321
+ providers = {
322
+ embeddingProvider: DefaultEmbeddingProvider,
323
+ gradingJsonProvider: anthropicProviders.gradingJsonProvider,
324
+ gradingProvider: anthropicProviders.gradingProvider,
325
+ llmRubricProvider: anthropicProviders.llmRubricProvider,
326
+ moderationProvider: DefaultModerationProvider,
327
+ suggestionsProvider: anthropicProviders.suggestionsProvider,
328
+ synthesizeProvider: anthropicProviders.synthesizeProvider,
329
+ webSearchProvider: anthropicProviders.webSearchProvider
330
+ };
331
+ } else if (useGoogleAiStudioDefaults) {
332
+ logger.debug("Using Google AI Studio default providers");
333
+ providers = {
334
+ embeddingProvider: DefaultEmbeddingProvider$2,
335
+ gradingJsonProvider: DefaultGradingJsonProvider$2,
336
+ gradingProvider: DefaultGradingProvider$2,
337
+ llmRubricProvider: DefaultLlmRubricProvider,
338
+ moderationProvider: DefaultModerationProvider,
339
+ suggestionsProvider: DefaultSuggestionsProvider$2,
340
+ synthesizeProvider: DefaultSynthesizeProvider$1
341
+ };
342
+ } else if (useGoogleVertexDefaults) {
343
+ logger.debug("Using Google Vertex default providers");
344
+ providers = {
345
+ embeddingProvider: DefaultEmbeddingProvider$2,
346
+ gradingJsonProvider: DefaultGradingProvider$3,
347
+ gradingProvider: DefaultGradingProvider$3,
348
+ moderationProvider: DefaultModerationProvider,
349
+ suggestionsProvider: DefaultGradingProvider$3,
350
+ synthesizeProvider: DefaultGradingProvider$3
351
+ };
352
+ } else if (useMistralDefaults) {
353
+ logger.debug("Using Mistral default providers");
354
+ providers = {
355
+ embeddingProvider: DefaultEmbeddingProvider$1,
356
+ gradingJsonProvider: DefaultGradingJsonProvider$1,
357
+ gradingProvider: DefaultGradingProvider$1,
358
+ moderationProvider: DefaultModerationProvider,
359
+ suggestionsProvider: DefaultSuggestionsProvider$1,
360
+ synthesizeProvider: DefaultSynthesizeProvider
361
+ };
362
+ } else if (useCodexDefaults) {
363
+ logger.debug("Using Codex SDK default providers from ChatGPT/Codex credentials");
364
+ providers = {
365
+ embeddingProvider: DefaultEmbeddingProvider,
366
+ moderationProvider: DefaultModerationProvider,
367
+ ...getCodexDefaultProviders(env)
368
+ };
369
+ } else if (useGitHubDefaults) {
370
+ logger.debug("Using GitHub Models default providers");
371
+ providers = {
372
+ embeddingProvider: DefaultEmbeddingProvider,
373
+ gradingJsonProvider: DefaultGitHubGradingJsonProvider,
374
+ gradingProvider: DefaultGitHubGradingProvider,
375
+ moderationProvider: DefaultModerationProvider,
376
+ suggestionsProvider: DefaultGitHubSuggestionsProvider,
377
+ synthesizeProvider: DefaultGitHubGradingJsonProvider
378
+ };
379
+ } else {
380
+ logger.debug("Using OpenAI default providers");
381
+ providers = {
382
+ embeddingProvider: DefaultEmbeddingProvider,
383
+ gradingJsonProvider: DefaultGradingJsonProvider,
384
+ gradingProvider: DefaultGradingProvider,
385
+ moderationProvider: DefaultModerationProvider,
386
+ suggestionsProvider: DefaultSuggestionsProvider,
387
+ synthesizeProvider: DefaultGradingJsonProvider,
388
+ webSearchProvider: DefaultWebSearchProvider
389
+ };
390
+ }
391
+ if (getEnvString("AZURE_CONTENT_SAFETY_ENDPOINT") || env?.AZURE_CONTENT_SAFETY_ENDPOINT) providers.moderationProvider = new AzureModerationProvider("text-content-safety", { env });
392
+ return providers;
393
+ }
394
+ //#endregion
395
+ //#region src/assertions/utils.ts
396
+ const clone = Clone();
397
+ function getFinalTest(test, assertion) {
398
+ const ret = clone({
399
+ ...test,
400
+ ...test.options && test.options.provider && { options: {
401
+ ...test.options,
402
+ provider: void 0
403
+ } },
404
+ ...test.provider && { provider: void 0 }
405
+ });
406
+ ret.options = ret.options || {};
407
+ if (test.provider) ret.provider = test.provider;
408
+ ret.options.provider = assertion.provider || test?.options?.provider;
409
+ ret.options.rubricPrompt = assertion.rubricPrompt || ret.options.rubricPrompt;
410
+ return Object.freeze(ret);
411
+ }
412
+ async function loadFromJavaScriptFile(filePath, functionName, args) {
413
+ const requiredModule = await importModule(filePath, functionName);
414
+ if (functionName && typeof requiredModule[functionName] === "function") return requiredModule[functionName](...args);
415
+ else if (typeof requiredModule === "function") return requiredModule(...args);
416
+ else if (requiredModule.default && typeof requiredModule.default === "function") return requiredModule.default(...args);
417
+ else throw new Error(`Assertion malformed: ${filePath} must export a function or have a default export as a function`);
418
+ }
419
+ function processFileReference(fileRef) {
420
+ const basePath = state.basePath || "";
421
+ const filePath = path.resolve(basePath, fileRef.slice(7));
422
+ const fileContent = fs.readFileSync(filePath, "utf8");
423
+ const extension = path.extname(filePath);
424
+ if ([
425
+ ".json",
426
+ ".yaml",
427
+ ".yml"
428
+ ].includes(extension)) return yaml.load(fileContent);
429
+ else if (extension === ".txt") return fileContent.trim();
430
+ else throw new Error(`Unsupported file type: ${filePath}`);
431
+ }
432
+ function coerceString(value) {
433
+ if (typeof value === "string") return value;
434
+ return JSON.stringify(value);
435
+ }
436
+ //#endregion
437
+ //#region src/matchers/shared.ts
438
+ /**
439
+ * Normalize token usage for matcher results. Unlike the evaluator-level
440
+ * normalizeTokenUsage, this excludes the `assertions` field and preserves
441
+ * the existing completionDetails shape (passing through whatever the
442
+ * provider returned, or undefined if not present).
443
+ */
444
+ function normalizeMatcherTokenUsage(tokenUsage) {
445
+ return {
446
+ total: tokenUsage?.total || 0,
447
+ prompt: tokenUsage?.prompt || 0,
448
+ completion: tokenUsage?.completion || 0,
449
+ cached: tokenUsage?.cached || 0,
450
+ numRequests: tokenUsage?.numRequests || 0,
451
+ completionDetails: tokenUsage?.completionDetails || {
452
+ reasoning: 0,
453
+ acceptedPrediction: 0,
454
+ rejectedPrediction: 0
455
+ }
456
+ };
457
+ }
458
+ function fail(reason, tokensUsed) {
459
+ return {
460
+ pass: false,
461
+ reason,
462
+ score: 0,
463
+ tokensUsed: normalizeMatcherTokenUsage(tokensUsed)
464
+ };
465
+ }
466
+ function cosineSimilarity(vecA, vecB) {
467
+ if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
468
+ const dotProduct = vecA.reduce((acc, val, idx) => acc + val * vecB[idx], 0);
469
+ const vecAMagnitude = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
470
+ const vecBMagnitude = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
471
+ if (vecAMagnitude === 0 || vecBMagnitude === 0) return 0;
472
+ return dotProduct / (vecAMagnitude * vecBMagnitude);
473
+ }
474
+ function dotProduct(vecA, vecB) {
475
+ if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
476
+ return vecA.reduce((acc, val, idx) => acc + val * vecB[idx], 0);
477
+ }
478
+ function euclideanDistance(vecA, vecB) {
479
+ if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
480
+ const sumSquaredDiff = vecA.reduce((acc, val, idx) => {
481
+ const diff = val - vecB[idx];
482
+ return acc + diff * diff;
483
+ }, 0);
484
+ return Math.sqrt(sumSquaredDiff);
485
+ }
486
+ function tryParse(content) {
487
+ try {
488
+ return JSON.parse(content);
489
+ } catch {}
490
+ return content;
491
+ }
492
+ function splitIntoSentences(text) {
493
+ return text.split("\n").filter((sentence) => sentence.trim() !== "");
494
+ }
495
+ //#endregion
496
+ //#region src/matchers/rubric.ts
497
+ const nunjucks = getNunjucksEngine(void 0, false, true);
498
+ var LlmRubricProviderError = class extends Error {
499
+ constructor(message) {
500
+ super(message);
501
+ this.name = "LlmRubricProviderError";
502
+ }
281
503
  };
282
- const SELECT_BEST_PROMPT = JSON.stringify([{
283
- role: "system",
284
- content: `You are comparing multiple pieces of text to see which best fits the following criteria: {{criteria}}
285
-
286
- Here are the pieces of text:
287
-
288
- {% for output in outputs %}
289
- <Text index="{{ loop.index0 }}">
290
- {{ output }}
291
- </Text>
292
- {% endfor %}
293
-
294
- Output the index of the text that best fits the criteria. You must output a single integer.`
295
- }]);
296
- const DEFAULT_WEB_SEARCH_PROMPT = JSON.stringify([{
297
- role: "system",
298
- content: dedent`You are grading output according to a user-specified rubric, with the ability to search the web for current information. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
299
-
300
- You MUST search the web when:
301
- - The rubric asks about current information (prices, weather, news, etc.)
302
- - Facts need to be verified against recent data
303
- - The rubric references time-sensitive information
304
-
305
- Examples:
306
-
307
- <Output>The current CEO of Microsoft is Satya Nadella</Output>
308
- <Rubric>Contains accurate information about Microsoft's leadership</Rubric>
309
- {"reason": "I searched and confirmed Satya Nadella is indeed the current CEO of Microsoft", "pass": true, "score": 1.0}
310
-
311
- <Output>Bitcoin is trading at $45,000</Output>
312
- <Rubric>Provides current Bitcoin price within 10% accuracy</Rubric>
313
- {"reason": "Web search shows Bitcoin is currently trading at $98,000, not $45,000. The output is off by more than 50%", "pass": false, "score": 0.0}`
314
- }, {
315
- role: "user",
316
- content: "<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>"
317
- }]);
318
- const TRAJECTORY_GOAL_SUCCESS_PROMPT = JSON.stringify([{
319
- role: "system",
320
- content: dedent`You are grading whether an AI agent successfully completed a goal based on its final output and a summarized execution trajectory. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
321
-
322
- Judge end-to-end success, not stylistic perfection.
323
- Use the trajectory as evidence for what the agent actually did.
324
- Give partial credit when the agent made progress but did not fully achieve the goal.
325
-
326
- Examples:
327
-
328
- <Goal>Find the order status and tell the user whether it has shipped</Goal>
329
- <Trajectory>{"stepCount":2,"steps":[{"index":1,"type":"tool","name":"search_orders"},{"index":2,"type":"message","name":"agent response"}]}</Trajectory>
330
- <Output>Your order shipped yesterday and should arrive on Tuesday.</Output>
331
- {"reason":"The agent used the order lookup tool and gave the user the shipping status, so the goal was achieved.","pass":true,"score":1.0}
332
-
333
- <Goal>Find the order status and tell the user whether it has shipped</Goal>
334
- <Trajectory>{"stepCount":1,"steps":[{"index":1,"type":"message","name":"agent response"}]}</Trajectory>
335
- <Output>I cannot check your order right now.</Output>
336
- {"reason":"The agent did not show evidence of checking the order and did not provide the requested status.","pass":false,"score":0.0}`
337
- }, {
338
- role: "user",
339
- content: dedent`<Goal>
340
- {{ goal }}
341
- </Goal>
342
- <Trajectory>
343
- {{ trajectory }}
344
- </Trajectory>
345
- <Output>
346
- {{ output }}
347
- </Output>`
348
- }]);
504
+ async function loadRubricPrompt(rubricPrompt, defaultPrompt) {
505
+ if (!rubricPrompt) return defaultPrompt;
506
+ if (typeof rubricPrompt === "object" && Object.keys(rubricPrompt).length === 0) return defaultPrompt;
507
+ if (typeof rubricPrompt === "string" && rubricPrompt.startsWith("file://")) {
508
+ const basePath = state.basePath || "";
509
+ const { filePath, functionName } = parseFileUrl(getNunjucksEngineForFilePath().renderString(rubricPrompt, {}));
510
+ const resolvedPath = path.resolve(basePath, filePath);
511
+ if (isJavascriptFile(filePath)) rubricPrompt = await loadFromJavaScriptFile(resolvedPath, functionName, []);
512
+ else {
513
+ if (!fs$2.existsSync(resolvedPath)) throw new Error(`File does not exist: ${resolvedPath}`);
514
+ rubricPrompt = fs$2.readFileSync(resolvedPath, "utf8");
515
+ }
516
+ } else rubricPrompt = maybeLoadFromExternalFile(rubricPrompt);
517
+ if (typeof rubricPrompt === "object") rubricPrompt = JSON.stringify(rubricPrompt);
518
+ invariant(typeof rubricPrompt === "string", "rubricPrompt must be a string");
519
+ return rubricPrompt;
520
+ }
521
+ function processContextForTemplating(context, enableObjectAccess) {
522
+ if (enableObjectAccess) return context;
523
+ return Object.fromEntries(Object.entries(context).map(([key, value]) => {
524
+ if (value && typeof value === "object") {
525
+ if (Array.isArray(value)) return [key, value.map((item) => item && typeof item === "object" ? JSON.stringify(item) : item)];
526
+ return [key, JSON.stringify(value)];
527
+ }
528
+ return [key, value];
529
+ }));
530
+ }
531
+ async function renderLlmRubricPrompt(rubricPrompt, context) {
532
+ const processedContext = processContextForTemplating(context, getEnvBool("PROMPTFOO_DISABLE_OBJECT_STRINGIFY", false));
533
+ try {
534
+ const parsed = JSON.parse(rubricPrompt, (_k, v) => typeof v === "string" ? nunjucks.renderString(v, processedContext) : v);
535
+ return JSON.stringify(parsed);
536
+ } catch (err) {
537
+ logger.debug(`[Rubric] Rubric prompt is not valid JSON, using Nunjucks rendering: ${err.message}`);
538
+ }
539
+ return nunjucks.renderString(rubricPrompt, processedContext);
540
+ }
541
+ function parseJsonGradingResponse(label, resp) {
542
+ let jsonObjects = [];
543
+ if (typeof resp.output === "string") try {
544
+ jsonObjects = extractJsonObjects(resp.output);
545
+ if (jsonObjects.length === 0) return { failure: fail(`Could not extract JSON from ${label} response`, resp.tokenUsage) };
546
+ } catch (err) {
547
+ return { failure: fail(`${label} produced malformed response: ${err}\n\n${resp.output}`, resp.tokenUsage) };
548
+ }
549
+ else if (typeof resp.output === "object" && resp.output !== null && !Array.isArray(resp.output)) jsonObjects = [resp.output];
550
+ else return { failure: fail(`${label} produced malformed response - output must be string or object. Output: ${JSON.stringify(resp.output)}`, resp.tokenUsage) };
551
+ const parsed = jsonObjects[0];
552
+ if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) return { failure: fail(`${label} produced malformed response. We were not able to parse the response as JSON. Output: ${JSON.stringify(resp.output)}`, resp.tokenUsage) };
553
+ return { parsed };
554
+ }
555
+ async function runJsonGradingPrompt({ assertion, checkName, defaultPrompt, grading, label, providerCallContext, throwOnError, vars }) {
556
+ const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading.rubricPrompt, defaultPrompt), vars);
557
+ const defaultProviders = await getDefaultProviders();
558
+ const defaultProvider = defaultProviders.llmRubricProvider || defaultProviders.gradingJsonProvider;
559
+ const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, defaultProvider, checkName), prompt, label, vars, providerCallContext);
560
+ if (resp.error || !resp.output) {
561
+ if (throwOnError) throw new Error(resp.error || "No output");
562
+ return fail(resp.error || "No output", resp.tokenUsage);
563
+ }
564
+ const { parsed, failure } = parseJsonGradingResponse(label, resp);
565
+ if (!parsed) return failure;
566
+ let pass = parsed.pass ?? true;
567
+ if (typeof pass !== "boolean") pass = /^(true|yes|pass|y)$/i.test(String(pass));
568
+ let score = parsed.score;
569
+ if (typeof score !== "number") score = Number.isFinite(Number(score)) ? Number(score) : Number(pass);
570
+ const threshold = typeof assertion?.threshold === "string" ? Number(assertion.threshold) : assertion?.threshold;
571
+ if (typeof threshold === "number" && Number.isFinite(threshold)) pass = pass && score >= threshold;
572
+ const reason = parsed.reason || (pass ? "Grading passed" : `Score ${score} below threshold ${threshold}`);
573
+ let responseMetadata = {};
574
+ if (resp.metadata && typeof resp.metadata === "object" && !Array.isArray(resp.metadata)) {
575
+ const serializedMetadata = safeJsonStringify(resp.metadata);
576
+ responseMetadata = serializedMetadata ? JSON.parse(serializedMetadata) : {};
577
+ }
578
+ return {
579
+ assertion,
580
+ pass,
581
+ score,
582
+ reason,
583
+ tokensUsed: normalizeMatcherTokenUsage({
584
+ ...resp.tokenUsage,
585
+ completionDetails: resp.tokenUsage?.completionDetails || parsed.tokensUsed?.completionDetails
586
+ }),
587
+ metadata: {
588
+ ...responseMetadata,
589
+ renderedGradingPrompt: prompt
590
+ }
591
+ };
592
+ }
349
593
  //#endregion
350
594
  //#region src/prompts/processors/csv.ts
351
595
  /**
@@ -459,810 +703,667 @@ const executablePromptFunction = async (scriptPath, context) => {
459
703
  if (fileHashes.length > 0 && isCacheEnabled()) await getCache().set(cacheKey, standardOutput);
460
704
  resolve(standardOutput);
461
705
  });
462
- });
463
- };
464
- /**
465
- * Processes an executable file to generate prompts.
466
- * The executable can be any script or binary that outputs prompt text to stdout.
467
- * It receives the context as JSON in its first argument.
468
- *
469
- * @param filePath - Path to the executable file (can include arguments).
470
- * @param prompt - The raw prompt data.
471
- * @param functionName - Not used for executables, but kept for interface consistency.
472
- * @returns Array of prompts generated from the executable.
473
- */
474
- async function processExecutableFile(filePath, prompt, _functionName) {
475
- let rawContent = filePath;
476
- const firstPart = parseScriptParts(filePath)[0];
477
- if (firstPart) try {
478
- const stats = await stat(firstPart);
479
- if (stats.isFile() && stats.size < 1024 * 100) {
480
- const content = await readFile(firstPart, "utf-8");
481
- if (!/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(content.substring(0, 1e3))) rawContent = content;
482
- }
483
- } catch (_e) {}
484
- const label = prompt.label ?? filePath;
485
- return [{
486
- raw: rawContent,
487
- label,
488
- function: (context) => executablePromptFunction(filePath, {
489
- ...context,
490
- config: prompt.config
491
- }),
492
- config: prompt.config
493
- }];
494
- }
495
- //#endregion
496
- //#region src/prompts/processors/javascript.ts
497
- const transformContext = (context) => {
498
- invariant(context.provider, "Provider is required");
499
- return {
500
- vars: context.vars,
501
- provider: {
502
- id: context.provider.id(),
503
- label: context.provider.label
504
- },
505
- config: context.config ?? {}
506
- };
507
- };
508
- /**
509
- * Processes a JavaScript file to import and execute a module function as a prompt.
510
- * @param filePath - Path to the JavaScript file.
511
- * @param functionName - Optional function name to execute.
512
- * @returns Promise resolving to an array of prompts.
513
- */
514
- async function processJsFile(filePath, prompt, functionName) {
515
- const promptFunction = await importModule(filePath, functionName);
516
- return [{
517
- raw: String(promptFunction),
518
- label: prompt.label ? prompt.label : functionName ? `${filePath}:${functionName}` : filePath,
519
- function: (context) => promptFunction(transformContext({
520
- ...context,
521
- config: prompt.config ?? {}
522
- })),
523
- config: prompt.config ?? {}
524
- }];
525
- }
526
- //#endregion
527
- //#region src/prompts/processors/jinja.ts
528
- /**
529
- * Processes a Jinja2 template file to extract prompts.
530
- * Similar to markdown files, each Jinja2 file is treated as a single prompt.
531
- *
532
- * @param filePath - Path to the Jinja2 template file.
533
- * @param prompt - The raw prompt data.
534
- * @returns Array of one `Prompt` object.
535
- */
536
- function processJinjaFile(filePath, prompt) {
537
- const content = fs$2.readFileSync(filePath, "utf8");
538
- return [{
539
- raw: content,
540
- label: prompt.label || `${filePath}: ${content.slice(0, 50)}...`,
541
- config: prompt.config
542
- }];
543
- }
544
- //#endregion
545
- //#region src/prompts/processors/json.ts
546
- /**
547
- * Processes a JSON file to extract prompts.
548
- * This function reads a JSON file and converts it to a `Prompt` object.
549
- * Any file:// references within the JSON content are recursively resolved.
550
- *
551
- * @param filePath - The path to the JSON file.
552
- * @param prompt - The raw prompt data, used for labeling.
553
- * @returns An array of one `Prompt` object.
554
- * @throws Will throw an error if the file cannot be read.
555
- */
556
- function processJsonFile(filePath, prompt) {
557
- const fileContents = fs$2.readFileSync(filePath, "utf8");
558
- let processedContents = fileContents;
559
- try {
560
- const resolved = maybeLoadConfigFromExternalFile(JSON.parse(fileContents));
561
- processedContents = JSON.stringify(resolved);
562
- } catch {}
563
- return [{
564
- raw: processedContents,
565
- label: prompt.label || `${filePath}: ${processedContents}`,
566
- config: prompt.config
567
- }];
568
- }
569
- //#endregion
570
- //#region src/prompts/processors/jsonl.ts
571
- /**
572
- * Processes a JSONL file to extract prompts.
573
- * @param filePath - Path to the JSONL file.
574
- * @param prompt - The raw prompt data.
575
- * @returns Array of prompts extracted from the file.
576
- */
577
- function processJsonlFile(filePath, prompt) {
578
- const jsonLines = fs$2.readFileSync(filePath, "utf-8").split(/\r?\n/).filter((line) => line.length > 0);
579
- const containsMultiple = jsonLines.length > 1;
580
- return jsonLines.map((json) => ({
581
- raw: json,
582
- label: containsMultiple ? prompt.label ? `${prompt.label}: ${json}` : `${filePath}: ${json}` : prompt.label || `${filePath}`,
583
- config: prompt.config
584
- }));
585
- }
586
- //#endregion
587
- //#region src/prompts/processors/markdown.ts
588
- function processMarkdownFile(filePath, prompt) {
589
- const content = fs.readFileSync(filePath, "utf8");
590
- return [{
591
- raw: content,
592
- label: prompt.label || `${filePath}: ${content.slice(0, 50)}...`
593
- }];
594
- }
595
- //#endregion
596
- //#region src/prompts/processors/python.ts
597
- /**
598
- * Python prompt function. Runs a specific function from the python file.
599
- * @param promptPath - Path to the Python file.
600
- * @param functionName - Function name to execute.
601
- * @param context - Context for the prompt.
602
- * @returns The prompts
603
- */
604
- const pythonPromptFunction = async (filePath, functionName, context) => {
605
- invariant(context.provider?.id, "provider.id is required");
606
- return runPython(filePath, functionName, [{
607
- vars: context.vars,
608
- provider: {
609
- id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
610
- label: context.provider?.label
611
- },
612
- config: context.config ?? {}
613
- }]);
614
- };
615
- /**
616
- * Legacy Python prompt function. Runs the whole python file.
617
- * @param filePath - Path to the Python file.
618
- * @param context - Context for the prompt.
619
- * @returns The prompts
620
- */
621
- const pythonPromptFunctionLegacy = async (filePath, context) => {
622
- invariant(context?.provider?.id, "provider.id is required");
623
- const transformedContext = {
624
- vars: context.vars,
625
- provider: {
626
- id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
627
- label: context.provider?.label
628
- },
629
- config: context.config ?? {}
630
- };
631
- const options = {
632
- mode: "text",
633
- pythonPath: getEnvString("PROMPTFOO_PYTHON", "python"),
634
- args: [safeJsonStringify(transformedContext)]
635
- };
636
- logger.debug(`Executing python prompt script ${filePath}`);
637
- const results = (await PythonShell.run(filePath, options)).join("\n");
638
- logger.debug(`Python prompt script ${filePath} returned: ${results}`);
639
- return results;
640
- };
641
- /**
642
- * Processes a Python file to extract or execute a function as a prompt.
643
- * @param filePath - Path to the Python file.
644
- * @param prompt - The raw prompt data.
645
- * @param functionName - Optional function name to execute.
646
- * @returns Array of prompts extracted or executed from the file.
647
- */
648
- function processPythonFile(filePath, prompt, functionName) {
649
- const fileContent = fs$2.readFileSync(filePath, "utf-8");
650
- return [{
651
- raw: fileContent,
652
- label: prompt.label ?? (functionName ? `${filePath}:${functionName}` : `${filePath}: ${fileContent}`),
653
- function: functionName ? (context) => pythonPromptFunction(filePath, functionName, {
654
- ...context,
655
- config: prompt.config
656
- }) : (context) => pythonPromptFunctionLegacy(filePath, {
657
- ...context,
658
- config: prompt.config
659
- }),
660
- config: prompt.config
661
- }];
662
- }
663
- //#endregion
664
- //#region src/prompts/processors/string.ts
665
- /**
666
- * Processes a string as a literal prompt.
667
- * @param prompt - The raw prompt data.
668
- * @returns Array of prompts created from the string.
669
- */
670
- function processString(prompt) {
671
- invariant(typeof prompt.raw === "string", `prompt.raw must be a string, but got ${JSON.stringify(prompt.raw)}`);
672
- return [{
673
- id: prompt.id,
674
- raw: prompt.raw,
675
- label: prompt.label ?? `${prompt.raw}`,
676
- config: prompt.config
677
- }];
678
- }
679
- //#endregion
680
- //#region src/prompts/processors/text.ts
681
- /**
682
- * Processes a text file to extract prompts, splitting by a delimiter.
683
- * @param filePath - Path to the text file.
684
- * @param prompt - The raw prompt data.
685
- * @returns Array of prompts extracted from the file.
686
- */
687
- function processTxtFile(filePath, { label }) {
688
- const lines = fs$2.readFileSync(filePath, "utf-8").split(/\r?\n/);
689
- const prompts = [];
690
- let buffer = [];
691
- const flush = () => {
692
- const raw = buffer.join("\n").trim();
693
- if (raw.length > 0) prompts.push({
694
- raw,
695
- label: label ? `${label}: ${filePath}: ${raw}` : `${filePath}: ${raw}`
696
- });
697
- buffer = [];
698
- };
699
- for (const line of lines) if (line.trim() === PROMPT_DELIMITER) flush();
700
- else buffer.push(line);
701
- flush();
702
- return prompts;
703
- }
704
- //#endregion
705
- //#region src/prompts/processors/yaml.ts
706
- /**
707
- * Processes a YAML file to extract prompts.
708
- * This function reads a YAML file, parses it, and maps each entry to a `Prompt` object.
709
- * Each prompt is labeled with the file path and the YAML content.
710
- * Any file:// references within the YAML content are recursively resolved.
706
+ });
707
+ };
708
+ /**
709
+ * Processes an executable file to generate prompts.
710
+ * The executable can be any script or binary that outputs prompt text to stdout.
711
+ * It receives the context as JSON in its first argument.
711
712
  *
712
- * @param filePath - The path to the YAML file.
713
- * @param prompt - The raw prompt data, used for labeling.
714
- * @returns An array of `Prompt` objects extracted from the YAML file.
715
- * @throws Will throw an error if the file cannot be read or parsed.
713
+ * @param filePath - Path to the executable file (can include arguments).
714
+ * @param prompt - The raw prompt data.
715
+ * @param functionName - Not used for executables, but kept for interface consistency.
716
+ * @returns Array of prompts generated from the executable.
716
717
  */
717
- function processYamlFile(filePath, prompt) {
718
- const fileContents = fs$2.readFileSync(filePath, "utf8");
719
- let maybeParsed = fileContents;
720
- try {
721
- const resolved = maybeLoadConfigFromExternalFile(yaml.load(fileContents));
722
- maybeParsed = JSON.stringify(resolved);
723
- } catch (e) {
724
- logger.debug(`Error parsing YAML file ${filePath}: ${e}`);
725
- }
718
+ async function processExecutableFile(filePath, prompt, _functionName) {
719
+ let rawContent = filePath;
720
+ const firstPart = parseScriptParts(filePath)[0];
721
+ if (firstPart) try {
722
+ const stats = await stat(firstPart);
723
+ if (stats.isFile() && stats.size < 1024 * 100) {
724
+ const content = await readFile(firstPart, "utf-8");
725
+ if (!/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(content.substring(0, 1e3))) rawContent = content;
726
+ }
727
+ } catch (_e) {}
728
+ const label = prompt.label ?? filePath;
726
729
  return [{
727
- raw: maybeParsed,
728
- label: prompt.label || `${filePath}: ${maybeParsed?.slice(0, 80)}`,
730
+ raw: rawContent,
731
+ label,
732
+ function: (context) => executablePromptFunction(filePath, {
733
+ ...context,
734
+ config: prompt.config
735
+ }),
729
736
  config: prompt.config
730
737
  }];
731
738
  }
732
739
  //#endregion
733
- //#region src/prompts/index.ts
740
+ //#region src/prompts/processors/javascript.ts
741
+ const transformContext = (context) => {
742
+ invariant(context.provider, "Provider is required");
743
+ return {
744
+ vars: context.vars,
745
+ provider: {
746
+ id: context.provider.id(),
747
+ label: context.provider.label
748
+ },
749
+ config: context.config ?? {}
750
+ };
751
+ };
734
752
  /**
735
- * Reads and maps provider prompts based on the configuration and parsed prompts.
736
- * @param config - The configuration object.
737
- * @param parsedPrompts - Array of parsed prompts.
738
- * @returns A map of provider IDs to their respective prompts.
753
+ * Processes a JavaScript file to import and execute a module function as a prompt.
754
+ * @param filePath - Path to the JavaScript file.
755
+ * @param functionName - Optional function name to execute.
756
+ * @returns Promise resolving to an array of prompts.
739
757
  */
740
- function readProviderPromptMap(config, parsedPrompts) {
741
- const ret = {};
742
- if (!config.providers) return ret;
743
- const allPrompts = [];
744
- for (const prompt of parsedPrompts) allPrompts.push(prompt.label);
745
- if (typeof config.providers === "string") return { [config.providers]: allPrompts };
746
- if (typeof config.providers === "function") return { "Custom function": allPrompts };
747
- for (const provider of config.providers) if (typeof provider === "object") if (provider.id) {
748
- const rawProvider = provider;
749
- invariant(rawProvider.id, "You must specify an `id` on the Provider when you override options.prompts");
750
- ret[rawProvider.id] = rawProvider.prompts || allPrompts;
751
- if (rawProvider.label) ret[rawProvider.label] = rawProvider.prompts || allPrompts;
752
- } else {
753
- const rawProvider = provider;
754
- const originalId = Object.keys(rawProvider)[0];
755
- const id = rawProvider[originalId].id || originalId;
756
- ret[id] = rawProvider[originalId].prompts || allPrompts;
757
- }
758
- return ret;
758
+ async function processJsFile(filePath, prompt, functionName) {
759
+ const promptFunction = await importModule(filePath, functionName);
760
+ return [{
761
+ raw: String(promptFunction),
762
+ label: prompt.label ? prompt.label : functionName ? `${filePath}:${functionName}` : filePath,
763
+ function: (context) => promptFunction(transformContext({
764
+ ...context,
765
+ config: prompt.config ?? {}
766
+ })),
767
+ config: prompt.config ?? {}
768
+ }];
759
769
  }
770
+ //#endregion
771
+ //#region src/prompts/processors/jinja.ts
760
772
  /**
761
- * Processes a raw prompt based on its content type and path.
773
+ * Processes a Jinja2 template file to extract prompts.
774
+ * Similar to markdown files, each Jinja2 file is treated as a single prompt.
775
+ *
776
+ * @param filePath - Path to the Jinja2 template file.
762
777
  * @param prompt - The raw prompt data.
763
- * @param basePath - Base path for file resolution.
764
- * @param maxRecursionDepth - Maximum recursion depth for globbing.
765
- * @returns Promise resolving to an array of processed prompts.
778
+ * @returns Array of one `Prompt` object.
766
779
  */
767
- async function processPrompt(prompt, basePath = "", maxRecursionDepth = 1) {
768
- invariant(typeof prompt.raw === "string", `prompt.raw must be a string, but got ${JSON.stringify(prompt.raw)}`);
769
- if (prompt.function) return [prompt];
770
- if (prompt.raw.startsWith("exec:")) {
771
- const { filePath, functionName } = parsePathOrGlob(basePath, prompt.raw.substring(5));
772
- return await processExecutableFile(filePath, prompt, functionName);
773
- }
774
- if (!maybeFilePath(prompt.raw)) return processString(prompt);
775
- const { extension, functionName, isPathPattern, filePath } = parsePathOrGlob(basePath, prompt.raw);
776
- if (isPathPattern && maxRecursionDepth > 0) {
777
- const globbedPath = globSync(filePath.replace(/\\/g, "/"), { windowsPathsNoEscape: true });
778
- logger.debug(`Expanded prompt ${prompt.raw} to ${filePath} and then to ${JSON.stringify(globbedPath)}`);
779
- const prompts = [];
780
- for (const globbedFilePath of globbedPath) {
781
- const processedPrompts = await processPrompt({ raw: functionName ? `${globbedFilePath}:${functionName}` : globbedFilePath }, basePath, maxRecursionDepth - 1);
782
- prompts.push(...processedPrompts);
783
- }
784
- if (prompts.length === 0) {
785
- logger.debug(`Attempted to load file at "${prompt.raw}", but no file found. Using raw string.`);
786
- prompts.push(...processString(prompt));
787
- }
788
- return prompts;
789
- }
790
- if (extension === ".csv") return processCsvPrompts(filePath, prompt);
791
- if (extension === ".j2") return processJinjaFile(filePath, prompt);
792
- if (extension === ".json") return processJsonFile(filePath, prompt);
793
- if (extension === ".jsonl") return processJsonlFile(filePath, prompt);
794
- if (extension && isJavascriptFile(extension)) return processJsFile(filePath, prompt, functionName);
795
- if (extension === ".md") return processMarkdownFile(filePath, prompt);
796
- if (extension === ".py") return processPythonFile(filePath, prompt, functionName);
797
- if (extension === ".txt") return processTxtFile(filePath, prompt);
798
- if (extension && [".yml", ".yaml"].includes(extension)) return processYamlFile(filePath, prompt);
799
- if (extension && [
800
- ".sh",
801
- ".bash",
802
- ".exe",
803
- ".bat",
804
- ".cmd",
805
- ".ps1",
806
- ".rb",
807
- ".pl"
808
- ].includes(extension)) return await processExecutableFile(filePath, prompt, functionName);
809
- try {
810
- const stats = await stat(filePath);
811
- if (stats.isFile() && (stats.mode & 73) !== 0) return await processExecutableFile(filePath, prompt, functionName);
812
- } catch (_e) {}
813
- return [];
780
+ function processJinjaFile(filePath, prompt) {
781
+ const content = fs$2.readFileSync(filePath, "utf8");
782
+ return [{
783
+ raw: content,
784
+ label: prompt.label || `${filePath}: ${content.slice(0, 50)}...`,
785
+ config: prompt.config
786
+ }];
814
787
  }
788
+ //#endregion
789
+ //#region src/prompts/processors/json.ts
815
790
  /**
816
- * Reads and processes prompts from a specified path or glob pattern.
817
- * @param promptPathOrGlobs - The path or glob pattern.
818
- * @param basePath - Base path for file resolution.
819
- * @returns Promise resolving to an array of processed prompts.
820
- */
821
- async function readPrompts(promptPathOrGlobs, basePath = "") {
822
- logger.debug(`Reading prompts from ${JSON.stringify(promptPathOrGlobs)}`);
823
- const promptPartials = normalizeInput(promptPathOrGlobs);
824
- const prompts = [];
825
- for (const prompt of promptPartials) {
826
- const promptBatch = await processPrompt(prompt, basePath);
827
- if (promptBatch.length === 0) throw new Error(`There are no prompts in ${JSON.stringify(prompt.raw)}`);
828
- prompts.push(...promptBatch);
829
- }
830
- return prompts;
831
- }
832
- async function processPrompts(prompts) {
833
- return (await Promise.all(prompts.map(async (promptInput) => {
834
- if (typeof promptInput === "function") return {
835
- raw: promptInput.toString(),
836
- label: promptInput?.name ?? promptInput.toString(),
837
- function: promptInput
838
- };
839
- else if (typeof promptInput === "string") return readPrompts(promptInput);
840
- try {
841
- return PromptSchema.parse(promptInput);
842
- } catch (error) {
843
- logger.warn(`Prompt input is not a valid prompt schema: ${error}\nFalling back to serialized JSON as raw prompt.`);
844
- return {
845
- raw: JSON.stringify(promptInput),
846
- label: JSON.stringify(promptInput)
847
- };
848
- }
849
- }))).flat();
850
- }
851
- const GEVAL_PROMPT_STEPS = `
852
- Given evaluation criteria that outline how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to check whether the criteria are satisfied by the text.
853
-
854
- **EVALUATION CRITERIA**
855
- {{criteria}}
856
-
857
- **OUTPUT FORMAT**
858
- IMPORTANT:
859
- - Return output ONLY as a minified JSON object (no code fences).
860
- - The JSON object must contain a single key, "steps", whose value is a list of strings.
861
- - Each string must represent one evaluation step.
862
- - Do NOT include any explanations, commentary, extra text, or additional formatting.
863
-
864
- Format:
865
- {"steps": <list_of_strings>}
866
-
867
- Example:
868
- {"steps":["<Evaluation Step 1>","<Evaluation Step 2>","<Evaluation Step 3>","<Evaluation Step 4>"]}
869
-
870
- Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
871
- JSON:
872
- `;
873
- const GEVAL_PROMPT_EVALUATE = `
874
- You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
875
- Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
876
-
877
- **Evaluation Criteria**
878
- {{criteria}}
879
-
880
- **Evaluation Steps**
881
- - {{steps}}
882
- Given the evaluation steps, return a JSON with two keys:
883
- 1) a "score" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the condition described by the Evaluation Criteria is fully and clearly observed in the Reply according to the Evaluation Steps, and 0 indicates that it is not observed at all;
884
- 2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!
885
-
886
- **Prompt**
887
- {{input}}
888
-
889
- **Reply**
890
- {{output}}
891
-
892
- **OUTPUT FORMAT**
893
- IMPORTANT:
894
- - Return output ONLY as a minified JSON object (no code fences).
895
- - The JSON object must contain exactly two keys: "score" and "reason".
896
- - No additional words, explanations, or formatting are needed.
897
- - Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
898
-
899
- Example JSON:
900
- {"score":0,"reason":"The text of reply does not follow the evaluation criteria provided."}
901
-
902
- Here is the final evaluation in the required minified JSON format:
903
- JSON:
904
- `;
791
+ * Processes a JSON file to extract prompts.
792
+ * This function reads a JSON file and converts it to a `Prompt` object.
793
+ * Any file:// references within the JSON content are recursively resolved.
794
+ *
795
+ * @param filePath - The path to the JSON file.
796
+ * @param prompt - The raw prompt data, used for labeling.
797
+ * @returns An array of one `Prompt` object.
798
+ * @throws Will throw an error if the file cannot be read.
799
+ */
800
+ function processJsonFile(filePath, prompt) {
801
+ const fileContents = fs$2.readFileSync(filePath, "utf8");
802
+ let processedContents = fileContents;
803
+ try {
804
+ const resolved = maybeLoadConfigFromExternalFile(JSON.parse(fileContents));
805
+ processedContents = JSON.stringify(resolved);
806
+ } catch {}
807
+ return [{
808
+ raw: processedContents,
809
+ label: prompt.label || `${filePath}: ${processedContents}`,
810
+ config: prompt.config
811
+ }];
812
+ }
905
813
  //#endregion
906
- //#region src/providers/anthropic/defaults.ts
907
- const DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-6";
814
+ //#region src/prompts/processors/jsonl.ts
908
815
  /**
909
- * Helper function to create a lazy-loaded provider. This allows the .env file to be
910
- * loaded first before the provider is initialized.
911
- * @param factory Factory function that creates provider instance with optional env
912
- * @returns Object with getter that lazily initializes the provider with the latest env
816
+ * Processes a JSONL file to extract prompts.
817
+ * @param filePath - Path to the JSONL file.
818
+ * @param prompt - The raw prompt data.
819
+ * @returns Array of prompts extracted from the file.
913
820
  */
914
- function createLazyProvider(factory) {
915
- const instances = /* @__PURE__ */ new Map();
916
- return { getInstance(env) {
917
- const cacheKey = env ? JSON.stringify(env) : "";
918
- if (!instances.has(cacheKey)) instances.set(cacheKey, factory(env));
919
- return instances.get(cacheKey);
920
- } };
821
+ function processJsonlFile(filePath, prompt) {
822
+ const jsonLines = fs$2.readFileSync(filePath, "utf-8").split(/\r?\n/).filter((line) => line.length > 0);
823
+ const containsMultiple = jsonLines.length > 1;
824
+ return jsonLines.map((json) => ({
825
+ raw: json,
826
+ label: containsMultiple ? prompt.label ? `${prompt.label}: ${json}` : `${filePath}: ${json}` : prompt.label || `${filePath}`,
827
+ config: prompt.config
828
+ }));
921
829
  }
922
- var AnthropicLlmRubricProvider = class extends AnthropicMessagesProvider {
923
- constructor(modelName, options = {}) {
924
- const { env, config = {} } = options;
925
- super(modelName, {
926
- env,
927
- config: {
928
- tool_choice: {
929
- type: "tool",
930
- name: "grade_output"
931
- },
932
- tools: [{
933
- name: "grade_output",
934
- description: "Grade the given output based on specific criteria",
935
- input_schema: {
936
- type: "object",
937
- properties: {
938
- pass: {
939
- type: "boolean",
940
- description: "Whether the output passes the criteria"
941
- },
942
- score: {
943
- type: "number",
944
- description: "The score assigned to the output"
945
- },
946
- reason: {
947
- type: "string",
948
- description: "The reason for the given grade"
949
- }
950
- },
951
- required: [
952
- "pass",
953
- "score",
954
- "reason"
955
- ]
956
- }
957
- }],
958
- ...config
959
- }
960
- });
961
- }
962
- async callApi(prompt) {
963
- const result = await super.callApi(prompt);
964
- if (typeof result.output !== "string") return { error: `Anthropic LLM rubric grader - malformed non-string output\n\n${JSON.stringify(result.output)}` };
965
- try {
966
- return { output: JSON.parse(result.output).input };
967
- } catch (err) {
968
- return { error: `Anthropic LLM rubric grader - invalid JSON: ${err}\n\n${result.output}` };
969
- }
970
- }
830
+ //#endregion
831
+ //#region src/prompts/processors/markdown.ts
832
+ function processMarkdownFile(filePath, prompt) {
833
+ const content = fs.readFileSync(filePath, "utf8");
834
+ return [{
835
+ raw: content,
836
+ label: prompt.label || `${filePath}: ${content.slice(0, 50)}...`
837
+ }];
838
+ }
839
+ //#endregion
840
+ //#region src/prompts/processors/python.ts
841
+ /**
842
+ * Python prompt function. Runs a specific function from the python file.
843
+ * @param promptPath - Path to the Python file.
844
+ * @param functionName - Function name to execute.
845
+ * @param context - Context for the prompt.
846
+ * @returns The prompts
847
+ */
848
+ const pythonPromptFunction = async (filePath, functionName, context) => {
849
+ invariant(context.provider?.id, "provider.id is required");
850
+ return runPython(filePath, functionName, [{
851
+ vars: context.vars,
852
+ provider: {
853
+ id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
854
+ label: context.provider?.label
855
+ },
856
+ config: context.config ?? {}
857
+ }]);
971
858
  };
972
- const gradingProviderFactory = createLazyProvider((env) => new AnthropicMessagesProvider(DEFAULT_ANTHROPIC_MODEL, { env }));
973
- const llmRubricProviderFactory = createLazyProvider((env) => new AnthropicLlmRubricProvider(DEFAULT_ANTHROPIC_MODEL, { env }));
974
- const webSearchProviderFactory = createLazyProvider((env) => new AnthropicMessagesProvider(DEFAULT_ANTHROPIC_MODEL, {
975
- env,
976
- config: { tools: [{
977
- type: "web_search_20250305",
978
- name: "web_search",
979
- max_uses: 5
980
- }] }
981
- }));
982
859
  /**
983
- * Gets all default Anthropic providers with the given environment overrides
984
- * @param env - Optional environment overrides
985
- * @returns Anthropic provider implementations for various functions
860
+ * Legacy Python prompt function. Runs the whole python file.
861
+ * @param filePath - Path to the Python file.
862
+ * @param context - Context for the prompt.
863
+ * @returns The prompts
986
864
  */
987
- function getAnthropicProviders(env) {
988
- const gradingProvider = gradingProviderFactory.getInstance(env);
989
- return {
990
- gradingJsonProvider: gradingProvider,
991
- gradingProvider,
992
- llmRubricProvider: llmRubricProviderFactory.getInstance(env),
993
- suggestionsProvider: gradingProvider,
994
- synthesizeProvider: gradingProvider,
995
- webSearchProvider: webSearchProviderFactory.getInstance(env)
865
+ const pythonPromptFunctionLegacy = async (filePath, context) => {
866
+ invariant(context?.provider?.id, "provider.id is required");
867
+ const transformedContext = {
868
+ vars: context.vars,
869
+ provider: {
870
+ id: typeof context.provider?.id === "function" ? context.provider?.id() : context.provider?.id,
871
+ label: context.provider?.label
872
+ },
873
+ config: context.config ?? {}
996
874
  };
997
- }
998
- //#endregion
999
- //#region src/providers/github/defaults.ts
1000
- const githubConfig = {
1001
- apiBaseUrl: "https://models.github.ai/inference",
1002
- apiKeyEnvar: "GITHUB_TOKEN"
875
+ const options = {
876
+ mode: "text",
877
+ pythonPath: getEnvString("PROMPTFOO_PYTHON", "python"),
878
+ args: [safeJsonStringify(transformedContext)]
879
+ };
880
+ logger.debug(`Executing python prompt script ${filePath}`);
881
+ const results = (await PythonShell.run(filePath, options)).join("\n");
882
+ logger.debug(`Python prompt script ${filePath} returned: ${results}`);
883
+ return results;
1003
884
  };
1004
- const DefaultGitHubGradingProvider = new OpenAiChatCompletionProvider("openai/gpt-5", { config: githubConfig });
1005
- const DefaultGitHubGradingJsonProvider = new OpenAiChatCompletionProvider("openai/gpt-5", { config: {
1006
- ...githubConfig,
1007
- response_format: { type: "json_object" }
1008
- } });
1009
- const DefaultGitHubSuggestionsProvider = new OpenAiChatCompletionProvider("openai/gpt-5", { config: githubConfig });
1010
- new OpenAiChatCompletionProvider("openai/gpt-5-nano", { config: githubConfig });
1011
- new OpenAiChatCompletionProvider("openai/gpt-5-mini", { config: githubConfig });
1012
- new OpenAiChatCompletionProvider("openai/o4-mini", { config: githubConfig });
885
+ /**
886
+ * Processes a Python file to extract or execute a function as a prompt.
887
+ * @param filePath - Path to the Python file.
888
+ * @param prompt - The raw prompt data.
889
+ * @param functionName - Optional function name to execute.
890
+ * @returns Array of prompts extracted or executed from the file.
891
+ */
892
+ function processPythonFile(filePath, prompt, functionName) {
893
+ const fileContent = fs$2.readFileSync(filePath, "utf-8");
894
+ return [{
895
+ raw: fileContent,
896
+ label: prompt.label ?? (functionName ? `${filePath}:${functionName}` : `${filePath}: ${fileContent}`),
897
+ function: functionName ? (context) => pythonPromptFunction(filePath, functionName, {
898
+ ...context,
899
+ config: prompt.config
900
+ }) : (context) => pythonPromptFunctionLegacy(filePath, {
901
+ ...context,
902
+ config: prompt.config
903
+ }),
904
+ config: prompt.config
905
+ }];
906
+ }
1013
907
  //#endregion
1014
- //#region src/providers/mistral/defaults.ts
1015
- const DefaultEmbeddingProvider$1 = new MistralEmbeddingProvider();
1016
- const DefaultGradingProvider$1 = new MistralChatCompletionProvider("mistral-large-latest");
1017
- const DefaultGradingJsonProvider$1 = new MistralChatCompletionProvider("mistral-large-latest", { config: { response_format: { type: "json_object" } } });
1018
- const DefaultSuggestionsProvider$1 = new MistralChatCompletionProvider("mistral-large-latest");
1019
- const DefaultSynthesizeProvider = new MistralChatCompletionProvider("mistral-large-latest");
908
+ //#region src/prompts/processors/string.ts
909
+ /**
910
+ * Processes a string as a literal prompt.
911
+ * @param prompt - The raw prompt data.
912
+ * @returns Array of prompts created from the string.
913
+ */
914
+ function processString(prompt) {
915
+ invariant(typeof prompt.raw === "string", `prompt.raw must be a string, but got ${JSON.stringify(prompt.raw)}`);
916
+ return [{
917
+ id: prompt.id,
918
+ raw: prompt.raw,
919
+ label: prompt.label ?? `${prompt.raw}`,
920
+ config: prompt.config
921
+ }];
922
+ }
1020
923
  //#endregion
1021
- //#region src/providers/openai/defaults.ts
1022
- const DEFAULT_OPENAI_GRADING_MODEL = "gpt-5.4-2026-03-05";
1023
- const DefaultEmbeddingProvider = new OpenAiEmbeddingProvider("text-embedding-3-large");
1024
- const DefaultGradingProvider = new OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL);
1025
- const DefaultGradingJsonProvider = new OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL, { config: { response_format: { type: "json_object" } } });
1026
- const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider(DEFAULT_OPENAI_GRADING_MODEL);
1027
- const DefaultModerationProvider = new OpenAiModerationProvider("omni-moderation-latest");
1028
- const DefaultWebSearchProvider = new OpenAiResponsesProvider("gpt-5.4-2026-03-05", { config: { tools: [{ type: "web_search_preview" }] } });
1029
- async function getDefaultProviderPreferences(env) {
1030
- const hasAnthropicCredentials = Boolean(getEnvString("ANTHROPIC_API_KEY") || env?.ANTHROPIC_API_KEY);
1031
- const hasOpenAiCredentials = Boolean(getEnvString("OPENAI_API_KEY") || env?.OPENAI_API_KEY);
1032
- const hasGitHubCredentials = Boolean(getEnvString("GITHUB_TOKEN") || env?.GITHUB_TOKEN);
1033
- const hasGoogleAiStudioCredentials = Boolean(getEnvString("GEMINI_API_KEY") || env?.GEMINI_API_KEY || getEnvString("GOOGLE_API_KEY") || env?.GOOGLE_API_KEY || getEnvString("PALM_API_KEY") || env?.PALM_API_KEY);
1034
- const hasAzureApiKey = getEnvString("AZURE_OPENAI_API_KEY") || env?.AZURE_OPENAI_API_KEY || getEnvString("AZURE_API_KEY") || env?.AZURE_API_KEY;
1035
- const hasAzureClientCreds = (getEnvString("AZURE_CLIENT_ID") || env?.AZURE_CLIENT_ID) && (getEnvString("AZURE_CLIENT_SECRET") || env?.AZURE_CLIENT_SECRET) && (getEnvString("AZURE_TENANT_ID") || env?.AZURE_TENANT_ID);
1036
- const hasMistralCredentials = Boolean(getEnvString("MISTRAL_API_KEY") || env?.MISTRAL_API_KEY);
1037
- const preferAzure = Boolean(!hasOpenAiCredentials && (hasAzureApiKey || hasAzureClientCreds) && (getEnvString("AZURE_DEPLOYMENT_NAME") || env?.AZURE_DEPLOYMENT_NAME) && (getEnvString("AZURE_OPENAI_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_DEPLOYMENT_NAME));
1038
- const preferAnthropic = !hasOpenAiCredentials && hasAnthropicCredentials;
1039
- const shouldUseFallbackDefaults = !preferAzure && !hasOpenAiCredentials && !hasAnthropicCredentials && !hasGoogleAiStudioCredentials;
1040
- const useGoogleVertexDefaults = shouldUseFallbackDefaults ? await hasGoogleDefaultCredentials() : false;
1041
- const useNonGoogleFallbackDefaults = shouldUseFallbackDefaults && !useGoogleVertexDefaults;
1042
- const hasCodexCredentials = useNonGoogleFallbackDefaults && !hasMistralCredentials && hasCodexDefaultCredentials(env);
1043
- return {
1044
- preferAnthropic,
1045
- preferAzure,
1046
- useCodexDefaults: hasCodexCredentials,
1047
- useGitHubDefaults: useNonGoogleFallbackDefaults && !hasMistralCredentials && !hasCodexCredentials && hasGitHubCredentials,
1048
- useGoogleAiStudioDefaults: !hasOpenAiCredentials && !hasAnthropicCredentials && hasGoogleAiStudioCredentials,
1049
- useGoogleVertexDefaults,
1050
- useMistralDefaults: useNonGoogleFallbackDefaults && hasMistralCredentials
924
+ //#region src/prompts/processors/text.ts
925
+ /**
926
+ * Processes a text file to extract prompts, splitting by a delimiter.
927
+ * @param filePath - Path to the text file.
928
+ * @param prompt - The raw prompt data.
929
+ * @returns Array of prompts extracted from the file.
930
+ */
931
+ function processTxtFile(filePath, { label }) {
932
+ const lines = fs$2.readFileSync(filePath, "utf-8").split(/\r?\n/);
933
+ const prompts = [];
934
+ let buffer = [];
935
+ const flush = () => {
936
+ const raw = buffer.join("\n").trim();
937
+ if (raw.length > 0) prompts.push({
938
+ raw,
939
+ label: label ? `${label}: ${filePath}: ${raw}` : `${filePath}: ${raw}`
940
+ });
941
+ buffer = [];
1051
942
  };
943
+ for (const line of lines) if (line.trim() === PROMPT_DELIMITER) flush();
944
+ else buffer.push(line);
945
+ flush();
946
+ return prompts;
1052
947
  }
1053
- async function getDefaultProviders(env) {
1054
- const { preferAnthropic, preferAzure, useCodexDefaults, useGitHubDefaults, useGoogleAiStudioDefaults, useGoogleVertexDefaults, useMistralDefaults } = await getDefaultProviderPreferences(env);
1055
- let providers;
1056
- if (preferAzure) {
1057
- logger.debug("Using Azure OpenAI default providers");
1058
- const deploymentName = getEnvString("AZURE_OPENAI_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_DEPLOYMENT_NAME;
1059
- if (!deploymentName) throw new Error("AZURE_OPENAI_DEPLOYMENT_NAME must be set when using Azure OpenAI");
1060
- const embeddingDeploymentName = getEnvString("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME") || env?.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME || deploymentName;
1061
- const azureProvider = new AzureChatCompletionProvider(deploymentName, { env });
1062
- providers = {
1063
- embeddingProvider: new AzureEmbeddingProvider(embeddingDeploymentName, { env }),
1064
- gradingJsonProvider: azureProvider,
1065
- gradingProvider: azureProvider,
1066
- moderationProvider: DefaultModerationProvider,
1067
- suggestionsProvider: azureProvider,
1068
- synthesizeProvider: azureProvider
1069
- };
1070
- } else if (preferAnthropic) {
1071
- logger.debug("Using Anthropic default providers");
1072
- const anthropicProviders = getAnthropicProviders(env);
1073
- providers = {
1074
- embeddingProvider: DefaultEmbeddingProvider,
1075
- gradingJsonProvider: anthropicProviders.gradingJsonProvider,
1076
- gradingProvider: anthropicProviders.gradingProvider,
1077
- llmRubricProvider: anthropicProviders.llmRubricProvider,
1078
- moderationProvider: DefaultModerationProvider,
1079
- suggestionsProvider: anthropicProviders.suggestionsProvider,
1080
- synthesizeProvider: anthropicProviders.synthesizeProvider,
1081
- webSearchProvider: anthropicProviders.webSearchProvider
1082
- };
1083
- } else if (useGoogleAiStudioDefaults) {
1084
- logger.debug("Using Google AI Studio default providers");
1085
- providers = {
1086
- embeddingProvider: DefaultEmbeddingProvider$2,
1087
- gradingJsonProvider: DefaultGradingJsonProvider$2,
1088
- gradingProvider: DefaultGradingProvider$2,
1089
- llmRubricProvider: DefaultLlmRubricProvider,
1090
- moderationProvider: DefaultModerationProvider,
1091
- suggestionsProvider: DefaultSuggestionsProvider$2,
1092
- synthesizeProvider: DefaultSynthesizeProvider$1
1093
- };
1094
- } else if (useGoogleVertexDefaults) {
1095
- logger.debug("Using Google Vertex default providers");
1096
- providers = {
1097
- embeddingProvider: DefaultEmbeddingProvider$2,
1098
- gradingJsonProvider: DefaultGradingProvider$3,
1099
- gradingProvider: DefaultGradingProvider$3,
1100
- moderationProvider: DefaultModerationProvider,
1101
- suggestionsProvider: DefaultGradingProvider$3,
1102
- synthesizeProvider: DefaultGradingProvider$3
1103
- };
1104
- } else if (useMistralDefaults) {
1105
- logger.debug("Using Mistral default providers");
1106
- providers = {
1107
- embeddingProvider: DefaultEmbeddingProvider$1,
1108
- gradingJsonProvider: DefaultGradingJsonProvider$1,
1109
- gradingProvider: DefaultGradingProvider$1,
1110
- moderationProvider: DefaultModerationProvider,
1111
- suggestionsProvider: DefaultSuggestionsProvider$1,
1112
- synthesizeProvider: DefaultSynthesizeProvider
1113
- };
1114
- } else if (useCodexDefaults) {
1115
- logger.debug("Using Codex SDK default providers from ChatGPT/Codex credentials");
1116
- providers = {
1117
- embeddingProvider: DefaultEmbeddingProvider,
1118
- moderationProvider: DefaultModerationProvider,
1119
- ...getCodexDefaultProviders(env)
1120
- };
1121
- } else if (useGitHubDefaults) {
1122
- logger.debug("Using GitHub Models default providers");
1123
- providers = {
1124
- embeddingProvider: DefaultEmbeddingProvider,
1125
- gradingJsonProvider: DefaultGitHubGradingJsonProvider,
1126
- gradingProvider: DefaultGitHubGradingProvider,
1127
- moderationProvider: DefaultModerationProvider,
1128
- suggestionsProvider: DefaultGitHubSuggestionsProvider,
1129
- synthesizeProvider: DefaultGitHubGradingJsonProvider
1130
- };
1131
- } else {
1132
- logger.debug("Using OpenAI default providers");
1133
- providers = {
1134
- embeddingProvider: DefaultEmbeddingProvider,
1135
- gradingJsonProvider: DefaultGradingJsonProvider,
1136
- gradingProvider: DefaultGradingProvider,
1137
- moderationProvider: DefaultModerationProvider,
1138
- suggestionsProvider: DefaultSuggestionsProvider,
1139
- synthesizeProvider: DefaultGradingJsonProvider,
1140
- webSearchProvider: DefaultWebSearchProvider
1141
- };
948
+ //#endregion
949
+ //#region src/prompts/processors/yaml.ts
950
+ /**
951
+ * Processes a YAML file to extract prompts.
952
+ * This function reads a YAML file, parses it, and maps each entry to a `Prompt` object.
953
+ * Each prompt is labeled with the file path and the YAML content.
954
+ * Any file:// references within the YAML content are recursively resolved.
955
+ *
956
+ * @param filePath - The path to the YAML file.
957
+ * @param prompt - The raw prompt data, used for labeling.
958
+ * @returns An array of `Prompt` objects extracted from the YAML file.
959
+ * @throws Will throw an error if the file cannot be read or parsed.
960
+ */
961
+ function processYamlFile(filePath, prompt) {
962
+ const fileContents = fs$2.readFileSync(filePath, "utf8");
963
+ let maybeParsed = fileContents;
964
+ try {
965
+ const resolved = maybeLoadConfigFromExternalFile(yaml.load(fileContents));
966
+ maybeParsed = JSON.stringify(resolved);
967
+ } catch (e) {
968
+ logger.debug(`Error parsing YAML file ${filePath}: ${e}`);
1142
969
  }
1143
- if (getEnvString("AZURE_CONTENT_SAFETY_ENDPOINT") || env?.AZURE_CONTENT_SAFETY_ENDPOINT) providers.moderationProvider = new AzureModerationProvider("text-content-safety", { env });
1144
- return providers;
970
+ return [{
971
+ raw: maybeParsed,
972
+ label: prompt.label || `${filePath}: ${maybeParsed?.slice(0, 80)}`,
973
+ config: prompt.config
974
+ }];
1145
975
  }
1146
976
  //#endregion
1147
- //#region src/providers/webSearchUtils.ts
1148
- function hasTool(provider, predicate) {
1149
- return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
1150
- }
1151
- function getProviderId(provider) {
1152
- if (typeof provider.id !== "function") return null;
1153
- try {
1154
- return provider.id();
1155
- } catch (err) {
1156
- logger.debug(`Failed to read provider id: ${err}`);
1157
- return null;
1158
- }
1159
- }
1160
- function isOpenAiResponsesProvider(provider, id) {
1161
- return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
1162
- }
977
+ //#region src/external/prompts/ragas.ts
978
+ const ANSWER_RELEVANCY_GENERATE = `Generate question for the given answer.
979
+ Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India
980
+ Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?
981
+
982
+ Answer:{{answer}}
983
+ Question:`;
984
+ const CONTEXT_RECALL = `Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not.
985
+ Think in steps and reason before coming to conclusion.
986
+
987
+ context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
988
+ answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895
989
+ classification
990
+ 1. Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. The date of birth of Einstein is mentioned clearly in the context. So [Attributed]
991
+ 2. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. The exact sentence is present in the given context. So [Attributed]
992
+ 3. He published 4 papers in 1905. There is no mention about papers he wrote in given the context. So [Not Attributed]
993
+ 4. Einstein moved to Switzerland in 1895. There is not supporting evidence for this in the given the context. So [Not Attributed]
994
+
995
+ context:{{context}}
996
+ answer:{{groundTruth}}
997
+ classification:
998
+ `;
999
+ const CONTEXT_RECALL_ATTRIBUTED_TOKEN = "[Attributed]";
1000
+ const CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN = "[Not Attributed]";
1001
+ const CONTEXT_RELEVANCE = `Please extract relevant sentences from the provided context that is absolutely required answer the following query. If no relevant sentences are found, or if you believe the query cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.
1002
+
1003
+ query: {{query}}
1004
+ context: {{context}}
1005
+ candidate sentences:
1006
+ `;
1007
+ const CONTEXT_RELEVANCE_BAD = "Insufficient Information";
1008
+ const CONTEXT_FAITHFULNESS_LONGFORM = `Given a question and answer, create one or more statements from each sentence in the given answer.
1009
+ question: Who was Albert Einstein and what is he best known for?
1010
+ answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
1011
+ statements:\nAlbert Einstein was born in Germany.\nAlbert Einstein was best known for his theory of relativity.
1012
+ question: Cadmium Chloride is slightly soluble in this chemical, it is also called what?
1013
+ answer: alcohol
1014
+ statements:\nCadmium Chloride is slightly soluble in alcohol.
1015
+ question: Were Shahul and Jithin of the same nationality?
1016
+ answer: They were from different countries.
1017
+ statements:\nShahul and Jithin were from different countries.
1018
+ question:{{question}}
1019
+ answer: {{answer}}
1020
+ statements:\n`;
1021
+ const CONTEXT_FAITHFULNESS_NLI_STATEMENTS = `Prompt: Natural language inference
1022
+ Consider the given context and following statements, then determine whether they are supported by the information present in the context.Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Provide a final verdict for each statement in order at the end in the given format. Do not deviate from the specified format.
1023
+
1024
+ Context:\nJohn is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.
1025
+ statements:\n1. John is majoring in Biology.\n2. John is taking a course on Artificial Intelligence.\n3. John is a dedicated student.\n4. John has a part-time job.\n5. John is interested in computer programming.\n
1026
+ Answer:
1027
+ 1. John is majoring in Biology.
1028
+ Explanation: John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology. Verdict: No.
1029
+ 2. John is taking a course on Artificial Intelligence.
1030
+ Explanation: The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI. Verdict: No.
1031
+ 3. John is a dedicated student.
1032
+ Explanation: The prompt states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication. Verdict: Yes.
1033
+ 4. John has a part-time job.
1034
+ Explanation: There is no information given in the context about John having a part-time job. Therefore, it cannot be deduced that John has a part-time job. Verdict: No.
1035
+ 5. John is interested in computer programming.
1036
+ Explanation: The context states that John is pursuing a degree in Computer Science, which implies an interest in computer programming. Verdict: Yes.
1037
+ Final verdict for each statement in order: No. No. Yes. No. Yes.
1038
+ context:\n{{context}}
1039
+ statements:\n{{statements|join("\\n")}}
1040
+ Answer:
1041
+ `;
1042
+ //#endregion
1043
+ //#region src/prompts/grading.ts
1044
+ const DEFAULT_GRADING_PROMPT = JSON.stringify([{
1045
+ role: "system",
1046
+ content: dedent`You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
1047
+
1048
+ Examples:
1049
+
1050
+ <Output>Hello world</Output>
1051
+ <Rubric>Content contains a greeting</Rubric>
1052
+ {"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
1053
+
1054
+ <Output>Avast ye swabs, repel the invaders!</Output>
1055
+ <Rubric>Does not speak like a pirate</Rubric>
1056
+ {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}`
1057
+ }, {
1058
+ role: "user",
1059
+ content: "<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>"
1060
+ }]);
1061
+ const PROMPTFOO_FACTUALITY_PROMPT = JSON.stringify([{
1062
+ role: "system",
1063
+ content: dedent`
1064
+ You are a precise factuality evaluator that compares a submitted answer to an expert answer.
1065
+
1066
+ Your task is to analyze the factual content while ignoring differences in style, grammar, or punctuation.
1067
+ You must categorize the submission into one of these options:
1068
+
1069
+ (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
1070
+ (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
1071
+ (C) The submitted answer contains all the same details as the expert answer.
1072
+ (D) There is a disagreement between the submitted answer and the expert answer.
1073
+ (E) The answers differ, but these differences don't matter from the perspective of factuality.
1074
+
1075
+ Respond ONLY with a JSON object in this format:
1076
+ {
1077
+ "category": "[LETTER]",
1078
+ "reason": "[DETAILED EXPLANATION]"
1079
+ }
1080
+
1081
+ - The "category" must be a single letter A, B, C, D, or E.
1082
+ - Provide a clear, detailed explanation in the "reason" field.
1083
+ - Your response must be valid JSON with no additional text.`
1084
+ }, {
1085
+ role: "user",
1086
+ content: dedent`
1087
+ I need you to compare these answers:
1088
+
1089
+ <question>
1090
+ {{input}}
1091
+ </question>
1092
+
1093
+ <expert_answer>
1094
+ {{ideal}}
1095
+ </expert_answer>
1096
+
1097
+ <submitted_answer>
1098
+ {{completion}}
1099
+ </submitted_answer>
1100
+
1101
+ Please analyze the factual relationship between these answers according to the categories you've been given.`
1102
+ }]);
1103
+ const OPENAI_CLOSED_QA_PROMPT = JSON.stringify([{
1104
+ role: "system",
1105
+ content: `You are assessing a submitted answer on a given task based on a criterion. Here is the data:
1106
+ [BEGIN DATA]
1107
+ ***
1108
+ [Task]: {{input}}
1109
+ ***
1110
+ [Submission]: {{completion}}
1111
+ ***
1112
+ [Criterion]: {{criteria}}
1113
+ ***
1114
+ [END DATA]
1115
+ Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
1116
+
1117
+ Reasoning:`
1118
+ }]);
1119
+ const SUGGEST_PROMPTS_SYSTEM_MESSAGE = {
1120
+ role: "system",
1121
+ content: `You're helping a scientist who is tuning a prompt for a large language model. You will receive messages, and each message is a full prompt. Generate a candidate variation of the given prompt. This variation will be tested for quality in order to select a winner.
1122
+
1123
+ Substantially revise the prompt, revising its structure and content however necessary to make it perform better, while preserving the original intent and including important details.
1124
+
1125
+ Your output is going to be copied directly into the program. It should contain the prompt ONLY`
1126
+ };
1127
+ const SELECT_BEST_PROMPT = JSON.stringify([{
1128
+ role: "system",
1129
+ content: `You are comparing multiple pieces of text to see which best fits the following criteria: {{criteria}}
1130
+
1131
+ Here are the pieces of text:
1132
+
1133
+ {% for output in outputs %}
1134
+ <Text index="{{ loop.index0 }}">
1135
+ {{ output }}
1136
+ </Text>
1137
+ {% endfor %}
1138
+
1139
+ Output the index of the text that best fits the criteria. You must output a single integer.`
1140
+ }]);
1141
+ const DEFAULT_WEB_SEARCH_PROMPT = JSON.stringify([{
1142
+ role: "system",
1143
+ content: dedent`You are grading output according to a user-specified rubric, with the ability to search the web for current information. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
1144
+
1145
+ You MUST search the web when:
1146
+ - The rubric asks about current information (prices, weather, news, etc.)
1147
+ - Facts need to be verified against recent data
1148
+ - The rubric references time-sensitive information
1149
+
1150
+ Examples:
1151
+
1152
+ <Output>The current CEO of Microsoft is Satya Nadella</Output>
1153
+ <Rubric>Contains accurate information about Microsoft's leadership</Rubric>
1154
+ {"reason": "I searched and confirmed Satya Nadella is indeed the current CEO of Microsoft", "pass": true, "score": 1.0}
1155
+
1156
+ <Output>Bitcoin is trading at $45,000</Output>
1157
+ <Rubric>Provides current Bitcoin price within 10% accuracy</Rubric>
1158
+ {"reason": "Web search shows Bitcoin is currently trading at $98,000, not $45,000. The output is off by more than 50%", "pass": false, "score": 0.0}`
1159
+ }, {
1160
+ role: "user",
1161
+ content: "<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>"
1162
+ }]);
1163
+ const TRAJECTORY_GOAL_SUCCESS_PROMPT = JSON.stringify([{
1164
+ role: "system",
1165
+ content: dedent`You are grading whether an AI agent successfully completed a goal based on its final output and a summarized execution trajectory. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
1166
+
1167
+ Judge end-to-end success, not stylistic perfection.
1168
+ Use the trajectory as evidence for what the agent actually did.
1169
+ Give partial credit when the agent made progress but did not fully achieve the goal.
1170
+
1171
+ Examples:
1172
+
1173
+ <Goal>Find the order status and tell the user whether it has shipped</Goal>
1174
+ <Trajectory>{"stepCount":2,"steps":[{"index":1,"type":"tool","name":"search_orders"},{"index":2,"type":"message","name":"agent response"}]}</Trajectory>
1175
+ <Output>Your order shipped yesterday and should arrive on Tuesday.</Output>
1176
+ {"reason":"The agent used the order lookup tool and gave the user the shipping status, so the goal was achieved.","pass":true,"score":1.0}
1177
+
1178
+ <Goal>Find the order status and tell the user whether it has shipped</Goal>
1179
+ <Trajectory>{"stepCount":1,"steps":[{"index":1,"type":"message","name":"agent response"}]}</Trajectory>
1180
+ <Output>I cannot check your order right now.</Output>
1181
+ {"reason":"The agent did not show evidence of checking the order and did not provide the requested status.","pass":false,"score":0.0}`
1182
+ }, {
1183
+ role: "user",
1184
+ content: dedent`<Goal>
1185
+ {{ goal }}
1186
+ </Goal>
1187
+ <Trajectory>
1188
+ {{ trajectory }}
1189
+ </Trajectory>
1190
+ <Output>
1191
+ {{ output }}
1192
+ </Output>`
1193
+ }]);
1194
+ //#endregion
1195
+ //#region src/prompts/index.ts
1163
1196
  /**
1164
- * Check if a provider has web search capabilities
1165
- * @param provider The provider to check
1166
- * @returns true if the provider supports web search
1197
+ * Reads and maps provider prompts based on the configuration and parsed prompts.
1198
+ * @param config - The configuration object.
1199
+ * @param parsedPrompts - Array of parsed prompts.
1200
+ * @returns A map of provider IDs to their respective prompts.
1167
1201
  */
1168
- function hasWebSearchCapability(provider) {
1169
- if (!provider) return false;
1170
- const id = getProviderId(provider);
1171
- if (!id) return false;
1172
- if (id.includes("perplexity")) return true;
1173
- if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
1174
- if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
1175
- if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
1176
- if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
1177
- if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
1178
- return false;
1202
+ function readProviderPromptMap(config, parsedPrompts) {
1203
+ const ret = {};
1204
+ if (!config.providers) return ret;
1205
+ const allPrompts = [];
1206
+ for (const prompt of parsedPrompts) allPrompts.push(prompt.label);
1207
+ if (typeof config.providers === "string") return { [config.providers]: allPrompts };
1208
+ if (typeof config.providers === "function") return { "Custom function": allPrompts };
1209
+ for (const provider of config.providers) if (typeof provider === "object") if (provider.id) {
1210
+ const rawProvider = provider;
1211
+ invariant(rawProvider.id, "You must specify an `id` on the Provider when you override options.prompts");
1212
+ ret[rawProvider.id] = rawProvider.prompts || allPrompts;
1213
+ if (rawProvider.label) ret[rawProvider.label] = rawProvider.prompts || allPrompts;
1214
+ } else {
1215
+ const rawProvider = provider;
1216
+ const originalId = Object.keys(rawProvider)[0];
1217
+ const id = rawProvider[originalId].id || originalId;
1218
+ ret[id] = rawProvider[originalId].prompts || allPrompts;
1219
+ }
1220
+ return ret;
1179
1221
  }
1180
1222
  /**
1181
- * Load a provider with web search capabilities.
1182
- * Tries multiple providers in order of preference until one succeeds.
1183
- * Uses the latest and most capable models from each provider with specific checkpoint IDs.
1184
- *
1185
- * @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
1186
- * @returns A provider with web search capabilities or null
1223
+ * Processes a raw prompt based on its content type and path.
1224
+ * @param prompt - The raw prompt data.
1225
+ * @param basePath - Base path for file resolution.
1226
+ * @param maxRecursionDepth - Maximum recursion depth for globbing.
1227
+ * @returns Promise resolving to an array of processed prompts.
1187
1228
  */
1188
- async function loadWebSearchProvider(preferAnthropic = false) {
1189
- const loadAnthropicWebSearch = async () => {
1190
- try {
1191
- return await loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
1192
- type: "web_search_20250305",
1193
- name: "web_search",
1194
- max_uses: 5
1195
- }] } } });
1196
- } catch (err) {
1197
- logger.debug(`Failed to load Anthropic web search provider: ${err}`);
1198
- return null;
1199
- }
1200
- };
1201
- const loadOpenAIWebSearch = async () => {
1202
- try {
1203
- return await loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
1204
- } catch (err) {
1205
- logger.debug(`Failed to load OpenAI web search provider: ${err}`);
1206
- return null;
1207
- }
1208
- };
1209
- const loadPerplexity = async () => {
1210
- try {
1211
- return await loadApiProvider("perplexity:sonar-pro");
1212
- } catch (err) {
1213
- logger.debug(`Failed to load Perplexity provider: ${err}`);
1214
- return null;
1215
- }
1216
- };
1217
- const loadGoogleWebSearch = async () => {
1218
- try {
1219
- return await loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
1220
- } catch (err) {
1221
- logger.debug(`Failed to load Google web search provider: ${err}`);
1222
- return null;
1229
+ async function processPrompt(prompt, basePath = "", maxRecursionDepth = 1) {
1230
+ invariant(typeof prompt.raw === "string", `prompt.raw must be a string, but got ${JSON.stringify(prompt.raw)}`);
1231
+ if (prompt.function) return [prompt];
1232
+ if (prompt.raw.startsWith("exec:")) {
1233
+ const { filePath, functionName } = parsePathOrGlob(basePath, prompt.raw.substring(5));
1234
+ return await processExecutableFile(filePath, prompt, functionName);
1235
+ }
1236
+ if (!maybeFilePath(prompt.raw)) return processString(prompt);
1237
+ const { extension, functionName, isPathPattern, filePath } = parsePathOrGlob(basePath, prompt.raw);
1238
+ if (isPathPattern && maxRecursionDepth > 0) {
1239
+ const globbedPath = globSync(filePath.replace(/\\/g, "/"), { windowsPathsNoEscape: true });
1240
+ logger.debug(`Expanded prompt ${prompt.raw} to ${filePath} and then to ${JSON.stringify(globbedPath)}`);
1241
+ const prompts = [];
1242
+ for (const globbedFilePath of globbedPath) {
1243
+ const processedPrompts = await processPrompt({ raw: functionName ? `${globbedFilePath}:${functionName}` : globbedFilePath }, basePath, maxRecursionDepth - 1);
1244
+ prompts.push(...processedPrompts);
1223
1245
  }
1224
- };
1225
- const loadVertexWebSearch = async () => {
1226
- try {
1227
- return await loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
1228
- } catch (err) {
1229
- logger.debug(`Failed to load Vertex web search provider: ${err}`);
1230
- return null;
1246
+ if (prompts.length === 0) {
1247
+ logger.debug(`Attempted to load file at "${prompt.raw}", but no file found. Using raw string.`);
1248
+ prompts.push(...processString(prompt));
1231
1249
  }
1232
- };
1233
- const loadXaiWebSearch = async () => {
1250
+ return prompts;
1251
+ }
1252
+ if (extension === ".csv") return processCsvPrompts(filePath, prompt);
1253
+ if (extension === ".j2") return processJinjaFile(filePath, prompt);
1254
+ if (extension === ".json") return processJsonFile(filePath, prompt);
1255
+ if (extension === ".jsonl") return processJsonlFile(filePath, prompt);
1256
+ if (extension && isJavascriptFile(extension)) return processJsFile(filePath, prompt, functionName);
1257
+ if (extension === ".md") return processMarkdownFile(filePath, prompt);
1258
+ if (extension === ".py") return processPythonFile(filePath, prompt, functionName);
1259
+ if (extension === ".txt") return processTxtFile(filePath, prompt);
1260
+ if (extension && [".yml", ".yaml"].includes(extension)) return processYamlFile(filePath, prompt);
1261
+ if (extension && [
1262
+ ".sh",
1263
+ ".bash",
1264
+ ".exe",
1265
+ ".bat",
1266
+ ".cmd",
1267
+ ".ps1",
1268
+ ".rb",
1269
+ ".pl"
1270
+ ].includes(extension)) return await processExecutableFile(filePath, prompt, functionName);
1271
+ try {
1272
+ const stats = await stat(filePath);
1273
+ if (stats.isFile() && (stats.mode & 73) !== 0) return await processExecutableFile(filePath, prompt, functionName);
1274
+ } catch (_e) {}
1275
+ return [];
1276
+ }
1277
+ /**
1278
+ * Reads and processes prompts from a specified path or glob pattern.
1279
+ * @param promptPathOrGlobs - The path or glob pattern.
1280
+ * @param basePath - Base path for file resolution.
1281
+ * @returns Promise resolving to an array of processed prompts.
1282
+ */
1283
+ async function readPrompts(promptPathOrGlobs, basePath = "") {
1284
+ logger.debug(`Reading prompts from ${JSON.stringify(promptPathOrGlobs)}`);
1285
+ const promptPartials = normalizeInput(promptPathOrGlobs);
1286
+ const prompts = [];
1287
+ for (const prompt of promptPartials) {
1288
+ const promptBatch = await processPrompt(prompt, basePath);
1289
+ if (promptBatch.length === 0) throw new Error(`There are no prompts in ${JSON.stringify(prompt.raw)}`);
1290
+ prompts.push(...promptBatch);
1291
+ }
1292
+ return prompts;
1293
+ }
1294
+ async function processPrompts(prompts) {
1295
+ return (await Promise.all(prompts.map(async (promptInput) => {
1296
+ if (typeof promptInput === "function") return {
1297
+ raw: promptInput.toString(),
1298
+ label: promptInput?.name ?? promptInput.toString(),
1299
+ function: promptInput
1300
+ };
1301
+ else if (typeof promptInput === "string") return readPrompts(promptInput);
1234
1302
  try {
1235
- return await loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
1236
- } catch (err) {
1237
- logger.debug(`Failed to load xAI web search provider: ${err}`);
1238
- return null;
1239
- }
1240
- };
1241
- const providers = preferAnthropic ? [
1242
- loadAnthropicWebSearch,
1243
- loadOpenAIWebSearch,
1244
- loadPerplexity,
1245
- loadGoogleWebSearch,
1246
- loadVertexWebSearch,
1247
- loadXaiWebSearch
1248
- ] : [
1249
- loadOpenAIWebSearch,
1250
- loadAnthropicWebSearch,
1251
- loadPerplexity,
1252
- loadGoogleWebSearch,
1253
- loadVertexWebSearch,
1254
- loadXaiWebSearch
1255
- ];
1256
- for (const getProvider of providers) {
1257
- const provider = await getProvider();
1258
- if (provider && hasWebSearchCapability(provider)) {
1259
- logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
1260
- return provider;
1303
+ return PromptSchema.parse(promptInput);
1304
+ } catch (error) {
1305
+ logger.warn(`Prompt input is not a valid prompt schema: ${error}\nFalling back to serialized JSON as raw prompt.`);
1306
+ return {
1307
+ raw: JSON.stringify(promptInput),
1308
+ label: JSON.stringify(promptInput)
1309
+ };
1261
1310
  }
1262
- if (provider) logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
1263
- }
1264
- return null;
1311
+ }))).flat();
1265
1312
  }
1313
+ const GEVAL_PROMPT_STEPS = `
1314
+ Given evaluation criteria that outline how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to check whether the criteria are satisfied by the text.
1315
+
1316
+ **EVALUATION CRITERIA**
1317
+ {{criteria}}
1318
+
1319
+ **OUTPUT FORMAT**
1320
+ IMPORTANT:
1321
+ - Return output ONLY as a minified JSON object (no code fences).
1322
+ - The JSON object must contain a single key, "steps", whose value is a list of strings.
1323
+ - Each string must represent one evaluation step.
1324
+ - Do NOT include any explanations, commentary, extra text, or additional formatting.
1325
+
1326
+ Format:
1327
+ {"steps": <list_of_strings>}
1328
+
1329
+ Example:
1330
+ {"steps":["<Evaluation Step 1>","<Evaluation Step 2>","<Evaluation Step 3>","<Evaluation Step 4>"]}
1331
+
1332
+ Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
1333
+ JSON:
1334
+ `;
1335
+ const GEVAL_PROMPT_EVALUATE = `
1336
+ You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
1337
+ Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
1338
+
1339
+ **Evaluation Criteria**
1340
+ {{criteria}}
1341
+
1342
+ **Evaluation Steps**
1343
+ - {{steps}}
1344
+ Given the evaluation steps, return a JSON with two keys:
1345
+ 1) a "score" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the condition described by the Evaluation Criteria is fully and clearly observed in the Reply according to the Evaluation Steps, and 0 indicates that it is not observed at all;
1346
+ 2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!
1347
+
1348
+ **Prompt**
1349
+ {{input}}
1350
+
1351
+ **Reply**
1352
+ {{output}}
1353
+
1354
+ **OUTPUT FORMAT**
1355
+ IMPORTANT:
1356
+ - Return output ONLY as a minified JSON object (no code fences).
1357
+ - The JSON object must contain exactly two keys: "score" and "reason".
1358
+ - No additional words, explanations, or formatting are needed.
1359
+ - Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
1360
+
1361
+ Example JSON:
1362
+ {"score":0,"reason":"The text of reply does not follow the evaluation criteria provided."}
1363
+
1364
+ Here is the final evaluation in the required minified JSON format:
1365
+ JSON:
1366
+ `;
1266
1367
  //#endregion
1267
1368
  //#region src/remoteGrading.ts
1268
1369
  async function doRemoteGrading(payload) {
@@ -1325,23 +1426,7 @@ async function doRemoteScoringWithPi(payload, passThreshold = .5) {
1325
1426
  }
1326
1427
  }
1327
1428
  //#endregion
1328
- //#region src/scheduler/providerCallExecutionContext.ts
1329
- const providerCallExecutionContext = new AsyncLocalStorage();
1330
- function getProviderCallExecutionContext() {
1331
- return providerCallExecutionContext.getStore();
1332
- }
1333
- function withProviderCallExecutionContext(context, fn) {
1334
- return providerCallExecutionContext.run(context, fn);
1335
- }
1336
- //#endregion
1337
- //#region src/matchers.ts
1338
- var LlmRubricProviderError = class extends Error {
1339
- constructor(message) {
1340
- super(message);
1341
- this.name = "LlmRubricProviderError";
1342
- }
1343
- };
1344
- const nunjucks = getNunjucksEngine(void 0, false, true);
1429
+ //#region src/matchers/llmGrading.ts
1345
1430
  const FACTUALITY_CATEGORY_DESCRIPTIONS = {
1346
1431
  A: "The submitted answer is a subset of the expert answer and is fully consistent with it.",
1347
1432
  B: "The submitted answer is a superset of the expert answer and is fully consistent with it.",
@@ -1349,409 +1434,73 @@ const FACTUALITY_CATEGORY_DESCRIPTIONS = {
1349
1434
  D: "There is a disagreement between the submitted answer and the expert answer.",
1350
1435
  E: "The answers differ, but these differences don't matter from the perspective of factuality."
1351
1436
  };
1352
- function cosineSimilarity(vecA, vecB) {
1353
- if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
1354
- return vecA.reduce((acc, val, idx) => acc + val * vecB[idx], 0) / (Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0)) * Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0)));
1355
- }
1356
- function dotProduct(vecA, vecB) {
1357
- if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
1358
- return vecA.reduce((acc, val, idx) => acc + val * vecB[idx], 0);
1359
- }
1360
- function euclideanDistance(vecA, vecB) {
1361
- if (vecA.length !== vecB.length) throw new Error("Vectors must be of equal length");
1362
- const sumSquaredDiff = vecA.reduce((acc, val, idx) => {
1363
- const diff = val - vecB[idx];
1364
- return acc + diff * diff;
1365
- }, 0);
1366
- return Math.sqrt(sumSquaredDiff);
1367
- }
1368
- /**
1369
- * Helper to call provider with consistent context propagation pattern.
1370
- * Spreads the optional context and merges with prompt label and vars.
1371
- *
1372
- * IMPORTANT: Spread order matters - context is spread first, then prompt/vars
1373
- * override. This ensures originalProvider from context is preserved while
1374
- * allowing this call to specify its own prompt metadata.
1375
- */
1376
- function callProviderWithContext(provider, prompt, label, vars, context) {
1377
- const callApiContext = {
1378
- ...context,
1379
- prompt: {
1380
- raw: prompt,
1381
- label
1382
- },
1383
- vars
1384
- };
1385
- const executionContext = getProviderCallExecutionContext();
1386
- const callApiOptions = executionContext?.abortSignal ? { abortSignal: executionContext.abortSignal } : void 0;
1387
- const callApi = () => callApiOptions ? provider.callApi(prompt, callApiContext, callApiOptions) : provider.callApi(prompt, callApiContext);
1388
- const executeCall = () => {
1389
- if (executionContext?.rateLimitRegistry && !isRateLimitWrapped(provider)) return executionContext.rateLimitRegistry.execute(provider, callApi, createProviderRateLimitOptions());
1390
- return callApi();
1391
- };
1392
- if (executionContext?.providerCallQueue) return executionContext.providerCallQueue.enqueue(provider.id(), executeCall);
1393
- return executeCall();
1394
- }
1395
- async function loadFromProviderOptions(provider) {
1396
- invariant(typeof provider === "object", `Provider must be an object, but received a ${typeof provider}: ${provider}`);
1397
- invariant(!Array.isArray(provider), `Provider must be an object, but received an array: ${JSON.stringify(provider)}`);
1398
- invariant(provider.id, "Provider supplied to assertion must have an id");
1399
- return loadApiProvider(provider.id, {
1400
- options: provider,
1401
- basePath: state.basePath
1402
- });
1403
- }
1404
- function isSimulatedUserProviderConfig(provider) {
1405
- if (typeof provider === "string") return provider === "promptfoo:simulated-user";
1406
- if (!provider || typeof provider !== "object" || Array.isArray(provider)) return false;
1407
- if (typeof provider.id === "function") return provider.id() === "promptfoo:simulated-user";
1408
- const providerId = provider.id;
1409
- if (typeof providerId === "string") return providerId === "promptfoo:simulated-user";
1410
- return Object.values(provider).some((providerTypeConfig) => isSimulatedUserProviderConfig(providerTypeConfig));
1411
- }
1412
- async function getGradingProvider(type, provider, defaultProvider) {
1413
- let finalProvider;
1414
- if (typeof provider === "string") finalProvider = await loadApiProvider(provider, { basePath: state.basePath });
1415
- else if (typeof provider === "object" && typeof provider.id === "function") finalProvider = provider;
1416
- else if (typeof provider === "object") {
1417
- const typeValue = provider[type];
1418
- if (typeValue) finalProvider = await getGradingProvider(type, typeValue, defaultProvider);
1419
- else if (provider.id) finalProvider = await loadFromProviderOptions(provider);
1420
- else if (Array.isArray(provider)) throw new Error(`Provider must be an object or string, but received an array.\n\nCheck that the provider ${JSON.stringify(provider[0], null, 2)} is not nested in an array.`);
1421
- else throw new Error(`Invalid provider definition for output type '${type}': ${JSON.stringify(provider, null, 2)}`);
1422
- } else {
1423
- const defaultTest = state.config?.defaultTest;
1424
- const defaultTestObj = typeof defaultTest === "object" ? defaultTest : null;
1425
- const cfg = [
1426
- defaultTestObj?.provider || void 0,
1427
- defaultTestObj?.options?.provider?.text || void 0,
1428
- defaultTestObj?.options?.provider || void 0
1429
- ].find((candidateProvider) => {
1430
- if (!candidateProvider) return false;
1431
- if (isSimulatedUserProviderConfig(candidateProvider)) {
1432
- logger.debug("[Grading] Skipping promptfoo:simulated-user as an implicit grader fallback");
1433
- return false;
1434
- }
1435
- return true;
1436
- });
1437
- if (cfg) {
1438
- finalProvider = await getGradingProvider(type, cfg, defaultProvider);
1439
- if (finalProvider) logger.debug(`[Grading] Using provider from defaultTest fallback: ${finalProvider.id()}`);
1440
- } else finalProvider = defaultProvider;
1441
- }
1442
- return finalProvider;
1443
- }
1444
- async function getAndCheckProvider(type, provider, defaultProvider, checkName) {
1445
- const matchedProvider = await getGradingProvider(type, provider, defaultProvider);
1446
- if (!matchedProvider) if (defaultProvider) {
1447
- logger.warn(`No provider of type ${type} found for '${checkName}', falling back to default`);
1448
- return defaultProvider;
1449
- } else throw new Error(`No provider of type ${type} found for '${checkName}'`);
1450
- let isValidProviderType = true;
1451
- if (type === "embedding") isValidProviderType = "callEmbeddingApi" in matchedProvider || "callSimilarityApi" in matchedProvider;
1452
- else if (type === "classification") isValidProviderType = "callClassificationApi" in matchedProvider;
1453
- else if (type === "moderation") isValidProviderType = "callModerationApi" in matchedProvider;
1454
- if (!isValidProviderType) if (defaultProvider) {
1455
- logger.warn(`Provider ${matchedProvider.id()} is not a valid ${type} provider for '${checkName}', falling back to default`);
1456
- return defaultProvider;
1457
- } else throw new Error(`Provider ${matchedProvider.id()} is not a valid ${type} provider for '${checkName}'`);
1458
- return matchedProvider;
1459
- }
1460
- function fail(reason, tokensUsed) {
1461
- return {
1462
- pass: false,
1463
- reason,
1464
- score: 0,
1465
- tokensUsed: {
1466
- total: tokensUsed?.total || 0,
1467
- prompt: tokensUsed?.prompt || 0,
1468
- completion: tokensUsed?.completion || 0,
1469
- cached: tokensUsed?.cached || 0,
1470
- numRequests: tokensUsed?.numRequests || 0,
1471
- completionDetails: tokensUsed?.completionDetails
1472
- }
1473
- };
1474
- }
1475
- function normalizeTokenUsage(tokensUsed) {
1476
- return {
1477
- total: tokensUsed?.total || 0,
1478
- prompt: tokensUsed?.prompt || 0,
1479
- completion: tokensUsed?.completion || 0,
1480
- cached: tokensUsed?.cached || 0,
1481
- numRequests: tokensUsed?.numRequests || 0,
1482
- completionDetails: tokensUsed?.completionDetails || {
1483
- reasoning: 0,
1484
- acceptedPrediction: 0,
1485
- rejectedPrediction: 0
1486
- }
1487
- };
1488
- }
1489
- function createMatcherTokenUsage() {
1490
- return {
1491
- total: 0,
1492
- prompt: 0,
1493
- completion: 0,
1494
- cached: 0,
1495
- numRequests: 0,
1496
- completionDetails: {
1497
- reasoning: 0,
1498
- acceptedPrediction: 0,
1499
- rejectedPrediction: 0
1500
- }
1501
- };
1502
- }
1503
- function copySimilarityTokenUsage(tokensUsed, response) {
1504
- Object.assign(tokensUsed, normalizeTokenUsage(response.tokenUsage));
1505
- }
1506
- function combineEmbeddingTokenUsage(expectedEmbedding, outputEmbedding) {
1507
- const expectedTokens = normalizeTokenUsage(expectedEmbedding.tokenUsage);
1508
- const outputTokens = normalizeTokenUsage(outputEmbedding.tokenUsage);
1509
- return {
1510
- total: (expectedTokens.total ?? 0) + (outputTokens.total ?? 0),
1511
- prompt: (expectedTokens.prompt ?? 0) + (outputTokens.prompt ?? 0),
1512
- completion: (expectedTokens.completion ?? 0) + (outputTokens.completion ?? 0),
1513
- cached: (expectedTokens.cached ?? 0) + (outputTokens.cached ?? 0),
1514
- numRequests: (expectedTokens.numRequests ?? 0) + (outputTokens.numRequests ?? 0),
1515
- completionDetails: {
1516
- reasoning: (expectedTokens.completionDetails?.reasoning || 0) + (outputTokens.completionDetails?.reasoning || 0),
1517
- acceptedPrediction: (expectedTokens.completionDetails?.acceptedPrediction || 0) + (outputTokens.completionDetails?.acceptedPrediction || 0),
1518
- rejectedPrediction: (expectedTokens.completionDetails?.rejectedPrediction || 0) + (outputTokens.completionDetails?.rejectedPrediction || 0)
1519
- }
1520
- };
1521
- }
1522
- function accumulateTokens(target, update) {
1523
- accumulateTokenUsage(target, update);
1524
- }
1525
- async function maybeRemoteSimilarityGrading({ expected, output, threshold, inverse }) {
1526
- if (!state.config?.redteam || !shouldGenerateRemote({ requireEmbeddingProvider: true })) return;
1527
- try {
1528
- return await doRemoteGrading({
1529
- task: "similar",
1530
- expected,
1531
- output,
1532
- threshold,
1533
- inverse
1534
- });
1535
- } catch (error) {
1536
- return fail(`Could not perform remote grading: ${error}`);
1537
- }
1538
- }
1539
- async function computeNativeSimilarity(provider, expected, output, metric) {
1540
- if ("callSimilarityApi" in provider) return computeSimilarityFromNativeProvider(provider, expected, output, metric);
1541
- if ("callEmbeddingApi" in provider) return computeSimilarityFromEmbeddings(provider, expected, output, metric);
1542
- throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
1543
- }
1544
- async function computeSimilarityFromNativeProvider(provider, expected, output, metric) {
1545
- const tokensUsed = createMatcherTokenUsage();
1546
- if (metric !== "cosine") return { failure: fail(`Provider ${provider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed) };
1547
- const similarityResp = await provider.callSimilarityApi(expected, output);
1548
- copySimilarityTokenUsage(tokensUsed, similarityResp);
1549
- if (similarityResp.error) return { failure: fail(similarityResp.error, tokensUsed) };
1550
- if (similarityResp.similarity == null) return { failure: fail("Unknown error fetching similarity", tokensUsed) };
1551
- return {
1552
- similarity: similarityResp.similarity,
1553
- tokensUsed
1554
- };
1555
- }
1556
- async function computeSimilarityFromEmbeddings(provider, expected, output, metric) {
1557
- const expectedEmbedding = await provider.callEmbeddingApi(expected);
1558
- const outputEmbedding = await provider.callEmbeddingApi(output);
1559
- const tokensUsed = combineEmbeddingTokenUsage(expectedEmbedding, outputEmbedding);
1560
- if (expectedEmbedding.error || outputEmbedding.error) return { failure: fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed) };
1561
- if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return { failure: fail("Embedding not found", tokensUsed) };
1562
- return {
1563
- similarity: computeSimilarityMetric(expectedEmbedding.embedding, outputEmbedding.embedding, metric),
1564
- tokensUsed
1565
- };
1566
- }
1567
- function computeSimilarityMetric(expectedEmbedding, outputEmbedding, metric) {
1568
- switch (metric) {
1569
- case "cosine": return cosineSimilarity(expectedEmbedding, outputEmbedding);
1570
- case "dot_product": return dotProduct(expectedEmbedding, outputEmbedding);
1571
- case "euclidean": return euclideanDistance(expectedEmbedding, outputEmbedding);
1572
- default: throw new Error(`Unsupported metric: ${metric}`);
1573
- }
1574
- }
1575
- function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
1576
- return metric === "euclidean" ? buildDistanceResult(similarity, threshold, inverse, tokensUsed) : buildSimilarityScoreResult(similarity, threshold, inverse, tokensUsed);
1577
- }
1578
- function buildDistanceResult(distance, threshold, inverse, tokensUsed) {
1579
- const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
1580
- const normalizedScore = 1 / (1 + distance);
1581
- const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
1582
- const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
1583
- return {
1584
- pass,
1585
- score: inverse ? 1 - normalizedScore : normalizedScore,
1586
- reason: pass === inverse ? aboveThresholdReason : belowThresholdReason,
1587
- tokensUsed
1588
- };
1589
- }
1590
- function buildSimilarityScoreResult(similarity, threshold, inverse, tokensUsed) {
1591
- const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
1592
- const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
1593
- const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
1594
- return {
1595
- pass,
1596
- score: inverse ? 1 - similarity : similarity,
1597
- reason: pass === inverse ? lessThanReason : greaterThanReason,
1598
- tokensUsed
1599
- };
1600
- }
1601
- async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
1602
- const remoteResult = await maybeRemoteSimilarityGrading({
1603
- expected,
1604
- output,
1605
- threshold,
1606
- inverse
1607
- });
1608
- if (remoteResult) return remoteResult;
1609
- const defaults = await getDefaultProviders();
1610
- const computation = await computeNativeSimilarity(await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check"), expected, output, metric);
1611
- return "failure" in computation ? computation.failure : buildSimilarityResult(computation.similarity, threshold, inverse, metric, computation.tokensUsed);
1612
- }
1613
- /**
1614
- *
1615
- * @param expected Expected classification. If undefined, matches any classification.
1616
- * @param output Text to classify.
1617
- * @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
1618
- * @param grading
1619
- * @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
1620
- */
1621
- async function matchesClassification(expected, output, threshold, grading) {
1622
- const resp = await (await getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
1623
- if (!resp.classification) return fail(resp.error || "Unknown error fetching classification");
1624
- let score;
1625
- if (expected === void 0) score = Math.max(...Object.values(resp.classification));
1626
- else score = resp.classification[expected] || 0;
1627
- if (score >= threshold - Number.EPSILON) {
1628
- const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
1629
- return {
1630
- pass: true,
1631
- score,
1632
- reason
1633
- };
1634
- }
1437
+ function getFactualityScoreLookup(grading) {
1635
1438
  return {
1636
- pass: false,
1637
- score,
1638
- reason: `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
1639
- };
1640
- }
1641
- async function loadRubricPrompt(rubricPrompt, defaultPrompt) {
1642
- if (!rubricPrompt || typeof rubricPrompt === "object" && Object.keys(rubricPrompt).length === 0) return defaultPrompt;
1643
- if (typeof rubricPrompt === "string" && rubricPrompt.startsWith("file://")) {
1644
- const basePath = state.basePath || "";
1645
- const { filePath, functionName } = parseFileUrl(getNunjucksEngineForFilePath().renderString(rubricPrompt, {}));
1646
- const resolvedPath = path.resolve(basePath, filePath);
1647
- if (isJavascriptFile(filePath)) rubricPrompt = await loadFromJavaScriptFile(resolvedPath, functionName, []);
1648
- else {
1649
- if (!fs$2.existsSync(resolvedPath)) throw new Error(`File does not exist: ${resolvedPath}`);
1650
- rubricPrompt = fs$2.readFileSync(resolvedPath, "utf8");
1651
- }
1652
- } else rubricPrompt = maybeLoadFromExternalFile(rubricPrompt);
1653
- if (typeof rubricPrompt === "object") rubricPrompt = JSON.stringify(rubricPrompt);
1654
- invariant(typeof rubricPrompt === "string", "rubricPrompt must be a string");
1655
- return rubricPrompt;
1656
- }
1657
- function tryParse(content) {
1658
- try {
1659
- return JSON.parse(content);
1660
- } catch {}
1661
- return content;
1662
- }
1663
- function splitIntoSentences(text) {
1664
- return text.split("\n").filter((sentence) => sentence.trim() !== "");
1665
- }
1666
- function processContextForTemplating(context, enableObjectAccess) {
1667
- if (enableObjectAccess) return context;
1668
- return Object.fromEntries(Object.entries(context).map(([key, value]) => {
1669
- if (value && typeof value === "object") {
1670
- if (Array.isArray(value)) return [key, value.map((item) => item && typeof item === "object" ? JSON.stringify(item) : item)];
1671
- return [key, JSON.stringify(value)];
1672
- }
1673
- return [key, value];
1674
- }));
1439
+ A: grading.factuality?.subset ?? 1,
1440
+ B: grading.factuality?.superset ?? 1,
1441
+ C: grading.factuality?.agree ?? 1,
1442
+ D: grading.factuality?.disagree ?? 0,
1443
+ E: grading.factuality?.differButFactual ?? 1
1444
+ };
1675
1445
  }
1676
- async function renderLlmRubricPrompt(rubricPrompt, context) {
1677
- const processedContext = processContextForTemplating(context, getEnvBool("PROMPTFOO_DISABLE_OBJECT_STRINGIFY", false));
1678
- try {
1679
- const parsed = JSON.parse(rubricPrompt, (_k, v) => typeof v === "string" ? nunjucks.renderString(v, processedContext) : v);
1680
- return JSON.stringify(parsed);
1681
- } catch {}
1682
- return nunjucks.renderString(rubricPrompt, processedContext);
1446
+ function buildFactualityResult(option, reason, grading, resp) {
1447
+ const scoreLookup = getFactualityScoreLookup(grading);
1448
+ const passing = Object.keys(scoreLookup).filter((key) => scoreLookup[key] > 0);
1449
+ const failing = Object.keys(scoreLookup).filter((key) => scoreLookup[key] === 0);
1450
+ const pass = passing.includes(option) && !failing.includes(option);
1451
+ return {
1452
+ pass,
1453
+ score: scoreLookup[option] ?? (pass ? 1 : 0),
1454
+ reason,
1455
+ tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage)
1456
+ };
1683
1457
  }
1684
- function parseJsonGradingResponse(label, resp) {
1685
- let jsonObjects = [];
1686
- if (typeof resp.output === "string") try {
1687
- jsonObjects = extractJsonObjects(resp.output);
1688
- if (jsonObjects.length === 0) return { failure: fail(`Could not extract JSON from ${label} response`, resp.tokenUsage) };
1458
+ function parseFactualityJsonResponse(responseText) {
1459
+ try {
1460
+ const jsonData = extractFirstJsonObject(responseText);
1461
+ if (!jsonData?.category || typeof jsonData.category !== "string") return;
1462
+ const option = jsonData.category.trim().toUpperCase();
1463
+ if (!/^[A-E]$/.test(option)) throw new Error(`Invalid category value: ${option}`);
1464
+ return {
1465
+ option,
1466
+ reason: jsonData.reason?.trim() || `Category ${option}: ${FACTUALITY_CATEGORY_DESCRIPTIONS[option]}`
1467
+ };
1689
1468
  } catch (err) {
1690
- return { failure: fail(`${label} produced malformed response: ${err}\n\n${resp.output}`, resp.tokenUsage) };
1469
+ const error = err;
1470
+ if (error.message.startsWith("Invalid category value:")) throw error;
1471
+ logger.debug(`JSON parsing failed: ${error.message}`);
1472
+ return;
1691
1473
  }
1692
- else if (typeof resp.output === "object" && resp.output !== null && !Array.isArray(resp.output)) jsonObjects = [resp.output];
1693
- else return { failure: fail(`${label} produced malformed response - output must be string or object. Output: ${JSON.stringify(resp.output)}`, resp.tokenUsage) };
1694
- const parsed = jsonObjects[0];
1695
- if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) return { failure: fail(`${label} produced malformed response. We were not able to parse the response as JSON. Output: ${JSON.stringify(resp.output)}`, resp.tokenUsage) };
1696
- return { parsed };
1697
1474
  }
1698
- async function runJsonGradingPrompt({ assertion, checkName, defaultPrompt, grading, label, providerCallContext, throwOnError, vars }) {
1699
- const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading.rubricPrompt, defaultPrompt), vars);
1700
- const defaultProviders = await getDefaultProviders();
1701
- const defaultProvider = defaultProviders.llmRubricProvider || defaultProviders.gradingJsonProvider;
1702
- const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, defaultProvider, checkName), prompt, label, vars, providerCallContext);
1703
- if (resp.error || !resp.output) {
1704
- if (throwOnError) throw new Error(resp.error || "No output");
1705
- return fail(resp.error || "No output", resp.tokenUsage);
1706
- }
1707
- const { parsed, failure } = parseJsonGradingResponse(label, resp);
1708
- if (!parsed) return failure;
1709
- let pass = parsed.pass ?? true;
1710
- if (typeof pass !== "boolean") pass = /^(true|yes|pass|y)$/i.test(String(pass));
1711
- let score = parsed.score;
1712
- if (typeof score !== "number") score = Number.isFinite(Number(score)) ? Number(score) : Number(pass);
1713
- const threshold = typeof assertion?.threshold === "string" ? Number(assertion.threshold) : assertion?.threshold;
1714
- if (typeof threshold === "number" && Number.isFinite(threshold)) pass = pass && score >= threshold;
1715
- const reason = parsed.reason || (pass ? "Grading passed" : `Score ${score} below threshold ${threshold}`);
1716
- let responseMetadata = {};
1717
- if (resp.metadata && typeof resp.metadata === "object" && !Array.isArray(resp.metadata)) {
1718
- const serializedMetadata = safeJsonStringify(resp.metadata);
1719
- responseMetadata = serializedMetadata ? JSON.parse(serializedMetadata) : {};
1720
- }
1475
+ function parseLegacyFactualityResponse(responseText) {
1476
+ const answerMatch = responseText.match(/\s*\(?([a-eA-E])\)/);
1477
+ if (!answerMatch) throw new Error(`Factuality checker output did not match expected format: ${responseText}`);
1478
+ const option = answerMatch[1].toUpperCase();
1479
+ const reasonMatch = responseText.match(/\)\s*(.*)/s);
1721
1480
  return {
1722
- assertion,
1723
- pass,
1724
- score,
1725
- reason,
1726
- tokensUsed: {
1727
- total: resp.tokenUsage?.total || 0,
1728
- prompt: resp.tokenUsage?.prompt || 0,
1729
- completion: resp.tokenUsage?.completion || 0,
1730
- cached: resp.tokenUsage?.cached || 0,
1731
- numRequests: resp.tokenUsage?.numRequests || 0,
1732
- completionDetails: parsed.tokensUsed?.completionDetails || {
1733
- reasoning: 0,
1734
- acceptedPrediction: 0,
1735
- rejectedPrediction: 0
1736
- }
1737
- },
1738
- metadata: {
1739
- ...responseMetadata,
1740
- renderedGradingPrompt: prompt
1741
- }
1481
+ option,
1482
+ reason: reasonMatch?.[1] ? reasonMatch[1].trim() : responseText
1742
1483
  };
1743
1484
  }
1744
1485
  async function matchesLlmRubric(rubric, llmOutput, grading, vars, assertion, options, providerCallContext) {
1745
1486
  if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
1746
- if (!grading.rubricPrompt && !state.config?.redteam?.provider && state.config?.redteam && shouldGenerateRemote({ canUseCodexDefaultProvider: true })) return {
1747
- ...await doRemoteGrading({
1748
- task: "llm-rubric",
1749
- rubric,
1750
- output: llmOutput,
1751
- vars: vars || {}
1752
- }),
1753
- assertion
1754
- };
1487
+ const shouldPreferRemote = options?.preferRemote || grading.__promptfooPreferRemote || !grading.provider;
1488
+ if (!grading.rubricPrompt && shouldPreferRemote && !state.config?.redteam?.provider && state.config?.redteam && shouldGenerateRemote({ canUseCodexDefaultProvider: true })) try {
1489
+ return {
1490
+ ...await doRemoteGrading({
1491
+ task: "llm-rubric",
1492
+ rubric,
1493
+ output: llmOutput,
1494
+ vars: vars || {}
1495
+ }),
1496
+ assertion
1497
+ };
1498
+ } catch (error) {
1499
+ return {
1500
+ ...fail(`Could not perform remote grading: ${error}`),
1501
+ assertion
1502
+ };
1503
+ }
1755
1504
  try {
1756
1505
  return await runJsonGradingPrompt({
1757
1506
  assertion,
@@ -1799,89 +1548,42 @@ async function matchesPiScore(renderedValue, llmInput, llmOutput, assertion) {
1799
1548
  assertion
1800
1549
  };
1801
1550
  }
1802
- function isFactualityCategory(category) {
1803
- return /^[A-E]$/.test(category);
1804
- }
1805
- function getFactualityScoreLookup(grading) {
1806
- return {
1807
- A: grading.factuality?.subset ?? 1,
1808
- B: grading.factuality?.superset ?? 1,
1809
- C: grading.factuality?.agree ?? 1,
1810
- D: grading.factuality?.disagree ?? 0,
1811
- E: grading.factuality?.differButFactual ?? 1
1812
- };
1813
- }
1814
- function buildFactualityCategoryResult(category, reason, grading, tokensUsed) {
1815
- const option = category.trim().toUpperCase();
1816
- if (!isFactualityCategory(option)) return fail(`Invalid category value: ${option}`, tokensUsed);
1817
- const score = getFactualityScoreLookup(grading)[option];
1818
- return {
1819
- pass: score > 0,
1820
- score,
1821
- reason: reason?.trim() || `Category ${option}: ${FACTUALITY_CATEGORY_DESCRIPTIONS[option]}`,
1822
- tokensUsed: normalizeTokenUsage(tokensUsed)
1823
- };
1824
- }
1825
- function parseJsonFactualityOutput(output) {
1826
- try {
1827
- const jsonData = extractFirstJsonObject(output);
1828
- return typeof jsonData?.category === "string" ? {
1829
- category: jsonData.category,
1830
- reason: jsonData.reason
1831
- } : null;
1832
- } catch (err) {
1833
- logger.debug(`JSON parsing failed: ${err.message}`);
1834
- return null;
1835
- }
1836
- }
1837
- function parseLegacyFactualityOutput(output) {
1838
- const answerMatch = output.match(/\s*\(?([a-eA-E])\)/);
1839
- if (!answerMatch) return { failure: `Factuality checker output did not match expected format: ${output}` };
1840
- const reasonMatch = output.match(/\)\s*(.*)/s);
1841
- return {
1842
- category: answerMatch[1],
1843
- reason: reasonMatch?.[1]?.trim() || output
1844
- };
1845
- }
1846
- function gradeFactualityOutput(output, grading, tokensUsed) {
1847
- const jsonResult = parseJsonFactualityOutput(output);
1848
- if (jsonResult) return buildFactualityCategoryResult(jsonResult.category, jsonResult.reason, grading, tokensUsed);
1849
- logger.info("Falling back to legacy pattern matching for factuality check");
1850
- const legacyResult = parseLegacyFactualityOutput(output);
1851
- return "failure" in legacyResult ? fail(legacyResult.failure, tokensUsed) : buildFactualityCategoryResult(legacyResult.category, legacyResult.reason, grading, tokensUsed);
1852
- }
1853
1551
  async function matchesFactuality(input, expected, output, grading, vars, providerCallContext) {
1854
1552
  if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
1855
- const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, PROMPTFOO_FACTUALITY_PROMPT), {
1856
- input,
1857
- ideal: expected,
1858
- completion: tryParse(output),
1859
- ...vars || {}
1860
- });
1861
- const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, (await getDefaultProviders()).gradingProvider, "factuality check"), prompt, "factuality", {
1553
+ const templateVars = {
1862
1554
  input,
1863
1555
  ideal: expected,
1864
1556
  completion: tryParse(output),
1865
1557
  ...vars || {}
1866
- }, providerCallContext);
1558
+ };
1559
+ const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, PROMPTFOO_FACTUALITY_PROMPT), templateVars);
1560
+ const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, (await getDefaultProviders()).gradingProvider, "factuality check"), prompt, "factuality", templateVars, providerCallContext);
1867
1561
  if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
1868
1562
  invariant(typeof resp.output === "string", "factuality produced malformed response");
1869
- return gradeFactualityOutput(resp.output, grading, resp.tokenUsage);
1563
+ try {
1564
+ const parsedJson = parseFactualityJsonResponse(resp.output);
1565
+ if (parsedJson) return buildFactualityResult(parsedJson.option, parsedJson.reason, grading, resp);
1566
+ } catch (err) {
1567
+ return fail(err.message, resp.tokenUsage);
1568
+ }
1569
+ logger.info("Falling back to legacy pattern matching for factuality check");
1570
+ try {
1571
+ const parsedLegacy = parseLegacyFactualityResponse(resp.output);
1572
+ return buildFactualityResult(parsedLegacy.option, parsedLegacy.reason, grading, resp);
1573
+ } catch (err) {
1574
+ return fail(err.message, resp.tokenUsage);
1575
+ }
1870
1576
  }
1871
1577
  async function matchesClosedQa(input, expected, output, grading, vars, providerCallContext) {
1872
1578
  if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
1873
- const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, OPENAI_CLOSED_QA_PROMPT), {
1874
- input,
1875
- criteria: expected,
1876
- completion: tryParse(output),
1877
- ...vars || {}
1878
- });
1879
- const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, (await getDefaultProviders()).gradingProvider, "model-graded-closedqa check"), prompt, "model-graded-closedqa", {
1579
+ const templateVars = {
1880
1580
  input,
1881
1581
  criteria: expected,
1882
1582
  completion: tryParse(output),
1883
1583
  ...vars || {}
1884
- }, providerCallContext);
1584
+ };
1585
+ const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, OPENAI_CLOSED_QA_PROMPT), templateVars);
1586
+ const resp = await callProviderWithContext(await getAndCheckProvider("text", grading.provider, (await getDefaultProviders()).gradingProvider, "model-graded-closedqa check"), prompt, "model-graded-closedqa", templateVars, providerCallContext);
1885
1587
  if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
1886
1588
  invariant(typeof resp.output === "string", "model-graded-closedqa produced malformed response");
1887
1589
  try {
@@ -1894,18 +1596,7 @@ async function matchesClosedQa(input, expected, output, grading, vars, providerC
1894
1596
  pass,
1895
1597
  score: pass ? 1 : 0,
1896
1598
  reason,
1897
- tokensUsed: {
1898
- total: resp.tokenUsage?.total || 0,
1899
- prompt: resp.tokenUsage?.prompt || 0,
1900
- completion: resp.tokenUsage?.completion || 0,
1901
- cached: resp.tokenUsage?.cached || 0,
1902
- numRequests: resp.tokenUsage?.numRequests || 0,
1903
- completionDetails: resp.tokenUsage?.completionDetails || {
1904
- reasoning: 0,
1905
- acceptedPrediction: 0,
1906
- rejectedPrediction: 0
1907
- }
1908
- }
1599
+ tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage)
1909
1600
  };
1910
1601
  } catch (err) {
1911
1602
  return fail(`Error parsing output: ${err.message}`, resp.tokenUsage);
@@ -1915,490 +1606,51 @@ async function matchesGEval(criteria, input, output, threshold, grading, provide
1915
1606
  if (!input) throw Error("No source text to estimate reply");
1916
1607
  const maxScore = 10;
1917
1608
  const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "reply geval check");
1918
- const tokensUsed = {
1919
- total: 0,
1920
- prompt: 0,
1921
- completion: 0,
1922
- cached: 0,
1923
- numRequests: 0,
1924
- completionDetails: {
1925
- reasoning: 0,
1926
- acceptedPrediction: 0,
1927
- rejectedPrediction: 0
1928
- }
1929
- };
1609
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
1930
1610
  const respSteps = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(typeof grading?.rubricPrompt === "object" && !Array.isArray(grading?.rubricPrompt) ? grading?.rubricPrompt?.["steps"] : void 0, GEVAL_PROMPT_STEPS), { criteria }), "g-eval-steps", { criteria }, providerCallContext);
1931
- accumulateTokens(tokensUsed, respSteps.tokenUsage);
1611
+ accumulateTokenUsage(tokensUsed, respSteps.tokenUsage);
1612
+ if (respSteps.error) return fail(respSteps.error, tokensUsed);
1613
+ if (!respSteps.output) return fail("No output", tokensUsed);
1614
+ if (typeof respSteps.output !== "string") return fail("LLM-proposed evaluation steps response is not a string", tokensUsed);
1932
1615
  let steps;
1933
1616
  try {
1934
- steps = JSON.parse(respSteps.output.match(/\{"steps".+\}/g)[0]).steps;
1617
+ const stepsMatch = respSteps.output.match(/\{"steps".+\}/g);
1618
+ if (!stepsMatch) return fail(`LLM-proposed evaluation steps are not in JSON format: ${respSteps.output}`, tokensUsed);
1619
+ steps = JSON.parse(stepsMatch[0]).steps;
1935
1620
  if (!steps.length) return fail("LLM does not propose any evaluation step", tokensUsed);
1936
- } catch {
1937
- return fail(`LLM-proposed evaluation steps are not in JSON format: ${respSteps.output}`, tokensUsed);
1621
+ } catch (err) {
1622
+ return fail(`LLM-proposed evaluation steps are not in JSON format: ${err.message}\n\n${respSteps.output}`, tokensUsed);
1938
1623
  }
1939
- const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(typeof grading?.rubricPrompt === "object" && !Array.isArray(grading?.rubricPrompt) ? grading?.rubricPrompt?.["evaluate"] : void 0, GEVAL_PROMPT_EVALUATE), {
1940
- criteria,
1941
- steps: steps.join("\n- "),
1942
- maxScore: maxScore.toString(),
1943
- input: tryParse(input),
1944
- output: tryParse(output)
1945
- }), "g-eval", {
1624
+ const evalPrompt = await loadRubricPrompt(typeof grading?.rubricPrompt === "object" && !Array.isArray(grading?.rubricPrompt) ? grading?.rubricPrompt?.["evaluate"] : void 0, GEVAL_PROMPT_EVALUATE);
1625
+ const evalVars = {
1946
1626
  criteria,
1947
1627
  steps: steps.join("\n- "),
1948
1628
  maxScore: maxScore.toString(),
1949
1629
  input: tryParse(input),
1950
1630
  output: tryParse(output)
1951
- }, providerCallContext);
1952
- accumulateTokens(tokensUsed, resp.tokenUsage);
1631
+ };
1632
+ const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(evalPrompt, evalVars), "g-eval", evalVars, providerCallContext);
1633
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
1634
+ if (resp.error) return fail(resp.error, tokensUsed);
1635
+ if (!resp.output) return fail("No output", tokensUsed);
1636
+ if (typeof resp.output !== "string") return fail("LLM-proposed evaluation result response is not a string", tokensUsed);
1953
1637
  let result;
1954
1638
  try {
1955
- result = JSON.parse(resp.output.match(/\{.+\}/g)[0]);
1956
- } catch {
1957
- return fail(`LLM-proposed evaluation result is not in JSON format: ${resp.output}`, tokensUsed);
1639
+ const resultMatch = resp.output.match(/\{.+\}/g);
1640
+ if (!resultMatch) return fail(`LLM-proposed evaluation result is not in JSON format: ${resp.output}`, tokensUsed);
1641
+ result = JSON.parse(resultMatch[0]);
1642
+ } catch (err) {
1643
+ return fail(`LLM-proposed evaluation result is not in JSON format: ${err.message}\n\n${resp.output}`, tokensUsed);
1958
1644
  }
1645
+ const rawScore = typeof result.score === "number" ? result.score : Number(result.score);
1646
+ if (!Number.isFinite(rawScore)) return fail(`G-Eval result has invalid or missing score: ${JSON.stringify(result.score)}`, tokensUsed);
1959
1647
  return {
1960
- pass: result.score / maxScore >= threshold,
1961
- score: result.score / maxScore,
1648
+ pass: rawScore / maxScore >= threshold,
1649
+ score: rawScore / maxScore,
1962
1650
  reason: result.reason,
1963
1651
  tokensUsed
1964
1652
  };
1965
1653
  }
1966
- async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
1967
- const embeddingProvider = await getAndCheckProvider("embedding", grading?.provider, (await getDefaultProviders()).embeddingProvider, "answer relevancy check");
1968
- const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "answer relevancy check");
1969
- const tokensUsed = {
1970
- total: 0,
1971
- prompt: 0,
1972
- completion: 0,
1973
- cached: 0,
1974
- numRequests: 0,
1975
- completionDetails: {
1976
- reasoning: 0,
1977
- acceptedPrediction: 0,
1978
- rejectedPrediction: 0
1979
- }
1980
- };
1981
- const candidateQuestions = [];
1982
- for (let i = 0; i < 3; i++) {
1983
- const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, ANSWER_RELEVANCY_GENERATE), { answer: tryParse(output) }), "answer-relevance", { answer: tryParse(output) }, providerCallContext);
1984
- accumulateTokens(tokensUsed, resp.tokenUsage);
1985
- if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
1986
- invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
1987
- candidateQuestions.push(resp.output);
1988
- }
1989
- invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id} must implement callEmbeddingApi for similarity check`);
1990
- const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
1991
- accumulateTokens(tokensUsed, inputEmbeddingResp.tokenUsage);
1992
- if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
1993
- const inputEmbedding = inputEmbeddingResp.embedding;
1994
- const similarities = [];
1995
- const questionsWithScores = [];
1996
- for (const question of candidateQuestions) {
1997
- const resp = await embeddingProvider.callEmbeddingApi(question);
1998
- accumulateTokens(tokensUsed, resp.tokenUsage);
1999
- if (resp.error || !resp.embedding) return fail(resp.error || "No embedding", tokensUsed);
2000
- const questionSimilarity = cosineSimilarity(inputEmbedding, resp.embedding);
2001
- similarities.push(questionSimilarity);
2002
- questionsWithScores.push({
2003
- question,
2004
- similarity: questionSimilarity
2005
- });
2006
- }
2007
- const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
2008
- const pass = similarity >= threshold - Number.EPSILON;
2009
- const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
2010
- const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
2011
- const metadata = {
2012
- generatedQuestions: questionsWithScores,
2013
- averageSimilarity: similarity,
2014
- threshold
2015
- };
2016
- if (pass) return {
2017
- pass: true,
2018
- score: similarity,
2019
- reason: greaterThanReason,
2020
- tokensUsed,
2021
- metadata
2022
- };
2023
- return {
2024
- pass: false,
2025
- score: similarity,
2026
- reason: lessThanReason,
2027
- tokensUsed,
2028
- metadata
2029
- };
2030
- }
2031
- async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
2032
- const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context recall check");
2033
- const contextString = serializeContext(context);
2034
- const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RECALL), {
2035
- context: contextString,
2036
- groundTruth,
2037
- ...vars || {}
2038
- }), "context-recall", {
2039
- context: contextString,
2040
- groundTruth,
2041
- ...vars || {}
2042
- }, providerCallContext);
2043
- if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
2044
- invariant(typeof resp.output === "string", "context-recall produced malformed response");
2045
- const attributedTokenLower = CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
2046
- const notAttributedTokenLower = CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
2047
- const sentences = splitIntoSentences(resp.output).filter((line) => {
2048
- const lowerLine = line.toLowerCase();
2049
- return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
2050
- });
2051
- const sentenceAttributions = [];
2052
- let numerator = 0;
2053
- for (const sentence of sentences) {
2054
- const isAttributed = sentence.toLowerCase().includes(attributedTokenLower);
2055
- if (isAttributed) numerator++;
2056
- const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
2057
- const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
2058
- sentenceAttributions.push({
2059
- sentence: cleanSentence,
2060
- attributed: isAttributed
2061
- });
2062
- }
2063
- const score = sentences.length > 0 ? numerator / sentences.length : 0;
2064
- const pass = score >= threshold - Number.EPSILON;
2065
- const metadata = {
2066
- sentenceAttributions,
2067
- totalSentences: sentences.length,
2068
- attributedSentences: numerator,
2069
- score
2070
- };
2071
- return {
2072
- pass,
2073
- score,
2074
- reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
2075
- tokensUsed: {
2076
- total: resp.tokenUsage?.total || 0,
2077
- prompt: resp.tokenUsage?.prompt || 0,
2078
- completion: resp.tokenUsage?.completion || 0,
2079
- cached: resp.tokenUsage?.cached || 0,
2080
- numRequests: resp.tokenUsage?.numRequests || 0,
2081
- completionDetails: resp.tokenUsage?.completionDetails || {
2082
- reasoning: 0,
2083
- acceptedPrediction: 0,
2084
- rejectedPrediction: 0
2085
- }
2086
- },
2087
- metadata
2088
- };
2089
- }
2090
- async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
2091
- const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context relevance check");
2092
- const contextString = serializeContext(context);
2093
- const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RELEVANCE), {
2094
- context: contextString,
2095
- query: question
2096
- }), "context-relevance", {
2097
- context: contextString,
2098
- query: question
2099
- }, providerCallContext);
2100
- if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
2101
- invariant(typeof resp.output === "string", "context-relevance produced malformed response");
2102
- const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : splitIntoSentences(context);
2103
- const totalContextUnits = contextUnits.length;
2104
- const extractedSentences = splitIntoSentences(resp.output);
2105
- const relevantSentences = [];
2106
- const insufficientInformation = resp.output.includes(CONTEXT_RELEVANCE_BAD);
2107
- let numerator = 0;
2108
- if (insufficientInformation) numerator = 0;
2109
- else {
2110
- numerator = extractedSentences.length;
2111
- relevantSentences.push(...extractedSentences);
2112
- }
2113
- const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
2114
- const pass = score >= threshold - Number.EPSILON;
2115
- const metadata = {
2116
- extractedSentences: relevantSentences,
2117
- totalContextUnits,
2118
- totalContextSentences: totalContextUnits,
2119
- contextUnits,
2120
- relevantSentenceCount: numerator,
2121
- insufficientInformation,
2122
- score
2123
- };
2124
- return {
2125
- pass,
2126
- score,
2127
- reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
2128
- tokensUsed: {
2129
- total: resp.tokenUsage?.total || 0,
2130
- prompt: resp.tokenUsage?.prompt || 0,
2131
- completion: resp.tokenUsage?.completion || 0,
2132
- cached: resp.tokenUsage?.cached || 0,
2133
- numRequests: resp.tokenUsage?.numRequests || 0,
2134
- completionDetails: resp.tokenUsage?.completionDetails || {
2135
- reasoning: 0,
2136
- acceptedPrediction: 0,
2137
- rejectedPrediction: 0
2138
- }
2139
- },
2140
- metadata
2141
- };
2142
- }
2143
- async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
2144
- const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "faithfulness check");
2145
- const tokensUsed = {
2146
- total: 0,
2147
- prompt: 0,
2148
- completion: 0,
2149
- cached: 0,
2150
- numRequests: 0,
2151
- completionDetails: {
2152
- reasoning: 0,
2153
- acceptedPrediction: 0,
2154
- rejectedPrediction: 0
2155
- }
2156
- };
2157
- if (grading?.rubricPrompt) invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
2158
- const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
2159
- const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
2160
- const longformPrompt = await loadRubricPrompt(rawLongformPrompt, CONTEXT_FAITHFULNESS_LONGFORM);
2161
- const nliPrompt = await loadRubricPrompt(rawNliPrompt, CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
2162
- let promptText = await renderLlmRubricPrompt(longformPrompt, {
2163
- question: query,
2164
- answer: tryParse(output),
2165
- ...vars || {}
2166
- });
2167
- let resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
2168
- question: query,
2169
- answer: tryParse(output),
2170
- ...vars || {}
2171
- }, providerCallContext);
2172
- accumulateTokens(tokensUsed, resp.tokenUsage);
2173
- if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
2174
- invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
2175
- const contextString = serializeContext(context);
2176
- const statements = splitIntoSentences(resp.output);
2177
- promptText = await renderLlmRubricPrompt(nliPrompt, {
2178
- context: contextString,
2179
- statements,
2180
- ...vars || {}
2181
- });
2182
- resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
2183
- context: contextString,
2184
- statements,
2185
- ...vars || {}
2186
- }, providerCallContext);
2187
- accumulateTokens(tokensUsed, resp.tokenUsage);
2188
- if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
2189
- invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
2190
- let finalAnswer = "Final verdict for each statement in order:";
2191
- finalAnswer = finalAnswer.toLowerCase();
2192
- let verdicts = resp.output.toLowerCase().trim();
2193
- let score = 0;
2194
- if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
2195
- verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
2196
- const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
2197
- if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
2198
- } else {
2199
- const noVerdictCount = verdicts.split("verdict: no").length - 1;
2200
- if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
2201
- }
2202
- score = Math.min(1, Math.max(0, score));
2203
- const pass = score >= threshold - Number.EPSILON;
2204
- return {
2205
- pass,
2206
- score,
2207
- reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
2208
- tokensUsed
2209
- };
2210
- }
2211
- async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
2212
- invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
2213
- const resp = await callProviderWithContext(await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "select-best check"), await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, SELECT_BEST_PROMPT), {
2214
- criteria,
2215
- outputs: outputs.map((o) => tryParse(o)),
2216
- ...vars || {}
2217
- }), "select-best", {
2218
- criteria,
2219
- outputs: outputs.map((o) => tryParse(o)),
2220
- ...vars || {}
2221
- }, providerCallContext);
2222
- if (resp.error || !resp.output) return new Array(outputs.length).fill(fail(resp.error || "No output", resp.tokenUsage));
2223
- invariant(typeof resp.output === "string", "select-best produced malformed response");
2224
- const firstDigitMatch = resp.output.trim().match(/\d/);
2225
- const verdict = firstDigitMatch ? Number.parseInt(firstDigitMatch[0], 10) : NaN;
2226
- if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return new Array(outputs.length).fill(fail(`Invalid select-best verdict: ${verdict}`));
2227
- const tokensUsed = {
2228
- total: resp.tokenUsage?.total || 0,
2229
- prompt: resp.tokenUsage?.prompt || 0,
2230
- completion: resp.tokenUsage?.completion || 0,
2231
- cached: resp.tokenUsage?.cached || 0,
2232
- numRequests: resp.tokenUsage?.numRequests || 0,
2233
- completionDetails: resp.tokenUsage?.completionDetails || {
2234
- reasoning: 0,
2235
- acceptedPrediction: 0,
2236
- rejectedPrediction: 0
2237
- }
2238
- };
2239
- return outputs.map((_output, index) => {
2240
- if (index === verdict) return {
2241
- pass: true,
2242
- score: 1,
2243
- reason: `Output selected as the best: ${criteria}`,
2244
- tokensUsed
2245
- };
2246
- else return {
2247
- pass: false,
2248
- score: 0,
2249
- reason: `Output not selected: ${criteria}`,
2250
- tokensUsed
2251
- };
2252
- });
2253
- }
2254
- async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
2255
- invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
2256
- const value = assertion.value || {};
2257
- const options = {
2258
- method: typeof value === "object" && "method" in value ? value.method : "average",
2259
- weights: typeof value === "object" && "weights" in value ? value.weights : {},
2260
- threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
2261
- };
2262
- const scores = resultsWithGradingResults.map((result, index) => {
2263
- const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
2264
- if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
2265
- let totalWeightedScore = 0;
2266
- let totalWeight = 0;
2267
- relevantResults.forEach((componentResult) => {
2268
- const assertionType = componentResult.assertion?.type || "unknown";
2269
- const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
2270
- const score = componentResult.score || 0;
2271
- totalWeightedScore += score * weight;
2272
- totalWeight += weight;
2273
- });
2274
- let aggregateScore;
2275
- if (options.method === "sum") aggregateScore = totalWeightedScore;
2276
- else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
2277
- return {
2278
- index,
2279
- score: aggregateScore,
2280
- componentCount: relevantResults.length,
2281
- totalWeight
2282
- };
2283
- });
2284
- let maxScore = -Infinity;
2285
- let winnerIndex = 0;
2286
- for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
2287
- maxScore = scores[i].score;
2288
- winnerIndex = i;
2289
- }
2290
- const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
2291
- return scores.map(({ index, score, componentCount, totalWeight }) => {
2292
- const isWinner = index === winnerIndex && meetsThreshold;
2293
- return {
2294
- pass: isWinner,
2295
- score: isWinner ? 1 : 0,
2296
- reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
2297
- namedScores: {
2298
- maxScore: score,
2299
- assertionCount: componentCount,
2300
- totalWeight
2301
- }
2302
- };
2303
- });
2304
- }
2305
- async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
2306
- if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
2307
- const defaultProviders = await getDefaultProviders();
2308
- const defaultSearchProviders = [
2309
- defaultProviders.webSearchProvider,
2310
- defaultProviders.llmRubricProvider,
2311
- defaultProviders.gradingProvider
2312
- ];
2313
- let searchProvider = (grading.provider ? await getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
2314
- if (!hasWebSearchCapability(searchProvider)) {
2315
- const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
2316
- if (webSearchDefault) searchProvider = webSearchDefault;
2317
- }
2318
- if (!hasWebSearchCapability(searchProvider)) {
2319
- const webSearchProvider = await loadWebSearchProvider(true);
2320
- if (webSearchProvider) searchProvider = webSearchProvider;
2321
- }
2322
- if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error("search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:claude-sonnet-4, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider");
2323
- const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, DEFAULT_WEB_SEARCH_PROMPT), {
2324
- output: tryParse(llmOutput),
2325
- rubric,
2326
- ...vars || {}
2327
- });
2328
- const resp = await callProviderWithContext(searchProvider, prompt, "search-rubric", {
2329
- output: tryParse(llmOutput),
2330
- rubric,
2331
- ...vars || {}
2332
- }, providerCallContext);
2333
- if (resp.error || !resp.output) return {
2334
- pass: false,
2335
- score: 0,
2336
- reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
2337
- tokensUsed: resp.tokenUsage,
2338
- assertion
2339
- };
2340
- try {
2341
- const result = extractFirstJsonObject(String(resp.output));
2342
- let pass = result.pass ?? false;
2343
- const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
2344
- if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
2345
- return {
2346
- pass,
2347
- score,
2348
- reason: result.reason || "No reason provided",
2349
- tokensUsed: resp.tokenUsage,
2350
- assertion,
2351
- metadata: {
2352
- searchResults: result.searchResults || [],
2353
- searchProvider: searchProvider.id()
2354
- }
2355
- };
2356
- } catch {
2357
- const outputLower = String(resp.output).toLowerCase();
2358
- const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
2359
- return {
2360
- pass,
2361
- score: pass ? 1 : 0,
2362
- reason: resp.output,
2363
- tokensUsed: resp.tokenUsage,
2364
- assertion
2365
- };
2366
- }
2367
- }
2368
- async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
2369
- if (!assistantResponse) return {
2370
- pass: true,
2371
- score: 1,
2372
- reason: "No output to moderate"
2373
- };
2374
- const defaultProviders = await getDefaultProviders();
2375
- const defaultModerationProvider = !getEnvString("OPENAI_API_KEY") && (getEnvString("REPLICATE_API_KEY") || getEnvString("REPLICATE_API_TOKEN")) ? await loadApiProvider(LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
2376
- const moderationProvider = await getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
2377
- invariant(moderationProvider, "Moderation provider must be defined");
2378
- const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
2379
- if (resp.error) return {
2380
- pass: false,
2381
- score: 0,
2382
- reason: `Moderation API error: ${resp.error}`
2383
- };
2384
- const { flags } = resp;
2385
- if (!flags || flags.length === 0) return {
2386
- pass: true,
2387
- score: 1,
2388
- reason: "No moderation flags detected"
2389
- };
2390
- const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
2391
- if (filteredFlags.length > 0) return {
2392
- pass: false,
2393
- score: 0,
2394
- reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
2395
- };
2396
- return {
2397
- pass: true,
2398
- score: 1,
2399
- reason: "No relevant moderation flags detected"
2400
- };
2401
- }
2402
1654
  //#endregion
2403
1655
  //#region src/integrations/huggingfaceDatasets.ts
2404
1656
  /**
@@ -2984,7 +2236,7 @@ var RedteamPluginBase = class RedteamPluginBase {
2984
2236
  const rejectedPromptLengths = [];
2985
2237
  let rejectedPromptLimit;
2986
2238
  for (const prompt of parsedPrompts) {
2987
- const violation = getGeneratedPromptOverLimit("__prompt" in prompt ? String(prompt.__prompt) : JSON.stringify(prompt), this.config.maxCharsPerMessage);
2239
+ const violation = getGeneratedPromptOverLimit("__prompt" in prompt ? prompt.__prompt : JSON.stringify(prompt), this.config.maxCharsPerMessage);
2988
2240
  if (violation) {
2989
2241
  rejectedPromptLengths.push(violation.length);
2990
2242
  rejectedPromptLimit = violation.limit;
@@ -3148,10 +2400,17 @@ var RedteamGraderBase = class {
3148
2400
  },
3149
2401
  rubric: finalRubric
3150
2402
  };
3151
- const grade = await matchesLlmRubric(finalRubric, llmOutput, {
2403
+ const defaultTest = typeof state.config?.defaultTest === "object" ? state.config.defaultTest : void 0;
2404
+ const hasConfiguredGradingProvider = Boolean(state.config?.redteam?.provider || defaultTest?.options?.provider);
2405
+ const grading = {
3152
2406
  ...test.options,
3153
2407
  provider: await redteamProviderManager.getGradingProvider({ jsonOnly: true })
3154
- });
2408
+ };
2409
+ if (!hasConfiguredGradingProvider) {
2410
+ Object.defineProperty(grading, "__promptfooPreferRemote", { value: true });
2411
+ logger.debug("[Redteam] No configured grading provider detected, preferring remote grading");
2412
+ }
2413
+ const grade = await matchesLlmRubric(finalRubric, llmOutput, grading);
3155
2414
  logger.debug(`Redteam grading result for ${this.id}: - ${JSON.stringify(grade)}`);
3156
2415
  let suggestions;
3157
2416
  if (!grade.pass) suggestions = this.getSuggestions({
@@ -15970,6 +15229,6 @@ function getGraderById(id) {
15970
15229
  return grader;
15971
15230
  }
15972
15231
  //#endregion
15973
- export { matchesGEval as $, DivergentRepetitionPlugin as A, sampleArray as B, getPiiLeakTestsForCategory as C, HarmbenchPlugin as D, ImitationPlugin as E, AegisPlugin as F, loadRubricPrompt as G, callProviderWithContext as H, RedteamGraderBase as I, matchesClosedQa as J, matchesAnswerRelevance as K, RedteamPluginBase as L, CrossSessionLeakPlugin as M, ContractPlugin as N, HallucinationPlugin as O, BeavertailsPlugin as P, matchesFactuality as Q, getCustomPolicies as R, PlinyPlugin as S, IntentPlugin as T, fail as U, fetchHuggingFaceDataset as V, getAndCheckProvider as W, matchesContextRecall as X, matchesContextFaithfulness as Y, matchesContextRelevance as Z, PoliticsPlugin as _, processFileReference as _t, UnverifiableClaimsPlugin as a, matchesSimilarity as at, isValidPolicyObject as b, ToolDiscoveryPlugin as c, withProviderCallExecutionContext as ct, TeenSafetyDangerousContentPlugin as d, readPrompts as dt, matchesLlmRubric as et, TeenSafetyAgeRestrictedGoodsAndServicesPlugin as f, readProviderPromptMap as ft, PromptExtractionPlugin as g, loadFromJavaScriptFile as gt, RbacPlugin as h, getFinalTest as ht, VLGuardPlugin as i, matchesSelectBest as it, DebugAccessPlugin as j, ExcessiveAgencyPlugin as k, TeenSafetyHarmfulBodyIdealsPlugin as l, getDefaultProviders as lt, ShellInjectionPlugin as m, coerceString as mt, getGraderById as n, matchesPiScore as nt, UnsafeBenchPlugin as o, matchesTrajectoryGoalSuccess as ot, SqlInjectionPlugin as p, SUGGEST_PROMPTS_SYSTEM_MESSAGE as pt, matchesClassification as q, VLSUPlugin as r, matchesSearchRubric as rt, ToxicChatPlugin as s, selectMaxScore as st, GRADERS as t, matchesModeration as tt, TeenSafetyDangerousRoleplayPlugin as u, processPrompts as ut, PolicyPlugin as v, resolveContext as vt, OverreliancePlugin as w, makeInlinePolicyIdSync as x, determinePolicyTypeFromId as y, retryWithDeduplication as z };
15232
+ export { SELECT_BEST_PROMPT as $, DivergentRepetitionPlugin as A, sampleArray as B, getPiiLeakTestsForCategory as C, DEFAULT_ANTHROPIC_MODEL as Ct, HarmbenchPlugin as D, withProviderCallExecutionContext as Dt, ImitationPlugin as E, getGradingProvider as Et, AegisPlugin as F, matchesLlmRubric as G, matchesClosedQa as H, RedteamGraderBase as I, doRemoteGrading as J, matchesPiScore as K, RedteamPluginBase as L, CrossSessionLeakPlugin as M, ContractPlugin as N, HallucinationPlugin as O, BeavertailsPlugin as P, DEFAULT_WEB_SEARCH_PROMPT as Q, getCustomPolicies as R, PlinyPlugin as S, getDefaultProviders as St, IntentPlugin as T, getAndCheckProvider as Tt, matchesFactuality as U, fetchHuggingFaceDataset as V, matchesGEval as W, readPrompts as X, processPrompts as Y, readProviderPromptMap as Z, PoliticsPlugin as _, tryParse as _t, UnverifiableClaimsPlugin as a, CONTEXT_RECALL_ATTRIBUTED_TOKEN as at, isValidPolicyObject as b, loadFromJavaScriptFile as bt, ToolDiscoveryPlugin as c, CONTEXT_RELEVANCE_BAD as ct, TeenSafetyDangerousContentPlugin as d, cosineSimilarity as dt, SUGGEST_PROMPTS_SYSTEM_MESSAGE as et, TeenSafetyAgeRestrictedGoodsAndServicesPlugin as f, dotProduct as ft, PromptExtractionPlugin as g, splitIntoSentences as gt, RbacPlugin as h, normalizeMatcherTokenUsage as ht, VLGuardPlugin as i, CONTEXT_RECALL as it, DebugAccessPlugin as j, ExcessiveAgencyPlugin as k, TeenSafetyHarmfulBodyIdealsPlugin as l, loadRubricPrompt as lt, ShellInjectionPlugin as m, fail as mt, getGraderById as n, CONTEXT_FAITHFULNESS_LONGFORM as nt, UnsafeBenchPlugin as o, CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN as ot, SqlInjectionPlugin as p, euclideanDistance as pt, matchesTrajectoryGoalSuccess as q, VLSUPlugin as r, CONTEXT_FAITHFULNESS_NLI_STATEMENTS as rt, ToxicChatPlugin as s, CONTEXT_RELEVANCE as st, GRADERS as t, ANSWER_RELEVANCY_GENERATE as tt, TeenSafetyDangerousRoleplayPlugin as u, renderLlmRubricPrompt as ut, PolicyPlugin as v, coerceString as vt, OverreliancePlugin as w, callProviderWithContext as wt, makeInlinePolicyIdSync as x, processFileReference as xt, determinePolicyTypeFromId as y, getFinalTest as yt, retryWithDeduplication as z };
15974
15233
 
15975
- //# sourceMappingURL=graders-Zy3x0zqX.js.map
15234
+ //# sourceMappingURL=graders-BoUqsCEm.js.map