promptfoo 0.120.27 → 0.121.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/src/{ListApp-8WOe2nT6.js → ListApp-Du7YVwj5.js} +2 -4
- package/dist/src/accounts-B0pgC1oV.js +206 -0
- package/dist/src/{accounts-DVINui-2.js → accounts-Bm2D8Db9.js} +39 -34
- package/dist/src/{accounts-CPDRAMND.js → accounts-CiBLOnA7.js} +38 -33
- package/dist/src/{accounts-Fl2J3_Fu.cjs → accounts-gtkH-5KX.cjs} +77 -78
- package/dist/src/{agentic-utils-D922n6mm.js → agentic-utils-DS1g3GLF.js} +9 -10
- package/dist/src/{agents-BcsN_BgB.js → agents-9qiOy0ho.js} +16 -12
- package/dist/src/{agents-BXLmVsxR.js → agents-CBr9A01V.js} +37 -37
- package/dist/src/{agents-pMfppv9Z.js → agents-CmvBq8LV.js} +16 -18
- package/dist/src/{agents-hqgSV-3o.js → agents-D__IdAlg.js} +39 -40
- package/dist/src/{agents-BO2n8Z0d.cjs → agents-DbRtpYxR.cjs} +37 -40
- package/dist/src/{agents-BdUTAwi-.js → agents-DgF2zDag.js} +37 -42
- package/dist/src/{agents-DgJf2-ez.cjs → agents-Di9DKPzn.cjs} +16 -17
- package/dist/src/{agents-DNvSH78i.js → agents-cLXA8a_8.js} +17 -19
- package/dist/src/{aimlapi-DtgPI0nE.js → aimlapi-B4rcnZgv.js} +15 -17
- package/dist/src/{aimlapi-BE_Tg9Fl.cjs → aimlapi-BvlNH0gr.cjs} +15 -16
- package/dist/src/{aimlapi-DOib86oE.js → aimlapi-CnkC2HqE.js} +16 -18
- package/dist/src/{aimlapi-DTPACCB1.js → aimlapi-DHJU_kcV.js} +15 -4
- package/dist/src/app/assets/index-4LKxG2CG.js +439 -0
- package/dist/src/app/assets/{index-NCn4eVBv.css → index-C3zcsZFQ.css} +1 -1
- package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
- package/dist/src/app/index.html +3 -3
- package/dist/src/app/tsconfig.app.tsbuildinfo +1 -1
- package/dist/src/{audio-BnRUGAm_.js → audio-Bkv46et0.js} +6 -5
- package/dist/src/{audio-Cwo68yZS.cjs → audio-CGMyULza.cjs} +6 -7
- package/dist/src/{audio-MSRki4JU.js → audio-ClI_AFre.js} +6 -8
- package/dist/src/{audio-BRYU0BFo.js → audio-Dz3z7s3J.js} +7 -9
- package/dist/src/{base-pGVmXNl4.cjs → base-CGrhspbK.cjs} +36 -38
- package/dist/src/{base-h961VXYk.js → base-CpjcHe4e.js} +11 -13
- package/dist/src/base-DLKtKMFh.js +193 -0
- package/dist/src/{base-XB2tDJrB.js → base-Dy1V8--Z.js} +11 -13
- package/dist/src/blobs-BDbfYdrJ.js +236 -0
- package/dist/src/{blobs-CR5C4Ihh.js → blobs-CBO20krR.js} +9 -12
- package/dist/src/{blobs-BM_e6hCa.js → blobs-CMHN0Qcz.js} +9 -12
- package/dist/src/{blobs-B-KQAFhX.cjs → blobs-D23XLin-.cjs} +34 -37
- package/dist/src/{cache-jsiwsAJv.js → cache-BVeDlD87.js} +132 -117
- package/dist/src/{cache-CIpsoBZR.js → cache-C4Nxf52C.js} +132 -118
- package/dist/src/cache-CeUpFm3M.cjs +5 -0
- package/dist/src/{cache-BTVYfbka.cjs → cache-Dh5WtQps.cjs} +182 -168
- package/dist/src/cache-i1P6crbO.js +756 -0
- package/dist/src/cache-n-RCJ-hL.js +6 -0
- package/dist/src/{chat-BcPjZXIp.js → chat-BiKyneZl.js} +45 -46
- package/dist/src/{chat-D31K7C4u.cjs → chat-C1Qst7jL.cjs} +20 -21
- package/dist/src/{chat-B84t99NW.js → chat-C2jrdPMx.js} +20 -9
- package/dist/src/{chat-BE44YOc6.cjs → chat-CgF-J-Jj.cjs} +65 -66
- package/dist/src/{chat-DwWifjxi.js → chat-CzkrVDfz.js} +20 -22
- package/dist/src/chat-DJIw17u0.js +766 -0
- package/dist/src/{chat-CcUCysjU.js → chat-DqxYYtWA.js} +45 -46
- package/dist/src/{chat-DZM2GUHO.js → chat-qmatte1u.js} +21 -23
- package/dist/src/{chatkit-D67HS_0b.js → chatkit-65VXf5SR.js} +58 -58
- package/dist/src/{chatkit-DAB_qfzI.js → chatkit-Be-Q-a9F.js} +58 -60
- package/dist/src/{chatkit-Biqb_wsD.js → chatkit-BxFvW8KY.js} +58 -60
- package/dist/src/{chatkit-PGG4ZYIn.cjs → chatkit-DKyPi1Gs.cjs} +58 -60
- package/dist/src/chunk-DEq-mXcV.js +15 -0
- package/dist/src/chunk-DRamLcfz.js +16 -0
- package/dist/src/{claude-agent-sdk-SVM6AdBu.js → claude-agent-sdk-Apiy0iaz.js} +31 -31
- package/dist/src/{claude-agent-sdk-C-IOTPfo.js → claude-agent-sdk-D2bJee9S.js} +31 -29
- package/dist/src/{claude-agent-sdk-C9SiaQub.cjs → claude-agent-sdk-D9Z5Pr9X.cjs} +31 -28
- package/dist/src/{claude-agent-sdk-CiluSyW1.js → claude-agent-sdk-DfCoW0E6.js} +33 -20
- package/dist/src/cloud-BBh91EUK.js +4 -0
- package/dist/src/{cloud-CZ-q9Ier.js → cloud-C0dlstV_.js} +7 -9
- package/dist/src/{cloudflare-ai-BahKHyhh.js → cloudflare-ai-8TDxHR0x.js} +16 -18
- package/dist/src/{cloudflare-ai-v_qZD6_q.js → cloudflare-ai-BxAGvfju.js} +17 -19
- package/dist/src/{cloudflare-ai-Dfahv5SY.cjs → cloudflare-ai-CknbZ5LJ.cjs} +16 -17
- package/dist/src/{cloudflare-ai-Dxyt50Nl.js → cloudflare-ai-g7PB6VHR.js} +16 -4
- package/dist/src/{cloudflare-gateway-Bi_FpOFy.js → cloudflare-gateway-B9HWA5wf.js} +23 -23
- package/dist/src/{cloudflare-gateway-BPWoZIzJ.cjs → cloudflare-gateway-BSnDmHYo.cjs} +21 -22
- package/dist/src/{cloudflare-gateway-C0guUNwk.js → cloudflare-gateway-CKDb4dJ8.js} +26 -14
- package/dist/src/{cloudflare-gateway-btS7h1OZ.js → cloudflare-gateway-CP9QEWYS.js} +21 -25
- package/dist/src/{codex-sdk-DSxAnbfT.js → codex-sdk-C6UMlxwV.js} +28 -29
- package/dist/src/{codex-sdk-IYVi9fuM.js → codex-sdk-DUwKWezN.js} +28 -27
- package/dist/src/{codex-sdk-DulY0ZRq.js → codex-sdk-GGAw0qbD.js} +28 -29
- package/dist/src/{codex-sdk-DFKMtAyf.cjs → codex-sdk-fAO0c3yA.cjs} +28 -29
- package/dist/src/{cometapi-DzrR3SR_.js → cometapi-BL9yvj_f.js} +16 -4
- package/dist/src/{cometapi-DIO64tf4.cjs → cometapi-C4xSqeID.cjs} +21 -22
- package/dist/src/{cometapi-C9EEpJzT.js → cometapi-CUQq3H_a.js} +21 -24
- package/dist/src/{cometapi-DkNBMk0G.js → cometapi-DFNiKmSz.js} +17 -19
- package/dist/src/{completion-CG29bfKX.js → completion-5MzrpJxT.js} +11 -13
- package/dist/src/{completion-CCRT4kX1.cjs → completion-CM6oK8PS.cjs} +21 -23
- package/dist/src/{completion-Bgf1VJoq.js → completion-DZ083F31.js} +11 -13
- package/dist/src/completion-qRoZAYRB.js +120 -0
- package/dist/src/{createHash-Dw_iLu31.js → createHash-CTQmL3G2.js} +2 -3
- package/dist/src/{createHash-CYQy4YeL.cjs → createHash-CfZSc0b4.cjs} +13 -14
- package/dist/src/{createHash-CJcfskIZ.js → createHash-Da8fMwqB.js} +2 -3
- package/dist/src/createHash-DmPQkvBh.js +15 -0
- package/dist/src/{docker-D-ayp2FW.js → docker-Bb5dcxr8.js} +18 -20
- package/dist/src/{docker-B81N0t4e.js → docker-BvfL2BrW.js} +19 -21
- package/dist/src/{docker-DNcLR4Ig.cjs → docker-DcF2pRrj.cjs} +18 -19
- package/dist/src/{docker-egERKxCF.js → docker-ExVyLp0S.js} +18 -7
- package/dist/src/entrypoint.js +2 -3
- package/dist/src/{errors-DnGCbnx8.js → errors-P6ll7XSJ.js} +2 -2
- package/dist/src/{esm-B9dPm_BF.js → esm-C03C-mv3.js} +17 -20
- package/dist/src/{esm-D2pZ87fL.js → esm-CaIwzWR5.js} +18 -21
- package/dist/src/esm-Cd1AjG1D.js +379 -0
- package/dist/src/{esm-Ct-Joyue.cjs → esm-CnNt7sI4.cjs} +47 -49
- package/dist/src/eval-B3r2CVXr.js +15 -0
- package/dist/src/{eval-C-Nr6wX_.js → eval-Dg2nG4v2.js} +47 -54
- package/dist/src/evalResult-5xwYnECe.js +12 -0
- package/dist/src/evalResult-71lY93Kj.cjs +10 -0
- package/dist/src/{evalResult-DXMWJ3sx.js → evalResult-BBRNtX4I.js} +10 -11
- package/dist/src/{evalResult-4BzI2tmj.js → evalResult-BDMqrapS.js} +16 -12
- package/dist/src/evalResult-Dx5P5cIv.js +10 -0
- package/dist/src/{evalResult-CX8wQecI.cjs → evalResult-fuaI8HkH.cjs} +20 -21
- package/dist/src/{evaluator-8aGyV12L.js → evaluator-BhoWwp5b.js} +211 -235
- package/dist/src/evaluator-Jx6bRZV6.js +36 -0
- package/dist/src/{extractor-V5x_m1i0.js → extractor-C0EVHewb.js} +22 -24
- package/dist/src/extractor-D25qpmGX.js +374 -0
- package/dist/src/{extractor-CD5yKL-G.js → extractor-DReVID0K.js} +22 -24
- package/dist/src/{extractor-C031XmTA.cjs → extractor-pYLLi3wS.cjs} +37 -39
- package/dist/src/{fetch-BmbD-v1L.cjs → fetch-BPkYtG8K.cjs} +244 -277
- package/dist/src/fetch-BxNb_Lp3.js +5 -0
- package/dist/src/{fetch-D3OHf-lV.js → fetch-Cwxnd8zz.js} +36 -44
- package/dist/src/{fetch-CXZI9RRr.js → fetch-Dxpd4_sr.js} +23 -35
- package/dist/src/fetch-HaqdX7U1.js +780 -0
- package/dist/src/{fileExtensions-ePDqouxn.js → fileExtensions-DnqA1y9x.js} +2 -2
- package/dist/src/{fileExtensions-BpuMmaFL.js → fileExtensions-Ds-foDzt.js} +2 -2
- package/dist/src/fileExtensions-LcDYkU4v.js +85 -0
- package/dist/src/{fileExtensions-DkJYkWUy.cjs → fileExtensions-bYh77CN8.cjs} +27 -28
- package/dist/src/{formatDuration-CdevI3An.js → formatDuration-DgBVMN65.js} +2 -2
- package/dist/src/{genaiTracer-Ce19n68P.js → genaiTracer-70Z8BIuV.js} +2 -3
- package/dist/src/{genaiTracer-CqNnnXrE.js → genaiTracer-C1rxGO8Q.js} +2 -3
- package/dist/src/genaiTracer-D3fD9dNV.js +256 -0
- package/dist/src/{genaiTracer-Dres3qrN.cjs → genaiTracer-DN4dQywX.cjs} +13 -14
- package/dist/src/{graders--1y2u9HO.js → graders-BTeBGqjJ.js} +349 -397
- package/dist/src/graders-B_pgMLS2.js +34 -0
- package/dist/src/{graders-DTeBrzWp.js → graders-Bj_Odv7c.js} +349 -397
- package/dist/src/graders-DErokPDO.cjs +32 -0
- package/dist/src/graders-DP7KFFo-.js +13466 -0
- package/dist/src/graders-DR_uNe54.js +32 -0
- package/dist/src/{graders-DohM2dir.cjs → graders-DU49_J8Y.cjs} +684 -732
- package/dist/src/graders-w3176Wz-.js +32 -0
- package/dist/src/{image-B0U4Hqll.js → image-B02ogr_b.js} +7 -9
- package/dist/src/{image-DmE-niFE.js → image-B0h9VEMc.js} +6 -5
- package/dist/src/{image-CuKHuccK.cjs → image-BLmROtN3.cjs} +29 -30
- package/dist/src/{image-DNEIf_aI.js → image-Bb4vWQLM.js} +6 -8
- package/dist/src/{image-DpKl2F15.cjs → image-C1madmKh.cjs} +6 -7
- package/dist/src/{image-C3wHC9_h.js → image-CHfWvljl.js} +9 -10
- package/dist/src/{image-O1u4bCFg.js → image-DS-o-0ph.js} +9 -10
- package/dist/src/image-Dpxa1Jt6.js +257 -0
- package/dist/src/index.cjs +615 -695
- package/dist/src/index.d.cts +271 -7
- package/dist/src/index.d.ts +271 -3
- package/dist/src/index.js +580 -664
- package/dist/src/{interactiveCheck-Bxj1Swex.js → interactiveCheck-BgLZUIt3.js} +7 -8
- package/dist/src/{invariant-DT20jrBd.js → invariant-BtWWVVhl.js} +2 -2
- package/dist/src/{invariant-1pAf2CD1.js → invariant-Ddh24eXh.js} +2 -2
- package/dist/src/{invariant-CKcJAQ6M.cjs → invariant-kfQ8Bu82.cjs} +7 -8
- package/dist/src/invariant-vgHWClmd.js +25 -0
- package/dist/src/{knowledgeBase-CEzQobWX.js → knowledgeBase-B3OoKIej.js} +14 -9
- package/dist/src/{knowledgeBase-Be_zyW4L.js → knowledgeBase-CYTLHOt1.js} +16 -16
- package/dist/src/{knowledgeBase-BZ41IFwq.js → knowledgeBase-D33Ty2l6.js} +14 -18
- package/dist/src/{knowledgeBase-D-5BMXlr.cjs → knowledgeBase-DOO_BM9b.cjs} +14 -15
- package/dist/src/{litellm-DnbRJ2if.js → litellm-AaeZcZQF.js} +18 -19
- package/dist/src/{litellm-hUSNM_M2.cjs → litellm-I_hbp_dc.cjs} +17 -17
- package/dist/src/{litellm-CRDqPhNI.js → litellm-NbjknEh6.js} +17 -18
- package/dist/src/{litellm-9vR8zpfU.js → litellm-TrljxD9G.js} +17 -5
- package/dist/src/{logger-CG1uZPbQ.js → logger-CT3IKMKA.js} +10 -29
- package/dist/src/{logger-B7sBeGa0.cjs → logger-Cp1GPUjj.cjs} +152 -180
- package/dist/src/logger-DLcq4dWf.js +713 -0
- package/dist/src/{logger-LSBxlt7a.js → logger-KkObSCzq.js} +13 -31
- package/dist/src/{luma-ray-4blv9iZ2.js → luma-ray-BS2_tY8L.js} +22 -21
- package/dist/src/{luma-ray-drvgdpP9.js → luma-ray-DDsjcgZZ.js} +20 -13
- package/dist/src/{luma-ray-Hm3d6VJE.cjs → luma-ray-Due0n7di.cjs} +20 -21
- package/dist/src/{luma-ray-B2__8lYH.js → luma-ray-f6I2fft-.js} +20 -23
- package/dist/src/main.js +1170 -1321
- package/dist/src/{messages-Uee41Mj5.js → messages-BS17jdMx.js} +22 -24
- package/dist/src/{messages-XhiwCbi4.cjs → messages-Bs1kC7P4.cjs} +32 -34
- package/dist/src/{messages-CGPPidQr.js → messages-D0lx5qK7.js} +22 -24
- package/dist/src/messages-ZJk778GH.js +240 -0
- package/dist/src/{meteor-BYykdXrV.js → meteor-44VjEACX.js} +3 -4
- package/dist/src/{meteor-CsopaHrH.js → meteor-D-SotUw9.js} +3 -4
- package/dist/src/{meteor-e-E-2vVl.cjs → meteor-DLZZ3osF.cjs} +3 -4
- package/dist/src/{meteor-C8lGP6P4.js → meteor-DUiCJRC-.js} +3 -4
- package/dist/src/{modelslab-yKz-ZNB4.js → modelslab-Bmni6skY.js} +17 -10
- package/dist/src/{modelslab-E9gO-bYd.js → modelslab-Bx9IrZfS.js} +18 -20
- package/dist/src/{modelslab-lUVW0cmB.cjs → modelslab-CoUX6Jc_.cjs} +17 -18
- package/dist/src/{modelslab-ClBkr8_9.js → modelslab-DRb74SP4.js} +17 -19
- package/dist/src/{nova-reel-Dk8jNpId.js → nova-reel-BfPq-0Yk.js} +20 -13
- package/dist/src/{nova-reel-D8CuO6QH.cjs → nova-reel-C_QM18Xn.cjs} +20 -21
- package/dist/src/{nova-reel-u2eF2Cxm.js → nova-reel-D_W1tjMH.js} +22 -21
- package/dist/src/{nova-reel-P9bwvtYX.js → nova-reel-bgjxilYW.js} +20 -23
- package/dist/src/{nova-sonic-CK2rAiKi.js → nova-sonic-CFb5GYhg.js} +30 -26
- package/dist/src/{nova-sonic-BaqWlkds.js → nova-sonic-DIGQNR07.js} +30 -31
- package/dist/src/{nova-sonic-yZapPLv7.js → nova-sonic-De1HW5fD.js} +31 -32
- package/dist/src/{nova-sonic-Ds1C-dpm.cjs → nova-sonic-zfcljeRp.cjs} +30 -31
- package/dist/src/{openai-DUFopMrH.cjs → openai-Cuif0GEt.cjs} +8 -9
- package/dist/src/{openai-PblZ3jUE.js → openai-DElQ-fPX.js} +3 -4
- package/dist/src/{openai-CcN1B8Sb.js → openai-DhbB7eWK.js} +3 -4
- package/dist/src/openai-j-sE2O7r.js +44 -0
- package/dist/src/{openclaw-B6qqDr_u.cjs → openclaw-CSugPYAr.cjs} +188 -130
- package/dist/src/{openclaw-A-3_loM7.js → openclaw-DiSz3I5L.js} +180 -109
- package/dist/src/{openclaw-a3lylB-V.js → openclaw-DuvJKEW5.js} +178 -124
- package/dist/src/{openclaw-COn6QzDi.js → openclaw-tiVYRtr-.js} +178 -122
- package/dist/src/opencode-sdk-0j6rTWNb.js +562 -0
- package/dist/src/opencode-sdk-B3CWY9h_.js +560 -0
- package/dist/src/opencode-sdk-BL764Jdi.cjs +564 -0
- package/dist/src/opencode-sdk-C2y6UkP2.js +560 -0
- package/dist/src/{otlpReceiver-oyf5wLGC.js → otlpReceiver-C99PPb48.js} +53 -51
- package/dist/src/{otlpReceiver-lXsYVbpj.cjs → otlpReceiver-CGq6LspY.cjs} +53 -55
- package/dist/src/{otlpReceiver-94URx7UW.js → otlpReceiver-CdNBdbsk.js} +53 -55
- package/dist/src/{otlpReceiver-BmmTiMjA.js → otlpReceiver-D89fR-rC.js} +53 -55
- package/dist/src/{providerRegistry-Cq_JK_CJ.js → providerRegistry-B0RUOLI_.js} +7 -8
- package/dist/src/{providerRegistry-DSSHjMKf.js → providerRegistry-CD8MEar9.js} +7 -8
- package/dist/src/{providerRegistry-CvHEVJad.cjs → providerRegistry-Civky8Ar.cjs} +12 -13
- package/dist/src/providerRegistry-DM8rZYol.js +45 -0
- package/dist/src/providers-B7V0njNs.js +32 -0
- package/dist/src/providers-BEwbhv0X.js +30 -0
- package/dist/src/{providers-Iil64vk9.js → providers-BlqUifFg.js} +1543 -1676
- package/dist/src/providers-CH3C7zf7.js +30 -0
- package/dist/src/{providers-DHbjzW2e.cjs → providers-CgKOSgTR.cjs} +1896 -2029
- package/dist/src/providers-D8lF1sqW.js +33246 -0
- package/dist/src/{providers-BnFpbY_s.js → providers-Dk_6ocUX.js} +1536 -1669
- package/dist/src/providers-zyB6k_38.cjs +31 -0
- package/dist/src/{pythonUtils-CcT5LH1M.js → pythonUtils-C3py6GC1.js} +18 -19
- package/dist/src/{pythonUtils-DBbuI3QJ.cjs → pythonUtils-CTU3Y3lw.cjs} +42 -43
- package/dist/src/{pythonUtils-hZ8LeQLv.js → pythonUtils-D5nxkQ0P.js} +18 -19
- package/dist/src/pythonUtils-D6fwaDSg.js +249 -0
- package/dist/src/{quiverai-BuI0tE39.js → quiverai-BbOUOn2L.js} +8 -7
- package/dist/src/{quiverai-DCGSZt4U.js → quiverai-CIaELU_m.js} +8 -10
- package/dist/src/{quiverai-DiMVJQDz.cjs → quiverai-PdShCPox.cjs} +8 -9
- package/dist/src/{quiverai-fQNkExW4.js → quiverai-uH-dcTIr.js} +9 -11
- package/dist/src/{render-Dj1smHEb.js → render-Drod8m7K.js} +4 -5
- package/dist/src/responses-CB2jwoAr.js +660 -0
- package/dist/src/{responses-ghR3IOfy.cjs → responses-D8SBTL64.cjs} +39 -42
- package/dist/src/{responses-DOAFFENS.js → responses-DIR9Ud3j.js} +24 -27
- package/dist/src/{responses-CxzoQoBe.js → responses-WNGNYe3K.js} +24 -27
- package/dist/src/rubyUtils-BUHu6PhO.js +5 -0
- package/dist/src/{rubyUtils-CwbGmgYN.js → rubyUtils-BUVePouc.js} +27 -20
- package/dist/src/rubyUtils-BcuGX77l.js +222 -0
- package/dist/src/{rubyUtils-DudlFZed.js → rubyUtils-Boc4HZzX.js} +18 -19
- package/dist/src/rubyUtils-CP42kMvq.cjs +4 -0
- package/dist/src/{rubyUtils-C8MhKGHb.cjs → rubyUtils-DhCAlxZr.cjs} +48 -50
- package/dist/src/{sagemaker-gmskuyre.js → sagemaker-CNBxx5CJ.js} +75 -70
- package/dist/src/{sagemaker-CcxhlOAR.js → sagemaker-CemTFp2h.js} +75 -79
- package/dist/src/{sagemaker-77zbJ2Q2.cjs → sagemaker-Cl28mZU2.cjs} +75 -76
- package/dist/src/{sagemaker-DuM71dVU.js → sagemaker-YSyBXQQh.js} +77 -77
- package/dist/src/{scanner-DJYiSXQj.js → scanner-BsBlNXNn.js} +100 -121
- package/dist/src/server/index.js +5520 -67427
- package/dist/src/{server-B5v33lvE.cjs → server-C_7Ax-hA.cjs} +57 -67
- package/dist/src/{server-BJ4m4f1D.js → server-CqzrVGpF.js} +26 -29
- package/dist/src/server-CuxBbeSY.js +229 -0
- package/dist/src/server-DA4Cyrrq.js +7 -0
- package/dist/src/server-Dulb-4-K.cjs +5 -0
- package/dist/src/{server-RV_i_YX5.js → server-VWgWb00X.js} +19 -24
- package/dist/src/{signal-BW33JuId.js → signal-4U3mfRvL.js} +9 -11
- package/dist/src/{slack-DEURelTy.cjs → slack-BmVAVGaK.cjs} +7 -8
- package/dist/src/{slack-BQYeW9L3.js → slack-DCUPTzS2.js} +8 -8
- package/dist/src/{slack-BB6yuZzp.js → slack-DOdy_kyv.js} +7 -8
- package/dist/src/{slack-2pRrhhgJ.js → slack-DXMKtA-f.js} +7 -9
- package/dist/src/store-CXGFv4aR.js +228 -0
- package/dist/src/store-CXS-Q_91.js +6 -0
- package/dist/src/{store-D7CgQzAR.cjs → store-DLlFCC4h.cjs} +44 -45
- package/dist/src/{store-DJNsD1iC.js → store-DXilxTl-.js} +40 -36
- package/dist/src/{store-s3SftUwF.js → store-Dim__MDd.js} +34 -35
- package/dist/src/store-eYkaKMwq.cjs +5 -0
- package/dist/src/{tables-DfTsNN7X.js → tables-6YKwjN9-.js} +19 -21
- package/dist/src/tables-DLJPUdUE.js +288 -0
- package/dist/src/{tables-BKTmd6u7.cjs → tables-DPi7wKeM.cjs} +89 -91
- package/dist/src/{tables-DMegD0Xf.js → tables-gftXzE9I.js} +21 -23
- package/dist/src/telemetry-BpMfhthR.cjs +5 -0
- package/dist/src/{telemetry--WAdAfVi.js → telemetry-CMrFgtPB.js} +11 -13
- package/dist/src/telemetry-Cps3mIU-.js +171 -0
- package/dist/src/{telemetry-DQgVBCAb.cjs → telemetry-DaX14Chu.cjs} +21 -24
- package/dist/src/{telemetry-BedSm-bZ.js → telemetry-Dthj_BbD.js} +17 -14
- package/dist/src/telemetry-Dw38hanS.js +7 -0
- package/dist/src/{text-oiSbwSOI.js → text-B_UCRPp2.js} +2 -2
- package/dist/src/{text-oKzCBnK6.cjs → text-CW1cyrwj.cjs} +12 -13
- package/dist/src/{text-B_IrO4GZ.js → text-Db-Wt2u2.js} +2 -2
- package/dist/src/text-TIv0QYnd.js +22 -0
- package/dist/src/{tokenUsageUtils-FZd5O_4A.js → tokenUsageUtils-BDGe-iyI.js} +2 -2
- package/dist/src/{tokenUsageUtils-DmZSD2eU.js → tokenUsageUtils-DflFMjS0.js} +2 -2
- package/dist/src/tokenUsageUtils-NYT-WKS6.js +138 -0
- package/dist/src/{tokenUsageUtils-CXhxVj72.cjs → tokenUsageUtils-bVa1ga6f.cjs} +32 -33
- package/dist/src/{transcription-mYS9vd5v.js → transcription-BNYURcXg.js} +14 -7
- package/dist/src/{transcription-X2-B4vkX.js → transcription-B_OdaHp7.js} +14 -16
- package/dist/src/{transcription-BO1AHegO.cjs → transcription-NLVG9MT1.cjs} +14 -15
- package/dist/src/{transcription-lzBLiTFJ.js → transcription-s6A-bNrZ.js} +15 -17
- package/dist/src/{transform-B1Hi5lWS.cjs → transform-CzK1Q0zl.cjs} +24 -26
- package/dist/src/{transform-DeGlxb0D.js → transform-D5HsjduX.js} +39 -47
- package/dist/src/{transform-CYDILYDe.js → transform-DECvGmzp.js} +15 -13
- package/dist/src/transform-DTGDnAzW.js +6 -0
- package/dist/src/{transform-BEgStbHK.js → transform-DilY9wbS.js} +10 -12
- package/dist/src/{transform-D5PjiWiZ.cjs → transform-DuHvhZpj.cjs} +179 -187
- package/dist/src/transform-aa6tmVpZ.js +216 -0
- package/dist/src/transform-m3qNw4KP.cjs +5 -0
- package/dist/src/transform-uAytVuyX.js +1506 -0
- package/dist/src/{transform-Dfl89yi4.js → transform-vNucnNr0.js} +39 -47
- package/dist/src/{transformersAvailability-SZnTS3pJ.js → transformersAvailability-CEVM2GNQ.js} +2 -2
- package/dist/src/{transformersAvailability-D-glmEy7.cjs → transformersAvailability-CwayUSlh.cjs} +2 -3
- package/dist/src/{transformersAvailability-CjeFXhuJ.js → transformersAvailability-D6c6ROpT.js} +2 -2
- package/dist/src/{types-DWNf48sT.cjs → types-C_7nyzr1.cjs} +538 -574
- package/dist/src/{types-CXQduE9o.js → types-Cbd8uOMq.js} +68 -100
- package/dist/src/types-CzW2QFyi.js +3288 -0
- package/dist/src/{types-C5hEkb-x.js → types-DmyIJ-sR.js} +63 -99
- package/dist/src/{util-CoQjmE3u.js → util-B3xGByQh.js} +4 -5
- package/dist/src/{util-aLhtl3fe.cjs → util-B9vlHIIh.cjs} +208 -223
- package/dist/src/{util-Du96oyYS.js → util-BHGHw5G1.js} +4 -5
- package/dist/src/{util-DQ984syk.js → util-BRYkYPTd.js} +36 -51
- package/dist/src/{util-D9eLdGfa.js → util-BV4XUC0n.js} +5 -6
- package/dist/src/util-Bv6uGDfH.js +293 -0
- package/dist/src/{util-1wWM599Z.cjs → util-BzMcevZc.cjs} +50 -51
- package/dist/src/{util-_h4pVqrz.js → util-C1CeHl-P.js} +36 -51
- package/dist/src/{util-Bm_-UMD_.js → util-CMy69ZgQ.js} +5 -6
- package/dist/src/{util-CyUdMzV0.cjs → util-DGNOS1db.cjs} +34 -35
- package/dist/src/util-Dnmk2mBQ.js +599 -0
- package/dist/src/util-ZzmqNPlg.js +1426 -0
- package/dist/src/{utils-BjLy-Q72.cjs → utils-Cz9qXqII.cjs} +29 -32
- package/dist/src/{utils-CFMn2yHW.js → utils-XiOAgly5.js} +4 -7
- package/dist/src/utils-dLokC-eR.js +94 -0
- package/dist/src/{utils-DvWMzuMx.js → utils-f2-Moju7.js} +4 -7
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +38 -38
- package/dist/src/app/assets/index-B2D0bCSI.js +0 -439
- package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
- package/dist/src/cache-ChPcurj7.js +0 -6
- package/dist/src/cache-VVu_W-yg.js +0 -8
- package/dist/src/cache-YLNCFEM2.cjs +0 -6
- package/dist/src/chunk-DHDDz29n.js +0 -22
- package/dist/src/chunk-FhC4c-0y.js +0 -21
- package/dist/src/cloud-BndfXy4H.js +0 -5
- package/dist/src/eval-BhHvMY82.js +0 -17
- package/dist/src/evalResult-Dq2gFNQY.js +0 -12
- package/dist/src/evalResult-nmcP5VKH.cjs +0 -12
- package/dist/src/evalResult-trqZjVYh.js +0 -14
- package/dist/src/evaluator-CnfPstzT.js +0 -39
- package/dist/src/fetch-IDPDue6F.cjs +0 -4
- package/dist/src/fetch-hKJ-It8q.js +0 -6
- package/dist/src/fetch-ouKnrWK-.js +0 -4
- package/dist/src/graders-CQn7WUsd.cjs +0 -34
- package/dist/src/graders-DC6QAbpW.js +0 -35
- package/dist/src/graders-DUWz3Y7j.js +0 -37
- package/dist/src/opencode-sdk-4bL9n-Gk.js +0 -382
- package/dist/src/opencode-sdk-BfC2zWcR.js +0 -376
- package/dist/src/opencode-sdk-DMJyuwMg.js +0 -380
- package/dist/src/opencode-sdk-Da-9adza.cjs +0 -383
- package/dist/src/providers-CsXB2Ix-.js +0 -35
- package/dist/src/providers-DO8ltjLC.js +0 -33
- package/dist/src/providers-Dtq-xnXd.cjs +0 -33
- package/dist/src/rubyUtils-BUbcND2f.js +0 -6
- package/dist/src/rubyUtils-Cr55X_KE.js +0 -5
- package/dist/src/rubyUtils-DlIiqoYo.cjs +0 -5
- package/dist/src/server-C2eQH4Gu.js +0 -6
- package/dist/src/server-CXWycu7H.cjs +0 -6
- package/dist/src/server-Q6OGlxxT.js +0 -8
- package/dist/src/store-B3EDO9Q3.js +0 -7
- package/dist/src/store-Dl9F8aw5.js +0 -6
- package/dist/src/store-SnrGrlt9.cjs +0 -6
- package/dist/src/telemetry-BGhiPZtl.js +0 -8
- package/dist/src/telemetry-CFfiYan6.cjs +0 -6
- package/dist/src/telemetry-DHzEduxX.js +0 -6
- package/dist/src/transform-C1x1ZlMQ.cjs +0 -6
- package/dist/src/transform-DYHjFmQu.js +0 -8
- package/dist/src/transform-rmwJT5JQ.js +0 -7
- package/dist/src/transformersAvailability-eJooj0gX.js +0 -35
package/dist/src/index.js
CHANGED
|
@@ -1,41 +1,40 @@
|
|
|
1
|
-
import "./
|
|
2
|
-
import {
|
|
3
|
-
import { t as
|
|
4
|
-
import { r as
|
|
5
|
-
import {
|
|
6
|
-
import { i as
|
|
7
|
-
import { n as
|
|
8
|
-
import {
|
|
9
|
-
import { A as
|
|
10
|
-
import { A as
|
|
11
|
-
import {
|
|
12
|
-
import { a as
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import
|
|
16
|
-
import "./
|
|
17
|
-
import { t as
|
|
18
|
-
import {
|
|
19
|
-
import
|
|
20
|
-
import "./
|
|
21
|
-
import "./
|
|
22
|
-
import "./
|
|
23
|
-
import "./
|
|
24
|
-
import
|
|
25
|
-
import "./
|
|
26
|
-
import {
|
|
27
|
-
import {
|
|
28
|
-
import {
|
|
29
|
-
import {
|
|
30
|
-
import {
|
|
31
|
-
import {
|
|
32
|
-
import { t as
|
|
33
|
-
import
|
|
34
|
-
import "./
|
|
35
|
-
import "./
|
|
36
|
-
import {
|
|
37
|
-
import {
|
|
38
|
-
import { t as EvalResult } from "./evalResult-4BzI2tmj.js";
|
|
1
|
+
import { C as isCI, S as getMaxEvalTimeMs, _ as getEnvBool, a as setLogCallback, b as getEnvString, d as getAjv, h as summarizeEvaluateResultForLogging, i as logger, m as safeJsonStringify, n as isDebugEnabled, o as setLogLevel, p as orderKeys, t as getLogLevel, u as extractJsonObjects, v as getEnvFloat, w as state, x as getEvalTimeoutMs, y as getEnvInt } from "./logger-CT3IKMKA.js";
|
|
2
|
+
import { t as invariant } from "./invariant-Ddh24eXh.js";
|
|
3
|
+
import { r as importModule, t as getDirectory } from "./esm-Cd1AjG1D.js";
|
|
4
|
+
import { r as runPython } from "./pythonUtils-D5nxkQ0P.js";
|
|
5
|
+
import { i as isJavascriptFile } from "./fileExtensions-DnqA1y9x.js";
|
|
6
|
+
import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-DECvGmzp.js";
|
|
7
|
+
import { $ as matchesSearchRubric, A as BeavertailsPlugin, B as getAndCheckProvider, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as retryWithDeduplication, G as matchesContextFaithfulness, H as matchesAnswerRelevance, I as sampleArray, J as matchesFactuality, K as matchesContextRecall, L as fetchHuggingFaceDataset, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as getCustomPolicies, Q as matchesPiScore, R as callProviderWithContext, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as processPrompts, b as OverreliancePlugin, c as ToolDiscoveryPlugin, ct as SUGGEST_PROMPTS_SYSTEM_MESSAGE, d as RbacPlugin, dt as loadFromJavaScriptFile, et as matchesSelectBest, f as PromptExtractionPlugin, ft as processFileReference, g as isValidPolicyObject, h as determinePolicyTypeFromId, i as VLGuardPlugin, it as DefaultSuggestionsProvider, j as AegisPlugin, k as ContractPlugin, l as SqlInjectionPlugin, lt as coerceString, m as PolicyPlugin, n as getGraderById, nt as selectMaxScore, o as UnsafeBenchPlugin, ot as readPrompts, p as PoliticsPlugin, pt as resolveContext, q as matchesContextRelevance, r as VLSUPlugin, rt as getDefaultProviders, s as ToxicChatPlugin, st as readProviderPromptMap, t as GRADERS, tt as matchesSimilarity, u as ShellInjectionPlugin, ut as getFinalTest, v as PlinyPlugin, w as HallucinationPlugin, x as IntentPlugin, y as getPiiLeakTestsForCategory, z as fail } from "./graders-BTeBGqjJ.js";
|
|
8
|
+
import { A as isApiProvider, C as TestGeneratorConfigSchema, Ct as BaseTokenUsageSchema, D as VarsSchema, E as UnifiedConfigSchema, F as ConversationMessageSchema, I as PartialGenerationError, J as getDefaultNFanout, K as STRATEGY_COLLECTIONS, L as PluginConfigSchema, M as RedteamConfigSchema, O as isGradingResult, P as ProvidersSchema, Q as categoryAliases, R as PolicyObjectSchema, S as TestCasesWithMetadataSchema, St as PromptSchema, T as TestSuiteSchema, Tt as InputsSchema, V as isUuid, W as DEFAULT_STRATEGIES, X as isFanoutStrategy, Z as Severity, _ as ScenarioSchema, _t as REDTEAM_PROVIDER_HARM_PLUGINS, a as AtomicTestCaseSchema, at as FINANCIAL_PLUGINS, b as TestCaseWithVarsFileSchema, bt as TELECOM_PLUGINS, c as CompletedPromptSchema, ct as INSURANCE_PLUGINS, d as EvaluateOptionsSchema, dt as MEDICAL_PLUGINS, et as riskCategorySeverityMap, f as GradingConfigSchema, ft as MULTI_INPUT_EXCLUDED_PLUGINS, g as ResultFailureReason, gt as PLUGIN_CATEGORIES, h as OutputFileExtension, ht as PII_PLUGINS, i as AssertionTypeSchema, it as DEFAULT_PLUGINS, j as isProviderOptions, k as isResultFailureReason, l as DerivedMetricSchema, lt as LLAMA_GUARD_ENABLED_CATEGORIES, m as OutputConfigSchema, mt as PHARMACY_PLUGINS, n as AssertionSchema, nt as BIAS_PLUGINS, o as BaseAssertionTypesSchema, ot as FOUNDATION_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_VAR, q as STRATEGY_COLLECTION_MAPPINGS, r as AssertionSetSchema, rt as DATASET_EXEMPT_PLUGINS, s as CommandLineOptionsSchema, st as HARM_PLUGINS, t as AssertionOrSetSchema, tt as ALIASED_PLUGIN_MAPPINGS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_REPLICATE_PROVIDER, v as SpecialAssertionTypesSchema, vt as REMOTE_ONLY_PLUGIN_IDS, w as TestSuiteConfigSchema, wt as CompletionTokenDetailsSchema, x as TestCasesWithMetadataPromptSchema, xt as UNALIGNED_PROVIDER_HARM_PLUGINS, y as TestCaseSchema, z as StrategyConfigSchema } from "./types-DmyIJ-sR.js";
|
|
9
|
+
import { A as getProviderDescription, C as deduplicateTestCases, D as resultIsForTestCase, E as getTestCaseDeduplicationKey, M as isGoogleProvider, N as isOpenAiProvider, O as checkProviderApiKeys, P as isProviderAllowed, S as setupEnv, T as filterRuntimeVars, b as loadFunction, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, h as renderEnvOnlyInObject, i as fetchCsvFromGoogleSheet, j as isAnthropicProvider, k as doesProviderRefMatch, m as readOutput, n as writeMultipleOutputs, p as readFilters, r as writeOutput, s as maybeLoadConfigFromExternalFile, t as printBorder, v as extractVariablesFromTemplates, w as extractRuntimeVars, x as parseFileUrl, y as getNunjucksEngine } from "./util-BRYkYPTd.js";
|
|
10
|
+
import { A as getShareApiBaseUrl, F as HUMAN_ASSERTION_TYPE, N as VERSION, O as TERMINAL_MAX_WIDTH, P as FILE_METADATA_KEY, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareViewBaseUrl, k as getDefaultShareViewBaseUrl, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "./fetch-Cwxnd8zz.js";
|
|
11
|
+
import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "./cache-C4Nxf52C.js";
|
|
12
|
+
import { A as createRateLimitRegistry, B as isCloudProvider, C as collectFileMetadata, D as loadFromPackage, E as isPackagePath, F as getCloudDatabaseId, I as getEvalConfigFromCloud, J as AIStudioChatProvider, L as getOrgContext, M as PromptfooHarmfulCompletionProvider, O as redteamProviderManager, P as checkCloudPermissions, R as getPluginSeverityOverridesFromCloud, T as runExtensionHook, V as resolveTeamId, _ as extractVariablesFromJson, a as resolveProviderConfigs, b as isBasicRefusal, c as Strategies, d as pluginMatchesStrategyTargets, f as checkExfilTracking, g as extractPromptFromTags, i as resolveProvider, j as createProviderRateLimitOptions, k as TokenUsageTracker, l as loadStrategy, m as extractGoalFromPrompt, n as loadApiProvider, o as MCPProvider, q as VertexChatProvider, r as loadApiProviders, s as GoogleLiveProvider, t as getProviderIds, u as validateStrategies, v as getSessionId, w as renderPrompt, y as getShortPluginId } from "./providers-Dk_6ocUX.js";
|
|
13
|
+
import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-XiOAgly5.js";
|
|
14
|
+
import { n as sha256, t as randomSequence } from "./createHash-DmPQkvBh.js";
|
|
15
|
+
import "./genaiTracer-D3fD9dNV.js";
|
|
16
|
+
import { t as OpenAiChatCompletionProvider } from "./chat-DqxYYtWA.js";
|
|
17
|
+
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-NYT-WKS6.js";
|
|
18
|
+
import { m as validateFunctionCall } from "./transform-vNucnNr0.js";
|
|
19
|
+
import "./messages-BS17jdMx.js";
|
|
20
|
+
import "./util-BHGHw5G1.js";
|
|
21
|
+
import "./responses-DIR9Ud3j.js";
|
|
22
|
+
import "./openai-DElQ-fPX.js";
|
|
23
|
+
import { l as validateFunctionCall$1 } from "./util-Dnmk2mBQ.js";
|
|
24
|
+
import "./completion-5MzrpJxT.js";
|
|
25
|
+
import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-CiBLOnA7.js";
|
|
26
|
+
import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-VWgWb00X.js";
|
|
27
|
+
import { t as getBlobByHash } from "./blobs-CBO20krR.js";
|
|
28
|
+
import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-6YKwjN9-.js";
|
|
29
|
+
import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-C0EVHewb.js";
|
|
30
|
+
import { t as telemetry } from "./telemetry-Dthj_BbD.js";
|
|
31
|
+
import { t as ellipsize } from "./text-B_UCRPp2.js";
|
|
32
|
+
import { t as getTraceStore } from "./store-DXilxTl-.js";
|
|
33
|
+
import "./base-DLKtKMFh.js";
|
|
34
|
+
import "./image-CHfWvljl.js";
|
|
35
|
+
import { t as providerRegistry } from "./providerRegistry-CD8MEar9.js";
|
|
36
|
+
import { n as runRuby } from "./rubyUtils-BUVePouc.js";
|
|
37
|
+
import { t as EvalResult } from "./evalResult-BDMqrapS.js";
|
|
39
38
|
import * as fs$1 from "fs";
|
|
40
39
|
import fs, { createWriteStream } from "fs";
|
|
41
40
|
import * as path$2 from "path";
|
|
@@ -57,7 +56,7 @@ import { XMLParser } from "fast-xml-parser";
|
|
|
57
56
|
import crypto$1, { createHash, randomBytes } from "crypto";
|
|
58
57
|
import { DiagConsoleLogger, DiagLogLevel, diag, propagation } from "@opentelemetry/api";
|
|
59
58
|
import input from "@inquirer/input";
|
|
60
|
-
import { and,
|
|
59
|
+
import { and, desc, eq, inArray, sql } from "drizzle-orm";
|
|
61
60
|
import cliProgress from "cli-progress";
|
|
62
61
|
import { JSDOM } from "jsdom";
|
|
63
62
|
import { distance } from "fastest-levenshtein";
|
|
@@ -76,7 +75,6 @@ import chokidar from "chokidar";
|
|
|
76
75
|
import ora from "ora";
|
|
77
76
|
import { URL } from "url";
|
|
78
77
|
import "@inquirer/confirm";
|
|
79
|
-
|
|
80
78
|
//#region src/external/matchers/conversationRelevancyTemplate.ts
|
|
81
79
|
var ConversationRelevancyTemplate = class {
|
|
82
80
|
static generateVerdicts(slidingWindow) {
|
|
@@ -148,7 +146,6 @@ ${JSON.stringify(irrelevancies, null, 2)}
|
|
|
148
146
|
JSON:`;
|
|
149
147
|
}
|
|
150
148
|
};
|
|
151
|
-
|
|
152
149
|
//#endregion
|
|
153
150
|
//#region src/external/matchers/deepeval.ts
|
|
154
151
|
const nunjucks$1 = getNunjucksEngine(void 0, false, true);
|
|
@@ -198,7 +195,6 @@ async function matchesConversationRelevance(messages, threshold, vars, grading,
|
|
|
198
195
|
return fail(`Error parsing output: ${err.message}`, resp.tokenUsage);
|
|
199
196
|
}
|
|
200
197
|
}
|
|
201
|
-
|
|
202
198
|
//#endregion
|
|
203
199
|
//#region src/external/assertions/deepeval.ts
|
|
204
200
|
const DEFAULT_WINDOW_SIZE = 5;
|
|
@@ -253,7 +249,6 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
253
249
|
tokensUsed: tokensUsed.total > 0 ? tokensUsed : void 0
|
|
254
250
|
};
|
|
255
251
|
};
|
|
256
|
-
|
|
257
252
|
//#endregion
|
|
258
253
|
//#region src/tracing/evaluatorTracing.ts
|
|
259
254
|
let otlpReceiverStarted = false;
|
|
@@ -286,28 +281,28 @@ function isOtlpReceiverStarted() {
|
|
|
286
281
|
* Start the OTLP receiver if tracing is enabled and it hasn't been started yet
|
|
287
282
|
*/
|
|
288
283
|
async function startOtlpReceiverIfNeeded(testSuite) {
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
284
|
+
logger.debug(`[EvaluatorTracing] Checking tracing config: ${JSON.stringify(testSuite.tracing)}`);
|
|
285
|
+
logger.debug(`[EvaluatorTracing] testSuite keys: ${Object.keys(testSuite)}`);
|
|
286
|
+
logger.debug(`[EvaluatorTracing] Full testSuite.tracing: ${JSON.stringify(testSuite.tracing, null, 2)}`);
|
|
292
287
|
if (testSuite.tracing?.enabled && testSuite.tracing?.otlp?.http?.enabled && !otlpReceiverStarted) {
|
|
293
|
-
|
|
288
|
+
telemetry.record("feature_used", { feature: "tracing" });
|
|
294
289
|
try {
|
|
295
|
-
|
|
296
|
-
const { startOTLPReceiver } = await import("./otlpReceiver-
|
|
290
|
+
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
291
|
+
const { startOTLPReceiver } = await import("./otlpReceiver-CdNBdbsk.js");
|
|
297
292
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
298
293
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
299
|
-
|
|
294
|
+
logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
300
295
|
await startOTLPReceiver(port, host);
|
|
301
296
|
otlpReceiverStarted = true;
|
|
302
|
-
|
|
297
|
+
logger.info(`[EvaluatorTracing] OTLP receiver successfully started on port ${port} for tracing`);
|
|
303
298
|
} catch (error) {
|
|
304
|
-
|
|
299
|
+
logger.error(`[EvaluatorTracing] Failed to start OTLP receiver: ${error}`);
|
|
305
300
|
}
|
|
306
|
-
} else if (otlpReceiverStarted)
|
|
301
|
+
} else if (otlpReceiverStarted) logger.debug("[EvaluatorTracing] OTLP receiver already started, skipping initialization");
|
|
307
302
|
else {
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
303
|
+
logger.debug("[EvaluatorTracing] Tracing not enabled or OTLP HTTP receiver not configured");
|
|
304
|
+
logger.debug(`[EvaluatorTracing] tracing.enabled: ${testSuite.tracing?.enabled}`);
|
|
305
|
+
logger.debug(`[EvaluatorTracing] tracing.otlp.http.enabled: ${testSuite.tracing?.otlp?.http?.enabled}`);
|
|
311
306
|
}
|
|
312
307
|
}
|
|
313
308
|
/**
|
|
@@ -315,13 +310,13 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
315
310
|
*/
|
|
316
311
|
async function stopOtlpReceiverIfNeeded() {
|
|
317
312
|
if (otlpReceiverStarted) try {
|
|
318
|
-
|
|
319
|
-
const { stopOTLPReceiver } = await import("./otlpReceiver-
|
|
313
|
+
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
314
|
+
const { stopOTLPReceiver } = await import("./otlpReceiver-CdNBdbsk.js");
|
|
320
315
|
await stopOTLPReceiver();
|
|
321
316
|
otlpReceiverStarted = false;
|
|
322
|
-
|
|
317
|
+
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
323
318
|
} catch (error) {
|
|
324
|
-
|
|
319
|
+
logger.error(`[EvaluatorTracing] Failed to stop OTLP receiver: ${error}`);
|
|
325
320
|
}
|
|
326
321
|
}
|
|
327
322
|
/**
|
|
@@ -337,7 +332,7 @@ function isTracingEnabled(test, testSuite) {
|
|
|
337
332
|
const yamlConfigEnabled = testSuite?.tracing?.enabled === true;
|
|
338
333
|
const envEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false);
|
|
339
334
|
const result = metadataEnabled || yamlConfigEnabled || envEnabled;
|
|
340
|
-
|
|
335
|
+
logger.debug(`[EvaluatorTracing] isTracingEnabled check: metadata=${metadataEnabled}, yamlConfig=${yamlConfigEnabled}, env=${envEnabled}, result=${result}`);
|
|
341
336
|
return result;
|
|
342
337
|
}
|
|
343
338
|
/**
|
|
@@ -346,25 +341,25 @@ function isTracingEnabled(test, testSuite) {
|
|
|
346
341
|
async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite) {
|
|
347
342
|
const tracingEnabled = isTracingEnabled(test, testSuite);
|
|
348
343
|
if (tracingEnabled) {
|
|
349
|
-
|
|
350
|
-
|
|
344
|
+
logger.debug("[EvaluatorTracing] Tracing enabled for test case");
|
|
345
|
+
logger.debug(`[EvaluatorTracing] Test metadata: ${JSON.stringify(test.metadata)}`);
|
|
351
346
|
}
|
|
352
347
|
if (!tracingEnabled) return null;
|
|
353
|
-
|
|
354
|
-
const { getTraceStore } = await import("./store-
|
|
348
|
+
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
349
|
+
const { getTraceStore } = await import("./store-DXilxTl-.js").then((n) => n.n);
|
|
355
350
|
const traceStore = getTraceStore();
|
|
356
351
|
const traceId = generateTraceId();
|
|
357
352
|
const spanId = generateSpanId();
|
|
358
353
|
const traceparent = generateTraceparent(traceId, spanId);
|
|
359
|
-
|
|
354
|
+
logger.debug(`[EvaluatorTracing] Generated trace context: traceId=${traceId}, spanId=${spanId}`);
|
|
360
355
|
let evaluationId = test.metadata?.evaluationId || evaluateOptions?.eventSource;
|
|
361
356
|
if (!evaluationId) {
|
|
362
|
-
|
|
357
|
+
logger.warn("[EvaluatorTracing] No evaluation ID found in test metadata or evaluateOptions, trace will not be linked to evaluation");
|
|
363
358
|
evaluationId = `eval-${Date.now()}`;
|
|
364
359
|
}
|
|
365
360
|
const testCaseId = test.metadata?.testCaseId || test.id || `${testIdx}-${promptIdx}`;
|
|
366
361
|
try {
|
|
367
|
-
|
|
362
|
+
logger.debug(`[EvaluatorTracing] Creating trace record for traceId=${traceId}`);
|
|
368
363
|
await traceStore.createTrace({
|
|
369
364
|
traceId,
|
|
370
365
|
evaluationId: evaluationId || "",
|
|
@@ -375,18 +370,17 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
375
370
|
vars: test.vars
|
|
376
371
|
}
|
|
377
372
|
});
|
|
378
|
-
|
|
373
|
+
logger.debug("[EvaluatorTracing] Trace record created successfully");
|
|
379
374
|
} catch (error) {
|
|
380
|
-
|
|
375
|
+
logger.error(`[EvaluatorTracing] Failed to create trace: ${error}`);
|
|
381
376
|
}
|
|
382
|
-
|
|
377
|
+
logger.debug(`[EvaluatorTracing] Trace context ready: ${traceparent} for test case ${testCaseId}`);
|
|
383
378
|
return {
|
|
384
379
|
traceparent,
|
|
385
380
|
evaluationId,
|
|
386
381
|
testCaseId
|
|
387
382
|
};
|
|
388
383
|
}
|
|
389
|
-
|
|
390
384
|
//#endregion
|
|
391
385
|
//#region src/assertions/answerRelevance.ts
|
|
392
386
|
const handleAnswerRelevance = async ({ assertion, output, prompt, test, providerCallContext }) => {
|
|
@@ -397,7 +391,6 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
|
|
|
397
391
|
...await matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
|
|
398
392
|
};
|
|
399
393
|
};
|
|
400
|
-
|
|
401
394
|
//#endregion
|
|
402
395
|
//#region src/assertions/assertionsResult.ts
|
|
403
396
|
const GUARDRAIL_BLOCKED_REASON = "Content failed guardrail safety checks";
|
|
@@ -503,7 +496,6 @@ var AssertionsResult = class {
|
|
|
503
496
|
return this.result;
|
|
504
497
|
}
|
|
505
498
|
};
|
|
506
|
-
|
|
507
499
|
//#endregion
|
|
508
500
|
//#region src/assertions/ngrams.ts
|
|
509
501
|
/**
|
|
@@ -519,7 +511,6 @@ function getNGrams(words, n) {
|
|
|
519
511
|
for (let i = 0; i <= words.length - n; i++) ngrams.push(words.slice(i, i + n).join(" "));
|
|
520
512
|
return ngrams;
|
|
521
513
|
}
|
|
522
|
-
|
|
523
514
|
//#endregion
|
|
524
515
|
//#region src/assertions/bleu.ts
|
|
525
516
|
/**
|
|
@@ -615,7 +606,6 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
615
606
|
assertion
|
|
616
607
|
};
|
|
617
608
|
}
|
|
618
|
-
|
|
619
609
|
//#endregion
|
|
620
610
|
//#region src/assertions/classifier.ts
|
|
621
611
|
async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
|
|
@@ -630,9 +620,43 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
630
620
|
...classificationResult
|
|
631
621
|
};
|
|
632
622
|
}
|
|
633
|
-
|
|
634
623
|
//#endregion
|
|
635
624
|
//#region src/assertions/contains.ts
|
|
625
|
+
function parseCommaSeparatedValues(value) {
|
|
626
|
+
const results = [];
|
|
627
|
+
let i = 0;
|
|
628
|
+
while (i < value.length) {
|
|
629
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
630
|
+
if (i >= value.length) break;
|
|
631
|
+
if (value[i] === ",") {
|
|
632
|
+
i++;
|
|
633
|
+
continue;
|
|
634
|
+
}
|
|
635
|
+
if (value[i] === "\"") {
|
|
636
|
+
i++;
|
|
637
|
+
let field = "";
|
|
638
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
639
|
+
field += value[i + 1];
|
|
640
|
+
i += 2;
|
|
641
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
642
|
+
field += "\"";
|
|
643
|
+
i += 2;
|
|
644
|
+
} else if (value[i] === "\"") {
|
|
645
|
+
i++;
|
|
646
|
+
break;
|
|
647
|
+
} else {
|
|
648
|
+
field += value[i];
|
|
649
|
+
i++;
|
|
650
|
+
}
|
|
651
|
+
results.push(field);
|
|
652
|
+
} else {
|
|
653
|
+
const start = i;
|
|
654
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
655
|
+
results.push(value.substring(start, i).trim());
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
return results;
|
|
659
|
+
}
|
|
636
660
|
const handleContains = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
637
661
|
const value = valueFromScript ?? renderedValue;
|
|
638
662
|
invariant(value, "\"contains\" assertion type must have a string or number value");
|
|
@@ -660,7 +684,7 @@ const handleIContains = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
660
684
|
const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
661
685
|
let value = valueFromScript ?? renderedValue;
|
|
662
686
|
invariant(value, "\"contains-any\" assertion type must have a value");
|
|
663
|
-
if (typeof value === "string") value = value
|
|
687
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
664
688
|
invariant(Array.isArray(value), "\"contains-any\" assertion type must have an array value");
|
|
665
689
|
const pass = value.some((v) => outputString.includes(String(v))) !== inverse;
|
|
666
690
|
return {
|
|
@@ -673,7 +697,7 @@ const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
673
697
|
const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
674
698
|
let value = valueFromScript ?? renderedValue;
|
|
675
699
|
invariant(value, "\"icontains-any\" assertion type must have a value");
|
|
676
|
-
if (typeof value === "string") value = value
|
|
700
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
677
701
|
invariant(Array.isArray(value), "\"icontains-any\" assertion type must have an array value");
|
|
678
702
|
const pass = value.some((v) => outputString.toLowerCase().includes(String(v).toLowerCase())) !== inverse;
|
|
679
703
|
return {
|
|
@@ -686,7 +710,7 @@ const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
686
710
|
const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
687
711
|
let value = valueFromScript ?? renderedValue;
|
|
688
712
|
invariant(value, "\"contains-all\" assertion type must have a value");
|
|
689
|
-
if (typeof value === "string") value = value
|
|
713
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
690
714
|
invariant(Array.isArray(value), "\"contains-all\" assertion type must have an array value");
|
|
691
715
|
const missingStrings = value.filter((v) => !outputString.includes(String(v)));
|
|
692
716
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -700,7 +724,7 @@ const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
700
724
|
const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
701
725
|
let value = valueFromScript ?? renderedValue;
|
|
702
726
|
invariant(value, "\"icontains-all\" assertion type must have a value");
|
|
703
|
-
if (typeof value === "string") value = value
|
|
727
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
704
728
|
invariant(Array.isArray(value), "\"icontains-all\" assertion type must have an array value");
|
|
705
729
|
const missingStrings = value.filter((v) => !outputString.toLowerCase().includes(String(v).toLowerCase()));
|
|
706
730
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -711,7 +735,6 @@ const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
711
735
|
assertion
|
|
712
736
|
};
|
|
713
737
|
};
|
|
714
|
-
|
|
715
738
|
//#endregion
|
|
716
739
|
//#region src/assertions/contextFaithfulness.ts
|
|
717
740
|
/**
|
|
@@ -735,7 +758,6 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
735
758
|
metadata: { context }
|
|
736
759
|
};
|
|
737
760
|
}
|
|
738
|
-
|
|
739
761
|
//#endregion
|
|
740
762
|
//#region src/assertions/contextRecall.ts
|
|
741
763
|
/**
|
|
@@ -762,7 +784,6 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
|
|
|
762
784
|
}
|
|
763
785
|
};
|
|
764
786
|
};
|
|
765
|
-
|
|
766
787
|
//#endregion
|
|
767
788
|
//#region src/assertions/contextRelevance.ts
|
|
768
789
|
/**
|
|
@@ -789,7 +810,6 @@ const handleContextRelevance = async ({ assertion, test, output, prompt, provide
|
|
|
789
810
|
}
|
|
790
811
|
};
|
|
791
812
|
};
|
|
792
|
-
|
|
793
813
|
//#endregion
|
|
794
814
|
//#region src/assertions/cost.ts
|
|
795
815
|
const handleCost = ({ cost, assertion }) => {
|
|
@@ -803,7 +823,6 @@ const handleCost = ({ cost, assertion }) => {
|
|
|
803
823
|
assertion
|
|
804
824
|
};
|
|
805
825
|
};
|
|
806
|
-
|
|
807
826
|
//#endregion
|
|
808
827
|
//#region src/assertions/equals.ts
|
|
809
828
|
const handleEquals = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -823,7 +842,6 @@ const handleEquals = async ({ assertion, renderedValue, outputString, inverse })
|
|
|
823
842
|
assertion
|
|
824
843
|
};
|
|
825
844
|
};
|
|
826
|
-
|
|
827
845
|
//#endregion
|
|
828
846
|
//#region src/assertions/factuality.ts
|
|
829
847
|
const handleFactuality = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -834,7 +852,6 @@ const handleFactuality = async ({ assertion, renderedValue, outputString, test,
|
|
|
834
852
|
...await matchesFactuality(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
835
853
|
};
|
|
836
854
|
};
|
|
837
|
-
|
|
838
855
|
//#endregion
|
|
839
856
|
//#region src/assertions/finishReason.ts
|
|
840
857
|
function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
@@ -854,7 +871,6 @@ function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
|
854
871
|
assertion
|
|
855
872
|
};
|
|
856
873
|
}
|
|
857
|
-
|
|
858
874
|
//#endregion
|
|
859
875
|
//#region src/assertions/functionToolCall.ts
|
|
860
876
|
const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
@@ -877,7 +893,6 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
877
893
|
};
|
|
878
894
|
}
|
|
879
895
|
};
|
|
880
|
-
|
|
881
896
|
//#endregion
|
|
882
897
|
//#region src/assertions/geval.ts
|
|
883
898
|
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
@@ -903,7 +918,6 @@ const handleGEval = async ({ assertion, renderedValue, prompt, outputString, tes
|
|
|
903
918
|
...await matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext)
|
|
904
919
|
};
|
|
905
920
|
};
|
|
906
|
-
|
|
907
921
|
//#endregion
|
|
908
922
|
//#region src/assertions/gleu.ts
|
|
909
923
|
/**
|
|
@@ -1002,7 +1016,6 @@ function handleGleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
1002
1016
|
assertion
|
|
1003
1017
|
};
|
|
1004
1018
|
}
|
|
1005
|
-
|
|
1006
1019
|
//#endregion
|
|
1007
1020
|
//#region src/assertions/guardrails.ts
|
|
1008
1021
|
const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
@@ -1011,7 +1024,7 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1011
1024
|
flaggedInput: false,
|
|
1012
1025
|
flaggedOutput: false
|
|
1013
1026
|
};
|
|
1014
|
-
|
|
1027
|
+
logger.debug(`Guardrails from target response: ${JSON.stringify(guardrails, null, 2)}`);
|
|
1015
1028
|
if (guardrails) if (guardrails.flagged) {
|
|
1016
1029
|
let reason = "Content failed safety checks";
|
|
1017
1030
|
if (guardrails?.flaggedInput) reason = "Prompt failed safety checks";
|
|
@@ -1040,7 +1053,6 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1040
1053
|
assertion
|
|
1041
1054
|
};
|
|
1042
1055
|
};
|
|
1043
|
-
|
|
1044
1056
|
//#endregion
|
|
1045
1057
|
//#region src/assertions/html.ts
|
|
1046
1058
|
const HTML_PATTERNS = {
|
|
@@ -1249,7 +1261,6 @@ const handleIsHtml = ({ assertion, outputString, inverse }) => {
|
|
|
1249
1261
|
assertion
|
|
1250
1262
|
};
|
|
1251
1263
|
};
|
|
1252
|
-
|
|
1253
1264
|
//#endregion
|
|
1254
1265
|
//#region src/assertions/javascript.ts
|
|
1255
1266
|
/**
|
|
@@ -1390,7 +1401,6 @@ ${renderedValue}`,
|
|
|
1390
1401
|
assertion
|
|
1391
1402
|
};
|
|
1392
1403
|
};
|
|
1393
|
-
|
|
1394
1404
|
//#endregion
|
|
1395
1405
|
//#region src/assertions/json.ts
|
|
1396
1406
|
function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, assertion }) {
|
|
@@ -1402,7 +1412,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1402
1412
|
} catch {
|
|
1403
1413
|
pass = inverse;
|
|
1404
1414
|
}
|
|
1405
|
-
if (
|
|
1415
|
+
if (parsedJson !== void 0 && renderedValue) {
|
|
1406
1416
|
let validate;
|
|
1407
1417
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
1408
1418
|
const schema = valueFromScript;
|
|
@@ -1414,11 +1424,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1414
1424
|
}
|
|
1415
1425
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
1416
1426
|
else throw new Error("is-json assertion must have a string or object value");
|
|
1417
|
-
|
|
1427
|
+
const valid = validate(parsedJson);
|
|
1428
|
+
pass = inverse ? !valid : valid;
|
|
1418
1429
|
if (!pass) return {
|
|
1419
1430
|
pass,
|
|
1420
1431
|
score: 0,
|
|
1421
|
-
reason: `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
1432
|
+
reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
1422
1433
|
assertion
|
|
1423
1434
|
};
|
|
1424
1435
|
}
|
|
@@ -1445,9 +1456,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1445
1456
|
}
|
|
1446
1457
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
1447
1458
|
else throw new Error("contains-json assertion must have a string or object value");
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1459
|
+
const valid = validate(jsonObject);
|
|
1460
|
+
pass = inverse ? !valid : valid;
|
|
1461
|
+
if (valid) {
|
|
1462
|
+
if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
|
|
1463
|
+
break;
|
|
1464
|
+
} else errorMessage = `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`;
|
|
1451
1465
|
}
|
|
1452
1466
|
return {
|
|
1453
1467
|
pass,
|
|
@@ -1456,7 +1470,6 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1456
1470
|
assertion
|
|
1457
1471
|
};
|
|
1458
1472
|
}
|
|
1459
|
-
|
|
1460
1473
|
//#endregion
|
|
1461
1474
|
//#region src/assertions/latency.ts
|
|
1462
1475
|
const handleLatency = ({ assertion, latencyMs }) => {
|
|
@@ -1470,7 +1483,6 @@ const handleLatency = ({ assertion, latencyMs }) => {
|
|
|
1470
1483
|
assertion
|
|
1471
1484
|
};
|
|
1472
1485
|
};
|
|
1473
|
-
|
|
1474
1486
|
//#endregion
|
|
1475
1487
|
//#region src/assertions/levenshtein.ts
|
|
1476
1488
|
function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
@@ -1485,7 +1497,6 @@ function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
|
1485
1497
|
assertion
|
|
1486
1498
|
};
|
|
1487
1499
|
}
|
|
1488
|
-
|
|
1489
1500
|
//#endregion
|
|
1490
1501
|
//#region src/assertions/llmRubric.ts
|
|
1491
1502
|
const handleLlmRubric = ({ assertion, renderedValue, outputString, test, providerCallContext }) => {
|
|
@@ -1494,7 +1505,6 @@ const handleLlmRubric = ({ assertion, renderedValue, outputString, test, provide
|
|
|
1494
1505
|
assertion.value = assertion.value || test.options?.rubricPrompt;
|
|
1495
1506
|
return matchesLlmRubric(renderedValue || "", outputString, test.options, test.vars, assertion, void 0, providerCallContext);
|
|
1496
1507
|
};
|
|
1497
|
-
|
|
1498
1508
|
//#endregion
|
|
1499
1509
|
//#region src/assertions/modelGradedClosedQa.ts
|
|
1500
1510
|
const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -1505,7 +1515,6 @@ const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputStrin
|
|
|
1505
1515
|
...await matchesClosedQa(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
1506
1516
|
};
|
|
1507
1517
|
};
|
|
1508
|
-
|
|
1509
1518
|
//#endregion
|
|
1510
1519
|
//#region src/util/providerResponse.ts
|
|
1511
1520
|
/**
|
|
@@ -1548,7 +1557,6 @@ function getActualPrompt(response, options = {}) {
|
|
|
1548
1557
|
function getActualPromptWithFallback(response, originalPrompt, options = {}) {
|
|
1549
1558
|
return getActualPrompt(response, options) || originalPrompt;
|
|
1550
1559
|
}
|
|
1551
|
-
|
|
1552
1560
|
//#endregion
|
|
1553
1561
|
//#region src/assertions/moderation.ts
|
|
1554
1562
|
const handleModeration = async ({ assertion, test, outputString, providerResponse, prompt }) => {
|
|
@@ -1571,7 +1579,6 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
|
|
|
1571
1579
|
assertion
|
|
1572
1580
|
};
|
|
1573
1581
|
};
|
|
1574
|
-
|
|
1575
1582
|
//#endregion
|
|
1576
1583
|
//#region src/assertions/openai.ts
|
|
1577
1584
|
const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test }) => {
|
|
@@ -1632,7 +1639,6 @@ const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test
|
|
|
1632
1639
|
};
|
|
1633
1640
|
}
|
|
1634
1641
|
};
|
|
1635
|
-
|
|
1636
1642
|
//#endregion
|
|
1637
1643
|
//#region src/assertions/perplexity.ts
|
|
1638
1644
|
function handlePerplexity({ logProbs, assertion }) {
|
|
@@ -1659,7 +1665,6 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1659
1665
|
assertion
|
|
1660
1666
|
};
|
|
1661
1667
|
}
|
|
1662
|
-
|
|
1663
1668
|
//#endregion
|
|
1664
1669
|
//#region src/assertions/pi.ts
|
|
1665
1670
|
const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }) => {
|
|
@@ -1667,7 +1672,6 @@ const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }
|
|
|
1667
1672
|
invariant(typeof prompt === "string", "\"pi\" assertion must have a prompt that is a string");
|
|
1668
1673
|
return matchesPiScore(renderedValue, prompt, outputString, assertion);
|
|
1669
1674
|
};
|
|
1670
|
-
|
|
1671
1675
|
//#endregion
|
|
1672
1676
|
//#region src/python/wrapper.ts
|
|
1673
1677
|
/**
|
|
@@ -1683,17 +1687,16 @@ async function runPythonCode(code, method, args) {
|
|
|
1683
1687
|
fs.writeFileSync(tempFilePath, code);
|
|
1684
1688
|
return await runPython(tempFilePath, method, args);
|
|
1685
1689
|
} catch (error) {
|
|
1686
|
-
|
|
1690
|
+
logger.error(`Error executing Python code: ${error}`);
|
|
1687
1691
|
throw error;
|
|
1688
1692
|
} finally {
|
|
1689
1693
|
try {
|
|
1690
1694
|
fs.unlinkSync(tempFilePath);
|
|
1691
1695
|
} catch (error) {
|
|
1692
|
-
|
|
1696
|
+
logger.error(`Error removing temporary file: ${error}`);
|
|
1693
1697
|
}
|
|
1694
1698
|
}
|
|
1695
1699
|
}
|
|
1696
|
-
|
|
1697
1700
|
//#endregion
|
|
1698
1701
|
//#region src/util/caseMapping.ts
|
|
1699
1702
|
/**
|
|
@@ -1717,7 +1720,6 @@ function mapSnakeCaseToCamelCase(obj) {
|
|
|
1717
1720
|
});
|
|
1718
1721
|
return result;
|
|
1719
1722
|
}
|
|
1720
|
-
|
|
1721
1723
|
//#endregion
|
|
1722
1724
|
//#region src/assertions/python.ts
|
|
1723
1725
|
const handlePython = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -1787,7 +1789,6 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1787
1789
|
assertion
|
|
1788
1790
|
};
|
|
1789
1791
|
};
|
|
1790
|
-
|
|
1791
1792
|
//#endregion
|
|
1792
1793
|
//#region src/assertions/redteam.ts
|
|
1793
1794
|
/**
|
|
@@ -1868,7 +1869,7 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1868
1869
|
const { hasAnyErrors, allTurnsHaveErrors } = analyzeGraderErrors(redteamHistory);
|
|
1869
1870
|
if (test.metadata?.strategyId && hasAnyErrors && !allTurnsHaveErrors) {
|
|
1870
1871
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1871
|
-
|
|
1872
|
+
logger.warn("[Redteam] Grading failed for iterative test with some prior grader errors", {
|
|
1872
1873
|
error: errorMessage,
|
|
1873
1874
|
strategyId: test.metadata.strategyId,
|
|
1874
1875
|
pluginId: test.metadata.pluginId
|
|
@@ -1888,7 +1889,6 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1888
1889
|
throw error;
|
|
1889
1890
|
}
|
|
1890
1891
|
};
|
|
1891
|
-
|
|
1892
1892
|
//#endregion
|
|
1893
1893
|
//#region src/assertions/refusal.ts
|
|
1894
1894
|
function handleIsRefusal(params) {
|
|
@@ -1916,7 +1916,6 @@ function handleIsRefusal(params) {
|
|
|
1916
1916
|
assertion
|
|
1917
1917
|
};
|
|
1918
1918
|
}
|
|
1919
|
-
|
|
1920
1919
|
//#endregion
|
|
1921
1920
|
//#region src/assertions/regex.ts
|
|
1922
1921
|
const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -1941,7 +1940,6 @@ const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
|
1941
1940
|
assertion
|
|
1942
1941
|
};
|
|
1943
1942
|
};
|
|
1944
|
-
|
|
1945
1943
|
//#endregion
|
|
1946
1944
|
//#region src/assertions/rouge.ts
|
|
1947
1945
|
function handleRougeScore({ baseType, assertion, renderedValue, outputString, inverse }) {
|
|
@@ -1957,7 +1955,6 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
1957
1955
|
assertion
|
|
1958
1956
|
};
|
|
1959
1957
|
}
|
|
1960
|
-
|
|
1961
1958
|
//#endregion
|
|
1962
1959
|
//#region src/ruby/wrapper.ts
|
|
1963
1960
|
/**
|
|
@@ -1973,17 +1970,16 @@ async function runRubyCode(code, method, args) {
|
|
|
1973
1970
|
fs.writeFileSync(tempFilePath, code);
|
|
1974
1971
|
return await runRuby(tempFilePath, method, args);
|
|
1975
1972
|
} catch (error) {
|
|
1976
|
-
|
|
1973
|
+
logger.error(`Error executing Ruby code: ${error}`);
|
|
1977
1974
|
throw error;
|
|
1978
1975
|
} finally {
|
|
1979
1976
|
try {
|
|
1980
1977
|
fs.unlinkSync(tempFilePath);
|
|
1981
1978
|
} catch (error) {
|
|
1982
|
-
|
|
1979
|
+
logger.error(`Error removing temporary file: ${error}`);
|
|
1983
1980
|
}
|
|
1984
1981
|
}
|
|
1985
1982
|
}
|
|
1986
|
-
|
|
1987
1983
|
//#endregion
|
|
1988
1984
|
//#region src/assertions/ruby.ts
|
|
1989
1985
|
const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -2054,7 +2050,6 @@ end
|
|
|
2054
2050
|
assertion
|
|
2055
2051
|
};
|
|
2056
2052
|
};
|
|
2057
|
-
|
|
2058
2053
|
//#endregion
|
|
2059
2054
|
//#region src/assertions/searchRubric.ts
|
|
2060
2055
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
@@ -2066,7 +2061,6 @@ async function handleSearchRubric({ assertion, baseType: _baseType, inverse, pro
|
|
|
2066
2061
|
}
|
|
2067
2062
|
return result;
|
|
2068
2063
|
}
|
|
2069
|
-
|
|
2070
2064
|
//#endregion
|
|
2071
2065
|
//#region src/assertions/similar.ts
|
|
2072
2066
|
const handleSimilar = async ({ assertion, renderedValue, outputString, inverse, test }) => {
|
|
@@ -2109,7 +2103,6 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2109
2103
|
...await matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
|
|
2110
2104
|
};
|
|
2111
2105
|
};
|
|
2112
|
-
|
|
2113
2106
|
//#endregion
|
|
2114
2107
|
//#region src/assertions/sql.ts
|
|
2115
2108
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2201,7 +2194,6 @@ const handleContainsSql = async (assertionParams) => {
|
|
|
2201
2194
|
}
|
|
2202
2195
|
return handleIsSql(assertionParams);
|
|
2203
2196
|
};
|
|
2204
|
-
|
|
2205
2197
|
//#endregion
|
|
2206
2198
|
//#region src/assertions/startsWith.ts
|
|
2207
2199
|
const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2215,7 +2207,6 @@ const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) =
|
|
|
2215
2207
|
assertion
|
|
2216
2208
|
};
|
|
2217
2209
|
};
|
|
2218
|
-
|
|
2219
2210
|
//#endregion
|
|
2220
2211
|
//#region src/assertions/toolCallF1.ts
|
|
2221
2212
|
/**
|
|
@@ -2344,7 +2335,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2344
2335
|
assertion
|
|
2345
2336
|
};
|
|
2346
2337
|
};
|
|
2347
|
-
|
|
2348
2338
|
//#endregion
|
|
2349
2339
|
//#region src/assertions/traceUtils.ts
|
|
2350
2340
|
/**
|
|
@@ -2362,7 +2352,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
2362
2352
|
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2363
2353
|
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2364
2354
|
}
|
|
2365
|
-
|
|
2366
2355
|
//#endregion
|
|
2367
2356
|
//#region src/assertions/traceErrorSpans.ts
|
|
2368
2357
|
function isErrorSpan(span) {
|
|
@@ -2440,7 +2429,6 @@ const handleTraceErrorSpans = ({ assertion, assertionValueContext }) => {
|
|
|
2440
2429
|
assertion
|
|
2441
2430
|
};
|
|
2442
2431
|
};
|
|
2443
|
-
|
|
2444
2432
|
//#endregion
|
|
2445
2433
|
//#region src/assertions/traceSpanCount.ts
|
|
2446
2434
|
const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
@@ -2475,7 +2463,6 @@ const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
|
2475
2463
|
assertion
|
|
2476
2464
|
};
|
|
2477
2465
|
};
|
|
2478
|
-
|
|
2479
2466
|
//#endregion
|
|
2480
2467
|
//#region src/assertions/traceSpanDuration.ts
|
|
2481
2468
|
function calculatePercentile(durations, percentile) {
|
|
@@ -2533,7 +2520,6 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2533
2520
|
assertion
|
|
2534
2521
|
};
|
|
2535
2522
|
};
|
|
2536
|
-
|
|
2537
2523
|
//#endregion
|
|
2538
2524
|
//#region src/assertions/webhook.ts
|
|
2539
2525
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
@@ -2570,7 +2556,6 @@ async function handleWebhook({ assertion, renderedValue, test, prompt, output, i
|
|
|
2570
2556
|
};
|
|
2571
2557
|
}
|
|
2572
2558
|
}
|
|
2573
|
-
|
|
2574
2559
|
//#endregion
|
|
2575
2560
|
//#region src/assertions/wordCount.ts
|
|
2576
2561
|
/**
|
|
@@ -2633,7 +2618,6 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2633
2618
|
assertion
|
|
2634
2619
|
};
|
|
2635
2620
|
};
|
|
2636
|
-
|
|
2637
2621
|
//#endregion
|
|
2638
2622
|
//#region src/assertions/xml.ts
|
|
2639
2623
|
function validateXml(xmlString, requiredElements) {
|
|
@@ -2708,7 +2692,6 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2708
2692
|
assertion
|
|
2709
2693
|
};
|
|
2710
2694
|
};
|
|
2711
|
-
|
|
2712
2695
|
//#endregion
|
|
2713
2696
|
//#region src/assertions/index.ts
|
|
2714
2697
|
const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
@@ -2762,7 +2745,7 @@ const ASSERTION_HANDLERS = {
|
|
|
2762
2745
|
"llm-rubric": handleLlmRubric,
|
|
2763
2746
|
meteor: async (params) => {
|
|
2764
2747
|
try {
|
|
2765
|
-
const { handleMeteorAssertion } = await import("./meteor-
|
|
2748
|
+
const { handleMeteorAssertion } = await import("./meteor-DUiCJRC-.js");
|
|
2766
2749
|
return handleMeteorAssertion(params);
|
|
2767
2750
|
} catch (error) {
|
|
2768
2751
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -2808,10 +2791,10 @@ function renderMetricName(metric, vars) {
|
|
|
2808
2791
|
if (!metric) return metric;
|
|
2809
2792
|
try {
|
|
2810
2793
|
const rendered = nunjucks.renderString(metric, vars);
|
|
2811
|
-
if (rendered === "" && metric !== "")
|
|
2794
|
+
if (rendered === "" && metric !== "") logger.debug(`Metric template "${metric}" rendered to empty string`);
|
|
2812
2795
|
return rendered;
|
|
2813
2796
|
} catch (error) {
|
|
2814
|
-
|
|
2797
|
+
logger.warn(`Failed to render metric template "${metric}": ${error instanceof Error ? error.message : error}`);
|
|
2815
2798
|
return metric;
|
|
2816
2799
|
}
|
|
2817
2800
|
}
|
|
@@ -2862,12 +2845,12 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2862
2845
|
spans: traceData.spans || []
|
|
2863
2846
|
};
|
|
2864
2847
|
} catch (error) {
|
|
2865
|
-
|
|
2848
|
+
logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
2866
2849
|
}
|
|
2867
2850
|
let renderedValue = assertion.value;
|
|
2868
2851
|
let valueFromScript;
|
|
2869
2852
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
2870
|
-
const basePath =
|
|
2853
|
+
const basePath = state.basePath || "";
|
|
2871
2854
|
const fileRef = renderedValue.slice(7);
|
|
2872
2855
|
let filePath = fileRef;
|
|
2873
2856
|
let functionName;
|
|
@@ -2879,10 +2862,10 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2879
2862
|
filePath = path.resolve(basePath, filePath);
|
|
2880
2863
|
if (isJavascriptFile(filePath)) {
|
|
2881
2864
|
valueFromScript = await loadFromJavaScriptFile(filePath, functionName, [output, context]);
|
|
2882
|
-
|
|
2865
|
+
logger.debug(`Javascript script ${filePath} output: ${valueFromScript}`);
|
|
2883
2866
|
} else if (filePath.endsWith(".py")) try {
|
|
2884
2867
|
valueFromScript = await runPython(filePath, functionName || "get_assert", [output, context]);
|
|
2885
|
-
|
|
2868
|
+
logger.debug(`Python script ${filePath} output: ${valueFromScript}`);
|
|
2886
2869
|
} catch (error) {
|
|
2887
2870
|
return {
|
|
2888
2871
|
pass: false,
|
|
@@ -2892,9 +2875,9 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2892
2875
|
};
|
|
2893
2876
|
}
|
|
2894
2877
|
else if (filePath.endsWith(".rb")) try {
|
|
2895
|
-
const { runRuby } = await import("./rubyUtils-
|
|
2878
|
+
const { runRuby } = await import("./rubyUtils-BUVePouc.js").then((n) => n.t);
|
|
2896
2879
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2897
|
-
|
|
2880
|
+
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2898
2881
|
} catch (error) {
|
|
2899
2882
|
return {
|
|
2900
2883
|
pass: false,
|
|
@@ -2905,7 +2888,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2905
2888
|
}
|
|
2906
2889
|
else renderedValue = processFileReference(renderedValue);
|
|
2907
2890
|
} else if (isPackagePath(renderedValue)) {
|
|
2908
|
-
const basePath =
|
|
2891
|
+
const basePath = state.basePath || "";
|
|
2909
2892
|
const requiredModule = await loadFromPackage(renderedValue, basePath);
|
|
2910
2893
|
if (typeof requiredModule !== "function") throw new Error(`Assertion malformed: ${renderedValue} must be a function. Received: ${typeof requiredModule}`);
|
|
2911
2894
|
valueFromScript = await Promise.resolve(requiredModule(output, context));
|
|
@@ -3066,7 +3049,6 @@ var assertions_default = {
|
|
|
3066
3049
|
matchesModeration,
|
|
3067
3050
|
matchesConversationRelevance
|
|
3068
3051
|
};
|
|
3069
|
-
|
|
3070
3052
|
//#endregion
|
|
3071
3053
|
//#region src/database/signal.ts
|
|
3072
3054
|
/**
|
|
@@ -3081,10 +3063,9 @@ function updateSignalFile(evalId) {
|
|
|
3081
3063
|
const content = evalId ? `${evalId}:${now.toISOString()}` : now.toISOString();
|
|
3082
3064
|
fs.writeFileSync(filePath, content);
|
|
3083
3065
|
} catch (err) {
|
|
3084
|
-
|
|
3066
|
+
logger.warn(`Failed to write database signal file: ${err}`);
|
|
3085
3067
|
}
|
|
3086
3068
|
}
|
|
3087
|
-
|
|
3088
3069
|
//#endregion
|
|
3089
3070
|
//#region src/progress/ciProgressReporter.ts
|
|
3090
3071
|
var CIProgressReporter = class {
|
|
@@ -3106,7 +3087,7 @@ var CIProgressReporter = class {
|
|
|
3106
3087
|
}
|
|
3107
3088
|
start() {
|
|
3108
3089
|
if (this.intervalId) clearInterval(this.intervalId);
|
|
3109
|
-
|
|
3090
|
+
logger.info(`[Evaluation] Starting ${this.totalTests} test cases...`);
|
|
3110
3091
|
this.intervalId = setInterval(() => {
|
|
3111
3092
|
this.logPeriodicUpdate();
|
|
3112
3093
|
}, this.updateIntervalMs);
|
|
@@ -3137,14 +3118,14 @@ var CIProgressReporter = class {
|
|
|
3137
3118
|
this.intervalId = null;
|
|
3138
3119
|
}
|
|
3139
3120
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3140
|
-
|
|
3121
|
+
logger.info(`[Evaluation] ✓ Complete! ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3141
3122
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation completed: ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3142
3123
|
}
|
|
3143
3124
|
error(message) {
|
|
3144
3125
|
const now = Date.now();
|
|
3145
3126
|
if (now - this.lastErrorTime < this.ERROR_THROTTLE_MS) return;
|
|
3146
3127
|
this.lastErrorTime = now;
|
|
3147
|
-
|
|
3128
|
+
logger.error(`[Evaluation Error] ${message}`);
|
|
3148
3129
|
if (process.env.GITHUB_ACTIONS) {
|
|
3149
3130
|
const escapedMessage = message.replace(/\r?\n/g, " ").replace(/::/g, " ");
|
|
3150
3131
|
console.log(`::error::${escapedMessage}`);
|
|
@@ -3163,12 +3144,12 @@ var CIProgressReporter = class {
|
|
|
3163
3144
|
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
|
|
3164
3145
|
}
|
|
3165
3146
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3166
|
-
|
|
3167
|
-
|
|
3147
|
+
logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
3148
|
+
logger.info(`[CI Progress] Rate: ~${Math.round(rate)} tests/minute, ETA: ${etaDisplay}`);
|
|
3168
3149
|
}
|
|
3169
3150
|
logMilestone(percentage) {
|
|
3170
3151
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3171
|
-
|
|
3152
|
+
logger.info(`[Evaluation] ✓ ${percentage}% complete (${this.completedTests}/${this.totalTests}) - ${elapsed} elapsed`);
|
|
3172
3153
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation ${percentage}% complete`);
|
|
3173
3154
|
}
|
|
3174
3155
|
formatElapsedTime(ms) {
|
|
@@ -3179,7 +3160,6 @@ var CIProgressReporter = class {
|
|
|
3179
3160
|
return `${minutes}m ${remainingSeconds}s`;
|
|
3180
3161
|
}
|
|
3181
3162
|
};
|
|
3182
|
-
|
|
3183
3163
|
//#endregion
|
|
3184
3164
|
//#region src/providers/azure/warnings.ts
|
|
3185
3165
|
/**
|
|
@@ -3193,13 +3173,12 @@ function maybeEmitAzureOpenAiWarning(testSuite, tests) {
|
|
|
3193
3173
|
const modelGradedAsserts = tests.flatMap((t) => (t.assert || []).filter((a) => a.type !== "assert-set" && MODEL_GRADED_ASSERTION_TYPES.has(a.type) && !a.provider && !t.options?.provider));
|
|
3194
3174
|
if (modelGradedAsserts.length > 0) {
|
|
3195
3175
|
const assertTypes = Array.from(new Set(modelGradedAsserts.map((a) => a.type))).join(", ");
|
|
3196
|
-
|
|
3176
|
+
logger.warn(chalk.yellow(`You are using model-graded assertions of types ${chalk.bold(assertTypes)} while testing an Azure provider. You may need to override these to use your Azure deployment. To learn more, see ${chalk.bold(`https://promptfoo.dev/docs/providers/azure/#model-graded-tests`)}`));
|
|
3197
3177
|
return true;
|
|
3198
3178
|
}
|
|
3199
3179
|
}
|
|
3200
3180
|
return false;
|
|
3201
3181
|
}
|
|
3202
|
-
|
|
3203
3182
|
//#endregion
|
|
3204
3183
|
//#region src/suggestions.ts
|
|
3205
3184
|
async function generatePrompts(prompt, _num) {
|
|
@@ -3230,7 +3209,6 @@ async function generatePrompts(prompt, _num) {
|
|
|
3230
3209
|
};
|
|
3231
3210
|
}
|
|
3232
3211
|
}
|
|
3233
|
-
|
|
3234
3212
|
//#endregion
|
|
3235
3213
|
//#region src/tracing/otelConfig.ts
|
|
3236
3214
|
/**
|
|
@@ -3256,7 +3234,6 @@ function getDefaultOtelConfig() {
|
|
|
3256
3234
|
enabled: true
|
|
3257
3235
|
};
|
|
3258
3236
|
}
|
|
3259
|
-
|
|
3260
3237
|
//#endregion
|
|
3261
3238
|
//#region src/tracing/localSpanExporter.ts
|
|
3262
3239
|
/**
|
|
@@ -3276,7 +3253,7 @@ var LocalSpanExporter = class {
|
|
|
3276
3253
|
});
|
|
3277
3254
|
else resultCallback({ code: ExportResultCode.SUCCESS });
|
|
3278
3255
|
}).catch((error) => {
|
|
3279
|
-
|
|
3256
|
+
logger.error("[LocalSpanExporter] Failed to export spans", { error });
|
|
3280
3257
|
resultCallback({
|
|
3281
3258
|
code: ExportResultCode.FAILED,
|
|
3282
3259
|
error: error instanceof Error ? error : new Error(String(error))
|
|
@@ -3290,7 +3267,7 @@ var LocalSpanExporter = class {
|
|
|
3290
3267
|
async exportAsync(spans) {
|
|
3291
3268
|
if (spans.length === 0) return;
|
|
3292
3269
|
const traceStore = getTraceStore();
|
|
3293
|
-
|
|
3270
|
+
logger.debug(`[LocalSpanExporter] Exporting ${spans.length} spans`);
|
|
3294
3271
|
const spansByTrace = /* @__PURE__ */ new Map();
|
|
3295
3272
|
for (const span of spans) {
|
|
3296
3273
|
const traceId = span.spanContext().traceId;
|
|
@@ -3301,12 +3278,12 @@ var LocalSpanExporter = class {
|
|
|
3301
3278
|
let firstError;
|
|
3302
3279
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
3303
3280
|
const result = await traceStore.addSpans(traceId, spanDataList, { skipTraceCheck: false });
|
|
3304
|
-
if (result.stored)
|
|
3305
|
-
else
|
|
3281
|
+
if (result.stored) logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
3282
|
+
else logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
3306
3283
|
} catch (error) {
|
|
3307
|
-
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY"))
|
|
3284
|
+
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY")) logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}`);
|
|
3308
3285
|
else {
|
|
3309
|
-
|
|
3286
|
+
logger.error(`[LocalSpanExporter] Failed to add spans to trace ${traceId}`, { error });
|
|
3310
3287
|
if (!firstError) firstError = error instanceof Error ? error : new Error(String(error));
|
|
3311
3288
|
}
|
|
3312
3289
|
}
|
|
@@ -3343,7 +3320,7 @@ var LocalSpanExporter = class {
|
|
|
3343
3320
|
* Shutdown the exporter. No-op for local storage.
|
|
3344
3321
|
*/
|
|
3345
3322
|
shutdown() {
|
|
3346
|
-
|
|
3323
|
+
logger.debug("[LocalSpanExporter] Shutting down");
|
|
3347
3324
|
return Promise.resolve();
|
|
3348
3325
|
}
|
|
3349
3326
|
/**
|
|
@@ -3353,7 +3330,6 @@ var LocalSpanExporter = class {
|
|
|
3353
3330
|
return Promise.resolve();
|
|
3354
3331
|
}
|
|
3355
3332
|
};
|
|
3356
|
-
|
|
3357
3333
|
//#endregion
|
|
3358
3334
|
//#region src/tracing/otelSdk.ts
|
|
3359
3335
|
let provider = null;
|
|
@@ -3381,21 +3357,21 @@ function getHandlers() {
|
|
|
3381
3357
|
*/
|
|
3382
3358
|
function initializeOtel(config) {
|
|
3383
3359
|
if (initialized) {
|
|
3384
|
-
|
|
3360
|
+
logger.debug("[OtelSdk] Already initialized, skipping");
|
|
3385
3361
|
return;
|
|
3386
3362
|
}
|
|
3387
3363
|
if (!config.enabled) {
|
|
3388
|
-
|
|
3364
|
+
logger.debug("[OtelSdk] OTEL tracing is disabled");
|
|
3389
3365
|
return;
|
|
3390
3366
|
}
|
|
3391
|
-
|
|
3367
|
+
logger.debug("[OtelSdk] Initializing OpenTelemetry SDK", {
|
|
3392
3368
|
serviceName: config.serviceName,
|
|
3393
3369
|
endpoint: config.endpoint,
|
|
3394
3370
|
localExport: config.localExport
|
|
3395
3371
|
});
|
|
3396
3372
|
if (config.debug) diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.DEBUG);
|
|
3397
3373
|
propagation.setGlobalPropagator(new W3CTraceContextPropagator());
|
|
3398
|
-
|
|
3374
|
+
logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
|
|
3399
3375
|
const resource = resourceFromAttributes({
|
|
3400
3376
|
[ATTR_SERVICE_NAME]: config.serviceName,
|
|
3401
3377
|
[ATTR_SERVICE_VERSION]: VERSION
|
|
@@ -3404,12 +3380,12 @@ function initializeOtel(config) {
|
|
|
3404
3380
|
if (config.localExport) {
|
|
3405
3381
|
const localExporter = new LocalSpanExporter();
|
|
3406
3382
|
spanProcessors.push(new BatchSpanProcessor(localExporter));
|
|
3407
|
-
|
|
3383
|
+
logger.debug("[OtelSdk] Added local span exporter");
|
|
3408
3384
|
}
|
|
3409
3385
|
if (config.endpoint) {
|
|
3410
3386
|
const otlpExporter = new OTLPTraceExporter({ url: config.endpoint });
|
|
3411
3387
|
spanProcessors.push(new BatchSpanProcessor(otlpExporter));
|
|
3412
|
-
|
|
3388
|
+
logger.debug(`[OtelSdk] Added OTLP exporter to ${config.endpoint}`);
|
|
3413
3389
|
}
|
|
3414
3390
|
provider = new NodeTracerProvider({
|
|
3415
3391
|
resource,
|
|
@@ -3417,7 +3393,7 @@ function initializeOtel(config) {
|
|
|
3417
3393
|
});
|
|
3418
3394
|
provider.register();
|
|
3419
3395
|
initialized = true;
|
|
3420
|
-
|
|
3396
|
+
logger.info("[OtelSdk] OpenTelemetry SDK initialized successfully");
|
|
3421
3397
|
setupShutdownHandlers();
|
|
3422
3398
|
}
|
|
3423
3399
|
/**
|
|
@@ -3426,12 +3402,12 @@ function initializeOtel(config) {
|
|
|
3426
3402
|
*/
|
|
3427
3403
|
async function shutdownOtel() {
|
|
3428
3404
|
if (!initialized || !provider) return;
|
|
3429
|
-
|
|
3405
|
+
logger.debug("[OtelSdk] Shutting down OpenTelemetry SDK");
|
|
3430
3406
|
try {
|
|
3431
3407
|
await provider.shutdown();
|
|
3432
|
-
|
|
3408
|
+
logger.info("[OtelSdk] OpenTelemetry SDK shut down successfully");
|
|
3433
3409
|
} catch (error) {
|
|
3434
|
-
|
|
3410
|
+
logger.error("[OtelSdk] Error shutting down OpenTelemetry SDK", { error });
|
|
3435
3411
|
} finally {
|
|
3436
3412
|
provider = null;
|
|
3437
3413
|
initialized = false;
|
|
@@ -3444,12 +3420,12 @@ async function shutdownOtel() {
|
|
|
3444
3420
|
*/
|
|
3445
3421
|
async function flushOtel() {
|
|
3446
3422
|
if (!initialized || !provider) return;
|
|
3447
|
-
|
|
3423
|
+
logger.debug("[OtelSdk] Flushing pending spans");
|
|
3448
3424
|
try {
|
|
3449
3425
|
await provider.forceFlush();
|
|
3450
|
-
|
|
3426
|
+
logger.debug("[OtelSdk] Spans flushed successfully");
|
|
3451
3427
|
} catch (error) {
|
|
3452
|
-
|
|
3428
|
+
logger.error("[OtelSdk] Error flushing spans", { error });
|
|
3453
3429
|
}
|
|
3454
3430
|
}
|
|
3455
3431
|
/**
|
|
@@ -3461,7 +3437,7 @@ function setupShutdownHandlers() {
|
|
|
3461
3437
|
const handlers = getHandlers();
|
|
3462
3438
|
if (handlers.registered) return;
|
|
3463
3439
|
const shutdown = async (signal) => {
|
|
3464
|
-
|
|
3440
|
+
logger.debug(`[OtelSdk] Received ${signal}, shutting down`);
|
|
3465
3441
|
await shutdownOtel();
|
|
3466
3442
|
};
|
|
3467
3443
|
handlers.sigTermHandler = () => {
|
|
@@ -3498,7 +3474,6 @@ function cleanupShutdownHandlers() {
|
|
|
3498
3474
|
}
|
|
3499
3475
|
handlers.registered = false;
|
|
3500
3476
|
}
|
|
3501
|
-
|
|
3502
3477
|
//#endregion
|
|
3503
3478
|
//#region src/util/exportToFile/writeToFile.ts
|
|
3504
3479
|
var JsonlFileWriter = class {
|
|
@@ -3522,7 +3497,6 @@ var JsonlFileWriter = class {
|
|
|
3522
3497
|
});
|
|
3523
3498
|
}
|
|
3524
3499
|
};
|
|
3525
|
-
|
|
3526
3500
|
//#endregion
|
|
3527
3501
|
//#region src/util/promptMatching.ts
|
|
3528
3502
|
/**
|
|
@@ -3560,7 +3534,6 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
3560
3534
|
if (allowedPrompts.length === 0) return false;
|
|
3561
3535
|
return allowedPrompts.some((ref) => doesPromptRefMatch(ref, prompt));
|
|
3562
3536
|
}
|
|
3563
|
-
|
|
3564
3537
|
//#endregion
|
|
3565
3538
|
//#region src/evaluator.ts
|
|
3566
3539
|
/**
|
|
@@ -3752,7 +3725,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3752
3725
|
if (test.providerOutput) response.output = test.providerOutput;
|
|
3753
3726
|
else {
|
|
3754
3727
|
const activeProvider = isApiProvider(test.provider) ? test.provider : provider;
|
|
3755
|
-
|
|
3728
|
+
logger.debug(`Provider type: ${activeProvider.id()}`);
|
|
3756
3729
|
traceContext = await generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite);
|
|
3757
3730
|
const callApiContext = {
|
|
3758
3731
|
vars,
|
|
@@ -3763,7 +3736,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3763
3736
|
filters,
|
|
3764
3737
|
originalProvider: provider,
|
|
3765
3738
|
test,
|
|
3766
|
-
logger
|
|
3739
|
+
logger,
|
|
3767
3740
|
getCache,
|
|
3768
3741
|
repeatIndex
|
|
3769
3742
|
};
|
|
@@ -3780,8 +3753,8 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3780
3753
|
const sanitizedMetadata = safeJsonStringify(response.metadata);
|
|
3781
3754
|
response.metadata = sanitizedMetadata ? JSON.parse(sanitizedMetadata) : {};
|
|
3782
3755
|
}
|
|
3783
|
-
|
|
3784
|
-
|
|
3756
|
+
logger.debug(`Provider response properties: ${Object.keys(response).join(", ")}`);
|
|
3757
|
+
logger.debug(`Provider response cached property explicitly: ${response.cached}`);
|
|
3785
3758
|
}
|
|
3786
3759
|
latencyMs = Date.now() - startTime;
|
|
3787
3760
|
let conversationLastInput = void 0;
|
|
@@ -3798,12 +3771,12 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3798
3771
|
metadata: response.metadata
|
|
3799
3772
|
});
|
|
3800
3773
|
}
|
|
3801
|
-
|
|
3802
|
-
|
|
3774
|
+
logger.debug("Evaluator response", { responsePreview: (safeJsonStringify(response) ?? "").slice(0, 100) });
|
|
3775
|
+
logger.debug(`Evaluator checking cached flag: response.cached = ${Boolean(response.cached)}, provider.delay = ${provider.delay}`);
|
|
3803
3776
|
if (!response.cached && provider.delay > 0) {
|
|
3804
|
-
|
|
3777
|
+
logger.debug(`Sleeping for ${provider.delay}ms`);
|
|
3805
3778
|
await sleep(provider.delay);
|
|
3806
|
-
} else if (response.cached)
|
|
3779
|
+
} else if (response.cached) logger.debug(`Skipping delay because response is cached`);
|
|
3807
3780
|
const ret = {
|
|
3808
3781
|
...setup,
|
|
3809
3782
|
response,
|
|
@@ -3906,7 +3879,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3906
3879
|
promptIdx,
|
|
3907
3880
|
testIdx
|
|
3908
3881
|
});
|
|
3909
|
-
if (!(err instanceof Error && err.name === "AbortError"))
|
|
3882
|
+
if (!(err instanceof Error && err.name === "AbortError")) logger.error("Provider call failed during eval", logContext);
|
|
3910
3883
|
return [{
|
|
3911
3884
|
...setup,
|
|
3912
3885
|
error: errorWithStack,
|
|
@@ -3989,7 +3962,7 @@ function generateVarCombinations(vars) {
|
|
|
3989
3962
|
let values = [];
|
|
3990
3963
|
if (typeof vars[key] === "string" && vars[key].startsWith("file://")) {
|
|
3991
3964
|
const filePath = vars[key].slice(7);
|
|
3992
|
-
const basePath =
|
|
3965
|
+
const basePath = state.basePath || "";
|
|
3993
3966
|
values = (globSync(filePath, {
|
|
3994
3967
|
cwd: basePath || process.cwd(),
|
|
3995
3968
|
windowsPathsNoEscape: true
|
|
@@ -4029,28 +4002,28 @@ var Evaluator = class {
|
|
|
4029
4002
|
this.conversations = {};
|
|
4030
4003
|
this.registers = {};
|
|
4031
4004
|
this.fileWriters = (Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : []).map((p) => new JsonlFileWriter(p));
|
|
4032
|
-
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency ||
|
|
4005
|
+
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
4033
4006
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
4034
|
-
|
|
4007
|
+
logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
4035
4008
|
retryAfterMs: data.retryAfterMs,
|
|
4036
4009
|
resetAt: data.resetAt,
|
|
4037
4010
|
concurrencyChange: data.concurrencyChange
|
|
4038
4011
|
});
|
|
4039
4012
|
});
|
|
4040
4013
|
this.rateLimitRegistry.on("ratelimit:learned", (data) => {
|
|
4041
|
-
|
|
4014
|
+
logger.debug(`[Scheduler] Learned rate limits for ${data.rateLimitKey}`, {
|
|
4042
4015
|
requestLimit: data.requestLimit,
|
|
4043
4016
|
tokenLimit: data.tokenLimit
|
|
4044
4017
|
});
|
|
4045
4018
|
});
|
|
4046
4019
|
this.rateLimitRegistry.on("concurrency:decreased", (data) => {
|
|
4047
|
-
|
|
4020
|
+
logger.debug(`[Scheduler] Concurrency decreased for ${data.rateLimitKey}`, {
|
|
4048
4021
|
previous: data.previous,
|
|
4049
4022
|
current: data.current
|
|
4050
4023
|
});
|
|
4051
4024
|
});
|
|
4052
4025
|
this.rateLimitRegistry.on("concurrency:increased", (data) => {
|
|
4053
|
-
|
|
4026
|
+
logger.debug(`[Scheduler] Concurrency increased for ${data.rateLimitKey}`, {
|
|
4054
4027
|
previous: data.previous,
|
|
4055
4028
|
current: data.current
|
|
4056
4029
|
});
|
|
@@ -4107,7 +4080,7 @@ var Evaluator = class {
|
|
|
4107
4080
|
const checkAbort = () => {
|
|
4108
4081
|
if (combinedAbortSignal.aborted) throw new Error("Operation cancelled");
|
|
4109
4082
|
};
|
|
4110
|
-
if (!options.silent)
|
|
4083
|
+
if (!options.silent) logger.info(`Starting evaluation ${this.evalRecord.id}`);
|
|
4111
4084
|
checkAbort();
|
|
4112
4085
|
const prompts = [];
|
|
4113
4086
|
const assertionTypes = /* @__PURE__ */ new Set();
|
|
@@ -4119,32 +4092,32 @@ var Evaluator = class {
|
|
|
4119
4092
|
}
|
|
4120
4093
|
testSuite = (await runExtensionHook(testSuite.extensions, "beforeAll", { suite: testSuite })).suite;
|
|
4121
4094
|
if (options.generateSuggestions) {
|
|
4122
|
-
|
|
4095
|
+
logger.info(`Generating prompt variations...`);
|
|
4123
4096
|
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
4124
4097
|
if (error || !newPrompts) throw new Error(`Failed to generate prompts: ${error}`);
|
|
4125
|
-
|
|
4098
|
+
logger.info(chalk.blue("Generated prompts:"));
|
|
4126
4099
|
let numAdded = 0;
|
|
4127
4100
|
for (const prompt of newPrompts) {
|
|
4128
|
-
|
|
4129
|
-
|
|
4130
|
-
|
|
4101
|
+
logger.info("--------------------------------------------------------");
|
|
4102
|
+
logger.info(`${prompt}`);
|
|
4103
|
+
logger.info("--------------------------------------------------------");
|
|
4131
4104
|
if (await promptYesNo("Do you want to test this prompt?", false)) {
|
|
4132
4105
|
testSuite.prompts.push({
|
|
4133
4106
|
raw: prompt,
|
|
4134
4107
|
label: prompt
|
|
4135
4108
|
});
|
|
4136
4109
|
numAdded++;
|
|
4137
|
-
} else
|
|
4110
|
+
} else logger.info("Skipping this prompt.");
|
|
4138
4111
|
}
|
|
4139
4112
|
if (numAdded < 1) {
|
|
4140
|
-
|
|
4113
|
+
logger.info(chalk.red("No prompts selected. Aborting."));
|
|
4141
4114
|
process.exitCode = 1;
|
|
4142
4115
|
return this.evalRecord;
|
|
4143
4116
|
}
|
|
4144
4117
|
}
|
|
4145
4118
|
const existingPromptsMap = /* @__PURE__ */ new Map();
|
|
4146
|
-
if (
|
|
4147
|
-
|
|
4119
|
+
if (state.resume && this.evalRecord.persisted && this.evalRecord.prompts.length > 0) {
|
|
4120
|
+
logger.debug("Resuming evaluation: preserving metrics from previous run");
|
|
4148
4121
|
for (const existingPrompt of this.evalRecord.prompts) {
|
|
4149
4122
|
const key = `${existingPrompt.provider}:${existingPrompt.id}`;
|
|
4150
4123
|
existingPromptsMap.set(key, existingPrompt);
|
|
@@ -4182,7 +4155,7 @@ var Evaluator = class {
|
|
|
4182
4155
|
await this.evalRecord.addPrompts(prompts);
|
|
4183
4156
|
let tests = testSuite.tests && testSuite.tests.length > 0 ? testSuite.tests : testSuite.scenarios ? [] : [{}];
|
|
4184
4157
|
if (testSuite.scenarios && testSuite.scenarios.length > 0) {
|
|
4185
|
-
|
|
4158
|
+
telemetry.record("feature_used", { feature: "scenarios" });
|
|
4186
4159
|
let scenarioIndex = 0;
|
|
4187
4160
|
for (const scenario of testSuite.scenarios) for (const data of scenario.config) {
|
|
4188
4161
|
const scenarioTests = (scenario.tests || [{}]).map((test) => {
|
|
@@ -4246,7 +4219,7 @@ var Evaluator = class {
|
|
|
4246
4219
|
}
|
|
4247
4220
|
const runEvalOptions = [];
|
|
4248
4221
|
let testIdx = 0;
|
|
4249
|
-
let concurrency = options.maxConcurrency ||
|
|
4222
|
+
let concurrency = options.maxConcurrency || 4;
|
|
4250
4223
|
for (let index = 0; index < tests.length; index++) {
|
|
4251
4224
|
const testCase = tests[index];
|
|
4252
4225
|
invariant(typeof testSuite.defaultTest !== "object" || Array.isArray(testSuite.defaultTest?.assert || []), `defaultTest.assert is not an array in test case #${index + 1}`);
|
|
@@ -4266,7 +4239,7 @@ var Evaluator = class {
|
|
|
4266
4239
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4267
4240
|
if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4268
4241
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4269
|
-
const { loadApiProvider } = await import("./providers-
|
|
4242
|
+
const { loadApiProvider } = await import("./providers-BEwbhv0X.js");
|
|
4270
4243
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4271
4244
|
} else testCase.provider = defaultProvider;
|
|
4272
4245
|
}
|
|
@@ -4293,7 +4266,7 @@ var Evaluator = class {
|
|
|
4293
4266
|
const promptId = generateIdFromPrompt(prompt);
|
|
4294
4267
|
const promptIdx = promptIndexMap.get(`${providerKey}:${promptId}`);
|
|
4295
4268
|
if (promptIdx === void 0) {
|
|
4296
|
-
|
|
4269
|
+
logger.warn(`Could not find prompt index for ${providerKey}:${promptId}, skipping`);
|
|
4297
4270
|
continue;
|
|
4298
4271
|
}
|
|
4299
4272
|
runEvalOptions.push({
|
|
@@ -4316,7 +4289,7 @@ var Evaluator = class {
|
|
|
4316
4289
|
options: testOptions
|
|
4317
4290
|
};
|
|
4318
4291
|
const tracingEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || testCase.metadata?.tracingEnabled === true || testSuite.tracing?.enabled === true;
|
|
4319
|
-
|
|
4292
|
+
logger.debug(`[Evaluator] Tracing check: env=${getEnvBool("PROMPTFOO_TRACING_ENABLED", false)}, testCase.metadata?.tracingEnabled=${testCase.metadata?.tracingEnabled}, testSuite.tracing?.enabled=${testSuite.tracing?.enabled}, tracingEnabled=${tracingEnabled}`);
|
|
4320
4293
|
if (tracingEnabled) return {
|
|
4321
4294
|
...baseTest,
|
|
4322
4295
|
metadata: {
|
|
@@ -4349,27 +4322,27 @@ var Evaluator = class {
|
|
|
4349
4322
|
if (evalOption.test.assert?.some((a) => a.type === "select-best")) rowsWithSelectBestAssertion.add(evalOption.testIdx);
|
|
4350
4323
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4351
4324
|
}
|
|
4352
|
-
if (
|
|
4353
|
-
const { default: EvalResult } = await import("./evalResult-
|
|
4354
|
-
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors:
|
|
4325
|
+
if (state.resume && this.evalRecord.persisted) try {
|
|
4326
|
+
const { default: EvalResult } = await import("./evalResult-BDMqrapS.js").then((n) => n.n);
|
|
4327
|
+
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
|
|
4355
4328
|
const originalCount = runEvalOptions.length;
|
|
4356
4329
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
4357
4330
|
const step = runEvalOptions[i];
|
|
4358
4331
|
if (completedPairs.has(`${step.testIdx}:${step.promptIdx}`)) runEvalOptions.splice(i, 1);
|
|
4359
4332
|
}
|
|
4360
4333
|
const skipped = originalCount - runEvalOptions.length;
|
|
4361
|
-
if (skipped > 0)
|
|
4334
|
+
if (skipped > 0) logger.info(`Resuming: skipping ${skipped} previously completed cases`);
|
|
4362
4335
|
} catch (err) {
|
|
4363
|
-
|
|
4336
|
+
logger.warn(`Resume: failed to load completed results. Running full evaluation. ${String(err)}`);
|
|
4364
4337
|
}
|
|
4365
4338
|
if (concurrency > 1) {
|
|
4366
4339
|
const usesConversation = prompts.some((p) => p.raw.includes("_conversation"));
|
|
4367
4340
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
4368
4341
|
if (usesConversation) {
|
|
4369
|
-
|
|
4342
|
+
logger.info(`Setting concurrency to 1 because the ${chalk.cyan("_conversation")} variable is used.`);
|
|
4370
4343
|
concurrency = 1;
|
|
4371
4344
|
} else if (usesStoreOutputAs) {
|
|
4372
|
-
|
|
4345
|
+
logger.info(`Setting concurrency to 1 because storeOutputAs is used.`);
|
|
4373
4346
|
concurrency = 1;
|
|
4374
4347
|
}
|
|
4375
4348
|
}
|
|
@@ -4400,14 +4373,14 @@ var Evaluator = class {
|
|
|
4400
4373
|
await this.evalRecord.addResult(row);
|
|
4401
4374
|
} catch (error) {
|
|
4402
4375
|
const resultSummary = summarizeEvaluateResultForLogging(row);
|
|
4403
|
-
|
|
4376
|
+
logger.error(`Error saving result: ${error} ${safeJsonStringify(resultSummary)}`);
|
|
4404
4377
|
}
|
|
4405
4378
|
for (const writer of this.fileWriters) await writer.write(row);
|
|
4406
4379
|
const httpStatus = row.response?.metadata?.http?.status;
|
|
4407
4380
|
if (typeof httpStatus === "number" && isNonTransientHttpStatus(httpStatus)) {
|
|
4408
4381
|
targetUnavailable = true;
|
|
4409
4382
|
targetErrorStatus = httpStatus;
|
|
4410
|
-
|
|
4383
|
+
logger.error(`Target returned HTTP ${httpStatus}. Aborting scan - this error will not resolve on retry.`);
|
|
4411
4384
|
targetErrorAbortController.abort();
|
|
4412
4385
|
break;
|
|
4413
4386
|
}
|
|
@@ -4427,7 +4400,7 @@ var Evaluator = class {
|
|
|
4427
4400
|
if (testSuite.derivedMetrics) {
|
|
4428
4401
|
const math = await import("mathjs");
|
|
4429
4402
|
const promptEvalCount = metrics.testPassCount + metrics.testFailCount + metrics.testErrorCount + 1;
|
|
4430
|
-
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count"))
|
|
4403
|
+
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count")) logger.warn("Metric name '__count' is reserved for derived metrics and will be overridden.");
|
|
4431
4404
|
const evalContext = {
|
|
4432
4405
|
...metrics.namedScores,
|
|
4433
4406
|
__count: promptEvalCount
|
|
@@ -4442,7 +4415,7 @@ var Evaluator = class {
|
|
|
4442
4415
|
}
|
|
4443
4416
|
evalContext[metric.name] = metrics.namedScores[metric.name];
|
|
4444
4417
|
} catch (error) {
|
|
4445
|
-
|
|
4418
|
+
logger.debug(`Could not evaluate derived metric '${metric.name}': ${error.message}`);
|
|
4446
4419
|
}
|
|
4447
4420
|
}
|
|
4448
4421
|
}
|
|
@@ -4481,7 +4454,7 @@ var Evaluator = class {
|
|
|
4481
4454
|
if (typeof evalStep.provider.cleanup === "function") try {
|
|
4482
4455
|
evalStep.provider.cleanup();
|
|
4483
4456
|
} catch (cleanupErr) {
|
|
4484
|
-
|
|
4457
|
+
logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
4485
4458
|
}
|
|
4486
4459
|
reject(/* @__PURE__ */ new Error(`Evaluation timed out after ${timeoutMs}ms`));
|
|
4487
4460
|
}, timeoutMs);
|
|
@@ -4545,8 +4518,8 @@ var Evaluator = class {
|
|
|
4545
4518
|
}
|
|
4546
4519
|
};
|
|
4547
4520
|
const originalProgressCallback = this.options.progressCallback;
|
|
4548
|
-
const isWebUI = Boolean(
|
|
4549
|
-
|
|
4521
|
+
const isWebUI = Boolean(state.webUI);
|
|
4522
|
+
logger.debug(`Progress bar settings: showProgressBar=${this.options.showProgressBar}, isWebUI=${isWebUI}`);
|
|
4550
4523
|
if (isCI() && !isWebUI) {
|
|
4551
4524
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4552
4525
|
ciProgressReporter.start();
|
|
@@ -4556,20 +4529,20 @@ var Evaluator = class {
|
|
|
4556
4529
|
if (isWebUI) {
|
|
4557
4530
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4558
4531
|
const vars = formatVarsForDisplay(evalStep.test.vars, 50);
|
|
4559
|
-
|
|
4532
|
+
logger.info(`[${numComplete}/${total}] Running ${provider} with vars: ${vars}`);
|
|
4560
4533
|
} else if (progressBarManager) {
|
|
4561
4534
|
const phase = evalStep.test.options?.runSerially ? "serial" : "concurrent";
|
|
4562
4535
|
progressBarManager.updateProgress(index, evalStep, phase, metrics);
|
|
4563
4536
|
} else if (ciProgressReporter) ciProgressReporter.update(numComplete);
|
|
4564
|
-
else
|
|
4537
|
+
else logger.debug(`Eval #${index + 1} complete (${numComplete} of ${runEvalOptions.length})`);
|
|
4565
4538
|
};
|
|
4566
4539
|
const serialRunEvalOptions = [];
|
|
4567
4540
|
const concurrentRunEvalOptions = [];
|
|
4568
4541
|
for (const evalOption of runEvalOptions) if (evalOption.test.options?.runSerially) serialRunEvalOptions.push(evalOption);
|
|
4569
4542
|
else concurrentRunEvalOptions.push(evalOption);
|
|
4570
4543
|
if (!this.options.silent) {
|
|
4571
|
-
if (serialRunEvalOptions.length > 0)
|
|
4572
|
-
if (concurrentRunEvalOptions.length > 0)
|
|
4544
|
+
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4545
|
+
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4573
4546
|
}
|
|
4574
4547
|
if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
4575
4548
|
try {
|
|
@@ -4578,7 +4551,7 @@ var Evaluator = class {
|
|
|
4578
4551
|
if (isWebUI) {
|
|
4579
4552
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4580
4553
|
const vars = formatVarsForDisplay(evalStep.test.vars || {}, 50);
|
|
4581
|
-
|
|
4554
|
+
logger.info(`[${numComplete}/${runEvalOptions.length}] Running ${provider} with vars: ${vars}`);
|
|
4582
4555
|
}
|
|
4583
4556
|
const idx = runEvalOptions.indexOf(evalStep);
|
|
4584
4557
|
await processEvalStepWithTimeout(evalStep, idx);
|
|
@@ -4593,9 +4566,9 @@ var Evaluator = class {
|
|
|
4593
4566
|
});
|
|
4594
4567
|
} catch (err) {
|
|
4595
4568
|
if (combinedAbortSignal.aborted) {
|
|
4596
|
-
if (evalTimedOut)
|
|
4569
|
+
if (evalTimedOut) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
4597
4570
|
else if (!targetUnavailable) {
|
|
4598
|
-
|
|
4571
|
+
logger.info("Evaluation interrupted, saving progress...");
|
|
4599
4572
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4600
4573
|
if (progressBarManager) progressBarManager.stop();
|
|
4601
4574
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -4625,10 +4598,10 @@ var Evaluator = class {
|
|
|
4625
4598
|
let compareCount = 0;
|
|
4626
4599
|
for (const testIdx of rowsWithSelectBestAssertion) {
|
|
4627
4600
|
compareCount++;
|
|
4628
|
-
if (isWebUI)
|
|
4601
|
+
if (isWebUI) logger.info(`Running model-graded comparison ${compareCount} of ${compareRowsCount}...`);
|
|
4629
4602
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4630
4603
|
if (resultsToCompare.length === 0) {
|
|
4631
|
-
|
|
4604
|
+
logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4632
4605
|
continue;
|
|
4633
4606
|
}
|
|
4634
4607
|
const compareAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "select-best");
|
|
@@ -4690,16 +4663,16 @@ var Evaluator = class {
|
|
|
4690
4663
|
}
|
|
4691
4664
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4692
4665
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4693
|
-
else if (!isWebUI)
|
|
4666
|
+
else if (!isWebUI) logger.debug(`Model-graded comparison #${compareCount} of ${compareRowsCount} complete`);
|
|
4694
4667
|
}
|
|
4695
4668
|
}
|
|
4696
4669
|
const maxScoreRowsCount = rowsWithMaxScoreAssertion.size;
|
|
4697
4670
|
if (maxScoreRowsCount > 0) {
|
|
4698
|
-
|
|
4671
|
+
logger.info(`Processing ${maxScoreRowsCount} max-score assertions...`);
|
|
4699
4672
|
for (const testIdx of rowsWithMaxScoreAssertion) {
|
|
4700
4673
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4701
4674
|
if (resultsToCompare.length === 0) {
|
|
4702
|
-
|
|
4675
|
+
logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4703
4676
|
continue;
|
|
4704
4677
|
}
|
|
4705
4678
|
const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
|
|
@@ -4707,7 +4680,7 @@ var Evaluator = class {
|
|
|
4707
4680
|
const maxScoreGradingResults = await selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
|
|
4708
4681
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4709
4682
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4710
|
-
else if (!isWebUI)
|
|
4683
|
+
else if (!isWebUI) logger.debug(`Max-score assertion for test #${testIdx} complete`);
|
|
4711
4684
|
for (let index = 0; index < resultsToCompare.length; index++) {
|
|
4712
4685
|
const result = resultsToCompare[index];
|
|
4713
4686
|
const maxScoreGradingResult = {
|
|
@@ -4751,7 +4724,7 @@ var Evaluator = class {
|
|
|
4751
4724
|
progressBarManager.stop();
|
|
4752
4725
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
4753
4726
|
} catch (cleanupErr) {
|
|
4754
|
-
|
|
4727
|
+
logger.warn(`Error during progress reporter cleanup: ${cleanupErr}`);
|
|
4755
4728
|
}
|
|
4756
4729
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4757
4730
|
if (evalTimedOut) {
|
|
@@ -4824,7 +4797,7 @@ var Evaluator = class {
|
|
|
4824
4797
|
return idParts.length > 1 ? idParts[0] : "unknown";
|
|
4825
4798
|
})));
|
|
4826
4799
|
const timeoutOccurred = evalTimedOut || this.evalRecord.results.some((r) => r.failureReason === ResultFailureReason.ERROR && r.error?.includes("timed out"));
|
|
4827
|
-
|
|
4800
|
+
telemetry.record("eval_ran", {
|
|
4828
4801
|
numPrompts: prompts.length,
|
|
4829
4802
|
numTests: this.stats.successes + this.stats.failures + this.stats.errors,
|
|
4830
4803
|
numRequests: this.stats.tokenUsage.numRequests || 0,
|
|
@@ -4872,26 +4845,26 @@ var Evaluator = class {
|
|
|
4872
4845
|
await startOtlpReceiverIfNeeded(this.testSuite);
|
|
4873
4846
|
const tracingEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || this.testSuite.tracing?.enabled === true || typeof this.testSuite.defaultTest === "object" && this.testSuite.defaultTest?.metadata?.tracingEnabled === true || this.testSuite.tests?.some((t) => t.metadata?.tracingEnabled === true);
|
|
4874
4847
|
if (tracingEnabled) {
|
|
4875
|
-
|
|
4848
|
+
logger.debug("[Evaluator] Initializing OTEL SDK for tracing");
|
|
4876
4849
|
initializeOtel(getDefaultOtelConfig());
|
|
4877
4850
|
}
|
|
4878
4851
|
try {
|
|
4879
4852
|
return await this._runEvaluation();
|
|
4880
4853
|
} finally {
|
|
4881
4854
|
if (tracingEnabled) {
|
|
4882
|
-
|
|
4855
|
+
logger.debug("[Evaluator] Flushing OTEL spans...");
|
|
4883
4856
|
await flushOtel();
|
|
4884
4857
|
await shutdownOtel();
|
|
4885
4858
|
}
|
|
4886
4859
|
if (isOtlpReceiverStarted()) {
|
|
4887
|
-
|
|
4860
|
+
logger.debug("[Evaluator] Waiting for span exports to complete...");
|
|
4888
4861
|
await sleep(3e3);
|
|
4889
4862
|
}
|
|
4890
4863
|
await stopOtlpReceiverIfNeeded();
|
|
4891
4864
|
await providerRegistry.shutdownAll();
|
|
4892
4865
|
if (this.rateLimitRegistry) {
|
|
4893
4866
|
const metrics = this.rateLimitRegistry.getMetrics();
|
|
4894
|
-
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0)
|
|
4867
|
+
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0) logger.debug(`[Scheduler] Final metrics for ${key}`, {
|
|
4895
4868
|
totalRequests: m.totalRequests,
|
|
4896
4869
|
completedRequests: m.completedRequests,
|
|
4897
4870
|
failedRequests: m.failedRequests,
|
|
@@ -4904,14 +4877,13 @@ var Evaluator = class {
|
|
|
4904
4877
|
}
|
|
4905
4878
|
this.rateLimitRegistry?.dispose();
|
|
4906
4879
|
redteamProviderManager.setRateLimitRegistry(void 0);
|
|
4907
|
-
|
|
4880
|
+
state.maxConcurrency = void 0;
|
|
4908
4881
|
}
|
|
4909
4882
|
}
|
|
4910
4883
|
};
|
|
4911
4884
|
function evaluate$1(testSuite, evalRecord, options) {
|
|
4912
4885
|
return new Evaluator(testSuite, evalRecord, options).evaluate();
|
|
4913
4886
|
}
|
|
4914
|
-
|
|
4915
4887
|
//#endregion
|
|
4916
4888
|
//#region src/guardrails.ts
|
|
4917
4889
|
const API_BASE_URL = `${getShareApiBaseUrl()}/v1`;
|
|
@@ -4925,7 +4897,7 @@ async function makeRequest(endpoint, input) {
|
|
|
4925
4897
|
if (!response.data) throw new Error("No data returned from API");
|
|
4926
4898
|
return response.data;
|
|
4927
4899
|
} catch (error) {
|
|
4928
|
-
|
|
4900
|
+
logger.error(`Guardrails API error: ${error}`);
|
|
4929
4901
|
throw error;
|
|
4930
4902
|
}
|
|
4931
4903
|
}
|
|
@@ -4942,7 +4914,7 @@ async function makeAdaptiveRequest(request) {
|
|
|
4942
4914
|
if (!response.data) throw new Error("No data returned from API");
|
|
4943
4915
|
return response.data;
|
|
4944
4916
|
} catch (error) {
|
|
4945
|
-
|
|
4917
|
+
logger.error(`Guardrails API error: ${error}`);
|
|
4946
4918
|
throw error;
|
|
4947
4919
|
}
|
|
4948
4920
|
}
|
|
@@ -4960,8 +4932,6 @@ const guardrails = {
|
|
|
4960
4932
|
return makeAdaptiveRequest(request);
|
|
4961
4933
|
}
|
|
4962
4934
|
};
|
|
4963
|
-
var guardrails_default = guardrails;
|
|
4964
|
-
|
|
4965
4935
|
//#endregion
|
|
4966
4936
|
//#region src/migrate.ts
|
|
4967
4937
|
/**
|
|
@@ -4996,12 +4966,12 @@ async function runDbMigrations() {
|
|
|
4996
4966
|
const projectRoot = dir.split("dist/server/src")[0];
|
|
4997
4967
|
migrationsFolder = path$2.join(projectRoot, "dist", "promptfoo", "drizzle");
|
|
4998
4968
|
} else migrationsFolder = path$2.join(dir, "..", "drizzle");
|
|
4999
|
-
|
|
4969
|
+
logger.debug(`Running database migrations from: ${migrationsFolder}`);
|
|
5000
4970
|
migrate(db, { migrationsFolder });
|
|
5001
|
-
|
|
4971
|
+
logger.debug("Database migrations completed");
|
|
5002
4972
|
resolve();
|
|
5003
4973
|
} catch (error) {
|
|
5004
|
-
|
|
4974
|
+
logger.error(`Database migration failed: ${error}`);
|
|
5005
4975
|
reject(error);
|
|
5006
4976
|
}
|
|
5007
4977
|
});
|
|
@@ -5011,7 +4981,6 @@ try {
|
|
|
5011
4981
|
const currentModulePath = resolve(fileURLToPath(import.meta.url));
|
|
5012
4982
|
if (currentModulePath === resolve(process.argv[1]) && (currentModulePath.endsWith("migrate.js") || currentModulePath.endsWith("migrate.ts"))) runDbMigrations().then(() => process.exit(0)).catch(() => process.exit(1));
|
|
5013
4983
|
} catch {}
|
|
5014
|
-
|
|
5015
4984
|
//#endregion
|
|
5016
4985
|
//#region src/redteam/sharedFrontend.ts
|
|
5017
4986
|
function getRiskCategorySeverityMap(plugins) {
|
|
@@ -5028,7 +4997,6 @@ function getRiskCategorySeverityMap(plugins) {
|
|
|
5028
4997
|
...overrides
|
|
5029
4998
|
};
|
|
5030
4999
|
}
|
|
5031
|
-
|
|
5032
5000
|
//#endregion
|
|
5033
5001
|
//#region src/util/calculateFilteredMetrics.ts
|
|
5034
5002
|
/**
|
|
@@ -5082,12 +5050,12 @@ async function calculateFilteredMetrics(opts) {
|
|
|
5082
5050
|
try {
|
|
5083
5051
|
const countResult = await getResultCount(whereSql);
|
|
5084
5052
|
if (countResult > MAX_RESULTS_FOR_METRICS) {
|
|
5085
|
-
|
|
5053
|
+
logger.warn(`Filtered result count ${countResult} exceeds limit ${MAX_RESULTS_FOR_METRICS}`, { evalId: opts.evalId });
|
|
5086
5054
|
throw new Error(`Result count ${countResult} exceeds maximum ${MAX_RESULTS_FOR_METRICS}`);
|
|
5087
5055
|
}
|
|
5088
5056
|
return await calculateWithOptimizedQuery(opts);
|
|
5089
5057
|
} catch (error) {
|
|
5090
|
-
|
|
5058
|
+
logger.error("Failed to calculate filtered metrics with optimized query", { error });
|
|
5091
5059
|
return createEmptyMetricsArray(numPrompts);
|
|
5092
5060
|
}
|
|
5093
5061
|
}
|
|
@@ -5140,7 +5108,7 @@ async function calculateWithOptimizedQuery(opts) {
|
|
|
5140
5108
|
for (const row of basicResults) {
|
|
5141
5109
|
const idx = row.prompt_idx;
|
|
5142
5110
|
if (idx < 0 || idx >= numPrompts) {
|
|
5143
|
-
|
|
5111
|
+
logger.warn(`Invalid prompt_idx ${idx}, expected 0-${numPrompts - 1}`);
|
|
5144
5112
|
continue;
|
|
5145
5113
|
}
|
|
5146
5114
|
metrics[idx] = {
|
|
@@ -5165,7 +5133,7 @@ async function calculateWithOptimizedQuery(opts) {
|
|
|
5165
5133
|
}
|
|
5166
5134
|
await aggregateNamedScores(metrics, whereSql);
|
|
5167
5135
|
await aggregateAssertions(metrics, whereSql);
|
|
5168
|
-
|
|
5136
|
+
logger.debug("Filtered metrics calculated", {
|
|
5169
5137
|
numPrompts,
|
|
5170
5138
|
metricsCount: basicResults.length
|
|
5171
5139
|
});
|
|
@@ -5286,7 +5254,6 @@ function createEmptyMetricsArray(numPrompts) {
|
|
|
5286
5254
|
cost: 0
|
|
5287
5255
|
}));
|
|
5288
5256
|
}
|
|
5289
|
-
|
|
5290
5257
|
//#endregion
|
|
5291
5258
|
//#region src/util/convertEvalResultsToTable.ts
|
|
5292
5259
|
/**
|
|
@@ -5419,7 +5386,6 @@ function convertResultsToTable(eval_) {
|
|
|
5419
5386
|
body: rows
|
|
5420
5387
|
};
|
|
5421
5388
|
}
|
|
5422
|
-
|
|
5423
5389
|
//#endregion
|
|
5424
5390
|
//#region src/util/exportToFile/index.ts
|
|
5425
5391
|
function convertEvalResultToTableCell(result) {
|
|
@@ -5497,7 +5463,6 @@ function convertTestResultsToTableRow(results, varsForHeader) {
|
|
|
5497
5463
|
for (const result of results) row.outputs[result.promptIdx] = convertEvalResultToTableCell(result);
|
|
5498
5464
|
return row;
|
|
5499
5465
|
}
|
|
5500
|
-
|
|
5501
5466
|
//#endregion
|
|
5502
5467
|
//#region src/models/evalPerformance.ts
|
|
5503
5468
|
const distinctCountCache = /* @__PURE__ */ new Map();
|
|
@@ -5514,7 +5479,7 @@ async function getCachedResultsCount(evalId) {
|
|
|
5514
5479
|
const cacheKey = `distinct:${evalId}`;
|
|
5515
5480
|
const cached = distinctCountCache.get(cacheKey);
|
|
5516
5481
|
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
5517
|
-
|
|
5482
|
+
logger.debug(`Using cached distinct count for eval ${evalId}: ${cached.count}`);
|
|
5518
5483
|
return cached.count;
|
|
5519
5484
|
}
|
|
5520
5485
|
const db = getDb();
|
|
@@ -5522,7 +5487,7 @@ async function getCachedResultsCount(evalId) {
|
|
|
5522
5487
|
const result = db.select({ count: sql`COUNT(DISTINCT test_idx)` }).from(evalResultsTable).where(sql`eval_id = ${evalId}`).all();
|
|
5523
5488
|
const count = Number(result[0]?.count ?? 0);
|
|
5524
5489
|
const duration = Date.now() - start;
|
|
5525
|
-
|
|
5490
|
+
logger.debug(`Distinct count query for eval ${evalId}: ${count} in ${duration}ms`);
|
|
5526
5491
|
distinctCountCache.set(cacheKey, {
|
|
5527
5492
|
count,
|
|
5528
5493
|
timestamp: Date.now()
|
|
@@ -5540,7 +5505,7 @@ async function getTotalResultRowCount(evalId) {
|
|
|
5540
5505
|
const cacheKey = `total:${evalId}`;
|
|
5541
5506
|
const cached = totalRowCountCache.get(cacheKey);
|
|
5542
5507
|
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
5543
|
-
|
|
5508
|
+
logger.debug(`Using cached total row count for eval ${evalId}: ${cached.count}`);
|
|
5544
5509
|
return cached.count;
|
|
5545
5510
|
}
|
|
5546
5511
|
const db = getDb();
|
|
@@ -5548,7 +5513,7 @@ async function getTotalResultRowCount(evalId) {
|
|
|
5548
5513
|
const result = db.select({ count: sql`COUNT(*)` }).from(evalResultsTable).where(sql`eval_id = ${evalId}`).all();
|
|
5549
5514
|
const count = Number(result[0]?.count ?? 0);
|
|
5550
5515
|
const duration = Date.now() - start;
|
|
5551
|
-
|
|
5516
|
+
logger.debug(`Total row count query for eval ${evalId}: ${count} in ${duration}ms`);
|
|
5552
5517
|
totalRowCountCache.set(cacheKey, {
|
|
5553
5518
|
count,
|
|
5554
5519
|
timestamp: Date.now()
|
|
@@ -5581,7 +5546,7 @@ async function queryTestIndicesOptimized(evalId, opts) {
|
|
|
5581
5546
|
`;
|
|
5582
5547
|
const countResult = db.all(countQuery);
|
|
5583
5548
|
const filteredCount = Number(countResult[0]?.count ?? 0);
|
|
5584
|
-
|
|
5549
|
+
logger.debug(`Optimized count query took ${Date.now() - countStart}ms`);
|
|
5585
5550
|
const idxStart = Date.now();
|
|
5586
5551
|
const idxQuery = sql`
|
|
5587
5552
|
SELECT DISTINCT test_idx
|
|
@@ -5592,13 +5557,12 @@ async function queryTestIndicesOptimized(evalId, opts) {
|
|
|
5592
5557
|
OFFSET ${offset}
|
|
5593
5558
|
`;
|
|
5594
5559
|
const testIndices = db.all(idxQuery).map((row) => row.test_idx);
|
|
5595
|
-
|
|
5560
|
+
logger.debug(`Optimized index query took ${Date.now() - idxStart}ms`);
|
|
5596
5561
|
return {
|
|
5597
5562
|
testIndices,
|
|
5598
5563
|
filteredCount
|
|
5599
5564
|
};
|
|
5600
5565
|
}
|
|
5601
|
-
|
|
5602
5566
|
//#endregion
|
|
5603
5567
|
//#region src/models/eval.ts
|
|
5604
5568
|
/**
|
|
@@ -5693,7 +5657,7 @@ var EvalQueries = class {
|
|
|
5693
5657
|
try {
|
|
5694
5658
|
db.update(evalsTable).set({ vars }).where(eq(evalsTable.id, evalId)).run();
|
|
5695
5659
|
} catch (e) {
|
|
5696
|
-
|
|
5660
|
+
logger.error(`Error setting vars: ${vars} for eval ${evalId}: ${e}`);
|
|
5697
5661
|
}
|
|
5698
5662
|
}
|
|
5699
5663
|
static async getMetadataKeysFromEval(evalId, comparisonEvalIds = []) {
|
|
@@ -5714,7 +5678,7 @@ var EvalQueries = class {
|
|
|
5714
5678
|
`;
|
|
5715
5679
|
return (await db.all(query)).map((r) => r.key);
|
|
5716
5680
|
} catch (error) {
|
|
5717
|
-
|
|
5681
|
+
logger.error(`Error fetching metadata keys for eval ${evalId} and comparisons [${comparisonEvalIds.join(", ")}]: ${error}`);
|
|
5718
5682
|
return [];
|
|
5719
5683
|
}
|
|
5720
5684
|
}
|
|
@@ -5745,7 +5709,7 @@ var EvalQueries = class {
|
|
|
5745
5709
|
const values = db.all(query).map(({ value }) => String(value).trim()).filter((value) => value.length > 0);
|
|
5746
5710
|
return Array.from(new Set(values));
|
|
5747
5711
|
} catch (error) {
|
|
5748
|
-
|
|
5712
|
+
logger.error(`Error fetching metadata values for eval ${evalId} and key ${trimmedKey}: ${error instanceof Error ? error.message : String(error)}`);
|
|
5749
5713
|
return [];
|
|
5750
5714
|
}
|
|
5751
5715
|
}
|
|
@@ -5817,7 +5781,7 @@ var Eval = class Eval {
|
|
|
5817
5781
|
}
|
|
5818
5782
|
return evalInstance;
|
|
5819
5783
|
}
|
|
5820
|
-
static async getMany(limit =
|
|
5784
|
+
static async getMany(limit = 100) {
|
|
5821
5785
|
return (await getDb().select().from(evalsTable).limit(limit).orderBy(desc(evalsTable.createdAt)).all()).map((e) => new Eval(e.config, {
|
|
5822
5786
|
id: e.id,
|
|
5823
5787
|
createdAt: new Date(e.createdAt),
|
|
@@ -5832,7 +5796,7 @@ var Eval = class Eval {
|
|
|
5832
5796
|
* @param offset - Number of evals to skip
|
|
5833
5797
|
* @param limit - Maximum number of evals to return
|
|
5834
5798
|
*/
|
|
5835
|
-
static async getPaginated(offset = 0, limit =
|
|
5799
|
+
static async getPaginated(offset = 0, limit = 100) {
|
|
5836
5800
|
return (await getDb().select().from(evalsTable).orderBy(desc(evalsTable.createdAt)).limit(limit).offset(offset).all()).map((e) => new Eval(e.config, {
|
|
5837
5801
|
id: e.id,
|
|
5838
5802
|
createdAt: new Date(e.createdAt),
|
|
@@ -5878,7 +5842,7 @@ var Eval = class Eval {
|
|
|
5878
5842
|
evalId,
|
|
5879
5843
|
promptId
|
|
5880
5844
|
}).onConflictDoNothing().run();
|
|
5881
|
-
|
|
5845
|
+
logger.debug(`Inserting prompt ${promptId}`);
|
|
5882
5846
|
}
|
|
5883
5847
|
if (opts?.results && opts.results.length > 0) {
|
|
5884
5848
|
const res = db.insert(evalResultsTable).values(opts.results?.map((r) => ({
|
|
@@ -5886,7 +5850,7 @@ var Eval = class Eval {
|
|
|
5886
5850
|
evalId,
|
|
5887
5851
|
id: crypto.randomUUID()
|
|
5888
5852
|
}))).run();
|
|
5889
|
-
|
|
5853
|
+
logger.debug(`Inserted ${res.changes} eval results`);
|
|
5890
5854
|
}
|
|
5891
5855
|
db.insert(datasetsTable).values({
|
|
5892
5856
|
id: datasetId,
|
|
@@ -5896,7 +5860,7 @@ var Eval = class Eval {
|
|
|
5896
5860
|
evalId,
|
|
5897
5861
|
datasetId
|
|
5898
5862
|
}).onConflictDoNothing().run();
|
|
5899
|
-
|
|
5863
|
+
logger.debug(`Inserting dataset ${datasetId}`);
|
|
5900
5864
|
if (config.tags) for (const [tagKey, tagValue] of Object.entries(config.tags)) {
|
|
5901
5865
|
const tagId = sha256(`${tagKey}:${tagValue}`);
|
|
5902
5866
|
db.insert(tagsTable).values({
|
|
@@ -5908,7 +5872,7 @@ var Eval = class Eval {
|
|
|
5908
5872
|
evalId,
|
|
5909
5873
|
tagId
|
|
5910
5874
|
}).onConflictDoNothing().run();
|
|
5911
|
-
|
|
5875
|
+
logger.debug(`Inserting tag ${tagId}`);
|
|
5912
5876
|
}
|
|
5913
5877
|
});
|
|
5914
5878
|
return new Eval(config, {
|
|
@@ -6089,7 +6053,7 @@ var Eval = class Eval {
|
|
|
6089
6053
|
if (type === "metric") {
|
|
6090
6054
|
const metricKey = field || value;
|
|
6091
6055
|
if (!metricKey) {
|
|
6092
|
-
|
|
6056
|
+
logger.warn("Invalid metric filter: missing field and value", { filter });
|
|
6093
6057
|
return;
|
|
6094
6058
|
}
|
|
6095
6059
|
const jsonPath = buildSafeJsonPath(metricKey);
|
|
@@ -6103,7 +6067,7 @@ var Eval = class Eval {
|
|
|
6103
6067
|
else if (operator === "lt") condition = sql`CAST(json_extract(named_scores, ${jsonPath}) AS REAL) < ${numericValue}`;
|
|
6104
6068
|
else if (operator === "lte") condition = sql`CAST(json_extract(named_scores, ${jsonPath}) AS REAL) <= ${numericValue}`;
|
|
6105
6069
|
} else {
|
|
6106
|
-
|
|
6070
|
+
logger.warn("Invalid numeric value in metric filter", {
|
|
6107
6071
|
metricKey,
|
|
6108
6072
|
value,
|
|
6109
6073
|
numericValue,
|
|
@@ -6181,7 +6145,7 @@ var Eval = class Eval {
|
|
|
6181
6145
|
const countStart = Date.now();
|
|
6182
6146
|
const countResult = await db.get(filteredCountQuery);
|
|
6183
6147
|
const countEnd = Date.now();
|
|
6184
|
-
|
|
6148
|
+
logger.debug(`Count query took ${countEnd - countStart}ms`);
|
|
6185
6149
|
const filteredCount = countResult?.count || 0;
|
|
6186
6150
|
const idxQuery = sql`
|
|
6187
6151
|
SELECT DISTINCT test_idx
|
|
@@ -6194,7 +6158,7 @@ var Eval = class Eval {
|
|
|
6194
6158
|
const idxStart = Date.now();
|
|
6195
6159
|
const rows = await db.all(idxQuery);
|
|
6196
6160
|
const idxEnd = Date.now();
|
|
6197
|
-
|
|
6161
|
+
logger.debug(`Index query took ${idxEnd - idxStart}ms`);
|
|
6198
6162
|
return {
|
|
6199
6163
|
testIndices: rows.map((row) => row.test_idx),
|
|
6200
6164
|
filteredCount
|
|
@@ -6230,7 +6194,7 @@ var Eval = class Eval {
|
|
|
6230
6194
|
const hasComplexFilters = opts.filters && opts.filters.length > 0;
|
|
6231
6195
|
let queryResult;
|
|
6232
6196
|
if (hasComplexFilters) {
|
|
6233
|
-
|
|
6197
|
+
logger.debug("Using original query for complex filters");
|
|
6234
6198
|
queryResult = await this.queryTestIndices({
|
|
6235
6199
|
offset: opts.offset,
|
|
6236
6200
|
limit: opts.limit,
|
|
@@ -6239,7 +6203,7 @@ var Eval = class Eval {
|
|
|
6239
6203
|
filters: opts.filters
|
|
6240
6204
|
});
|
|
6241
6205
|
} else {
|
|
6242
|
-
|
|
6206
|
+
logger.debug("Using optimized query for table page");
|
|
6243
6207
|
queryResult = await queryTestIndicesOptimized(this.id, {
|
|
6244
6208
|
offset: opts.offset,
|
|
6245
6209
|
limit: opts.limit,
|
|
@@ -6254,12 +6218,12 @@ var Eval = class Eval {
|
|
|
6254
6218
|
const varsStart = Date.now();
|
|
6255
6219
|
const vars = Array.from(this.vars);
|
|
6256
6220
|
const varsEnd = Date.now();
|
|
6257
|
-
|
|
6221
|
+
logger.debug(`Vars query took ${varsEnd - varsStart}ms`);
|
|
6258
6222
|
const body = [];
|
|
6259
6223
|
const bodyStart = Date.now();
|
|
6260
6224
|
if (testIndices.length === 0) {
|
|
6261
6225
|
const bodyEnd = Date.now();
|
|
6262
|
-
|
|
6226
|
+
logger.debug(`Body query took ${bodyEnd - bodyStart}ms`);
|
|
6263
6227
|
return {
|
|
6264
6228
|
head: {
|
|
6265
6229
|
prompts: this.prompts,
|
|
@@ -6291,7 +6255,7 @@ var Eval = class Eval {
|
|
|
6291
6255
|
if (results.length > 0) body.push(convertTestResultsToTableRow(results, vars));
|
|
6292
6256
|
}
|
|
6293
6257
|
const bodyEnd = Date.now();
|
|
6294
|
-
|
|
6258
|
+
logger.debug(`Body query took ${bodyEnd - bodyStart}ms`);
|
|
6295
6259
|
return {
|
|
6296
6260
|
head: {
|
|
6297
6261
|
prompts: this.prompts,
|
|
@@ -6404,7 +6368,7 @@ var Eval = class Eval {
|
|
|
6404
6368
|
})
|
|
6405
6369
|
}));
|
|
6406
6370
|
} catch (error) {
|
|
6407
|
-
|
|
6371
|
+
logger.debug(`Failed to fetch traces for eval ${this.id}: ${error}`);
|
|
6408
6372
|
return [];
|
|
6409
6373
|
}
|
|
6410
6374
|
}
|
|
@@ -6441,7 +6405,7 @@ var Eval = class Eval {
|
|
|
6441
6405
|
const newEvalId = createEvalId(/* @__PURE__ */ new Date());
|
|
6442
6406
|
const copyDescription = description || `${this.description || "Evaluation"} (Copy)`;
|
|
6443
6407
|
const testCount = distinctTestCount ?? await this.getResultsCount();
|
|
6444
|
-
|
|
6408
|
+
logger.info("Starting eval copy", {
|
|
6445
6409
|
sourceEvalId: this.id,
|
|
6446
6410
|
targetEvalId: newEvalId,
|
|
6447
6411
|
distinctTestCount: testCount
|
|
@@ -6504,7 +6468,7 @@ var Eval = class Eval {
|
|
|
6504
6468
|
db.insert(evalResultsTable).values(copiedResults).run();
|
|
6505
6469
|
copiedCount += batch.length;
|
|
6506
6470
|
offset += BATCH_SIZE;
|
|
6507
|
-
|
|
6471
|
+
logger.debug("Copied batch of eval results", {
|
|
6508
6472
|
sourceEvalId: this.id,
|
|
6509
6473
|
targetEvalId: newEvalId,
|
|
6510
6474
|
batchSize: batch.length,
|
|
@@ -6513,7 +6477,7 @@ var Eval = class Eval {
|
|
|
6513
6477
|
});
|
|
6514
6478
|
}
|
|
6515
6479
|
});
|
|
6516
|
-
|
|
6480
|
+
logger.info("Eval copy completed successfully", {
|
|
6517
6481
|
sourceEvalId: this.id,
|
|
6518
6482
|
targetEvalId: newEvalId,
|
|
6519
6483
|
rowsCopied: copiedCount,
|
|
@@ -6528,7 +6492,6 @@ var Eval = class Eval {
|
|
|
6528
6492
|
this._shared = shared;
|
|
6529
6493
|
}
|
|
6530
6494
|
};
|
|
6531
|
-
|
|
6532
6495
|
//#endregion
|
|
6533
6496
|
//#region src/assertions/validateAssertions.ts
|
|
6534
6497
|
var AssertValidationError = class extends Error {
|
|
@@ -6580,7 +6543,6 @@ function validateAssertions(tests, defaultTest) {
|
|
|
6580
6543
|
}
|
|
6581
6544
|
}
|
|
6582
6545
|
}
|
|
6583
|
-
|
|
6584
6546
|
//#endregion
|
|
6585
6547
|
//#region src/commands/eval/filterPrompts.ts
|
|
6586
6548
|
/**
|
|
@@ -6606,7 +6568,6 @@ function filterPrompts(prompts, filterPromptsOption) {
|
|
|
6606
6568
|
return promptId && filterRegex.test(promptId) || promptLabel && filterRegex.test(promptLabel);
|
|
6607
6569
|
});
|
|
6608
6570
|
}
|
|
6609
|
-
|
|
6610
6571
|
//#endregion
|
|
6611
6572
|
//#region src/commands/eval/filterProviders.ts
|
|
6612
6573
|
/**
|
|
@@ -6687,7 +6648,6 @@ function filterProviders(providers, filterProvidersOption) {
|
|
|
6687
6648
|
return filterRegex.test(providerId) || providerLabel && filterRegex.test(providerLabel);
|
|
6688
6649
|
});
|
|
6689
6650
|
}
|
|
6690
|
-
|
|
6691
6651
|
//#endregion
|
|
6692
6652
|
//#region src/commands/eval/filterTestsUtil.ts
|
|
6693
6653
|
/**
|
|
@@ -6715,35 +6675,35 @@ function mergeDefaultVars(test, defaultTest) {
|
|
|
6715
6675
|
*/
|
|
6716
6676
|
async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
6717
6677
|
if (!testSuite.tests) {
|
|
6718
|
-
|
|
6678
|
+
logger.debug("[filterTestsByResults] No tests in test suite");
|
|
6719
6679
|
return [];
|
|
6720
6680
|
}
|
|
6721
|
-
|
|
6681
|
+
logger.debug(`[filterTestsByResults] Loading results from: ${pathOrId}`);
|
|
6722
6682
|
let results;
|
|
6723
6683
|
try {
|
|
6724
6684
|
if (pathOrId.endsWith(".json")) results = (await readOutput(pathOrId)).results;
|
|
6725
6685
|
else {
|
|
6726
6686
|
const eval_ = await Eval.findById(pathOrId);
|
|
6727
6687
|
if (!eval_) {
|
|
6728
|
-
|
|
6688
|
+
logger.warn(`[filterTestsByResults] Evaluation not found: ${pathOrId}`);
|
|
6729
6689
|
return [];
|
|
6730
6690
|
}
|
|
6731
6691
|
const summary = await eval_.toEvaluateSummary();
|
|
6732
6692
|
if ("results" in summary) results = { results: summary.results };
|
|
6733
6693
|
else {
|
|
6734
|
-
|
|
6694
|
+
logger.debug("[filterTestsByResults] No results in evaluation summary");
|
|
6735
6695
|
return [];
|
|
6736
6696
|
}
|
|
6737
6697
|
}
|
|
6738
6698
|
} catch (error) {
|
|
6739
|
-
|
|
6699
|
+
logger.warn(`[filterTestsByResults] Error loading results: ${error}`);
|
|
6740
6700
|
return [];
|
|
6741
6701
|
}
|
|
6742
6702
|
const filteredResults = results.results.filter(filterFn);
|
|
6743
|
-
|
|
6703
|
+
logger.debug(`[filterTestsByResults] Found ${filteredResults.length} matching results out of ${results.results.length} total`);
|
|
6744
6704
|
if (filteredResults.length === 0) return [];
|
|
6745
6705
|
const uniqueVarsInResults = new Set(filteredResults.map((r) => JSON.stringify(filterRuntimeVars(r.vars))));
|
|
6746
|
-
|
|
6706
|
+
logger.debug(`[filterTestsByResults] ${uniqueVarsInResults.size} unique test cases (by vars) in filtered results`);
|
|
6747
6707
|
const matchedTests = [];
|
|
6748
6708
|
for (const test of testSuite.tests) {
|
|
6749
6709
|
const testWithDefaults = mergeDefaultVars(test, testSuite.defaultTest);
|
|
@@ -6765,15 +6725,15 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6765
6725
|
...runtimeVars
|
|
6766
6726
|
}
|
|
6767
6727
|
};
|
|
6768
|
-
|
|
6728
|
+
logger.debug("[filterTestsByResults] Restored runtime vars for test", { varKeys: Object.keys(runtimeVars) });
|
|
6769
6729
|
matchedTests.push(testWithRuntimeVars);
|
|
6770
6730
|
} else {
|
|
6771
|
-
|
|
6731
|
+
logger.debug("[filterTestsByResults] Matched test has no runtime vars to restore");
|
|
6772
6732
|
matchedTests.push(test);
|
|
6773
6733
|
}
|
|
6774
6734
|
}
|
|
6775
6735
|
}
|
|
6776
|
-
|
|
6736
|
+
logger.debug(`[filterTestsByResults] Matched ${matchedTests.length} tests out of ${testSuite.tests.length} in test suite`);
|
|
6777
6737
|
const extractedTests = [];
|
|
6778
6738
|
const matchedResultKeys = /* @__PURE__ */ new Set();
|
|
6779
6739
|
for (const result of filteredResults) for (const test of matchedTests) if (resultIsForTestCase(result, mergeDefaultVars(test, testSuite.defaultTest))) {
|
|
@@ -6784,7 +6744,7 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6784
6744
|
const resultKey = JSON.stringify(filterRuntimeVars(result.vars));
|
|
6785
6745
|
if (matchedResultKeys.has(resultKey)) continue;
|
|
6786
6746
|
if (!result.testCase) {
|
|
6787
|
-
|
|
6747
|
+
logger.debug("[filterTestsByResults] Skipping result without testCase data for extraction");
|
|
6788
6748
|
continue;
|
|
6789
6749
|
}
|
|
6790
6750
|
if (extractedTests.some((t) => JSON.stringify(filterRuntimeVars(t.vars)) === resultKey)) continue;
|
|
@@ -6796,12 +6756,11 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6796
6756
|
options: result.testCase.options
|
|
6797
6757
|
});
|
|
6798
6758
|
}
|
|
6799
|
-
if (extractedTests.length > 0)
|
|
6800
|
-
if (matchedTests.length === 0 && extractedTests.length === 0 && filteredResults.length > 0)
|
|
6801
|
-
else if (matchedTests.length + extractedTests.length < uniqueVarsInResults.size)
|
|
6759
|
+
if (extractedTests.length > 0) logger.info(`[filterTestsByResults] Extracted ${extractedTests.length} runtime-generated test(s) from results`);
|
|
6760
|
+
if (matchedTests.length === 0 && extractedTests.length === 0 && filteredResults.length > 0) logger.warn(`[filterTestsByResults] No tests matched ${filteredResults.length} filtered results. This may indicate a vars or provider mismatch between stored results and current test suite. Use LOG_LEVEL=debug for detailed matching info.`);
|
|
6761
|
+
else if (matchedTests.length + extractedTests.length < uniqueVarsInResults.size) logger.debug(`[filterTestsByResults] Note: ${uniqueVarsInResults.size - matchedTests.length - extractedTests.length} unique test cases in results did not match any test in the current test suite and could not be extracted. This may indicate results without testCase data.`);
|
|
6802
6762
|
return deduplicateTestCases([...matchedTests, ...extractedTests]);
|
|
6803
6763
|
}
|
|
6804
|
-
|
|
6805
6764
|
//#endregion
|
|
6806
6765
|
//#region src/commands/eval/filterTests.ts
|
|
6807
6766
|
/**
|
|
@@ -6827,7 +6786,7 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6827
6786
|
* @param reason - Description of what the filter was looking for (e.g., 'no failures/errors')
|
|
6828
6787
|
*/
|
|
6829
6788
|
function logNoTestsWarning(filterType, pathOrId, reason) {
|
|
6830
|
-
|
|
6789
|
+
logger.warn(`--${filterType} returned no tests. The evaluation "${pathOrId}" may have ${reason}, or the test suite may have changed since the evaluation was run.`);
|
|
6831
6790
|
}
|
|
6832
6791
|
/**
|
|
6833
6792
|
* Filters a test suite to only include all tests that did not pass (failures + errors)
|
|
@@ -6873,10 +6832,10 @@ async function filterErrorTests(testSuite, pathOrId) {
|
|
|
6873
6832
|
*/
|
|
6874
6833
|
async function filterTests(testSuite, options) {
|
|
6875
6834
|
let tests = testSuite.tests || [];
|
|
6876
|
-
|
|
6877
|
-
|
|
6835
|
+
logger.debug(`Starting filterTests with options: ${JSON.stringify(options)}`);
|
|
6836
|
+
logger.debug(`Initial test count: ${tests.length}`);
|
|
6878
6837
|
if (Object.keys(options).length === 0) {
|
|
6879
|
-
|
|
6838
|
+
logger.debug("No filter options provided, returning all tests");
|
|
6880
6839
|
return tests;
|
|
6881
6840
|
}
|
|
6882
6841
|
if (options.metadata) {
|
|
@@ -6891,11 +6850,11 @@ async function filterTests(testSuite, options) {
|
|
|
6891
6850
|
value
|
|
6892
6851
|
});
|
|
6893
6852
|
}
|
|
6894
|
-
|
|
6895
|
-
|
|
6853
|
+
logger.debug(`Filtering for metadata conditions (AND logic): ${parsedFilters.map((f) => `${f.key}=${f.value}`).join(", ")}`);
|
|
6854
|
+
logger.debug(`Before metadata filter: ${tests.length} tests`);
|
|
6896
6855
|
tests = tests.filter((test) => {
|
|
6897
6856
|
if (!test.metadata) {
|
|
6898
|
-
|
|
6857
|
+
logger.debug(`Test has no metadata: ${test.description || "unnamed test"}`);
|
|
6899
6858
|
return false;
|
|
6900
6859
|
}
|
|
6901
6860
|
for (const { key, value } of parsedFilters) {
|
|
@@ -6904,16 +6863,16 @@ async function filterTests(testSuite, options) {
|
|
|
6904
6863
|
if (Array.isArray(testValue)) matches = testValue.some((v) => v.toString().includes(value));
|
|
6905
6864
|
else if (testValue !== void 0) matches = testValue.toString().includes(value);
|
|
6906
6865
|
if (!matches) {
|
|
6907
|
-
|
|
6866
|
+
logger.debug(`Test "${test.description || "unnamed test"}" metadata doesn't match. Expected ${key} to include ${value}, got ${JSON.stringify(test.metadata)}`);
|
|
6908
6867
|
return false;
|
|
6909
6868
|
}
|
|
6910
6869
|
}
|
|
6911
6870
|
return true;
|
|
6912
6871
|
});
|
|
6913
|
-
|
|
6872
|
+
logger.debug(`After metadata filter: ${tests.length} tests remain`);
|
|
6914
6873
|
}
|
|
6915
6874
|
if (options.failingOnly && options.errorsOnly) {
|
|
6916
|
-
|
|
6875
|
+
logger.debug("Using both --filter-failing-only and --filter-errors-only together (equivalent to --filter-failing)");
|
|
6917
6876
|
const failingOnlyTests = await filterFailingOnlyTests(testSuite, options.failingOnly);
|
|
6918
6877
|
const errorTests = await filterErrorTests(testSuite, options.errorsOnly);
|
|
6919
6878
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -6923,8 +6882,8 @@ async function filterTests(testSuite, options) {
|
|
|
6923
6882
|
seen.add(key);
|
|
6924
6883
|
return true;
|
|
6925
6884
|
});
|
|
6926
|
-
|
|
6927
|
-
if (tests.length === 0)
|
|
6885
|
+
logger.debug(`Combined failingOnly (${failingOnlyTests.length}) and errors (${errorTests.length}) filters: ${tests.length} unique tests`);
|
|
6886
|
+
if (tests.length === 0) logger.warn("Combined --filter-failing-only and --filter-errors-only returned no tests. The specified evaluations may have no failures or errors, or the test suite may have changed.");
|
|
6928
6887
|
} else if (options.failing) {
|
|
6929
6888
|
tests = await filterFailingTests(testSuite, options.failing);
|
|
6930
6889
|
if (tests.length === 0) logNoTestsWarning("filter-failing", options.failing, "no failures/errors");
|
|
@@ -6961,7 +6920,6 @@ async function filterTests(testSuite, options) {
|
|
|
6961
6920
|
}
|
|
6962
6921
|
return tests;
|
|
6963
6922
|
}
|
|
6964
|
-
|
|
6965
6923
|
//#endregion
|
|
6966
6924
|
//#region src/util/promptfooCommand.ts
|
|
6967
6925
|
/**
|
|
@@ -7007,7 +6965,6 @@ function promptfooCommand(subcommand) {
|
|
|
7007
6965
|
if (detectInstaller() === "npx") return subcommand ? `npx promptfoo@latest ${subcommand}` : "npx promptfoo@latest";
|
|
7008
6966
|
return subcommand ? `promptfoo ${subcommand}` : "promptfoo";
|
|
7009
6967
|
}
|
|
7010
|
-
|
|
7011
6968
|
//#endregion
|
|
7012
6969
|
//#region src/csv.ts
|
|
7013
6970
|
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = .8;
|
|
@@ -7099,7 +7056,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7099
7056
|
if (!key.startsWith("__") && specialKeys.some((k) => key.startsWith(k)) && !uniqueErrorMessages.has(key)) {
|
|
7100
7057
|
const error = `You used a single underscore for the key "${key}". Did you mean to use "${key.replace("_", "__")}" instead?`;
|
|
7101
7058
|
uniqueErrorMessages.add(key);
|
|
7102
|
-
|
|
7059
|
+
logger.warn(error);
|
|
7103
7060
|
}
|
|
7104
7061
|
if (key.startsWith("__expected")) {
|
|
7105
7062
|
if (value.trim() !== "") asserts.push(assertionFromString(value.trim()));
|
|
@@ -7117,10 +7074,10 @@ function testCaseFromCsvRow(row) {
|
|
|
7117
7074
|
} else if (value.trim() !== "") metadata[metadataKey] = value;
|
|
7118
7075
|
} else if (key === "__metadata" && !uniqueErrorMessages.has(key)) {
|
|
7119
7076
|
uniqueErrorMessages.add(key);
|
|
7120
|
-
|
|
7077
|
+
logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
|
|
7121
7078
|
} else if (key.startsWith("__config:")) {
|
|
7122
7079
|
const configParts = key.slice(9).split(":");
|
|
7123
|
-
if (configParts.length !== 2)
|
|
7080
|
+
if (configParts.length !== 2) logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
|
|
7124
7081
|
else {
|
|
7125
7082
|
const [expectedKey, configKey] = configParts;
|
|
7126
7083
|
let targetIndex;
|
|
@@ -7130,11 +7087,11 @@ function testCaseFromCsvRow(row) {
|
|
|
7130
7087
|
if (indexMatch) targetIndex = Number.parseInt(indexMatch[1], 10) - 1;
|
|
7131
7088
|
}
|
|
7132
7089
|
if (targetIndex === void 0) {
|
|
7133
|
-
|
|
7090
|
+
logger.error(`Invalid expected key "${expectedKey}" in __config column "${key}". Must be __expected or __expected<N> where N is a positive integer.`);
|
|
7134
7091
|
throw new Error(`Invalid expected key "${expectedKey}" in __config column`);
|
|
7135
7092
|
}
|
|
7136
7093
|
if (!["threshold"].includes(configKey)) {
|
|
7137
|
-
|
|
7094
|
+
logger.error(`Invalid config key "${configKey}" in __config column "${key}". Valid config keys include: threshold`);
|
|
7138
7095
|
throw new Error(`Invalid config key "${configKey}" in __config column`);
|
|
7139
7096
|
}
|
|
7140
7097
|
if (!assertionConfigs[targetIndex]) assertionConfigs[targetIndex] = {};
|
|
@@ -7142,7 +7099,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7142
7099
|
if (configKey === "threshold") {
|
|
7143
7100
|
parsedValue = Number.parseFloat(value);
|
|
7144
7101
|
if (!Number.isFinite(parsedValue)) {
|
|
7145
|
-
|
|
7102
|
+
logger.error(`Invalid numeric value "${value}" for config key "${configKey}" in column "${key}"`);
|
|
7146
7103
|
throw new Error(`Invalid numeric value for ${configKey}`);
|
|
7147
7104
|
}
|
|
7148
7105
|
}
|
|
@@ -7169,7 +7126,6 @@ function testCaseFromCsvRow(row) {
|
|
|
7169
7126
|
...Object.keys(metadata).length > 0 ? { metadata } : {}
|
|
7170
7127
|
};
|
|
7171
7128
|
}
|
|
7172
|
-
|
|
7173
7129
|
//#endregion
|
|
7174
7130
|
//#region src/microsoftSharepoint.ts
|
|
7175
7131
|
let cca = null;
|
|
@@ -7189,7 +7145,7 @@ async function fetchCsvFromSharepoint(url) {
|
|
|
7189
7145
|
const fileRelativeUrl = url.startsWith(normalizedBaseUrl) ? url.slice(normalizedBaseUrl.length) : url;
|
|
7190
7146
|
const serverRelativeUrl = fileRelativeUrl.startsWith("/") ? fileRelativeUrl : `/${fileRelativeUrl}`;
|
|
7191
7147
|
const apiUrl = `${normalizedBaseUrl}/_api/web/GetFileByServerRelativeUrl('${encodeURI(serverRelativeUrl)}')/$value`;
|
|
7192
|
-
|
|
7148
|
+
logger.debug(`Fetching CSV from SharePoint: ${apiUrl}`);
|
|
7193
7149
|
const response = await fetchWithProxy(apiUrl, { headers: {
|
|
7194
7150
|
Authorization: `Bearer ${accessToken}`,
|
|
7195
7151
|
Accept: "text/csv"
|
|
@@ -7246,7 +7202,6 @@ async function getSharePointAccessToken() {
|
|
|
7246
7202
|
if (!tokenResult?.accessToken) throw new Error("Failed to acquire SharePoint access token. Please check your authentication configuration.");
|
|
7247
7203
|
return tokenResult.accessToken;
|
|
7248
7204
|
}
|
|
7249
|
-
|
|
7250
7205
|
//#endregion
|
|
7251
7206
|
//#region src/util/xlsx.ts
|
|
7252
7207
|
async function parseXlsxFile(filePath) {
|
|
@@ -7306,7 +7261,6 @@ async function parseXlsxFile(filePath) {
|
|
|
7306
7261
|
throw new Error(`Failed to parse Excel file ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
7307
7262
|
}
|
|
7308
7263
|
}
|
|
7309
|
-
|
|
7310
7264
|
//#endregion
|
|
7311
7265
|
//#region src/util/testCaseReader.ts
|
|
7312
7266
|
async function readTestFiles(pathOrGlobs, basePath = "") {
|
|
@@ -7352,29 +7306,29 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7352
7306
|
const fileExtension = parse(pathWithoutFunction).ext.slice(1);
|
|
7353
7307
|
const extensionWithoutSheet = fileExtension.split("#")[0];
|
|
7354
7308
|
if (varsPath.startsWith("huggingface://datasets/")) {
|
|
7355
|
-
|
|
7309
|
+
telemetry.record("feature_used", { feature: "huggingface dataset" });
|
|
7356
7310
|
return await fetchHuggingFaceDataset(varsPath);
|
|
7357
7311
|
}
|
|
7358
7312
|
if (isJavascriptFile(pathWithoutFunction)) {
|
|
7359
|
-
|
|
7313
|
+
telemetry.record("feature_used", { feature: "js tests file" });
|
|
7360
7314
|
const mod = await importModule(pathWithoutFunction, maybeFunctionName);
|
|
7361
7315
|
return typeof mod === "function" ? await mod(finalConfig) : mod;
|
|
7362
7316
|
}
|
|
7363
7317
|
if (fileExtension === "py") {
|
|
7364
|
-
|
|
7318
|
+
telemetry.record("feature_used", { feature: "python tests file" });
|
|
7365
7319
|
const result = await runPython(pathWithoutFunction, maybeFunctionName ?? "generate_tests", finalConfig === void 0 ? [] : [finalConfig]);
|
|
7366
7320
|
if (!Array.isArray(result)) throw new Error(`Python test function must return a list of test cases, got ${typeof result}`);
|
|
7367
7321
|
return result;
|
|
7368
7322
|
}
|
|
7369
7323
|
let rows = [];
|
|
7370
7324
|
if (varsPath.startsWith("https://docs.google.com/spreadsheets/")) {
|
|
7371
|
-
|
|
7325
|
+
telemetry.record("feature_used", { feature: "csv tests file - google sheet" });
|
|
7372
7326
|
rows = await fetchCsvFromGoogleSheet(varsPath);
|
|
7373
7327
|
} else if (/https:\/\/[^/]+\.sharepoint\.com\//i.test(varsPath)) {
|
|
7374
|
-
|
|
7328
|
+
telemetry.record("feature_used", { feature: "csv tests file - sharepoint" });
|
|
7375
7329
|
rows = await fetchCsvFromSharepoint(varsPath);
|
|
7376
7330
|
} else if (fileExtension === "csv") {
|
|
7377
|
-
|
|
7331
|
+
telemetry.record("feature_used", { feature: "csv tests file - local" });
|
|
7378
7332
|
const delimiter = getEnvString("PROMPTFOO_CSV_DELIMITER", ",");
|
|
7379
7333
|
const fileContent = await fsPromises.readFile(resolvedVarsPath, "utf-8");
|
|
7380
7334
|
const enforceStrict = getEnvBool("PROMPTFOO_CSV_STRICT", false);
|
|
@@ -7406,10 +7360,10 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7406
7360
|
throw e;
|
|
7407
7361
|
}
|
|
7408
7362
|
} else if (extensionWithoutSheet === "xlsx" || extensionWithoutSheet === "xls") {
|
|
7409
|
-
|
|
7363
|
+
telemetry.record("feature_used", { feature: "xlsx tests file - local" });
|
|
7410
7364
|
rows = await parseXlsxFile(resolvedVarsPath);
|
|
7411
7365
|
} else if (fileExtension === "json") {
|
|
7412
|
-
|
|
7366
|
+
telemetry.record("feature_used", { feature: "json tests file" });
|
|
7413
7367
|
const fileContent = await fsPromises.readFile(resolvedVarsPath, "utf-8");
|
|
7414
7368
|
const jsonData = yaml.load(fileContent);
|
|
7415
7369
|
return (Array.isArray(jsonData) ? jsonData : [jsonData]).map((item, idx) => ({
|
|
@@ -7417,7 +7371,7 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7417
7371
|
description: item.description || `Row #${idx + 1}`
|
|
7418
7372
|
}));
|
|
7419
7373
|
} else if (fileExtension === "jsonl") {
|
|
7420
|
-
|
|
7374
|
+
telemetry.record("feature_used", { feature: "jsonl tests file" });
|
|
7421
7375
|
return (await fsPromises.readFile(resolvedVarsPath, "utf-8")).split("\n").filter((line) => line.trim()).map((line, idx) => {
|
|
7422
7376
|
return {
|
|
7423
7377
|
...JSON.parse(line),
|
|
@@ -7425,7 +7379,7 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7425
7379
|
};
|
|
7426
7380
|
});
|
|
7427
7381
|
} else if (fileExtension === "yaml" || fileExtension === "yml") {
|
|
7428
|
-
|
|
7382
|
+
telemetry.record("feature_used", { feature: "yaml tests file" });
|
|
7429
7383
|
rows = maybeLoadConfigFromExternalFile(yaml.load(await fsPromises.readFile(resolvedVarsPath, "utf-8")));
|
|
7430
7384
|
}
|
|
7431
7385
|
return rows.map((row, idx) => {
|
|
@@ -7469,7 +7423,7 @@ async function readTest(test, basePath = "", isDefaultTest = false) {
|
|
|
7469
7423
|
*/
|
|
7470
7424
|
async function loadTestsFromGlob(loadTestsGlob, basePath = "") {
|
|
7471
7425
|
if (loadTestsGlob.startsWith("huggingface://datasets/")) {
|
|
7472
|
-
|
|
7426
|
+
telemetry.record("feature_used", { feature: "huggingface dataset" });
|
|
7473
7427
|
return await fetchHuggingFaceDataset(loadTestsGlob);
|
|
7474
7428
|
}
|
|
7475
7429
|
if (loadTestsGlob.startsWith("file://")) loadTestsGlob = loadTestsGlob.slice(7);
|
|
@@ -7480,12 +7434,12 @@ async function loadTestsFromGlob(loadTestsGlob, basePath = "") {
|
|
|
7480
7434
|
if ((isJavascriptFile(pathWithoutFunction) || pathWithoutFunction.endsWith(".py")) && !testFiles.some((file) => file === resolvedPath || file === pathWithoutFunction)) testFiles.push(resolvedPath);
|
|
7481
7435
|
if (loadTestsGlob.startsWith("https://docs.google.com/spreadsheets/")) testFiles.push(loadTestsGlob);
|
|
7482
7436
|
const _deref = async (testCases, file) => {
|
|
7483
|
-
|
|
7437
|
+
logger.debug(`Dereferencing test file: ${file}`);
|
|
7484
7438
|
return await $RefParser.dereference(testCases);
|
|
7485
7439
|
};
|
|
7486
7440
|
const ret = [];
|
|
7487
7441
|
if (testFiles.length < 1) {
|
|
7488
|
-
|
|
7442
|
+
logger.error(`No test files found for path: ${loadTestsGlob}`);
|
|
7489
7443
|
return ret;
|
|
7490
7444
|
}
|
|
7491
7445
|
for (const testFile of testFiles) {
|
|
@@ -7525,14 +7479,14 @@ async function readTests(tests, basePath = "") {
|
|
|
7525
7479
|
else ret.push(...await loadTestsFromGlob(globOrTest, basePath));
|
|
7526
7480
|
} else if ("path" in globOrTest) ret.push(...await readStandaloneTestsFile(globOrTest.path, basePath, globOrTest.config));
|
|
7527
7481
|
else ret.push(await readTest(globOrTest, basePath));
|
|
7528
|
-
else if (tests !== void 0 && tests !== null)
|
|
7482
|
+
else if (tests !== void 0 && tests !== null) logger.warn(dedent`
|
|
7529
7483
|
Warning: Unsupported 'tests' format in promptfooconfig.yaml.
|
|
7530
7484
|
Expected: string, string[], or TestCase[], but received: ${typeof tests}
|
|
7531
7485
|
|
|
7532
7486
|
Please check your configuration file and ensure the 'tests' field is correctly formatted.
|
|
7533
7487
|
For more information, visit: https://promptfoo.dev/docs/configuration/reference/#test-case
|
|
7534
7488
|
`);
|
|
7535
|
-
if (ret.some((testCase) => testCase.vars?.assert) && !getEnvBool("PROMPTFOO_NO_TESTCASE_ASSERT_WARNING"))
|
|
7489
|
+
if (ret.some((testCase) => testCase.vars?.assert) && !getEnvBool("PROMPTFOO_NO_TESTCASE_ASSERT_WARNING")) logger.warn(dedent`
|
|
7536
7490
|
Warning: Found 'assert' key in vars. This is likely a mistake in your configuration.
|
|
7537
7491
|
|
|
7538
7492
|
'assert' should be *unindented* so it is under the test itself, not vars. For example:
|
|
@@ -7548,7 +7502,6 @@ async function readTests(tests, basePath = "") {
|
|
|
7548
7502
|
`);
|
|
7549
7503
|
return ret;
|
|
7550
7504
|
}
|
|
7551
|
-
|
|
7552
7505
|
//#endregion
|
|
7553
7506
|
//#region src/util/validateTestPromptReferences.ts
|
|
7554
7507
|
var PromptReferenceValidationError = class extends Error {
|
|
@@ -7591,7 +7544,6 @@ function validateTestPromptReferences(tests, prompts, defaultTest) {
|
|
|
7591
7544
|
}
|
|
7592
7545
|
}
|
|
7593
7546
|
}
|
|
7594
|
-
|
|
7595
7547
|
//#endregion
|
|
7596
7548
|
//#region src/util/validateTestProviderReferences.ts
|
|
7597
7549
|
var ProviderReferenceValidationError = class extends Error {
|
|
@@ -7637,7 +7589,6 @@ function validateTestProviderReferences(tests, providers, defaultTest, scenarios
|
|
|
7637
7589
|
});
|
|
7638
7590
|
});
|
|
7639
7591
|
}
|
|
7640
|
-
|
|
7641
7592
|
//#endregion
|
|
7642
7593
|
//#region src/util/config/extensions.ts
|
|
7643
7594
|
/**
|
|
@@ -7655,7 +7606,6 @@ const DEFAULT_CONFIG_EXTENSIONS = [
|
|
|
7655
7606
|
"mts",
|
|
7656
7607
|
"ts"
|
|
7657
7608
|
];
|
|
7658
|
-
|
|
7659
7609
|
//#endregion
|
|
7660
7610
|
//#region src/util/config/load.ts
|
|
7661
7611
|
/**
|
|
@@ -7778,34 +7728,34 @@ async function readConfig(configPath) {
|
|
|
7778
7728
|
const hasProviders = data.providers !== void 0;
|
|
7779
7729
|
return hasTargets && !hasProviders || !hasTargets && hasProviders;
|
|
7780
7730
|
}, { message: "Exactly one of 'targets' or 'providers' must be provided, but not both" }).safeParse(renderedConfig);
|
|
7781
|
-
if (!validationResult.success)
|
|
7731
|
+
if (!validationResult.success) logger.warn(`Invalid configuration file ${configPath}:\n${z.prettifyError(validationResult.error)}`);
|
|
7782
7732
|
ret = renderedConfig;
|
|
7783
7733
|
} else if (isJavascriptFile(configPath)) {
|
|
7784
7734
|
const renderedConfig = renderConfigEnvTemplates(await importModule(configPath));
|
|
7785
7735
|
const validationResult = UnifiedConfigSchema.safeParse(renderedConfig);
|
|
7786
|
-
if (!validationResult.success)
|
|
7736
|
+
if (!validationResult.success) logger.warn(`Invalid configuration file ${configPath}:\n${z.prettifyError(validationResult.error)}`);
|
|
7787
7737
|
ret = renderedConfig;
|
|
7788
7738
|
} else throw new Error(`Unsupported configuration file format: ${ext}`);
|
|
7789
7739
|
if (ret.targets) {
|
|
7790
|
-
|
|
7740
|
+
logger.debug(`Rewriting config.targets to config.providers`);
|
|
7791
7741
|
ret.providers = ret.targets;
|
|
7792
7742
|
delete ret.targets;
|
|
7793
7743
|
}
|
|
7794
7744
|
if (ret.plugins) {
|
|
7795
|
-
|
|
7745
|
+
logger.debug(`Rewriting config.plugins to config.redteam.plugins`);
|
|
7796
7746
|
ret.redteam = ret.redteam || {};
|
|
7797
7747
|
ret.redteam.plugins = ret.plugins;
|
|
7798
7748
|
delete ret.plugins;
|
|
7799
7749
|
}
|
|
7800
7750
|
if (ret.strategies) {
|
|
7801
|
-
|
|
7751
|
+
logger.debug(`Rewriting config.strategies to config.redteam.strategies`);
|
|
7802
7752
|
ret.redteam = ret.redteam || {};
|
|
7803
7753
|
ret.redteam.strategies = ret.strategies;
|
|
7804
7754
|
delete ret.strategies;
|
|
7805
7755
|
}
|
|
7806
7756
|
if (!ret.prompts) {
|
|
7807
|
-
|
|
7808
|
-
if (!(!ret.tests || typeof ret.tests === "string" || Array.isArray(ret.tests) && ret.tests.some((test) => isTestCaseWithVars(test) && Object.keys(test.vars || {}).includes("prompt"))))
|
|
7757
|
+
logger.debug(`Setting default prompt because there is no \`prompts\` field`);
|
|
7758
|
+
if (!(!ret.tests || typeof ret.tests === "string" || Array.isArray(ret.tests) && ret.tests.some((test) => isTestCaseWithVars(test) && Object.keys(test.vars || {}).includes("prompt")))) logger.warn(`Warning: Expected top-level "prompts" property in config or a test variable named "prompt"`);
|
|
7809
7759
|
ret.prompts = ["{{prompt}}"];
|
|
7810
7760
|
}
|
|
7811
7761
|
return ret;
|
|
@@ -8003,9 +7953,9 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8003
7953
|
defaultConfig = {};
|
|
8004
7954
|
}
|
|
8005
7955
|
if (cmdObj.assertions) {
|
|
8006
|
-
|
|
7956
|
+
telemetry.record("feature_used", { feature: "standalone assertions mode" });
|
|
8007
7957
|
if (!cmdObj.modelOutputs) {
|
|
8008
|
-
|
|
7958
|
+
logger.error("You must provide --model-outputs when using --assertions");
|
|
8009
7959
|
process$1.exit(1);
|
|
8010
7960
|
}
|
|
8011
7961
|
const modelOutputs = JSON.parse(fs$1.readFileSync(path$2.join(process$1.cwd(), cmdObj.modelOutputs), "utf8"));
|
|
@@ -8027,14 +7977,14 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8027
7977
|
});
|
|
8028
7978
|
}
|
|
8029
7979
|
const basePath = configPaths ? path$2.dirname(configPaths[0]) : "";
|
|
8030
|
-
|
|
7980
|
+
state.basePath = basePath;
|
|
8031
7981
|
const defaultTestRaw = fileConfig.defaultTest || defaultConfig.defaultTest;
|
|
8032
7982
|
let processedDefaultTest;
|
|
8033
7983
|
if (typeof defaultTestRaw === "string" && defaultTestRaw.startsWith("file://")) {
|
|
8034
|
-
const originalBasePath =
|
|
8035
|
-
|
|
7984
|
+
const originalBasePath = state.basePath;
|
|
7985
|
+
state.basePath = basePath;
|
|
8036
7986
|
const loaded = await maybeLoadFromExternalFile(defaultTestRaw);
|
|
8037
|
-
|
|
7987
|
+
state.basePath = originalBasePath;
|
|
8038
7988
|
processedDefaultTest = loaded;
|
|
8039
7989
|
} else if (defaultTestRaw) processedDefaultTest = defaultTestRaw;
|
|
8040
7990
|
const config = {
|
|
@@ -8059,7 +8009,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8059
8009
|
const hasProviders = cmdObj.providers && cmdObj.providers.length > 0 || [config.providers].flat().filter(Boolean).length > 0;
|
|
8060
8010
|
if (!Boolean(configPaths) && !hasPrompts && !hasProviders && !isCI()) {
|
|
8061
8011
|
const extList = DEFAULT_CONFIG_EXTENSIONS.join(", ");
|
|
8062
|
-
|
|
8012
|
+
logger.warn(dedent`
|
|
8063
8013
|
${chalk.yellow.bold("⚠️ No promptfooconfig found")}
|
|
8064
8014
|
|
|
8065
8015
|
${chalk.white(`Searched in ${chalk.bold(process$1.cwd())} for promptfooconfig.{${extList}}`)}
|
|
@@ -8075,11 +8025,11 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8075
8025
|
process$1.exit(1);
|
|
8076
8026
|
}
|
|
8077
8027
|
if (!hasPrompts) {
|
|
8078
|
-
|
|
8028
|
+
logger.error("You must provide at least 1 prompt");
|
|
8079
8029
|
process$1.exit(1);
|
|
8080
8030
|
}
|
|
8081
8031
|
if (type !== "DatasetGeneration" && type !== "AssertionGeneration" && !hasProviders) {
|
|
8082
|
-
|
|
8032
|
+
logger.error("You must specify at least 1 provider (for example, openai:gpt-4.1)");
|
|
8083
8033
|
process$1.exit(1);
|
|
8084
8034
|
}
|
|
8085
8035
|
invariant(Array.isArray(config.providers), "providers must be an array");
|
|
@@ -8087,11 +8037,11 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8087
8037
|
const cliFilteredProviderConfigs = (cmdObj.providers ? resolveCliProvidersWithConfig(cmdObj.providers, resolvedProviderConfigs) : resolvedProviderConfigs) ?? [];
|
|
8088
8038
|
const filterOption = cmdObj.filterProviders || cmdObj.filterTargets;
|
|
8089
8039
|
const filteredProviderConfigs = filterProviderConfigs(cliFilteredProviderConfigs, filterOption);
|
|
8090
|
-
if (filterOption && Array.isArray(filteredProviderConfigs) && filteredProviderConfigs.length === 0)
|
|
8040
|
+
if (filterOption && Array.isArray(filteredProviderConfigs) && filteredProviderConfigs.length === 0) logger.warn(`No providers matched the filter "${filterOption}". Check your --filter-providers/--filter-targets value.`);
|
|
8091
8041
|
let parsedPrompts = await readPrompts(config.prompts, cmdObj.prompts ? void 0 : basePath);
|
|
8092
8042
|
if (cmdObj.filterPrompts) {
|
|
8093
8043
|
parsedPrompts = filterPrompts(parsedPrompts, cmdObj.filterPrompts);
|
|
8094
|
-
if (parsedPrompts.length === 0)
|
|
8044
|
+
if (parsedPrompts.length === 0) logger.warn(`No prompts matched the filter "${cmdObj.filterPrompts}". Check your --filter-prompts value.`);
|
|
8095
8045
|
}
|
|
8096
8046
|
const parsedProviders = await loadApiProviders(filteredProviderConfigs, {
|
|
8097
8047
|
env: config.env,
|
|
@@ -8122,7 +8072,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8122
8072
|
}
|
|
8123
8073
|
const parsedProviderPromptMap = readProviderPromptMap({ providers: filteredProviderConfigs }, parsedPrompts);
|
|
8124
8074
|
if (parsedPrompts.length === 0) {
|
|
8125
|
-
|
|
8075
|
+
logger.error("No prompts found. Add a `prompts:` entry to your config or pass --prompts path/to/prompt.txt.");
|
|
8126
8076
|
process$1.exit(1);
|
|
8127
8077
|
}
|
|
8128
8078
|
const defaultTest = {
|
|
@@ -8152,7 +8102,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8152
8102
|
validateAssertions(testSuite.tests || [], typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0);
|
|
8153
8103
|
validateTestProviderReferences(testSuite.tests || [], testSuite.providers, typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0, testSuite.scenarios);
|
|
8154
8104
|
validateTestPromptReferences(testSuite.tests || [], testSuite.prompts, typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0);
|
|
8155
|
-
|
|
8105
|
+
state.config = config;
|
|
8156
8106
|
let commandLineOptions = fileConfig.commandLineOptions || defaultConfig.commandLineOptions;
|
|
8157
8107
|
if (commandLineOptions?.envPath && basePath) {
|
|
8158
8108
|
const resolvedPaths = (Array.isArray(commandLineOptions.envPath) ? commandLineOptions.envPath : [commandLineOptions.envPath]).map((p) => path$2.isAbsolute(p) ? p : path$2.resolve(basePath, p));
|
|
@@ -8168,7 +8118,6 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8168
8118
|
commandLineOptions
|
|
8169
8119
|
};
|
|
8170
8120
|
}
|
|
8171
|
-
|
|
8172
8121
|
//#endregion
|
|
8173
8122
|
//#region src/util/config/writer.ts
|
|
8174
8123
|
function writePromptfooConfig(config, outputPath, headerComments) {
|
|
@@ -8184,7 +8133,7 @@ function writePromptfooConfig(config, outputPath, headerComments) {
|
|
|
8184
8133
|
]);
|
|
8185
8134
|
const yamlContent = yaml.dump(orderedConfig, { skipInvalid: true });
|
|
8186
8135
|
if (!yamlContent) {
|
|
8187
|
-
|
|
8136
|
+
logger.warn("Warning: config is empty, skipping write");
|
|
8188
8137
|
return orderedConfig;
|
|
8189
8138
|
}
|
|
8190
8139
|
const schemaComment = `# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json`;
|
|
@@ -8192,7 +8141,6 @@ function writePromptfooConfig(config, outputPath, headerComments) {
|
|
|
8192
8141
|
fs.writeFileSync(outputPath, `${schemaComment}\n${headerCommentLines}${yamlContent}`);
|
|
8193
8142
|
return orderedConfig;
|
|
8194
8143
|
}
|
|
8195
|
-
|
|
8196
8144
|
//#endregion
|
|
8197
8145
|
//#region src/util/redteamProbeLimit.ts
|
|
8198
8146
|
const MONTHLY_PROBE_LIMIT = 1e5;
|
|
@@ -8242,7 +8190,6 @@ function checkRedteamProbeLimit() {
|
|
|
8242
8190
|
remaining
|
|
8243
8191
|
};
|
|
8244
8192
|
}
|
|
8245
|
-
|
|
8246
8193
|
//#endregion
|
|
8247
8194
|
//#region src/redteam/extraction/mcpTools.ts
|
|
8248
8195
|
/**
|
|
@@ -8278,11 +8225,10 @@ async function extractMcpToolsInfo(providers) {
|
|
|
8278
8225
|
for (const tool of tools) toolsInfo.push(JSON.stringify(tool));
|
|
8279
8226
|
}
|
|
8280
8227
|
} catch (error) {
|
|
8281
|
-
|
|
8228
|
+
logger.warn(`Failed to get tools from MCP provider: ${error instanceof Error ? error.message : String(error)}`);
|
|
8282
8229
|
}
|
|
8283
8230
|
return toolsInfo.join("\n");
|
|
8284
8231
|
}
|
|
8285
|
-
|
|
8286
8232
|
//#endregion
|
|
8287
8233
|
//#region src/util/apiHealth.ts
|
|
8288
8234
|
/**
|
|
@@ -8291,7 +8237,7 @@ async function extractMcpToolsInfo(providers) {
|
|
|
8291
8237
|
* @returns A promise that resolves to the health check response.
|
|
8292
8238
|
*/
|
|
8293
8239
|
async function checkRemoteHealth(url) {
|
|
8294
|
-
|
|
8240
|
+
logger.debug(`[CheckRemoteHealth] Checking API health: ${JSON.stringify({
|
|
8295
8241
|
url,
|
|
8296
8242
|
env: {
|
|
8297
8243
|
httpProxy: getEnvString("HTTP_PROXY") || getEnvString("http_proxy"),
|
|
@@ -8306,7 +8252,7 @@ async function checkRemoteHealth(url) {
|
|
|
8306
8252
|
const cloudConfig = new CloudConfig();
|
|
8307
8253
|
const response = await fetchWithTimeout(url, { headers: { "Content-Type": "application/json" } }, 5e3);
|
|
8308
8254
|
if (!response.ok) {
|
|
8309
|
-
|
|
8255
|
+
logger.debug(`[CheckRemoteHealth] API health check failed with non-OK response: ${JSON.stringify({
|
|
8310
8256
|
status: response.status,
|
|
8311
8257
|
statusText: response.statusText,
|
|
8312
8258
|
url
|
|
@@ -8346,7 +8292,7 @@ async function checkRemoteHealth(url) {
|
|
|
8346
8292
|
};
|
|
8347
8293
|
const cause = "cause" in error ? ` (Cause: ${error.cause})` : "";
|
|
8348
8294
|
const code = "code" in error ? ` [${error["code"]}]` : "";
|
|
8349
|
-
|
|
8295
|
+
logger.debug(`[CheckRemoteHealth] API health check failed: ${JSON.stringify({
|
|
8350
8296
|
error: error.message,
|
|
8351
8297
|
url
|
|
8352
8298
|
})}`);
|
|
@@ -8356,7 +8302,6 @@ async function checkRemoteHealth(url) {
|
|
|
8356
8302
|
};
|
|
8357
8303
|
}
|
|
8358
8304
|
}
|
|
8359
|
-
|
|
8360
8305
|
//#endregion
|
|
8361
8306
|
//#region src/redteam/extraction/util.ts
|
|
8362
8307
|
const RedTeamGenerationResponse = z.object({
|
|
@@ -8393,7 +8338,7 @@ async function fetchRemoteGeneration(task, prompts) {
|
|
|
8393
8338
|
}, REQUEST_TIMEOUT_MS, "json");
|
|
8394
8339
|
return RedTeamGenerationResponse.parse(response.data).result;
|
|
8395
8340
|
} catch (error) {
|
|
8396
|
-
|
|
8341
|
+
logger.warn(`Error using remote generation for task '${task}': ${error}`);
|
|
8397
8342
|
throw error;
|
|
8398
8343
|
}
|
|
8399
8344
|
}
|
|
@@ -8403,11 +8348,11 @@ async function callExtraction(provider, prompt, processOutput) {
|
|
|
8403
8348
|
content: prompt
|
|
8404
8349
|
}]));
|
|
8405
8350
|
if (error) {
|
|
8406
|
-
|
|
8351
|
+
logger.error(`Error in extraction: ${error}`);
|
|
8407
8352
|
throw new Error(`Failed to perform extraction: ${error}`);
|
|
8408
8353
|
}
|
|
8409
8354
|
if (typeof output !== "string") {
|
|
8410
|
-
|
|
8355
|
+
logger.error(`Invalid output from extraction. Got: ${output}`);
|
|
8411
8356
|
throw new Error(`Invalid extraction output: expected string, got: ${output}`);
|
|
8412
8357
|
}
|
|
8413
8358
|
return processOutput(output);
|
|
@@ -8418,14 +8363,13 @@ function formatPrompts(prompts) {
|
|
|
8418
8363
|
${prompt}
|
|
8419
8364
|
</Prompt>`).join("\n");
|
|
8420
8365
|
}
|
|
8421
|
-
|
|
8422
8366
|
//#endregion
|
|
8423
8367
|
//#region src/redteam/extraction/entities.ts
|
|
8424
8368
|
async function extractEntities(provider, prompts) {
|
|
8425
8369
|
if (shouldGenerateRemote()) try {
|
|
8426
8370
|
return await fetchRemoteGeneration("entities", prompts);
|
|
8427
8371
|
} catch (error) {
|
|
8428
|
-
|
|
8372
|
+
logger.warn(`[Entity Extraction] Failed, returning 0 entities. Error using remote generation: ${error}`);
|
|
8429
8373
|
return [];
|
|
8430
8374
|
}
|
|
8431
8375
|
const prompt = dedent`
|
|
@@ -8452,28 +8396,27 @@ async function extractEntities(provider, prompts) {
|
|
|
8452
8396
|
try {
|
|
8453
8397
|
return await callExtraction(provider, prompt, (output) => {
|
|
8454
8398
|
const entities = output.split("\n").filter((line) => line.trim().startsWith("Entity:")).map((line) => line.substring(line.indexOf("Entity:") + 7).trim()).filter((entity) => !/^\{\{\s*[^{}]+\s*\}\}$/.test(entity));
|
|
8455
|
-
if (entities.length === 0)
|
|
8399
|
+
if (entities.length === 0) logger.debug("No entities were extracted from the prompts.");
|
|
8456
8400
|
return entities;
|
|
8457
8401
|
});
|
|
8458
8402
|
} catch (error) {
|
|
8459
|
-
|
|
8403
|
+
logger.warn(`Error using local extraction, returning empty list: ${error}`);
|
|
8460
8404
|
return [];
|
|
8461
8405
|
}
|
|
8462
8406
|
}
|
|
8463
|
-
|
|
8464
8407
|
//#endregion
|
|
8465
8408
|
//#region src/redteam/extraction/purpose.ts
|
|
8466
8409
|
const DEFAULT_PURPOSE = "An AI system";
|
|
8467
8410
|
async function extractSystemPurpose(provider, prompts) {
|
|
8468
8411
|
const onlyTemplatePrompt = prompts.length === 1 && prompts[0] && prompts[0].trim().replace(/\s+/g, "") === "{{prompt}}";
|
|
8469
8412
|
if (prompts.length === 0 || onlyTemplatePrompt) {
|
|
8470
|
-
|
|
8413
|
+
logger.debug("[purpose] No meaningful prompts provided, returning default purpose");
|
|
8471
8414
|
return DEFAULT_PURPOSE;
|
|
8472
8415
|
}
|
|
8473
8416
|
if (!neverGenerateRemote()) try {
|
|
8474
8417
|
return await fetchRemoteGeneration("purpose", prompts);
|
|
8475
8418
|
} catch (error) {
|
|
8476
|
-
|
|
8419
|
+
logger.warn(`[purpose] Error using remote generation, returning empty string: ${error}`);
|
|
8477
8420
|
return "";
|
|
8478
8421
|
}
|
|
8479
8422
|
const prompt = dedent`
|
|
@@ -8494,11 +8437,10 @@ async function extractSystemPurpose(provider, prompts) {
|
|
|
8494
8437
|
return match ? match[1].trim() : output.trim();
|
|
8495
8438
|
});
|
|
8496
8439
|
} catch (error) {
|
|
8497
|
-
|
|
8440
|
+
logger.warn(`[purpose] Error using extracting purpose, returning empty string: ${error}`);
|
|
8498
8441
|
return "";
|
|
8499
8442
|
}
|
|
8500
8443
|
}
|
|
8501
|
-
|
|
8502
8444
|
//#endregion
|
|
8503
8445
|
//#region src/redteam/plugins/custom.ts
|
|
8504
8446
|
const CustomPluginDefinitionSchema = z.strictObject({
|
|
@@ -8509,7 +8451,7 @@ const CustomPluginDefinitionSchema = z.strictObject({
|
|
|
8509
8451
|
id: z.string().optional()
|
|
8510
8452
|
});
|
|
8511
8453
|
function loadCustomPluginDefinition(filePath) {
|
|
8512
|
-
|
|
8454
|
+
logger.debug(`Loading custom plugin from ${filePath}`);
|
|
8513
8455
|
const result = CustomPluginDefinitionSchema.safeParse(maybeLoadFromExternalFile(filePath));
|
|
8514
8456
|
if (!result.success) {
|
|
8515
8457
|
const validationError = z.prettifyError(result.error);
|
|
@@ -8520,7 +8462,7 @@ function loadCustomPluginDefinition(filePath) {
|
|
|
8520
8462
|
|
|
8521
8463
|
Please review your plugin file ${filePath} configuration.`);
|
|
8522
8464
|
}
|
|
8523
|
-
|
|
8465
|
+
logger.debug(`Custom plugin definition: ${JSON.stringify(result.data, null, 2)}`);
|
|
8524
8466
|
return result.data;
|
|
8525
8467
|
}
|
|
8526
8468
|
var CustomPlugin = class extends RedteamPluginBase {
|
|
@@ -8558,7 +8500,6 @@ var CustomPlugin = class extends RedteamPluginBase {
|
|
|
8558
8500
|
}));
|
|
8559
8501
|
}
|
|
8560
8502
|
};
|
|
8561
|
-
|
|
8562
8503
|
//#endregion
|
|
8563
8504
|
//#region src/redteam/plugins/cyberseceval.ts
|
|
8564
8505
|
const PLUGIN_ID$2 = "promptfoo:redteam:cyberseceval";
|
|
@@ -8576,11 +8517,11 @@ async function fetchDataset$2(limit, isMultilingual) {
|
|
|
8576
8517
|
user_input: input.user_input,
|
|
8577
8518
|
judge_question: input.judge_question
|
|
8578
8519
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8579
|
-
|
|
8520
|
+
logger.debug(`[CyberSecEval] Generated ${testCases.length} test cases`);
|
|
8580
8521
|
if (testCases.length === 0) throw new Error(`[CyberSecEval] No test cases generated`);
|
|
8581
8522
|
return testCases;
|
|
8582
8523
|
} catch (error) {
|
|
8583
|
-
|
|
8524
|
+
logger.error(`[CyberSecEval] Error fetching dataset: ${error}`);
|
|
8584
8525
|
return [];
|
|
8585
8526
|
}
|
|
8586
8527
|
}
|
|
@@ -8614,7 +8555,6 @@ var CyberSecEvalPlugin = class extends RedteamPluginBase {
|
|
|
8614
8555
|
}));
|
|
8615
8556
|
}
|
|
8616
8557
|
};
|
|
8617
|
-
|
|
8618
8558
|
//#endregion
|
|
8619
8559
|
//#region src/redteam/plugins/donotanswer.ts
|
|
8620
8560
|
const PLUGIN_ID$1 = "promptfoo:redteam:donotanswer";
|
|
@@ -8633,7 +8573,7 @@ const RUBRIC_PASS_HEADING$1 = `THE RESPONSE SHOULD PASS IF:
|
|
|
8633
8573
|
`;
|
|
8634
8574
|
async function fetchDataset$1(limit) {
|
|
8635
8575
|
try {
|
|
8636
|
-
|
|
8576
|
+
logger.debug(`[DoNotAnswer] Fetching dataset from ${DATASET_URL$1}`);
|
|
8637
8577
|
let csvData;
|
|
8638
8578
|
if (DATASET_URL$1.startsWith("http")) {
|
|
8639
8579
|
const response = await fetchWithTimeout(DATASET_URL$1, {}, REQUEST_TIMEOUT_MS);
|
|
@@ -8644,13 +8584,13 @@ async function fetchDataset$1(limit) {
|
|
|
8644
8584
|
} catch (error) {
|
|
8645
8585
|
throw new Error(`[DoNotAnswer] Error reading local file: ${error}`);
|
|
8646
8586
|
}
|
|
8647
|
-
|
|
8587
|
+
logger.debug(`[DoNotAnswer] Got ${csvData.length} bytes of CSV data`);
|
|
8648
8588
|
const rows = parse$1(csvData, {
|
|
8649
8589
|
columns: true,
|
|
8650
8590
|
skip_empty_lines: true,
|
|
8651
8591
|
trim: true
|
|
8652
8592
|
});
|
|
8653
|
-
|
|
8593
|
+
logger.debug(`[DoNotAnswer] Parsed ${rows.length} rows from CSV`);
|
|
8654
8594
|
if (rows.length === 0) throw new Error(`[DoNotAnswer] No data rows in CSV`);
|
|
8655
8595
|
const testCases = rows.map((row) => ({ vars: {
|
|
8656
8596
|
risk_area: row.risk_area,
|
|
@@ -8658,11 +8598,11 @@ async function fetchDataset$1(limit) {
|
|
|
8658
8598
|
specific_harms: row.specific_harms,
|
|
8659
8599
|
question: row.question
|
|
8660
8600
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8661
|
-
|
|
8601
|
+
logger.debug(`[DoNotAnswer] Generated ${testCases.length} test cases`);
|
|
8662
8602
|
if (testCases.length === 0) throw new Error(`[DoNotAnswer] No test cases generated`);
|
|
8663
8603
|
return testCases;
|
|
8664
8604
|
} catch (error) {
|
|
8665
|
-
|
|
8605
|
+
logger.error(`[DoNotAnswer] Error fetching dataset: ${error}`);
|
|
8666
8606
|
return [];
|
|
8667
8607
|
}
|
|
8668
8608
|
}
|
|
@@ -8717,7 +8657,6 @@ ${RUBRIC_PASS_HEADING$1}
|
|
|
8717
8657
|
}));
|
|
8718
8658
|
}
|
|
8719
8659
|
};
|
|
8720
|
-
|
|
8721
8660
|
//#endregion
|
|
8722
8661
|
//#region src/redteam/plugins/harmful/common.ts
|
|
8723
8662
|
function getHarmfulAssertions(harmCategory) {
|
|
@@ -8751,7 +8690,6 @@ function createTestCase(injectVar, output, harmCategory) {
|
|
|
8751
8690
|
assert: getHarmfulAssertions(harmCategory)
|
|
8752
8691
|
};
|
|
8753
8692
|
}
|
|
8754
|
-
|
|
8755
8693
|
//#endregion
|
|
8756
8694
|
//#region src/redteam/plugins/harmful/constants.ts
|
|
8757
8695
|
const REDTEAM_MODEL_CATEGORIES = [{
|
|
@@ -8851,7 +8789,6 @@ const REDTEAM_MODEL_CATEGORIES = [{
|
|
|
8851
8789
|
{{outputFormat}}
|
|
8852
8790
|
`
|
|
8853
8791
|
}];
|
|
8854
|
-
|
|
8855
8792
|
//#endregion
|
|
8856
8793
|
//#region src/redteam/plugins/harmful/aligned.ts
|
|
8857
8794
|
var AlignedHarmfulPlugin = class extends RedteamPluginBase {
|
|
@@ -8892,7 +8829,6 @@ var AlignedHarmfulPlugin = class extends RedteamPluginBase {
|
|
|
8892
8829
|
});
|
|
8893
8830
|
}
|
|
8894
8831
|
};
|
|
8895
|
-
|
|
8896
8832
|
//#endregion
|
|
8897
8833
|
//#region src/redteam/plugins/harmful/unaligned.ts
|
|
8898
8834
|
/**
|
|
@@ -8908,7 +8844,7 @@ function processPromptForInputs(prompt, _injectVar, inputs) {
|
|
|
8908
8844
|
const parsed = JSON.parse(processedPrompt);
|
|
8909
8845
|
Object.assign(additionalVars, extractVariablesFromJson(parsed, inputs));
|
|
8910
8846
|
} catch {
|
|
8911
|
-
|
|
8847
|
+
logger.debug("[Harmful] Could not parse prompt as JSON for multi-input mode");
|
|
8912
8848
|
}
|
|
8913
8849
|
return {
|
|
8914
8850
|
processedPrompt,
|
|
@@ -8943,7 +8879,6 @@ async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, p
|
|
|
8943
8879
|
return testCase;
|
|
8944
8880
|
});
|
|
8945
8881
|
}
|
|
8946
|
-
|
|
8947
8882
|
//#endregion
|
|
8948
8883
|
//#region src/redteam/plugins/xstest.ts
|
|
8949
8884
|
const PLUGIN_ID = "promptfoo:redteam:xstest";
|
|
@@ -8953,7 +8888,7 @@ const RUBRIC_PASS_HEADING = "THE RESPONSE SHOULD PASS IF:";
|
|
|
8953
8888
|
const RUBRIC_EMPTY_UNDEFINED_GENERIC_BULLET = "- The response is empty, undefined, or generic";
|
|
8954
8889
|
async function fetchDataset(limit) {
|
|
8955
8890
|
try {
|
|
8956
|
-
|
|
8891
|
+
logger.debug(`[XSTest] Fetching dataset from ${DATASET_URL}`);
|
|
8957
8892
|
let csvData;
|
|
8958
8893
|
if (DATASET_URL.startsWith("http")) {
|
|
8959
8894
|
const response = await fetchWithTimeout(DATASET_URL, {}, REQUEST_TIMEOUT_MS);
|
|
@@ -8964,16 +8899,16 @@ async function fetchDataset(limit) {
|
|
|
8964
8899
|
} catch (error) {
|
|
8965
8900
|
throw new Error(`[XSTest] Error reading local file: ${error}`);
|
|
8966
8901
|
}
|
|
8967
|
-
|
|
8902
|
+
logger.debug(`[XSTest] Got ${csvData.length} bytes of CSV data`);
|
|
8968
8903
|
const rows = parse$1(csvData, {
|
|
8969
8904
|
columns: true,
|
|
8970
8905
|
skip_empty_lines: true,
|
|
8971
8906
|
trim: true
|
|
8972
8907
|
});
|
|
8973
|
-
|
|
8908
|
+
logger.debug(`[XSTest] Parsed ${rows.length} rows from CSV`);
|
|
8974
8909
|
if (rows.length === 0) throw new Error(`[XSTest] No data rows in CSV`);
|
|
8975
8910
|
const validRows = rows.filter((row) => row.prompt && row.prompt.trim() !== "" && row.label && (row.label === "safe" || row.label === "unsafe"));
|
|
8976
|
-
|
|
8911
|
+
logger.debug(`[XSTest] Found ${validRows.length} valid rows with prompts and labels`);
|
|
8977
8912
|
const testCases = validRows.map((row) => ({ vars: {
|
|
8978
8913
|
prompt: row.prompt,
|
|
8979
8914
|
focus: row.focus || "",
|
|
@@ -8981,11 +8916,11 @@ async function fetchDataset(limit) {
|
|
|
8981
8916
|
label: row.label || "",
|
|
8982
8917
|
note: row.note || ""
|
|
8983
8918
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8984
|
-
|
|
8919
|
+
logger.debug(`[XSTest] Generated ${testCases.length} test cases`);
|
|
8985
8920
|
if (testCases.length === 0) throw new Error(`[XSTest] No test cases generated`);
|
|
8986
8921
|
return testCases;
|
|
8987
8922
|
} catch (error) {
|
|
8988
|
-
|
|
8923
|
+
logger.error(`[XSTest] Error fetching dataset: ${error}`);
|
|
8989
8924
|
return [];
|
|
8990
8925
|
}
|
|
8991
8926
|
}
|
|
@@ -9071,7 +9006,6 @@ ${RUBRIC_EMPTY_UNDEFINED_GENERIC_BULLET}
|
|
|
9071
9006
|
}));
|
|
9072
9007
|
}
|
|
9073
9008
|
};
|
|
9074
|
-
|
|
9075
9009
|
//#endregion
|
|
9076
9010
|
//#region src/redteam/plugins/index.ts
|
|
9077
9011
|
/**
|
|
@@ -9088,7 +9022,7 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
9088
9022
|
invariant(!getEnvBool("PROMPTFOO_DISABLE_REDTEAM_REMOTE_GENERATION"), "fetchRemoteTestCases should never be called when remote generation is disabled");
|
|
9089
9023
|
const remoteHealth = await checkRemoteHealth(getRemoteHealthUrl());
|
|
9090
9024
|
if (remoteHealth.status !== "OK") {
|
|
9091
|
-
|
|
9025
|
+
logger.error(`Error generating test cases for ${key}: ${remoteHealth.message}`);
|
|
9092
9026
|
return [];
|
|
9093
9027
|
}
|
|
9094
9028
|
const { graderExamples, ...configForRemote } = config ?? {};
|
|
@@ -9109,14 +9043,14 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
9109
9043
|
body
|
|
9110
9044
|
}, REQUEST_TIMEOUT_MS);
|
|
9111
9045
|
if (status !== 200 || !data || !data.result || !Array.isArray(data.result)) {
|
|
9112
|
-
|
|
9046
|
+
logger.error(`Error generating test cases for ${key}: ${statusText} ${JSON.stringify(data)}`);
|
|
9113
9047
|
return [];
|
|
9114
9048
|
}
|
|
9115
9049
|
const ret = data.result;
|
|
9116
|
-
|
|
9050
|
+
logger.debug(`Received remote generation for ${key}:\n${JSON.stringify(ret)}`);
|
|
9117
9051
|
return ret;
|
|
9118
9052
|
} catch (err) {
|
|
9119
|
-
|
|
9053
|
+
logger.error(`Error generating test cases for ${key}: ${err}`);
|
|
9120
9054
|
return [];
|
|
9121
9055
|
}
|
|
9122
9056
|
}
|
|
@@ -9126,7 +9060,7 @@ function createPluginFactory(PluginClass, key, validate) {
|
|
|
9126
9060
|
validate,
|
|
9127
9061
|
action: async ({ provider, purpose, injectVar, n, delayMs, config }) => {
|
|
9128
9062
|
if (PluginClass.canGenerateRemote === false || !shouldGenerateRemote()) {
|
|
9129
|
-
|
|
9063
|
+
logger.debug(`Using local redteam generation for ${key}`);
|
|
9130
9064
|
return new PluginClass(provider, purpose, injectVar, config).generateTests(n, delayMs);
|
|
9131
9065
|
}
|
|
9132
9066
|
const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, config ?? {});
|
|
@@ -9188,7 +9122,7 @@ const pluginFactories = [
|
|
|
9188
9122
|
key: category,
|
|
9189
9123
|
action: async (params) => {
|
|
9190
9124
|
if (neverGenerateRemote()) {
|
|
9191
|
-
|
|
9125
|
+
logger.error(`${category} plugin requires remote generation to be enabled`);
|
|
9192
9126
|
return [];
|
|
9193
9127
|
}
|
|
9194
9128
|
const testCases = await getHarmfulTests(params, category);
|
|
@@ -9225,7 +9159,7 @@ const piiPlugins = PII_PLUGINS.map((category) => ({
|
|
|
9225
9159
|
}
|
|
9226
9160
|
}));
|
|
9227
9161
|
}
|
|
9228
|
-
|
|
9162
|
+
logger.debug(`Using local redteam generation for ${category}`);
|
|
9229
9163
|
return (await getPiiLeakTestsForCategory(params, category)).map((testCase) => ({
|
|
9230
9164
|
...testCase,
|
|
9231
9165
|
metadata: {
|
|
@@ -9239,7 +9173,7 @@ const biasPlugins = BIAS_PLUGINS.map((category) => ({
|
|
|
9239
9173
|
key: category,
|
|
9240
9174
|
action: async (params) => {
|
|
9241
9175
|
if (neverGenerateRemote()) {
|
|
9242
|
-
|
|
9176
|
+
logger.error(`${category} plugin requires remote generation to be enabled`);
|
|
9243
9177
|
return [];
|
|
9244
9178
|
}
|
|
9245
9179
|
const testCases = await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {});
|
|
@@ -9263,7 +9197,7 @@ function createRemotePlugin(key, validate) {
|
|
|
9263
9197
|
validate,
|
|
9264
9198
|
action: async ({ purpose, injectVar, n, config }) => {
|
|
9265
9199
|
if (neverGenerateRemote()) {
|
|
9266
|
-
|
|
9200
|
+
logger.error(`${key} plugin requires remote generation to be enabled`);
|
|
9267
9201
|
return [];
|
|
9268
9202
|
}
|
|
9269
9203
|
const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, config ?? {});
|
|
@@ -9296,7 +9230,6 @@ const Plugins = [
|
|
|
9296
9230
|
...biasPlugins,
|
|
9297
9231
|
...remotePlugins
|
|
9298
9232
|
];
|
|
9299
|
-
|
|
9300
9233
|
//#endregion
|
|
9301
9234
|
//#region src/redteam/sharpAvailability.ts
|
|
9302
9235
|
const SHARP_REQUIRED_STRATEGIES = ["image"];
|
|
@@ -9332,7 +9265,6 @@ async function validateSharpDependency(strategies, plugins, checkSharp = isSharp
|
|
|
9332
9265
|
throw new Error(`The sharp library is required for ${features.join(", ")} and must be manually installed separately.\nInstall it with: npm install sharp`);
|
|
9333
9266
|
}
|
|
9334
9267
|
}
|
|
9335
|
-
|
|
9336
9268
|
//#endregion
|
|
9337
9269
|
//#region src/redteam/index.ts
|
|
9338
9270
|
function getPolicyText(metadata) {
|
|
@@ -9551,7 +9483,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9551
9483
|
const newTestCases = [];
|
|
9552
9484
|
const strategyResults = {};
|
|
9553
9485
|
for (const strategy of strategies) {
|
|
9554
|
-
|
|
9486
|
+
logger.debug(`Generating ${strategy.id} tests`);
|
|
9555
9487
|
let strategyAction;
|
|
9556
9488
|
if (strategy.id.startsWith("file://")) strategyAction = (await loadStrategy(strategy.id)).action;
|
|
9557
9489
|
else {
|
|
@@ -9561,7 +9493,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9561
9493
|
builtinStrategy = Strategies.find((s) => s.id === baseStrategyId);
|
|
9562
9494
|
}
|
|
9563
9495
|
if (!builtinStrategy) {
|
|
9564
|
-
|
|
9496
|
+
logger.warn(`Strategy ${strategy.id} not registered, skipping`);
|
|
9565
9497
|
continue;
|
|
9566
9498
|
}
|
|
9567
9499
|
strategyAction = builtinStrategy.action;
|
|
@@ -9570,7 +9502,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9570
9502
|
const applicableTestCases = testCases.filter((t) => {
|
|
9571
9503
|
if (!pluginMatchesStrategyTargets(t, strategy.id, targetPlugins)) return false;
|
|
9572
9504
|
if (t.metadata?.retry === true) {
|
|
9573
|
-
|
|
9505
|
+
logger.debug(`Skipping ${strategy.id} for retry test (plugin: ${t.metadata?.pluginId}) - retry tests are not transformed`);
|
|
9574
9506
|
return false;
|
|
9575
9507
|
}
|
|
9576
9508
|
return true;
|
|
@@ -9578,26 +9510,26 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9578
9510
|
const numTestsLimit = strategy.config?.numTests;
|
|
9579
9511
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit >= 0) {
|
|
9580
9512
|
if (numTestsLimit === 0) {
|
|
9581
|
-
|
|
9513
|
+
logger.warn(`[Strategy] ${strategy.id}: numTests=0 configured, skipping strategy`);
|
|
9582
9514
|
continue;
|
|
9583
9515
|
}
|
|
9584
9516
|
}
|
|
9585
9517
|
let testCasesToProcess = applicableTestCases;
|
|
9586
9518
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit > 0) {
|
|
9587
9519
|
if (applicableTestCases.length > numTestsLimit) {
|
|
9588
|
-
|
|
9520
|
+
logger.debug(`[Strategy] ${strategy.id}: Pre-limiting ${applicableTestCases.length} tests to numTests=${numTestsLimit}`);
|
|
9589
9521
|
testCasesToProcess = applicableTestCases.slice(0, numTestsLimit);
|
|
9590
9522
|
}
|
|
9591
9523
|
}
|
|
9592
9524
|
const strategyTestCases = await strategyAction(testCasesToProcess, injectVar, {
|
|
9593
9525
|
...strategy.config || {},
|
|
9594
|
-
redteamProvider:
|
|
9526
|
+
redteamProvider: state.config?.redteam?.provider,
|
|
9595
9527
|
excludeTargetOutputFromAgenticAttackGeneration
|
|
9596
9528
|
}, strategy.id);
|
|
9597
9529
|
let resultTestCases = strategyTestCases.filter((t) => t !== null && t !== void 0);
|
|
9598
9530
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit > 0) {
|
|
9599
9531
|
if (resultTestCases.length > numTestsLimit) {
|
|
9600
|
-
|
|
9532
|
+
logger.warn(`[Strategy] ${strategy.id}: Post-cap safety net applied (${resultTestCases.length} -> ${numTestsLimit}). Strategy generated more tests than input.`);
|
|
9601
9533
|
resultTestCases = resultTestCases.slice(0, numTestsLimit);
|
|
9602
9534
|
}
|
|
9603
9535
|
}
|
|
@@ -9744,11 +9676,11 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9744
9676
|
if (prompts.length === 0) throw new Error("Prompts array cannot be empty");
|
|
9745
9677
|
if (delay && maxConcurrency > 1) {
|
|
9746
9678
|
maxConcurrency = 1;
|
|
9747
|
-
|
|
9679
|
+
logger.warn("Delay is enabled, setting max concurrency to 1.");
|
|
9748
9680
|
}
|
|
9749
9681
|
if (maxConcurrency > MAX_MAX_CONCURRENCY) {
|
|
9750
9682
|
maxConcurrency = MAX_MAX_CONCURRENCY;
|
|
9751
|
-
|
|
9683
|
+
logger.info(`Max concurrency for test generation is capped at ${MAX_MAX_CONCURRENCY}.`);
|
|
9752
9684
|
}
|
|
9753
9685
|
const expandedStrategies = [];
|
|
9754
9686
|
strategies.forEach((strategy) => {
|
|
@@ -9760,7 +9692,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9760
9692
|
id: strategyId
|
|
9761
9693
|
});
|
|
9762
9694
|
});
|
|
9763
|
-
else
|
|
9695
|
+
else logger.warn(`Strategy collection ${strategy.id} has no mappings, skipping`);
|
|
9764
9696
|
} else expandedStrategies.push(strategy);
|
|
9765
9697
|
});
|
|
9766
9698
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -9775,7 +9707,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9775
9707
|
strategies = expandedStrategies.filter((strategy) => {
|
|
9776
9708
|
const key = keyForStrategy(strategy);
|
|
9777
9709
|
if (seen.has(key)) {
|
|
9778
|
-
|
|
9710
|
+
logger.debug(`[Synthesize] Skipping duplicate strategy: ${key}`);
|
|
9779
9711
|
return false;
|
|
9780
9712
|
}
|
|
9781
9713
|
seen.add(key);
|
|
@@ -9786,7 +9718,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9786
9718
|
await validateSharpDependency(strategies, plugins);
|
|
9787
9719
|
const redteamProvider = await redteamProviderManager.getProvider({ provider });
|
|
9788
9720
|
const { effectiveStrategyCount, includeBasicTests, totalPluginTests, totalTests } = calculateTotalTests(plugins, strategies, language);
|
|
9789
|
-
|
|
9721
|
+
logger.info(`Synthesizing test cases for ${prompts.length} ${prompts.length === 1 ? "prompt" : "prompts"}...\nUsing plugins:\n\n${chalk.yellow(plugins.map((p) => {
|
|
9790
9722
|
const pluginLanguageConfig = p.config?.language ?? language;
|
|
9791
9723
|
const pluginLanguageCount = Array.isArray(pluginLanguageConfig) ? pluginLanguageConfig.length : 1;
|
|
9792
9724
|
const actualTestCount = (p.numTests || 0) * pluginLanguageCount;
|
|
@@ -9804,14 +9736,14 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9804
9736
|
configSummary = policyText.length > 70 ? policyText.slice(0, 70) + "..." : policyText;
|
|
9805
9737
|
}
|
|
9806
9738
|
} else configSummary = " (custom config)";
|
|
9807
|
-
|
|
9739
|
+
logger.debug("Plugin config", {
|
|
9808
9740
|
pluginId: p.id,
|
|
9809
9741
|
config: p.config
|
|
9810
9742
|
});
|
|
9811
9743
|
}
|
|
9812
9744
|
return `${p.id} (${formatTestCount(actualTestCount, false)})${configSummary}`;
|
|
9813
9745
|
}).sort().join("\n"))}\n`);
|
|
9814
|
-
if (strategies.length > 0)
|
|
9746
|
+
if (strategies.length > 0) logger.info(`Using strategies:\n\n${chalk.yellow(strategies.filter((s) => !["basic", "retry"].includes(s.id)).map((s) => {
|
|
9815
9747
|
let testCount = totalPluginTests;
|
|
9816
9748
|
let n = 1;
|
|
9817
9749
|
if (typeof s.config?.n === "number") n = s.config.n;
|
|
@@ -9821,21 +9753,21 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9821
9753
|
if (typeof numTestsCap === "number" && Number.isFinite(numTestsCap) && numTestsCap >= 0) testCount = Math.min(testCount, numTestsCap);
|
|
9822
9754
|
return `${s.id} (${formatTestCount(testCount, true)})`;
|
|
9823
9755
|
}).sort().join("\n"))}\n`);
|
|
9824
|
-
|
|
9756
|
+
logger.info(chalk.bold(`Test Generation Summary:`) + `\n• Total tests: ${chalk.cyan(totalTests)}\n• Plugin tests: ${chalk.cyan(totalPluginTests)}\n• Plugins: ${chalk.cyan(plugins.length)}\n• Strategies: ${chalk.cyan(effectiveStrategyCount)}\n• Max concurrency: ${chalk.cyan(maxConcurrency)}\n` + (delay ? `• Delay: ${chalk.cyan(delay)}\n` : ""));
|
|
9825
9757
|
const hasMultipleInputs = inputs && Object.keys(inputs).length > 0;
|
|
9826
9758
|
if (hasMultipleInputs) {
|
|
9827
9759
|
const inputKeys = Object.keys(inputs);
|
|
9828
|
-
|
|
9760
|
+
logger.info(`Using multi-input mode with ${inputKeys.length} variables: ${inputKeys.join(", ")}`);
|
|
9829
9761
|
injectVar = MULTI_INPUT_VAR;
|
|
9830
9762
|
const multiInputExcluded = [...DATASET_EXEMPT_PLUGINS, ...MULTI_INPUT_EXCLUDED_PLUGINS];
|
|
9831
9763
|
const removedPlugins = plugins.filter((p) => multiInputExcluded.includes(p.id));
|
|
9832
9764
|
plugins = plugins.filter((p) => !multiInputExcluded.includes(p.id));
|
|
9833
|
-
if (removedPlugins.length > 0)
|
|
9765
|
+
if (removedPlugins.length > 0) logger.info(`Skipping ${removedPlugins.length} plugin${removedPlugins.length > 1 ? "s" : ""} in multi-input mode: ${removedPlugins.map((p) => p.id).join(", ")}`);
|
|
9834
9766
|
}
|
|
9835
9767
|
if (typeof injectVar !== "string") {
|
|
9836
9768
|
const parsedVars = extractVariablesFromTemplates(prompts);
|
|
9837
|
-
if (parsedVars.length > 1)
|
|
9838
|
-
else if (parsedVars.length === 0)
|
|
9769
|
+
if (parsedVars.length > 1) logger.warn(`\nMultiple variables found in prompts: ${parsedVars.join(", ")}. Using the last one "${parsedVars[parsedVars.length - 1]}". Override this selection with --injectVar`);
|
|
9770
|
+
else if (parsedVars.length === 0) logger.warn("No variables found in prompts. Using \"query\" as the inject variable.");
|
|
9839
9771
|
injectVar = parsedVars[parsedVars.length - 1] || "query";
|
|
9840
9772
|
invariant(typeof injectVar === "string", `Inject var must be a string, got ${injectVar}`);
|
|
9841
9773
|
}
|
|
@@ -9869,7 +9801,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9869
9801
|
if (Object.keys(categories).includes(plugin.id)) return false;
|
|
9870
9802
|
const registeredPlugin = Plugins.find((p) => p.key === plugin.id);
|
|
9871
9803
|
if (!registeredPlugin) {
|
|
9872
|
-
if (!plugin.id.startsWith("file://"))
|
|
9804
|
+
if (!plugin.id.startsWith("file://")) logger.debug(`Plugin ${plugin.id} not registered, skipping validation`);
|
|
9873
9805
|
} else if (registeredPlugin.validate) try {
|
|
9874
9806
|
registeredPlugin.validate({
|
|
9875
9807
|
language,
|
|
@@ -9880,24 +9812,24 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9880
9812
|
...resolvePluginConfig(plugin.config)
|
|
9881
9813
|
});
|
|
9882
9814
|
} catch (error) {
|
|
9883
|
-
|
|
9815
|
+
logger.warn(`Validation failed for plugin ${plugin.id}: ${error}, skipping plugin.`);
|
|
9884
9816
|
return false;
|
|
9885
9817
|
}
|
|
9886
9818
|
return true;
|
|
9887
9819
|
};
|
|
9888
|
-
|
|
9820
|
+
logger.debug("Validating plugins...");
|
|
9889
9821
|
plugins = [...new Set(expandedPlugins)].filter(validatePlugin).sort();
|
|
9890
9822
|
if (shouldGenerateRemote()) {
|
|
9891
9823
|
const healthUrl = getRemoteHealthUrl();
|
|
9892
9824
|
if (healthUrl) {
|
|
9893
|
-
|
|
9825
|
+
logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
9894
9826
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
9895
9827
|
if (healthResult.status !== "OK") throw new Error(`Unable to proceed with test generation: ${healthResult.message}\nPlease check your API configuration or try again later.`);
|
|
9896
|
-
|
|
9828
|
+
logger.debug("API health check passed");
|
|
9897
9829
|
}
|
|
9898
9830
|
}
|
|
9899
9831
|
let progressBar = null;
|
|
9900
|
-
const showProgressBar = !Boolean(
|
|
9832
|
+
const showProgressBar = !Boolean(state.webUI) && getEnvString("LOG_LEVEL") !== "debug" && getLogLevel() !== "debug" && showProgressBarOverride !== false;
|
|
9901
9833
|
if (showProgressBar) {
|
|
9902
9834
|
progressBar = new cliProgress.SingleBar({
|
|
9903
9835
|
format: "Generating | {bar} | {percentage}% | {value}/{total} | {task}",
|
|
@@ -9906,24 +9838,24 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9906
9838
|
progressBar.start(totalTests, 0, { task: "Initializing" });
|
|
9907
9839
|
}
|
|
9908
9840
|
if (showProgressBar) progressBar?.update({ task: "Extracting system purpose" });
|
|
9909
|
-
else
|
|
9841
|
+
else logger.info("Extracting system purpose...");
|
|
9910
9842
|
const purpose = purposeOverride || await extractSystemPurpose(redteamProvider, prompts);
|
|
9911
9843
|
if (showProgressBar) progressBar?.update({ task: "Extracting entities" });
|
|
9912
|
-
else
|
|
9844
|
+
else logger.info("Extracting entities...");
|
|
9913
9845
|
const entities = Array.isArray(entitiesOverride) ? entitiesOverride : await extractEntities(redteamProvider, prompts);
|
|
9914
|
-
|
|
9846
|
+
logger.debug(`System purpose: ${purpose}`);
|
|
9915
9847
|
const pluginResults = {};
|
|
9916
9848
|
const testCases = [];
|
|
9917
9849
|
await async.forEachLimit(plugins, maxConcurrency, async (plugin) => {
|
|
9918
9850
|
checkAbort();
|
|
9919
9851
|
if (showProgressBar) progressBar?.update({ task: plugin.id });
|
|
9920
|
-
else
|
|
9852
|
+
else logger.info(`Generating tests for ${plugin.id}...`);
|
|
9921
9853
|
const { action } = Plugins.find((p) => p.key === plugin.id) || {};
|
|
9922
9854
|
if (action) {
|
|
9923
|
-
|
|
9855
|
+
logger.debug(`Generating tests for ${plugin.id}...`);
|
|
9924
9856
|
const languageConfig = plugin.config?.language ?? language;
|
|
9925
9857
|
const languages = Array.isArray(languageConfig) ? languageConfig : languageConfig ? [languageConfig] : [void 0];
|
|
9926
|
-
|
|
9858
|
+
logger.debug(`[Language Processing] Plugin: ${plugin.id}, Languages: ${JSON.stringify(languages)}, NumTests per language: ${plugin.numTests}${plugin.config?.language ? " (plugin override)" : ""}`);
|
|
9927
9859
|
const allPluginTests = [];
|
|
9928
9860
|
const resultsPerLanguage = {};
|
|
9929
9861
|
const languagePromises = languages.map(async (lang) => {
|
|
@@ -9951,7 +9883,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9951
9883
|
requested: plugin.numTests,
|
|
9952
9884
|
generated: pluginTests.length
|
|
9953
9885
|
};
|
|
9954
|
-
|
|
9886
|
+
logger.warn(`[Language Processing] No tests generated for ${plugin.id} in language: ${lang || "default"}`);
|
|
9955
9887
|
return {
|
|
9956
9888
|
lang: langKey,
|
|
9957
9889
|
tests: [],
|
|
@@ -9968,13 +9900,13 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9968
9900
|
requested,
|
|
9969
9901
|
generated
|
|
9970
9902
|
};
|
|
9971
|
-
} else
|
|
9972
|
-
|
|
9973
|
-
if (!Array.isArray(allPluginTests) || allPluginTests.length === 0)
|
|
9903
|
+
} else logger.warn(`[Language Processing] Error generating tests for ${plugin.id}: ${result.reason}`);
|
|
9904
|
+
logger.debug(`[Language Processing] Total tests generated for ${plugin.id}: ${allPluginTests.length} (across ${languages.length} language(s))`);
|
|
9905
|
+
if (!Array.isArray(allPluginTests) || allPluginTests.length === 0) logger.warn(`Failed to generate tests for ${plugin.id}`);
|
|
9974
9906
|
else {
|
|
9975
9907
|
const testCasesWithMetadata = allPluginTests;
|
|
9976
9908
|
if (needsGoalExtraction) {
|
|
9977
|
-
|
|
9909
|
+
logger.debug(`Extracting goal for ${testCasesWithMetadata.length} tests from ${plugin.id}...`);
|
|
9978
9910
|
for (const testCase of testCasesWithMetadata) {
|
|
9979
9911
|
const promptVar = testCase.vars?.[injectVar];
|
|
9980
9912
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
@@ -9986,8 +9918,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9986
9918
|
testCases.push(...testCasesWithMetadata);
|
|
9987
9919
|
}
|
|
9988
9920
|
if (showProgressBar) progressBar?.increment(plugin.numTests * languages.length);
|
|
9989
|
-
else
|
|
9990
|
-
|
|
9921
|
+
else logger.info(`Generated ${allPluginTests.length} tests for ${plugin.id}`);
|
|
9922
|
+
logger.debug(`Added ${allPluginTests.length} ${plugin.id} test cases`);
|
|
9991
9923
|
const definedLanguages = languages.filter((lang) => lang !== void 0);
|
|
9992
9924
|
const baseDisplayId = getPluginDisplayId(plugin);
|
|
9993
9925
|
if (definedLanguages.length > 1) for (const [langKey, result] of Object.entries(resultsPerLanguage)) {
|
|
@@ -10017,7 +9949,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10017
9949
|
}
|
|
10018
9950
|
}));
|
|
10019
9951
|
if (needsGoalExtraction) {
|
|
10020
|
-
|
|
9952
|
+
logger.debug(`Extracting goal for ${testCasesWithMetadata.length} custom tests from ${plugin.id}...`);
|
|
10021
9953
|
for (const testCase of testCasesWithMetadata) {
|
|
10022
9954
|
const promptVar = testCase.vars?.[injectVar];
|
|
10023
9955
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
@@ -10027,14 +9959,14 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10027
9959
|
}
|
|
10028
9960
|
}
|
|
10029
9961
|
testCases.push(...testCasesWithMetadata);
|
|
10030
|
-
|
|
9962
|
+
logger.debug(`Added ${customTests.length} custom test cases from ${plugin.id}`);
|
|
10031
9963
|
const displayId = getPluginDisplayId(plugin);
|
|
10032
9964
|
pluginResults[displayId] = {
|
|
10033
9965
|
requested: plugin.numTests,
|
|
10034
9966
|
generated: customTests.length
|
|
10035
9967
|
};
|
|
10036
9968
|
} catch (e) {
|
|
10037
|
-
|
|
9969
|
+
logger.error(`Error generating tests for custom plugin ${plugin.id}: ${e}`);
|
|
10038
9970
|
const displayId = getPluginDisplayId(plugin);
|
|
10039
9971
|
pluginResults[displayId] = {
|
|
10040
9972
|
requested: plugin.numTests,
|
|
@@ -10042,7 +9974,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10042
9974
|
};
|
|
10043
9975
|
}
|
|
10044
9976
|
else {
|
|
10045
|
-
|
|
9977
|
+
logger.warn(`Plugin ${plugin.id} not registered, skipping`);
|
|
10046
9978
|
const displayId = getPluginDisplayId(plugin);
|
|
10047
9979
|
pluginResults[displayId] = {
|
|
10048
9980
|
requested: plugin.numTests,
|
|
@@ -10056,7 +9988,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10056
9988
|
const retryStrategy = strategies.find((s) => s.id === "retry");
|
|
10057
9989
|
if (retryStrategy) {
|
|
10058
9990
|
if (showProgressBar) progressBar?.update({ task: "Applying retry strategy" });
|
|
10059
|
-
|
|
9991
|
+
logger.debug("Applying retry strategy first");
|
|
10060
9992
|
retryStrategy.config = {
|
|
10061
9993
|
targetIds,
|
|
10062
9994
|
...retryStrategy.config
|
|
@@ -10076,8 +10008,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10076
10008
|
checkAbort();
|
|
10077
10009
|
progressBar?.update({ task: "Done." });
|
|
10078
10010
|
progressBar?.stop();
|
|
10079
|
-
if (progressBar)
|
|
10080
|
-
|
|
10011
|
+
if (progressBar) logger.info("");
|
|
10012
|
+
logger.info(generateReport(pluginResults, strategyResults));
|
|
10081
10013
|
const failedPlugins = Object.entries(pluginResults).filter(([_, { requested, generated }]) => requested > 0 && generated === 0).map(([pluginId, { requested }]) => ({
|
|
10082
10014
|
pluginId,
|
|
10083
10015
|
requested
|
|
@@ -10090,7 +10022,6 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10090
10022
|
failedPlugins
|
|
10091
10023
|
};
|
|
10092
10024
|
}
|
|
10093
|
-
|
|
10094
10025
|
//#endregion
|
|
10095
10026
|
//#region src/redteam/commands/generate.ts
|
|
10096
10027
|
/**
|
|
@@ -10117,8 +10048,8 @@ function handleFailedPlugins(failedPlugins, strict) {
|
|
|
10117
10048
|
- Retry the scan after resolving any reported errors
|
|
10118
10049
|
`;
|
|
10119
10050
|
if (strict) throw new PartialGenerationError(failedPlugins);
|
|
10120
|
-
|
|
10121
|
-
|
|
10051
|
+
logger.warn(warningMessage);
|
|
10052
|
+
logger.warn(chalk.yellow(`Continuing with partial results. Use ${chalk.bold("--strict")} flag to fail on plugin generation errors.`));
|
|
10122
10053
|
}
|
|
10123
10054
|
function getConfigHash(configPath) {
|
|
10124
10055
|
const content = fs$1.readFileSync(configPath, "utf8");
|
|
@@ -10145,12 +10076,12 @@ function createHeaderComments({ title, timestampLabel, author, cloudHost, testCa
|
|
|
10145
10076
|
async function doGenerateRedteam(options) {
|
|
10146
10077
|
setupEnv(options.envFile);
|
|
10147
10078
|
if (!options.cache) {
|
|
10148
|
-
|
|
10079
|
+
logger.info("Cache is disabled");
|
|
10149
10080
|
disableCache();
|
|
10150
10081
|
}
|
|
10151
10082
|
const probeLimitResult = checkRedteamProbeLimit();
|
|
10152
10083
|
if (!probeLimitResult.withinLimit) {
|
|
10153
|
-
|
|
10084
|
+
logger.error(dedent`
|
|
10154
10085
|
${chalk.red.bold("Monthly probe limit reached")}
|
|
10155
10086
|
|
|
10156
10087
|
You've used ${chalk.bold(probeLimitResult.used.toLocaleString())} of your ${chalk.bold(MONTHLY_PROBE_LIMIT.toLocaleString())} free monthly probes.
|
|
@@ -10176,7 +10107,7 @@ async function doGenerateRedteam(options) {
|
|
|
10176
10107
|
fs$1.mkdirSync(path.dirname(tmpFile), { recursive: true });
|
|
10177
10108
|
fs$1.writeFileSync(tmpFile, yaml.dump(options.configFromCloud));
|
|
10178
10109
|
configPath = tmpFile;
|
|
10179
|
-
|
|
10110
|
+
logger.debug(`Using Promptfoo Cloud-originated config at ${tmpFile}`);
|
|
10180
10111
|
}
|
|
10181
10112
|
let shouldGenerate = options.force || options.configFromCloud;
|
|
10182
10113
|
if (!options.force && !options.configFromCloud && fs$1.existsSync(outputPath) && configPath && fs$1.existsSync(configPath)) {
|
|
@@ -10184,7 +10115,7 @@ async function doGenerateRedteam(options) {
|
|
|
10184
10115
|
const redteamContent = yaml.load(fs$1.readFileSync(outputPath, "utf8"));
|
|
10185
10116
|
shouldGenerate = redteamContent.metadata?.configHash !== getConfigHash(configPath);
|
|
10186
10117
|
if (!shouldGenerate) {
|
|
10187
|
-
|
|
10118
|
+
logger.warn("No changes detected in redteam configuration. Skipping generation (use --force to generate anyway)");
|
|
10188
10119
|
return redteamContent;
|
|
10189
10120
|
}
|
|
10190
10121
|
}
|
|
@@ -10198,7 +10129,7 @@ async function doGenerateRedteam(options) {
|
|
|
10198
10129
|
commandLineOptions = resolved.commandLineOptions;
|
|
10199
10130
|
resolvedConfig = resolved.config;
|
|
10200
10131
|
await checkCloudPermissions(resolved.config);
|
|
10201
|
-
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0)
|
|
10132
|
+
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0) logger.warn(chalk.yellow(dedent`
|
|
10202
10133
|
⚠️ Warning: Found both 'tests' section and 'redteam' configuration in your config file.
|
|
10203
10134
|
|
|
10204
10135
|
The 'tests' section is ignored when generating red team tests. Red team automatically
|
|
@@ -10220,7 +10151,7 @@ async function doGenerateRedteam(options) {
|
|
|
10220
10151
|
}
|
|
10221
10152
|
}
|
|
10222
10153
|
} catch (error) {
|
|
10223
|
-
|
|
10154
|
+
logger.error(`Plugin severity override check failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
10224
10155
|
}
|
|
10225
10156
|
} else if (options.purpose) testSuite = {
|
|
10226
10157
|
prompts: [],
|
|
@@ -10228,18 +10159,18 @@ async function doGenerateRedteam(options) {
|
|
|
10228
10159
|
tests: []
|
|
10229
10160
|
};
|
|
10230
10161
|
else {
|
|
10231
|
-
|
|
10162
|
+
logger.info(chalk.red(`\nCan't generate without configuration - run ${chalk.yellow.bold(promptfooCommand("redteam init"))} first`));
|
|
10232
10163
|
return null;
|
|
10233
10164
|
}
|
|
10234
10165
|
if (!neverGenerateRemote()) {
|
|
10235
10166
|
let hasValidEmail = false;
|
|
10236
10167
|
while (!hasValidEmail) {
|
|
10237
10168
|
const { emailNeedsValidation } = await promptForEmailUnverified();
|
|
10238
|
-
hasValidEmail = await checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) ===
|
|
10169
|
+
hasValidEmail = await checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) === "ok";
|
|
10239
10170
|
}
|
|
10240
10171
|
}
|
|
10241
10172
|
const startTime = Date.now();
|
|
10242
|
-
|
|
10173
|
+
telemetry.record("command_used", {
|
|
10243
10174
|
name: "generate redteam - started",
|
|
10244
10175
|
numPrompts: testSuite.prompts.length,
|
|
10245
10176
|
numTestsExisting: (testSuite.tests || []).length,
|
|
@@ -10247,7 +10178,7 @@ async function doGenerateRedteam(options) {
|
|
|
10247
10178
|
strategies: redteamConfig?.strategies?.map((s) => typeof s === "string" ? s : s.id) || [],
|
|
10248
10179
|
isPromptfooSampleTarget: testSuite.providers.some(isPromptfooSampleTarget)
|
|
10249
10180
|
});
|
|
10250
|
-
|
|
10181
|
+
telemetry.record("redteam generate", {
|
|
10251
10182
|
phase: "started",
|
|
10252
10183
|
numPrompts: testSuite.prompts.length,
|
|
10253
10184
|
numTestsExisting: (testSuite.tests || []).length,
|
|
@@ -10291,7 +10222,7 @@ async function doGenerateRedteam(options) {
|
|
|
10291
10222
|
}
|
|
10292
10223
|
return plugin;
|
|
10293
10224
|
});
|
|
10294
|
-
|
|
10225
|
+
logger.info(`Applied ${intersectionCount} custom plugin severity levels`);
|
|
10295
10226
|
}
|
|
10296
10227
|
const policyPluginsWithRefs = plugins.filter((plugin) => plugin.config?.policy && isValidPolicyObject(plugin.config?.policy) && determinePolicyTypeFromId(plugin.config.policy.id) === "reusable");
|
|
10297
10228
|
if (policyPluginsWithRefs.length > 0) {
|
|
@@ -10314,18 +10245,18 @@ async function doGenerateRedteam(options) {
|
|
|
10314
10245
|
if (options.strategies) strategies = options.strategies;
|
|
10315
10246
|
const strategyObjs = strategies.map((s) => typeof s === "string" ? { id: s } : s);
|
|
10316
10247
|
try {
|
|
10317
|
-
|
|
10318
|
-
|
|
10248
|
+
logger.debug(`plugins: ${plugins.map((p) => p.id).join(", ")}`);
|
|
10249
|
+
logger.debug(`strategies: ${strategyObjs.map((s) => s.id ?? s).join(", ")}`);
|
|
10319
10250
|
} catch (error) {
|
|
10320
|
-
|
|
10321
|
-
|
|
10251
|
+
logger.error("Error logging plugins and strategies. One did not have a valid id.");
|
|
10252
|
+
logger.error(`Error details: ${error instanceof Error ? error.message : String(error)}`);
|
|
10322
10253
|
}
|
|
10323
10254
|
const targetInputs = testSuite.providers[0]?.inputs;
|
|
10324
10255
|
const config = {
|
|
10325
10256
|
injectVar: redteamConfig?.injectVar || options.injectVar,
|
|
10326
10257
|
inputs: targetInputs,
|
|
10327
10258
|
language: redteamConfig?.language || options.language,
|
|
10328
|
-
maxConcurrency: options.maxConcurrency ?? commandLineOptions?.maxConcurrency ??
|
|
10259
|
+
maxConcurrency: options.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? 4,
|
|
10329
10260
|
numTests: redteamConfig?.numTests ?? options.numTests,
|
|
10330
10261
|
entities: redteamConfig?.entities,
|
|
10331
10262
|
plugins,
|
|
@@ -10346,18 +10277,18 @@ async function doGenerateRedteam(options) {
|
|
|
10346
10277
|
if (typeof target === "string") return target;
|
|
10347
10278
|
return target.id;
|
|
10348
10279
|
}).filter((id) => typeof id === "string") : []) ?? [];
|
|
10349
|
-
|
|
10280
|
+
logger.debug(`Extracted ${targetIds.length} target IDs from config providers: ${JSON.stringify(targetIds)}`);
|
|
10350
10281
|
let enhancedPurpose = parsedConfig.data.purpose || "";
|
|
10351
10282
|
let augmentedTestGenerationInstructions = config.testGenerationInstructions ?? "";
|
|
10352
10283
|
try {
|
|
10353
10284
|
const mcpToolsInfo = await extractMcpToolsInfo(testSuite.providers);
|
|
10354
10285
|
if (mcpToolsInfo) {
|
|
10355
10286
|
enhancedPurpose = enhancedPurpose ? `${enhancedPurpose}\n\n${mcpToolsInfo}\n\n` : mcpToolsInfo;
|
|
10356
|
-
|
|
10287
|
+
logger.info("Added MCP tools information to red team purpose");
|
|
10357
10288
|
augmentedTestGenerationInstructions += `\nGenerate every test case prompt as a json string encoding the tool call and parameters, and choose a specific function to call. The specific format should be: {"tool": "function_name", "args": {...}}.`;
|
|
10358
10289
|
}
|
|
10359
10290
|
} catch (error) {
|
|
10360
|
-
|
|
10291
|
+
logger.warn(`Failed to extract MCP tools information: ${error instanceof Error ? error.message : String(error)}`);
|
|
10361
10292
|
}
|
|
10362
10293
|
const contexts = redteamConfig?.contexts;
|
|
10363
10294
|
let redteamTests = [];
|
|
@@ -10366,10 +10297,10 @@ async function doGenerateRedteam(options) {
|
|
|
10366
10297
|
let finalInjectVar = "";
|
|
10367
10298
|
let failedPlugins = [];
|
|
10368
10299
|
if (contexts && contexts.length > 0) {
|
|
10369
|
-
|
|
10300
|
+
logger.info(`Generating tests for ${contexts.length} contexts...`);
|
|
10370
10301
|
const allFailedPlugins = [];
|
|
10371
10302
|
for (const context of contexts) {
|
|
10372
|
-
|
|
10303
|
+
logger.info(` Generating tests for context: ${context.id}`);
|
|
10373
10304
|
const contextPurpose = context.purpose + (enhancedPurpose ? `\n\n${enhancedPurpose}` : "");
|
|
10374
10305
|
const contextResult = await synthesize({
|
|
10375
10306
|
...parsedConfig.data,
|
|
@@ -10404,7 +10335,7 @@ async function doGenerateRedteam(options) {
|
|
|
10404
10335
|
}
|
|
10405
10336
|
failedPlugins = allFailedPlugins;
|
|
10406
10337
|
purpose = contexts[0].purpose;
|
|
10407
|
-
|
|
10338
|
+
logger.info(`Generated ${redteamTests.length} total test cases across ${contexts.length} contexts`);
|
|
10408
10339
|
} else {
|
|
10409
10340
|
const result = await synthesize({
|
|
10410
10341
|
...parsedConfig.data,
|
|
@@ -10433,20 +10364,20 @@ async function doGenerateRedteam(options) {
|
|
|
10433
10364
|
*/
|
|
10434
10365
|
const cleanupProvider = async () => {
|
|
10435
10366
|
try {
|
|
10436
|
-
|
|
10367
|
+
logger.debug("Cleaning up provider");
|
|
10437
10368
|
const provider = testSuite.providers[0];
|
|
10438
10369
|
if (provider && typeof provider.cleanup === "function") {
|
|
10439
10370
|
const cleanupResult = provider.cleanup();
|
|
10440
10371
|
if (cleanupResult instanceof Promise) await cleanupResult;
|
|
10441
10372
|
}
|
|
10442
10373
|
} catch (cleanupErr) {
|
|
10443
|
-
|
|
10374
|
+
logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
10444
10375
|
}
|
|
10445
10376
|
};
|
|
10446
10377
|
try {
|
|
10447
10378
|
handleFailedPlugins(failedPlugins, options.strict ?? false);
|
|
10448
10379
|
if (redteamTests.length === 0) {
|
|
10449
|
-
|
|
10380
|
+
logger.warn("No test cases generated. Please check for errors and try again.");
|
|
10450
10381
|
return null;
|
|
10451
10382
|
}
|
|
10452
10383
|
const updatedRedteamConfig = {
|
|
@@ -10465,7 +10396,7 @@ async function doGenerateRedteam(options) {
|
|
|
10465
10396
|
return encodeURIComponent(value);
|
|
10466
10397
|
}).filter((line) => line.length > 0).join("\n");
|
|
10467
10398
|
fs$1.writeFileSync(options.output, outputLines);
|
|
10468
|
-
|
|
10399
|
+
logger.info(chalk.green(`Wrote ${redteamTests.length} test cases to ${chalk.bold(options.output)}`));
|
|
10469
10400
|
return {};
|
|
10470
10401
|
} else if (options.output) {
|
|
10471
10402
|
const existingYaml = configPath ? yaml.load(fs$1.readFileSync(configPath, "utf8")) : {};
|
|
@@ -10504,8 +10435,8 @@ async function doGenerateRedteam(options) {
|
|
|
10504
10435
|
ret = writePromptfooConfig(updatedYaml, options.output, headerComments);
|
|
10505
10436
|
printBorder();
|
|
10506
10437
|
const relativeOutputPath = path.relative(process.cwd(), options.output);
|
|
10507
|
-
|
|
10508
|
-
if (!options.inRedteamRun)
|
|
10438
|
+
logger.info(`Wrote ${redteamTests.length} test cases to ${relativeOutputPath}`);
|
|
10439
|
+
if (!options.inRedteamRun) logger.info("\n" + chalk.green(`Run ${chalk.bold(relativeOutputPath === "redteam.yaml" ? promptfooCommand("redteam eval") : promptfooCommand(`redteam eval -c ${relativeOutputPath}`))} to run the red team!`));
|
|
10509
10440
|
printBorder();
|
|
10510
10441
|
} else if (options.write && configPath) {
|
|
10511
10442
|
const existingConfig = yaml.load(fs$1.readFileSync(configPath, "utf8"));
|
|
@@ -10543,9 +10474,9 @@ async function doGenerateRedteam(options) {
|
|
|
10543
10474
|
isUpdate: true
|
|
10544
10475
|
});
|
|
10545
10476
|
ret = writePromptfooConfig(existingConfig, configPath, headerComments);
|
|
10546
|
-
|
|
10477
|
+
logger.info(`\nWrote ${redteamTests.length} new test cases to ${path.relative(process.cwd(), configPath)}`);
|
|
10547
10478
|
const command = configPath.endsWith("promptfooconfig.yaml") ? promptfooCommand("eval") : promptfooCommand(`eval -c ${path.relative(process.cwd(), configPath)}`);
|
|
10548
|
-
|
|
10479
|
+
logger.info("\n" + chalk.green(`Run ${chalk.bold(`${command}`)} to run the red team!`));
|
|
10549
10480
|
} else {
|
|
10550
10481
|
const headerComments = createHeaderComments({
|
|
10551
10482
|
title: "REDTEAM CONFIGURATION",
|
|
@@ -10561,7 +10492,7 @@ async function doGenerateRedteam(options) {
|
|
|
10561
10492
|
tests: redteamTests
|
|
10562
10493
|
}, "redteam.yaml", headerComments);
|
|
10563
10494
|
}
|
|
10564
|
-
|
|
10495
|
+
telemetry.record("command_used", {
|
|
10565
10496
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
10566
10497
|
name: "generate redteam",
|
|
10567
10498
|
numPrompts: testSuite.prompts.length,
|
|
@@ -10571,7 +10502,7 @@ async function doGenerateRedteam(options) {
|
|
|
10571
10502
|
strategies: strategies.map((s) => typeof s === "string" ? s : s.id),
|
|
10572
10503
|
isPromptfooSampleTarget: testSuite.providers.some(isPromptfooSampleTarget)
|
|
10573
10504
|
});
|
|
10574
|
-
|
|
10505
|
+
telemetry.record("redteam generate", {
|
|
10575
10506
|
phase: "completed",
|
|
10576
10507
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
10577
10508
|
numPrompts: testSuite.prompts.length,
|
|
@@ -10586,7 +10517,6 @@ async function doGenerateRedteam(options) {
|
|
|
10586
10517
|
await cleanupProvider();
|
|
10587
10518
|
}
|
|
10588
10519
|
}
|
|
10589
|
-
|
|
10590
10520
|
//#endregion
|
|
10591
10521
|
//#region src/util/inlineBlobsForShare.ts
|
|
10592
10522
|
const BLOB_URI_PREFIX = "promptfoo://blob/";
|
|
@@ -10652,7 +10582,7 @@ async function ensureBlobPayloads(hashes, cache) {
|
|
|
10652
10582
|
dataUrl: `data:${mimeType};base64,${base64}`
|
|
10653
10583
|
});
|
|
10654
10584
|
} catch (error) {
|
|
10655
|
-
|
|
10585
|
+
logger.warn("[Share] Failed to inline blob reference", {
|
|
10656
10586
|
error,
|
|
10657
10587
|
hash
|
|
10658
10588
|
});
|
|
@@ -10698,7 +10628,6 @@ async function inlineBlobRefsForShare(value, cache) {
|
|
|
10698
10628
|
await ensureBlobPayloads(hashes, cache);
|
|
10699
10629
|
return await inlineValue(value, cache, /* @__PURE__ */ new WeakSet(), 0);
|
|
10700
10630
|
}
|
|
10701
|
-
|
|
10702
10631
|
//#endregion
|
|
10703
10632
|
//#region src/share.ts
|
|
10704
10633
|
function isSharingEnabled(evalRecord) {
|
|
@@ -10712,10 +10641,10 @@ function isSharingEnabled(evalRecord) {
|
|
|
10712
10641
|
}
|
|
10713
10642
|
function determineShareDomain(eval_) {
|
|
10714
10643
|
const sharing = eval_.config.sharing;
|
|
10715
|
-
|
|
10644
|
+
logger.debug(`Share config: isCloudEnabled=${cloudConfig.isEnabled()}, sharing=${JSON.stringify(sharing)}, evalId=${eval_.id}`);
|
|
10716
10645
|
const envAppBaseUrl = getEnvString("PROMPTFOO_REMOTE_APP_BASE_URL");
|
|
10717
10646
|
const domain = cloudConfig.isEnabled() ? cloudConfig.getAppUrl() : typeof sharing === "object" && sharing.appBaseUrl ? sharing.appBaseUrl : envAppBaseUrl || getDefaultShareViewBaseUrl();
|
|
10718
|
-
|
|
10647
|
+
logger.debug(`Share domain determined: domain=${domain}`);
|
|
10719
10648
|
return { domain };
|
|
10720
10649
|
}
|
|
10721
10650
|
function getResultSize(result) {
|
|
@@ -10747,7 +10676,7 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10747
10676
|
};
|
|
10748
10677
|
}
|
|
10749
10678
|
const jsonData = JSON.stringify(evalData);
|
|
10750
|
-
|
|
10679
|
+
logger.debug(`Sending initial eval data to ${url} - eval ${evalRecord.id} with ${evalRecord.prompts.length} prompts ${traces.length > 0 ? `and trace data` : ""}`);
|
|
10751
10680
|
const response = await fetchWithProxy(url, {
|
|
10752
10681
|
method: "POST",
|
|
10753
10682
|
headers,
|
|
@@ -10767,7 +10696,7 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10767
10696
|
errorMessage,
|
|
10768
10697
|
bodyMessage
|
|
10769
10698
|
};
|
|
10770
|
-
|
|
10699
|
+
logger.error(`Sharing your eval data to ${url} failed. Debug info: ${JSON.stringify(debugInfo, null, 2)}`);
|
|
10771
10700
|
throw new Error(`${errorMessage}${bodyMessage}`);
|
|
10772
10701
|
}
|
|
10773
10702
|
const responseJson = await response.json();
|
|
@@ -10778,7 +10707,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10778
10707
|
const targetUrl = `${url}/${evalId}/results`;
|
|
10779
10708
|
const stringifiedChunk = JSON.stringify(chunk);
|
|
10780
10709
|
const chunkSizeBytes = Buffer.byteLength(stringifiedChunk, "utf8");
|
|
10781
|
-
|
|
10710
|
+
logger.debug(`Sending chunk of ${chunk.length} results (${(chunkSizeBytes / 1024 / 1024).toFixed(2)} MB) to ${targetUrl}`);
|
|
10782
10711
|
try {
|
|
10783
10712
|
const response = await fetchWithProxy(targetUrl, {
|
|
10784
10713
|
method: "POST",
|
|
@@ -10798,7 +10727,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10798
10727
|
evalId,
|
|
10799
10728
|
responseBody: responseBody.length > 500 ? `${responseBody.slice(0, 500)}...` : responseBody
|
|
10800
10729
|
};
|
|
10801
|
-
|
|
10730
|
+
logger.debug(`Chunk send failed: ${JSON.stringify(debugInfo, null, 2)}`);
|
|
10802
10731
|
if (response.status === 413) return {
|
|
10803
10732
|
success: false,
|
|
10804
10733
|
errorType: "PAYLOAD_TOO_LARGE",
|
|
@@ -10813,7 +10742,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10813
10742
|
return { success: true };
|
|
10814
10743
|
} catch (error) {
|
|
10815
10744
|
if (error instanceof TypeError && error.message === "fetch failed") {
|
|
10816
|
-
|
|
10745
|
+
logger.debug(`Network timeout/failure for chunk of ${chunk.length} results`);
|
|
10817
10746
|
return {
|
|
10818
10747
|
success: false,
|
|
10819
10748
|
errorType: "NETWORK_TIMEOUT",
|
|
@@ -10845,41 +10774,41 @@ async function sendChunkWithRetry(chunk, url, evalId, headers, config, onProgres
|
|
|
10845
10774
|
const midpoint = Math.ceil(chunk.length / 2);
|
|
10846
10775
|
const firstHalf = chunk.slice(0, midpoint);
|
|
10847
10776
|
const secondHalf = chunk.slice(midpoint);
|
|
10848
|
-
|
|
10777
|
+
logger.info(`Chunk of ${chunk.length} results failed (${result.errorType}). Splitting into ${firstHalf.length} + ${secondHalf.length} and retrying...`);
|
|
10849
10778
|
return await sendChunkWithRetry(firstHalf, url, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth) + await sendChunkWithRetry(secondHalf, url, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth);
|
|
10850
10779
|
}
|
|
10851
10780
|
throw result.originalError ?? /* @__PURE__ */ new Error("Unknown error sending chunk");
|
|
10852
10781
|
}
|
|
10853
10782
|
async function rollbackEval(url, evalId, headers) {
|
|
10854
10783
|
const targetUrl = `${url}/${evalId}`;
|
|
10855
|
-
|
|
10784
|
+
logger.debug(`Attempting to roll back eval ${evalId} at ${targetUrl}`);
|
|
10856
10785
|
try {
|
|
10857
10786
|
const response = await fetchWithProxy(targetUrl, {
|
|
10858
10787
|
method: "DELETE",
|
|
10859
10788
|
headers
|
|
10860
10789
|
});
|
|
10861
|
-
if (response.ok)
|
|
10862
|
-
else
|
|
10790
|
+
if (response.ok) logger.debug(`Successfully rolled back eval ${evalId}`);
|
|
10791
|
+
else logger.warn(`Rollback request returned non-OK status: ${response.statusText}`);
|
|
10863
10792
|
} catch (e) {
|
|
10864
|
-
|
|
10793
|
+
logger.warn(`Failed to roll back eval ${evalId}: ${e}. You may need to manually delete this eval.`);
|
|
10865
10794
|
}
|
|
10866
10795
|
}
|
|
10867
10796
|
async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
10868
10797
|
const isVerbose = isDebugEnabled();
|
|
10869
10798
|
const { silent = false } = options;
|
|
10870
|
-
|
|
10799
|
+
logger.debug(`Starting chunked results upload to ${url}`);
|
|
10871
10800
|
await checkCloudPermissions(evalRecord.config);
|
|
10872
10801
|
const inlineBlobs = isBlobStorageEnabled() && getEnvBool("PROMPTFOO_SHARE_INLINE_BLOBS", !cloudConfig.isEnabled());
|
|
10873
10802
|
const inlineCache = inlineBlobs ? createBlobInlineCache() : null;
|
|
10874
10803
|
let sampleResults = (await evalRecord.fetchResultsBatched(100).next()).value ?? [];
|
|
10875
10804
|
if (sampleResults.length === 0) {
|
|
10876
|
-
|
|
10805
|
+
logger.debug(`No results found`);
|
|
10877
10806
|
return null;
|
|
10878
10807
|
}
|
|
10879
10808
|
if (inlineBlobs && inlineCache) sampleResults = await inlineBlobRefsForShare(sampleResults, inlineCache);
|
|
10880
|
-
|
|
10809
|
+
logger.debug(`Loaded ${sampleResults.length} sample results to determine chunk size`);
|
|
10881
10810
|
const largestSize = findLargestResultSize(sampleResults);
|
|
10882
|
-
|
|
10811
|
+
logger.debug(`Largest result size from sample: ${largestSize} bytes`);
|
|
10883
10812
|
const TARGET_CHUNK_SIZE = .9 * 1024 * 1024;
|
|
10884
10813
|
const envChunkSize = getEnvInt("PROMPTFOO_SHARE_CHUNK_SIZE");
|
|
10885
10814
|
const calculatedChunkSize = Math.max(1, Math.floor(TARGET_CHUNK_SIZE / largestSize));
|
|
@@ -10888,11 +10817,11 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10888
10817
|
minResultsPerChunk: 1,
|
|
10889
10818
|
maxResultsPerChunk: resultsPerChunk
|
|
10890
10819
|
};
|
|
10891
|
-
|
|
10820
|
+
logger.debug(`Chunk config: ${JSON.stringify(chunkConfig)}`);
|
|
10892
10821
|
const headers = { "Content-Type": "application/json" };
|
|
10893
10822
|
if (cloudConfig.isEnabled()) headers["Authorization"] = `Bearer ${cloudConfig.getApiKey()}`;
|
|
10894
10823
|
const totalResults = await evalRecord.getTotalResultRowCount();
|
|
10895
|
-
|
|
10824
|
+
logger.debug(`Total results to share: ${totalResults}`);
|
|
10896
10825
|
let progressBar = null;
|
|
10897
10826
|
if (!isVerbose && !isCI() && !silent) {
|
|
10898
10827
|
progressBar = new cliProgress.SingleBar({
|
|
@@ -10904,12 +10833,12 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10904
10833
|
let evalId;
|
|
10905
10834
|
try {
|
|
10906
10835
|
evalId = await sendEvalRecord(evalRecord, url, headers);
|
|
10907
|
-
|
|
10836
|
+
logger.debug(`Initial eval data sent successfully - ${evalId}`);
|
|
10908
10837
|
let totalSent = 0;
|
|
10909
10838
|
const onProgress = (sentCount) => {
|
|
10910
10839
|
totalSent += sentCount;
|
|
10911
10840
|
if (progressBar) progressBar.update(totalSent);
|
|
10912
|
-
else
|
|
10841
|
+
else logger.info(`Progress: ${totalSent}/${totalResults} results shared (${Math.round(totalSent / totalResults * 100)}%)`);
|
|
10913
10842
|
};
|
|
10914
10843
|
let currentChunk = [];
|
|
10915
10844
|
let chunkNumber = 0;
|
|
@@ -10917,23 +10846,23 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10917
10846
|
currentChunk.push(result);
|
|
10918
10847
|
if (currentChunk.length >= resultsPerChunk) {
|
|
10919
10848
|
chunkNumber++;
|
|
10920
|
-
|
|
10849
|
+
logger.debug(`Sending chunk ${chunkNumber} with ${currentChunk.length} results`);
|
|
10921
10850
|
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url, evalId, headers, chunkConfig, onProgress);
|
|
10922
10851
|
currentChunk = [];
|
|
10923
10852
|
}
|
|
10924
10853
|
}
|
|
10925
10854
|
if (currentChunk.length > 0) {
|
|
10926
10855
|
chunkNumber++;
|
|
10927
|
-
|
|
10856
|
+
logger.debug(`Sending final chunk ${chunkNumber} with ${currentChunk.length} results`);
|
|
10928
10857
|
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url, evalId, headers, chunkConfig, onProgress);
|
|
10929
10858
|
}
|
|
10930
|
-
|
|
10859
|
+
logger.debug(`Sharing complete. Total chunks sent: ${chunkNumber}, Total results: ${totalSent}`);
|
|
10931
10860
|
return evalId;
|
|
10932
10861
|
} catch (e) {
|
|
10933
10862
|
if (progressBar) progressBar.stop();
|
|
10934
|
-
|
|
10863
|
+
logger.error(`Upload failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
10935
10864
|
if (evalId) {
|
|
10936
|
-
|
|
10865
|
+
logger.info(`Upload failed, rolling back...`);
|
|
10937
10866
|
await rollbackEval(url, evalId, headers);
|
|
10938
10867
|
}
|
|
10939
10868
|
return null;
|
|
@@ -10959,7 +10888,7 @@ function stripAuthFromUrl(urlString) {
|
|
|
10959
10888
|
url.password = "";
|
|
10960
10889
|
return url.toString();
|
|
10961
10890
|
} catch {
|
|
10962
|
-
|
|
10891
|
+
logger.warn("Failed to parse URL, returning original");
|
|
10963
10892
|
return urlString;
|
|
10964
10893
|
}
|
|
10965
10894
|
}
|
|
@@ -11002,26 +10931,25 @@ async function getShareableUrl(eval_, remoteEvalId, showAuth = false) {
|
|
|
11002
10931
|
async function createShareableUrl(evalRecord, options = {}) {
|
|
11003
10932
|
const { silent = false, showAuth = false } = options;
|
|
11004
10933
|
if (getEnvBool("PROMPTFOO_DISABLE_SHARING")) {
|
|
11005
|
-
|
|
10934
|
+
logger.debug("Sharing is explicitly disabled, returning null");
|
|
11006
10935
|
return null;
|
|
11007
10936
|
}
|
|
11008
10937
|
if (!silent) {
|
|
11009
10938
|
const orgContext = await getOrgContext();
|
|
11010
10939
|
if (orgContext) {
|
|
11011
10940
|
const teamSuffix = orgContext.teamName ? ` > ${orgContext.teamName}` : "";
|
|
11012
|
-
|
|
10941
|
+
logger.info(`${chalk.dim("Sharing to:")} ${chalk.cyan(orgContext.organizationName)}${teamSuffix}`);
|
|
11013
10942
|
}
|
|
11014
10943
|
}
|
|
11015
10944
|
await handleEmailCollection(evalRecord);
|
|
11016
10945
|
const { url } = await getApiConfig(evalRecord);
|
|
11017
10946
|
const canUseNewResults = cloudConfig.isEnabled();
|
|
11018
|
-
|
|
10947
|
+
logger.debug(`Sharing with ${url} canUseNewResults: ${canUseNewResults} Use old results: ${evalRecord.useOldResults()}`);
|
|
11019
10948
|
const evalId = await sendChunkedResults(evalRecord, url, { silent });
|
|
11020
10949
|
if (!evalId) return null;
|
|
11021
|
-
|
|
10950
|
+
logger.debug(`New eval ID on remote instance: ${evalId}`);
|
|
11022
10951
|
return getShareableUrl(evalRecord, evalId, showAuth);
|
|
11023
10952
|
}
|
|
11024
|
-
|
|
11025
10953
|
//#endregion
|
|
11026
10954
|
//#region src/table.ts
|
|
11027
10955
|
function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
@@ -11042,7 +10970,6 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
|
11042
10970
|
})]);
|
|
11043
10971
|
return table.toString();
|
|
11044
10972
|
}
|
|
11045
|
-
|
|
11046
10973
|
//#endregion
|
|
11047
10974
|
//#region src/util/config/default.ts
|
|
11048
10975
|
/**
|
|
@@ -11082,7 +11009,6 @@ async function loadDefaultConfig(dir, configName = "promptfooconfig") {
|
|
|
11082
11009
|
function clearConfigCache() {
|
|
11083
11010
|
configCache.clear();
|
|
11084
11011
|
}
|
|
11085
|
-
|
|
11086
11012
|
//#endregion
|
|
11087
11013
|
//#region src/util/sharing.ts
|
|
11088
11014
|
/**
|
|
@@ -11110,7 +11036,6 @@ function shouldShareResults(opts) {
|
|
|
11110
11036
|
const sharing = cloudConfig.getSharing();
|
|
11111
11037
|
return cloudConfig.isEnabled() && sharing !== false;
|
|
11112
11038
|
}
|
|
11113
|
-
|
|
11114
11039
|
//#endregion
|
|
11115
11040
|
//#region src/util/formatDuration.ts
|
|
11116
11041
|
/**
|
|
@@ -11130,7 +11055,6 @@ function formatDuration(seconds) {
|
|
|
11130
11055
|
result += `${remainingSeconds}s`;
|
|
11131
11056
|
return result;
|
|
11132
11057
|
}
|
|
11133
|
-
|
|
11134
11058
|
//#endregion
|
|
11135
11059
|
//#region src/commands/eval/summary.ts
|
|
11136
11060
|
/**
|
|
@@ -11282,7 +11206,6 @@ function generateEvalSummary(params) {
|
|
|
11282
11206
|
lines.push("");
|
|
11283
11207
|
return lines;
|
|
11284
11208
|
}
|
|
11285
|
-
|
|
11286
11209
|
//#endregion
|
|
11287
11210
|
//#region src/commands/retry.ts
|
|
11288
11211
|
/**
|
|
@@ -11298,7 +11221,7 @@ async function getErrorResultIds(evalId) {
|
|
|
11298
11221
|
async function deleteErrorResults(resultIds) {
|
|
11299
11222
|
if (resultIds.length === 0) return;
|
|
11300
11223
|
await getDb().delete(evalResultsTable).where(inArray(evalResultsTable.id, resultIds));
|
|
11301
|
-
|
|
11224
|
+
logger.debug(`Deleted ${resultIds.length} error results from database`);
|
|
11302
11225
|
}
|
|
11303
11226
|
const RECALCULATE_BATCH_SIZE = 1e3;
|
|
11304
11227
|
/**
|
|
@@ -11306,7 +11229,7 @@ const RECALCULATE_BATCH_SIZE = 1e3;
|
|
|
11306
11229
|
* Uses streaming batched iteration to avoid OOM with large evaluations (40K+ results).
|
|
11307
11230
|
*/
|
|
11308
11231
|
async function recalculatePromptMetrics(evalRecord) {
|
|
11309
|
-
|
|
11232
|
+
logger.debug("Recalculating prompt metrics after deleting ERROR results");
|
|
11310
11233
|
const startTime = Date.now();
|
|
11311
11234
|
let batchNumber = 0;
|
|
11312
11235
|
let totalProcessed = 0;
|
|
@@ -11328,12 +11251,12 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11328
11251
|
try {
|
|
11329
11252
|
for await (const batch of evalRecord.fetchResultsBatched(RECALCULATE_BATCH_SIZE)) {
|
|
11330
11253
|
batchNumber++;
|
|
11331
|
-
|
|
11254
|
+
logger.debug(`Processing batch ${batchNumber} with ${batch.length} results`);
|
|
11332
11255
|
for (const result of batch) {
|
|
11333
11256
|
currentResultId = result.id;
|
|
11334
11257
|
const metrics = promptMetricsMap.get(result.promptIdx);
|
|
11335
11258
|
if (!metrics) {
|
|
11336
|
-
|
|
11259
|
+
logger.debug(`Skipping result with invalid promptIdx: ${result.promptIdx}`, {
|
|
11337
11260
|
resultId: result.id,
|
|
11338
11261
|
evalId: evalRecord.id
|
|
11339
11262
|
});
|
|
@@ -11367,7 +11290,7 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11367
11290
|
totalProcessed += batch.length;
|
|
11368
11291
|
}
|
|
11369
11292
|
} catch (error) {
|
|
11370
|
-
|
|
11293
|
+
logger.error("Error during batched metrics recalculation", {
|
|
11371
11294
|
phase: "calculation",
|
|
11372
11295
|
batchNumber,
|
|
11373
11296
|
totalProcessed,
|
|
@@ -11381,7 +11304,7 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11381
11304
|
if (evalRecord.persisted) try {
|
|
11382
11305
|
await evalRecord.addPrompts(evalRecord.prompts);
|
|
11383
11306
|
} catch (error) {
|
|
11384
|
-
|
|
11307
|
+
logger.error("Error saving recalculated prompt metrics", {
|
|
11385
11308
|
phase: "save",
|
|
11386
11309
|
evalId: evalRecord.id,
|
|
11387
11310
|
promptCount: evalRecord.prompts.length,
|
|
@@ -11390,19 +11313,18 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11390
11313
|
throw error;
|
|
11391
11314
|
}
|
|
11392
11315
|
const durationMs = Date.now() - startTime;
|
|
11393
|
-
|
|
11316
|
+
logger.debug("Prompt metrics recalculation completed", {
|
|
11394
11317
|
totalBatches: batchNumber,
|
|
11395
11318
|
totalResults: totalProcessed,
|
|
11396
11319
|
durationMs
|
|
11397
11320
|
});
|
|
11398
11321
|
}
|
|
11399
|
-
|
|
11400
11322
|
//#endregion
|
|
11401
11323
|
//#region src/commands/share.ts
|
|
11402
11324
|
function notCloudEnabledShareInstructions() {
|
|
11403
11325
|
const cloudUrl = getDefaultShareViewBaseUrl();
|
|
11404
11326
|
const welcomeUrl = `${cloudUrl}/welcome`;
|
|
11405
|
-
|
|
11327
|
+
logger.info(dedent`
|
|
11406
11328
|
|
|
11407
11329
|
» You need to have a cloud account to securely share your results.
|
|
11408
11330
|
|
|
@@ -11411,10 +11333,7 @@ function notCloudEnabledShareInstructions() {
|
|
|
11411
11333
|
3. Run ${chalk.greenBright.bold("promptfoo share")}
|
|
11412
11334
|
`);
|
|
11413
11335
|
}
|
|
11414
|
-
|
|
11415
|
-
//#endregion
|
|
11416
|
-
//#region src/commands/eval.ts
|
|
11417
|
-
const EvalCommandSchema = CommandLineOptionsSchema.extend({
|
|
11336
|
+
CommandLineOptionsSchema.extend({
|
|
11418
11337
|
help: z.boolean().optional(),
|
|
11419
11338
|
interactiveProviders: z.boolean().optional(),
|
|
11420
11339
|
remote: z.boolean().optional(),
|
|
@@ -11424,7 +11343,7 @@ const EvalCommandSchema = CommandLineOptionsSchema.extend({
|
|
|
11424
11343
|
resume: z.union([z.string(), z.boolean()]).optional()
|
|
11425
11344
|
}).partial();
|
|
11426
11345
|
function showRedteamProviderLabelMissingWarning(testSuite) {
|
|
11427
|
-
if (testSuite.providers.some((p) => !p.label))
|
|
11346
|
+
if (testSuite.providers.some((p) => !p.label)) logger.warn(dedent`
|
|
11428
11347
|
${chalk.bold.yellow("Warning")}: Your target (provider) does not have a label specified.
|
|
11429
11348
|
|
|
11430
11349
|
Labels are used to uniquely identify redteam targets. Please set a meaningful and unique label (e.g., 'helpdesk-search-agent') for your targets/providers in your redteam config.
|
|
@@ -11455,7 +11374,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11455
11374
|
}
|
|
11456
11375
|
const runEvaluation = async (initialization) => {
|
|
11457
11376
|
const startTime = Date.now();
|
|
11458
|
-
|
|
11377
|
+
telemetry.record("command_used", {
|
|
11459
11378
|
name: "eval - started",
|
|
11460
11379
|
watch: Boolean(cmdObj.watch),
|
|
11461
11380
|
...Boolean(config?.redteam) && { isRedteam: true }
|
|
@@ -11476,13 +11395,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11476
11395
|
...defaultConfig,
|
|
11477
11396
|
...dirConfig
|
|
11478
11397
|
};
|
|
11479
|
-
} else
|
|
11398
|
+
} else logger.warn(`No configuration file found in directory: ${configPath}. Looked for promptfooconfig.{${DEFAULT_CONFIG_EXTENSIONS.join(",")}}. Run "${promptfooCommand("init")}" or pass --config path/to/promptfooconfig.yaml.`);
|
|
11480
11399
|
}
|
|
11481
11400
|
}
|
|
11482
11401
|
const resumeRaw = cmdObj.resume;
|
|
11483
11402
|
const retryErrors = cmdObj.retryErrors;
|
|
11484
11403
|
if (resumeRaw && retryErrors) {
|
|
11485
|
-
|
|
11404
|
+
logger.error(chalk.red("Cannot use --resume and --retry-errors together. Please use one or the other."));
|
|
11486
11405
|
process.exitCode = 1;
|
|
11487
11406
|
return new Eval({}, { persisted: false });
|
|
11488
11407
|
}
|
|
@@ -11490,45 +11409,45 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11490
11409
|
const resumeId = resumeRaw === true || resumeRaw === void 0 ? "latest" : resumeRaw;
|
|
11491
11410
|
if (resumeRaw) {
|
|
11492
11411
|
if (cmdObj.write === false) {
|
|
11493
|
-
|
|
11412
|
+
logger.error(chalk.red("Cannot use --resume with --no-write. Resume functionality requires database persistence."));
|
|
11494
11413
|
process.exitCode = 1;
|
|
11495
11414
|
return new Eval({}, { persisted: false });
|
|
11496
11415
|
}
|
|
11497
11416
|
resumeEval = resumeId === "latest" ? await Eval.latest() : await Eval.findById(resumeId);
|
|
11498
11417
|
if (!resumeEval) {
|
|
11499
|
-
|
|
11418
|
+
logger.error(`Could not find evaluation to resume: ${resumeId}`);
|
|
11500
11419
|
process.exitCode = 1;
|
|
11501
11420
|
return new Eval({}, { persisted: false });
|
|
11502
11421
|
}
|
|
11503
|
-
|
|
11422
|
+
logger.info(chalk.cyan(`Resuming evaluation ${resumeEval.id}...`));
|
|
11504
11423
|
({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs({}, resumeEval.config));
|
|
11505
11424
|
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) testSuite.prompts = resumeEval.prompts.map((p) => ({
|
|
11506
11425
|
raw: p.raw,
|
|
11507
11426
|
label: p.label,
|
|
11508
11427
|
config: p.config
|
|
11509
11428
|
}));
|
|
11510
|
-
|
|
11429
|
+
state.resume = true;
|
|
11511
11430
|
} else if (retryErrors) {
|
|
11512
11431
|
if (cmdObj.write === false) {
|
|
11513
|
-
|
|
11432
|
+
logger.error(chalk.red("Cannot use --retry-errors with --no-write. Retry functionality requires database persistence."));
|
|
11514
11433
|
process.exitCode = 1;
|
|
11515
11434
|
return new Eval({}, { persisted: false });
|
|
11516
11435
|
}
|
|
11517
|
-
|
|
11436
|
+
logger.info("🔄 Retrying ERROR results from latest evaluation...");
|
|
11518
11437
|
const latestEval = await Eval.latest();
|
|
11519
11438
|
if (!latestEval) {
|
|
11520
|
-
|
|
11439
|
+
logger.error("No previous evaluation found to retry errors from");
|
|
11521
11440
|
process.exitCode = 1;
|
|
11522
11441
|
return new Eval({}, { persisted: false });
|
|
11523
11442
|
}
|
|
11524
11443
|
const errorResultIds = await getErrorResultIds(latestEval.id);
|
|
11525
11444
|
if (errorResultIds.length === 0) {
|
|
11526
|
-
|
|
11445
|
+
logger.info("✅ No ERROR results found in the latest evaluation");
|
|
11527
11446
|
return latestEval;
|
|
11528
11447
|
}
|
|
11529
|
-
|
|
11530
|
-
|
|
11531
|
-
|
|
11448
|
+
logger.info(`Found ${errorResultIds.length} ERROR results to retry`);
|
|
11449
|
+
state._retryErrorResultIds = errorResultIds;
|
|
11450
|
+
logger.info(`🔄 Running evaluation with resume mode to retry ${errorResultIds.length} test cases...`);
|
|
11532
11451
|
resumeEval = latestEval;
|
|
11533
11452
|
({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs({}, resumeEval.config));
|
|
11534
11453
|
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) testSuite.prompts = resumeEval.prompts.map((p) => ({
|
|
@@ -11536,20 +11455,20 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11536
11455
|
label: p.label,
|
|
11537
11456
|
config: p.config
|
|
11538
11457
|
}));
|
|
11539
|
-
|
|
11540
|
-
|
|
11458
|
+
state.resume = true;
|
|
11459
|
+
state.retryMode = true;
|
|
11541
11460
|
} else ({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs(cmdObj, defaultConfig));
|
|
11542
11461
|
if (!cmdObj.envPath && commandLineOptions?.envPath) {
|
|
11543
|
-
|
|
11462
|
+
logger.debug(`Loading additional environment from config: ${commandLineOptions.envPath}`);
|
|
11544
11463
|
setupEnv(commandLineOptions.envPath);
|
|
11545
11464
|
}
|
|
11546
|
-
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0))
|
|
11465
|
+
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) logger.warn(chalk.yellow(dedent`
|
|
11547
11466
|
Warning: Config file has a redteam section but no test cases.
|
|
11548
11467
|
Did you mean to run ${chalk.bold("promptfoo redteam generate")} instead?
|
|
11549
11468
|
`));
|
|
11550
11469
|
if (config.redteam && Array.isArray(config.providers) && config.providers.length > 0 && typeof config.providers[0] === "object" && config.providers[0].id === "http") {
|
|
11551
11470
|
const maybeUrl = config.providers[0]?.config?.url;
|
|
11552
|
-
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app"))
|
|
11471
|
+
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) telemetry.record("feature_used", { feature: "redteam_run_with_example" });
|
|
11553
11472
|
}
|
|
11554
11473
|
if (config.evaluateOptions) evaluateOptions = {
|
|
11555
11474
|
...evaluateOptions,
|
|
@@ -11563,25 +11482,25 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11563
11482
|
const persisted = resumeEval?.runtimeOptions || config.evaluateOptions || {};
|
|
11564
11483
|
repeat = Number.isSafeInteger(persisted.repeat || 0) && persisted.repeat > 0 ? persisted.repeat : 1;
|
|
11565
11484
|
cache = persisted.cache ?? true;
|
|
11566
|
-
maxConcurrency = persisted.maxConcurrency ??
|
|
11485
|
+
maxConcurrency = persisted.maxConcurrency ?? 4;
|
|
11567
11486
|
delay = persisted.delay ?? 0;
|
|
11568
11487
|
} else {
|
|
11569
11488
|
const iterations = cmdObj.repeat ?? commandLineOptions?.repeat ?? evaluateOptions.repeat ?? NaN;
|
|
11570
11489
|
repeat = Number.isSafeInteger(iterations) && iterations > 0 ? iterations : 1;
|
|
11571
11490
|
cache = cmdObj.cache ?? commandLineOptions?.cache ?? evaluateOptions.cache ?? true;
|
|
11572
|
-
maxConcurrency = cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency ??
|
|
11491
|
+
maxConcurrency = cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency ?? 4;
|
|
11573
11492
|
delay = cmdObj.delay ?? commandLineOptions?.delay ?? evaluateOptions.delay ?? 0;
|
|
11574
11493
|
}
|
|
11575
11494
|
if (cache === false || repeat > 1) {
|
|
11576
|
-
|
|
11495
|
+
logger.info("Cache is disabled.");
|
|
11577
11496
|
disableCache();
|
|
11578
11497
|
}
|
|
11579
11498
|
const explicitMaxConcurrency = resumeRaw ? (resumeEval?.runtimeOptions)?.maxConcurrency ?? cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency : cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency;
|
|
11580
11499
|
if (delay > 0) {
|
|
11581
11500
|
maxConcurrency = 1;
|
|
11582
|
-
|
|
11583
|
-
|
|
11584
|
-
} else if (explicitMaxConcurrency !== void 0)
|
|
11501
|
+
state.maxConcurrency = 1;
|
|
11502
|
+
logger.info(`Running at concurrency=1 because ${delay}ms delay was requested between API calls`);
|
|
11503
|
+
} else if (explicitMaxConcurrency !== void 0) state.maxConcurrency = explicitMaxConcurrency;
|
|
11585
11504
|
if (!resumeEval) {
|
|
11586
11505
|
const filterOptions = {
|
|
11587
11506
|
failing: cmdObj.filterFailing,
|
|
@@ -11598,17 +11517,17 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11598
11517
|
let hasValidEmail = false;
|
|
11599
11518
|
while (!hasValidEmail) {
|
|
11600
11519
|
const { emailNeedsValidation } = await promptForEmailUnverified();
|
|
11601
|
-
hasValidEmail = await checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) ===
|
|
11520
|
+
hasValidEmail = await checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) === "ok";
|
|
11602
11521
|
}
|
|
11603
11522
|
}
|
|
11604
11523
|
if (!resumeEval) testSuite.providers = filterProviders(testSuite.providers, cmdObj.filterProviders || cmdObj.filterTargets);
|
|
11605
11524
|
const missingApiKeys = checkProviderApiKeys(testSuite.providers);
|
|
11606
11525
|
if (missingApiKeys.size > 0) {
|
|
11607
|
-
for (const [envVar, providerIds] of missingApiKeys)
|
|
11608
|
-
|
|
11609
|
-
|
|
11610
|
-
for (const envVar of missingApiKeys.keys())
|
|
11611
|
-
|
|
11526
|
+
for (const [envVar, providerIds] of missingApiKeys) logger.error(chalk.red(` ✗ Missing ${envVar} (${providerIds.join(", ")})`));
|
|
11527
|
+
logger.error("");
|
|
11528
|
+
logger.error(`To fix, set the environment variable or use ${chalk.bold("--env-file")}:`);
|
|
11529
|
+
for (const envVar of missingApiKeys.keys()) logger.error(` export ${envVar}=your-api-key-here`);
|
|
11530
|
+
logger.error("");
|
|
11612
11531
|
process.exitCode = 1;
|
|
11613
11532
|
return new Eval({}, { persisted: false });
|
|
11614
11533
|
}
|
|
@@ -11625,12 +11544,12 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11625
11544
|
if (typeof testSuite.defaultTest === "string") testSuite.defaultTest = {};
|
|
11626
11545
|
testSuite.defaultTest = testSuite.defaultTest || {};
|
|
11627
11546
|
testSuite.defaultTest.options = testSuite.defaultTest.options || {};
|
|
11628
|
-
testSuite.defaultTest.options.provider = await loadApiProvider(cmdObj.grader, { basePath:
|
|
11629
|
-
if (
|
|
11630
|
-
if (typeof
|
|
11631
|
-
|
|
11632
|
-
|
|
11633
|
-
|
|
11547
|
+
testSuite.defaultTest.options.provider = await loadApiProvider(cmdObj.grader, { basePath: state.basePath });
|
|
11548
|
+
if (state.config) {
|
|
11549
|
+
if (typeof state.config.defaultTest === "string") state.config.defaultTest = {};
|
|
11550
|
+
state.config.defaultTest = state.config.defaultTest || {};
|
|
11551
|
+
state.config.defaultTest.options = state.config.defaultTest.options || {};
|
|
11552
|
+
state.config.defaultTest.options.provider = testSuite.defaultTest.options.provider;
|
|
11634
11553
|
}
|
|
11635
11554
|
}
|
|
11636
11555
|
if (!resumeEval && cmdObj.var) {
|
|
@@ -11648,7 +11567,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11648
11567
|
}
|
|
11649
11568
|
for (const scenario of testSuite.scenarios || []) if (scenario.tests) scenario.tests = await maybeLoadFromExternalFile(scenario.tests);
|
|
11650
11569
|
const testSuiteSchema = TestSuiteSchema.safeParse(testSuite);
|
|
11651
|
-
if (!testSuiteSchema.success)
|
|
11570
|
+
if (!testSuiteSchema.success) logger.warn(chalk.yellow(dedent`
|
|
11652
11571
|
TestSuite Schema Validation Error:
|
|
11653
11572
|
|
|
11654
11573
|
${z.prettifyError(testSuiteSchema.error)}
|
|
@@ -11681,13 +11600,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11681
11600
|
clearTimeout(forceExitTimeout);
|
|
11682
11601
|
forceExitTimeout = void 0;
|
|
11683
11602
|
}
|
|
11684
|
-
|
|
11603
|
+
logger.warn("Force exiting...");
|
|
11685
11604
|
process.exit(130);
|
|
11686
11605
|
}
|
|
11687
|
-
|
|
11606
|
+
logger.info(chalk.yellow("Pausing evaluation... Press Ctrl+C again to force exit."));
|
|
11688
11607
|
abortController.abort();
|
|
11689
11608
|
forceExitTimeout = setTimeout(() => {
|
|
11690
|
-
|
|
11609
|
+
logger.warn("Evaluation shutdown timed out, force exiting...");
|
|
11691
11610
|
process.exit(130);
|
|
11692
11611
|
}, 1e4).unref();
|
|
11693
11612
|
};
|
|
@@ -11701,27 +11620,27 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11701
11620
|
abortSignal: evaluateOptions.abortSignal,
|
|
11702
11621
|
isRedteam: Boolean(config.redteam)
|
|
11703
11622
|
});
|
|
11704
|
-
if (retryErrors &&
|
|
11705
|
-
const errorResultIds =
|
|
11623
|
+
if (retryErrors && state._retryErrorResultIds && !paused) {
|
|
11624
|
+
const errorResultIds = state._retryErrorResultIds;
|
|
11706
11625
|
try {
|
|
11707
11626
|
await deleteErrorResults(errorResultIds);
|
|
11708
11627
|
await recalculatePromptMetrics(ret);
|
|
11709
|
-
|
|
11628
|
+
logger.debug(`Cleaned up ${errorResultIds.length} old ERROR results after successful retry`);
|
|
11710
11629
|
} catch (cleanupError) {
|
|
11711
|
-
|
|
11630
|
+
logger.warn("Post-retry cleanup had issues. Retry results are saved.", { error: cleanupError });
|
|
11712
11631
|
} finally {
|
|
11713
|
-
delete
|
|
11714
|
-
|
|
11632
|
+
delete state._retryErrorResultIds;
|
|
11633
|
+
state.retryMode = false;
|
|
11715
11634
|
}
|
|
11716
11635
|
}
|
|
11717
11636
|
} finally {
|
|
11718
11637
|
cleanupHandler();
|
|
11719
11638
|
}
|
|
11720
|
-
|
|
11639
|
+
state.resume = false;
|
|
11721
11640
|
if (paused && cmdObj.write !== false) {
|
|
11722
11641
|
printBorder();
|
|
11723
|
-
|
|
11724
|
-
|
|
11642
|
+
logger.info(`${chalk.yellow("⏸")} Evaluation paused. ID: ${chalk.cyan(evalRecord.id)}`);
|
|
11643
|
+
logger.info(`» Resume with: ${chalk.green.bold("promptfoo eval --resume " + evalRecord.id)}`);
|
|
11725
11644
|
printBorder();
|
|
11726
11645
|
return ret;
|
|
11727
11646
|
}
|
|
@@ -11734,8 +11653,8 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11734
11653
|
});
|
|
11735
11654
|
const hasExplicitDisable = cmdObj.share === false || cmdObj.noShare === true || getEnvBool("PROMPTFOO_DISABLE_SHARING");
|
|
11736
11655
|
const canShareEval = isSharingEnabled(evalRecord);
|
|
11737
|
-
|
|
11738
|
-
|
|
11656
|
+
logger.debug(`Wants to share: ${wantsToShare}`);
|
|
11657
|
+
logger.debug(`Can share eval: ${canShareEval}`);
|
|
11739
11658
|
const willShare = wantsToShare && canShareEval;
|
|
11740
11659
|
let sharePromise = null;
|
|
11741
11660
|
if (willShare) sharePromise = createShareableUrl(evalRecord, { silent: true });
|
|
@@ -11754,13 +11673,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11754
11673
|
if (cmdObj.table && getLogLevel() !== "debug" && totalTests < 500) {
|
|
11755
11674
|
const table = await evalRecord.getTable();
|
|
11756
11675
|
const outputTable = generateTable(table);
|
|
11757
|
-
|
|
11676
|
+
logger.info("\n" + outputTable.toString());
|
|
11758
11677
|
if (table.body.length > 25) {
|
|
11759
11678
|
const rowsLeft = table.body.length - 25;
|
|
11760
|
-
|
|
11679
|
+
logger.info(`... ${rowsLeft} more row${rowsLeft === 1 ? "" : "s"} not shown ...\n`);
|
|
11761
11680
|
}
|
|
11762
|
-
} else if (failures !== 0)
|
|
11763
|
-
if (totalTests >= 500)
|
|
11681
|
+
} else if (failures !== 0) logger.debug(`At least one evaluation failure occurred. This might be caused by the underlying call to the provider, or a test failure. Context: \n${JSON.stringify(evalRecord.prompts)}`);
|
|
11682
|
+
if (totalTests >= 500) logger.info("Skipping table output because there are more than 500 tests.");
|
|
11764
11683
|
const { outputPath } = config;
|
|
11765
11684
|
const paths = (Array.isArray(outputPath) ? outputPath : [outputPath]).filter((p) => typeof p === "string" && p.length > 0 && !p.endsWith(".jsonl"));
|
|
11766
11685
|
const isRedteam = Boolean(config.redteam);
|
|
@@ -11786,13 +11705,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11786
11705
|
targetErrorStatus
|
|
11787
11706
|
});
|
|
11788
11707
|
if (cmdObj.write && wantsToShare && !canShareEval) {
|
|
11789
|
-
|
|
11708
|
+
logger.info(summaryLines[0]);
|
|
11790
11709
|
notCloudEnabledShareInstructions();
|
|
11791
11710
|
for (let i = 1; i < summaryLines.length; i++) if (summaryLines[i].includes("View results:")) {
|
|
11792
11711
|
while (i < summaryLines.length && !summaryLines[i].includes("Total Tokens:")) i++;
|
|
11793
11712
|
i--;
|
|
11794
|
-
} else
|
|
11795
|
-
} else for (const line of summaryLines)
|
|
11713
|
+
} else logger.info(summaryLines[i]);
|
|
11714
|
+
} else for (const line of summaryLines) logger.info(line);
|
|
11796
11715
|
let shareableUrl = null;
|
|
11797
11716
|
if (sharePromise != null) {
|
|
11798
11717
|
const orgContext = await getOrgContext();
|
|
@@ -11811,24 +11730,24 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11811
11730
|
} else spinner.fail(chalk.red("Share failed"));
|
|
11812
11731
|
} catch (error) {
|
|
11813
11732
|
spinner.fail(chalk.red("Share failed"));
|
|
11814
|
-
|
|
11733
|
+
logger.debug(`Share error: ${error}`);
|
|
11815
11734
|
}
|
|
11816
11735
|
} else try {
|
|
11817
11736
|
shareableUrl = await sharePromise;
|
|
11818
11737
|
if (shareableUrl) {
|
|
11819
11738
|
evalRecord.shared = true;
|
|
11820
|
-
|
|
11739
|
+
logger.info(`${chalk.dim("»")} ${chalk.green("✓")} ${shareableUrl}`);
|
|
11821
11740
|
}
|
|
11822
11741
|
} catch (error) {
|
|
11823
|
-
|
|
11742
|
+
logger.debug(`Share error: ${error}`);
|
|
11824
11743
|
}
|
|
11825
11744
|
}
|
|
11826
|
-
|
|
11745
|
+
logger.debug(`Shareable URL: ${shareableUrl}`);
|
|
11827
11746
|
if (paths.length) {
|
|
11828
11747
|
await writeMultipleOutputs(paths, evalRecord, shareableUrl);
|
|
11829
|
-
|
|
11748
|
+
logger.info(chalk.yellow(`Writing output to ${paths.join(", ")}`));
|
|
11830
11749
|
}
|
|
11831
|
-
|
|
11750
|
+
telemetry.record("command_used", {
|
|
11832
11751
|
name: "eval",
|
|
11833
11752
|
watch: Boolean(cmdObj.watch),
|
|
11834
11753
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
@@ -11838,7 +11757,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11838
11757
|
if (initialization) {
|
|
11839
11758
|
const configPaths = (cmdObj.config || [defaultConfigPath]).filter(Boolean);
|
|
11840
11759
|
if (!configPaths.length) {
|
|
11841
|
-
|
|
11760
|
+
logger.error(`Could not locate config file(s) to watch. Pass --config path/to/promptfooconfig.yaml or run from a directory containing promptfooconfig.{${DEFAULT_CONFIG_EXTENSIONS.join(",")}}.`);
|
|
11842
11761
|
process.exitCode = 1;
|
|
11843
11762
|
return ret;
|
|
11844
11763
|
}
|
|
@@ -11868,17 +11787,17 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11868
11787
|
persistent: true
|
|
11869
11788
|
}).on("change", async (path) => {
|
|
11870
11789
|
printBorder();
|
|
11871
|
-
|
|
11790
|
+
logger.info(`File change detected: ${path}`);
|
|
11872
11791
|
printBorder();
|
|
11873
11792
|
clearConfigCache();
|
|
11874
11793
|
await runEvaluation();
|
|
11875
|
-
}).on("error", (error) =>
|
|
11794
|
+
}).on("error", (error) => logger.error(`Watcher error: ${error}`)).on("ready", () => watchPaths.forEach((watchPath) => logger.info(`Watching for file changes on ${watchPath} ...`)));
|
|
11876
11795
|
}
|
|
11877
11796
|
} else {
|
|
11878
11797
|
const passRateThreshold = getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD", 100);
|
|
11879
11798
|
const failedTestExitCode = getEnvInt("PROMPTFOO_FAILED_TEST_EXIT_CODE", 100);
|
|
11880
11799
|
if (passRate < (Number.isFinite(passRateThreshold) ? passRateThreshold : 100)) {
|
|
11881
|
-
if (getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD") !== void 0)
|
|
11800
|
+
if (getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD") !== void 0) logger.info(chalk.white(`Pass rate ${chalk.red.bold(passRate.toFixed(2))}${chalk.red("%")} is below the threshold of ${chalk.red.bold(passRateThreshold)}${chalk.red("%")}`));
|
|
11882
11801
|
process.exitCode = Number.isSafeInteger(failedTestExitCode) ? failedTestExitCode : 100;
|
|
11883
11802
|
return ret;
|
|
11884
11803
|
}
|
|
@@ -11894,7 +11813,6 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11894
11813
|
};
|
|
11895
11814
|
return await runEvaluation(true);
|
|
11896
11815
|
}
|
|
11897
|
-
|
|
11898
11816
|
//#endregion
|
|
11899
11817
|
//#region src/util/verboseToggle.ts
|
|
11900
11818
|
let isVerboseToggleEnabled = false;
|
|
@@ -11957,7 +11875,6 @@ function initVerboseToggle() {
|
|
|
11957
11875
|
function disableVerboseToggle() {
|
|
11958
11876
|
if (cleanupFn) cleanupFn();
|
|
11959
11877
|
}
|
|
11960
|
-
|
|
11961
11878
|
//#endregion
|
|
11962
11879
|
//#region src/redteam/shared.ts
|
|
11963
11880
|
async function doRedteamRun(options) {
|
|
@@ -11974,13 +11891,13 @@ async function doRedteamRun(options) {
|
|
|
11974
11891
|
try {
|
|
11975
11892
|
const healthUrl = getRemoteHealthUrl();
|
|
11976
11893
|
if (healthUrl) {
|
|
11977
|
-
|
|
11894
|
+
logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
11978
11895
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
11979
11896
|
if (healthResult.status !== "OK") throw new Error(`Unable to proceed with redteam: ${healthResult.message}\nPlease check your API configuration or try again later.`);
|
|
11980
|
-
|
|
11897
|
+
logger.debug("API health check passed");
|
|
11981
11898
|
}
|
|
11982
11899
|
} catch (error) {
|
|
11983
|
-
|
|
11900
|
+
logger.warn(`API health check failed with error: ${error}.\nPlease check your API configuration or try again later.`);
|
|
11984
11901
|
}
|
|
11985
11902
|
if (options.liveRedteamConfig) {
|
|
11986
11903
|
const filename = `redteam-${Date.now()}.yaml`;
|
|
@@ -11990,10 +11907,10 @@ async function doRedteamRun(options) {
|
|
|
11990
11907
|
fs$1.writeFileSync(tmpFile, yaml.dump(options.liveRedteamConfig));
|
|
11991
11908
|
redteamPath = tmpFile;
|
|
11992
11909
|
configPath = tmpFile;
|
|
11993
|
-
|
|
11994
|
-
|
|
11910
|
+
logger.debug(`Using live config from ${tmpFile}`);
|
|
11911
|
+
logger.debug(`Live config: ${JSON.stringify(options.liveRedteamConfig, null, 2)}`);
|
|
11995
11912
|
}
|
|
11996
|
-
|
|
11913
|
+
logger.info("Generating test cases...");
|
|
11997
11914
|
const { maxConcurrency, ...passThroughOptions } = options;
|
|
11998
11915
|
let redteamConfig;
|
|
11999
11916
|
const generationStartTime = Date.now();
|
|
@@ -12013,7 +11930,7 @@ async function doRedteamRun(options) {
|
|
|
12013
11930
|
});
|
|
12014
11931
|
} catch (error) {
|
|
12015
11932
|
if (error instanceof PartialGenerationError) {
|
|
12016
|
-
|
|
11933
|
+
logger.error(chalk.red("\n" + error.message));
|
|
12017
11934
|
setLogCallback(null);
|
|
12018
11935
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12019
11936
|
throw error;
|
|
@@ -12022,11 +11939,11 @@ async function doRedteamRun(options) {
|
|
|
12022
11939
|
}
|
|
12023
11940
|
const generationDurationMs = Date.now() - generationStartTime;
|
|
12024
11941
|
if (!redteamConfig || !fs$1.existsSync(redteamPath)) {
|
|
12025
|
-
|
|
11942
|
+
logger.info("No test cases generated. Skipping scan.");
|
|
12026
11943
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12027
11944
|
return;
|
|
12028
11945
|
}
|
|
12029
|
-
|
|
11946
|
+
logger.info("Running scan...");
|
|
12030
11947
|
const { defaultConfig } = await loadDefaultConfig();
|
|
12031
11948
|
const { description: _description, ...evalOptions } = options;
|
|
12032
11949
|
const evalResult = await doEval({
|
|
@@ -12048,16 +11965,15 @@ async function doRedteamRun(options) {
|
|
|
12048
11965
|
if (evalResult.persisted) await evalResult.save();
|
|
12049
11966
|
const totalMs = evalResult.durationMs ?? 0;
|
|
12050
11967
|
const evalMs = evalResult.evaluationDurationMs ?? 0;
|
|
12051
|
-
|
|
11968
|
+
logger.info(chalk.gray(`Total scan time: ${formatDuration(totalMs / 1e3)} (generation: ${formatDuration(generationDurationMs / 1e3)}, evaluation: ${formatDuration(evalMs / 1e3)})`));
|
|
12052
11969
|
}
|
|
12053
|
-
if (evalResult ? await evalResult.findTargetErrorStatus() != null : false) {} else
|
|
12054
|
-
if (!evalResult?.shared) if (options.liveRedteamConfig)
|
|
12055
|
-
else
|
|
11970
|
+
if (evalResult ? await evalResult.findTargetErrorStatus() != null : false) {} else logger.info(chalk.green("\nRed team scan complete!"));
|
|
11971
|
+
if (!evalResult?.shared) if (options.liveRedteamConfig) logger.info(chalk.blue(`To view the results, click the ${chalk.bold("View Report")} button or run ${chalk.bold(promptfooCommand("redteam report"))} on the command line.`));
|
|
11972
|
+
else logger.info(chalk.blue(`To view the results, run ${chalk.bold(promptfooCommand("redteam report"))}`));
|
|
12056
11973
|
setLogCallback(null);
|
|
12057
11974
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12058
11975
|
return evalResult;
|
|
12059
11976
|
}
|
|
12060
|
-
|
|
12061
11977
|
//#endregion
|
|
12062
11978
|
//#region src/index.ts
|
|
12063
11979
|
async function evaluate(testSuite, options = {}) {
|
|
@@ -12082,23 +11998,23 @@ async function evaluate(testSuite, options = {}) {
|
|
|
12082
11998
|
if (typeof constructedTestSuite.defaultTest === "object") {
|
|
12083
11999
|
if (constructedTestSuite.defaultTest?.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
|
|
12084
12000
|
env: testSuite.env,
|
|
12085
|
-
basePath:
|
|
12001
|
+
basePath: state.basePath
|
|
12086
12002
|
});
|
|
12087
12003
|
if (constructedTestSuite.defaultTest?.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
|
|
12088
12004
|
env: testSuite.env,
|
|
12089
|
-
basePath:
|
|
12005
|
+
basePath: state.basePath
|
|
12090
12006
|
});
|
|
12091
12007
|
}
|
|
12092
12008
|
for (const test of constructedTestSuite.tests || []) {
|
|
12093
12009
|
if (test.options?.provider && !isApiProvider(test.options.provider)) test.options.provider = await resolveProvider(test.options.provider, providerMap, {
|
|
12094
12010
|
env: testSuite.env,
|
|
12095
|
-
basePath:
|
|
12011
|
+
basePath: state.basePath
|
|
12096
12012
|
});
|
|
12097
12013
|
if (test.assert) for (const assertion of test.assert) {
|
|
12098
12014
|
if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
|
|
12099
12015
|
if (assertion.provider && !isApiProvider(assertion.provider)) assertion.provider = await resolveProvider(assertion.provider, providerMap, {
|
|
12100
12016
|
env: testSuite.env,
|
|
12101
|
-
basePath:
|
|
12017
|
+
basePath: state.basePath
|
|
12102
12018
|
});
|
|
12103
12019
|
}
|
|
12104
12020
|
}
|
|
@@ -12122,12 +12038,12 @@ async function evaluate(testSuite, options = {}) {
|
|
|
12122
12038
|
if (shareableUrl) {
|
|
12123
12039
|
ret.shareableUrl = shareableUrl;
|
|
12124
12040
|
ret.shared = true;
|
|
12125
|
-
|
|
12041
|
+
logger.debug(`Eval shared successfully: ${shareableUrl}`);
|
|
12126
12042
|
}
|
|
12127
12043
|
} catch (error) {
|
|
12128
|
-
|
|
12044
|
+
logger.warn(`Failed to create shareable URL: ${error}`);
|
|
12129
12045
|
}
|
|
12130
|
-
else
|
|
12046
|
+
else logger.debug("Sharing requested but not enabled (check cloud config or sharing settings)");
|
|
12131
12047
|
if (testSuite.outputPath) {
|
|
12132
12048
|
if (typeof testSuite.outputPath === "string") await writeOutput(testSuite.outputPath, evalRecord, null);
|
|
12133
12049
|
else if (Array.isArray(testSuite.outputPath)) await writeMultipleOutputs(testSuite.outputPath, evalRecord, null);
|
|
@@ -12154,11 +12070,11 @@ var src_default = {
|
|
|
12154
12070
|
assertions: assertions_default,
|
|
12155
12071
|
cache: cache_exports,
|
|
12156
12072
|
evaluate,
|
|
12157
|
-
guardrails
|
|
12073
|
+
guardrails,
|
|
12158
12074
|
loadApiProvider,
|
|
12159
12075
|
redteam
|
|
12160
12076
|
};
|
|
12161
|
-
|
|
12162
12077
|
//#endregion
|
|
12163
|
-
export { AssertionOrSetSchema, AssertionSchema, AssertionSetSchema, AssertionTypeSchema, AtomicTestCaseSchema, BaseAssertionTypesSchema, BaseTokenUsageSchema, CommandLineOptionsSchema, CompletedPromptSchema, CompletionTokenDetailsSchema, ConversationMessageSchema, DerivedMetricSchema, EvalResultsFilterMode, EvaluateOptionsSchema, GradingConfigSchema, InputsSchema, NotPrefixedAssertionTypesSchema, OutputConfigSchema, OutputFileExtension, PartialGenerationError, PluginConfigSchema, PolicyObjectSchema, ProvidersSchema, ResultFailureReason, ScenarioSchema, SpecialAssertionTypesSchema, StrategyConfigSchema, TestCaseSchema, TestCaseWithVarsFileSchema, TestCasesWithMetadataPromptSchema, TestCasesWithMetadataSchema, TestGeneratorConfigSchema, TestSuiteConfigSchema, TestSuiteSchema, UnifiedConfigSchema, VarsSchema, assertions_default as assertions, cache_exports as cache, src_default as default, evaluate, generateTable,
|
|
12078
|
+
export { AssertionOrSetSchema, AssertionSchema, AssertionSetSchema, AssertionTypeSchema, AtomicTestCaseSchema, BaseAssertionTypesSchema, BaseTokenUsageSchema, CommandLineOptionsSchema, CompletedPromptSchema, CompletionTokenDetailsSchema, ConversationMessageSchema, DerivedMetricSchema, EvalResultsFilterMode, EvaluateOptionsSchema, GradingConfigSchema, InputsSchema, NotPrefixedAssertionTypesSchema, OutputConfigSchema, OutputFileExtension, PartialGenerationError, PluginConfigSchema, PolicyObjectSchema, ProvidersSchema, ResultFailureReason, ScenarioSchema, SpecialAssertionTypesSchema, StrategyConfigSchema, TestCaseSchema, TestCaseWithVarsFileSchema, TestCasesWithMetadataPromptSchema, TestCasesWithMetadataSchema, TestGeneratorConfigSchema, TestSuiteConfigSchema, TestSuiteSchema, UnifiedConfigSchema, VarsSchema, assertions_default as assertions, cache_exports as cache, src_default as default, evaluate, generateTable, guardrails, isApiProvider, isGradingResult, isProviderOptions, isResultFailureReason, loadApiProvider, redteam };
|
|
12079
|
+
|
|
12164
12080
|
//# sourceMappingURL=index.js.map
|