promptfoo 0.120.27 → 0.121.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/src/{ListApp-8WOe2nT6.js → ListApp-Du7YVwj5.js} +2 -4
- package/dist/src/accounts-B0pgC1oV.js +206 -0
- package/dist/src/{accounts-DVINui-2.js → accounts-Bm2D8Db9.js} +39 -34
- package/dist/src/{accounts-CPDRAMND.js → accounts-CiBLOnA7.js} +38 -33
- package/dist/src/{accounts-Fl2J3_Fu.cjs → accounts-gtkH-5KX.cjs} +77 -78
- package/dist/src/{agentic-utils-D922n6mm.js → agentic-utils-DS1g3GLF.js} +9 -10
- package/dist/src/{agents-BcsN_BgB.js → agents-9qiOy0ho.js} +16 -12
- package/dist/src/{agents-BXLmVsxR.js → agents-CBr9A01V.js} +37 -37
- package/dist/src/{agents-pMfppv9Z.js → agents-CmvBq8LV.js} +16 -18
- package/dist/src/{agents-hqgSV-3o.js → agents-D__IdAlg.js} +39 -40
- package/dist/src/{agents-BO2n8Z0d.cjs → agents-DbRtpYxR.cjs} +37 -40
- package/dist/src/{agents-BdUTAwi-.js → agents-DgF2zDag.js} +37 -42
- package/dist/src/{agents-DgJf2-ez.cjs → agents-Di9DKPzn.cjs} +16 -17
- package/dist/src/{agents-DNvSH78i.js → agents-cLXA8a_8.js} +17 -19
- package/dist/src/{aimlapi-DtgPI0nE.js → aimlapi-B4rcnZgv.js} +15 -17
- package/dist/src/{aimlapi-BE_Tg9Fl.cjs → aimlapi-BvlNH0gr.cjs} +15 -16
- package/dist/src/{aimlapi-DOib86oE.js → aimlapi-CnkC2HqE.js} +16 -18
- package/dist/src/{aimlapi-DTPACCB1.js → aimlapi-DHJU_kcV.js} +15 -4
- package/dist/src/app/assets/index-4LKxG2CG.js +439 -0
- package/dist/src/app/assets/{index-NCn4eVBv.css → index-C3zcsZFQ.css} +1 -1
- package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
- package/dist/src/app/index.html +3 -3
- package/dist/src/app/tsconfig.app.tsbuildinfo +1 -1
- package/dist/src/{audio-BnRUGAm_.js → audio-Bkv46et0.js} +6 -5
- package/dist/src/{audio-Cwo68yZS.cjs → audio-CGMyULza.cjs} +6 -7
- package/dist/src/{audio-MSRki4JU.js → audio-ClI_AFre.js} +6 -8
- package/dist/src/{audio-BRYU0BFo.js → audio-Dz3z7s3J.js} +7 -9
- package/dist/src/{base-pGVmXNl4.cjs → base-CGrhspbK.cjs} +36 -38
- package/dist/src/{base-h961VXYk.js → base-CpjcHe4e.js} +11 -13
- package/dist/src/base-DLKtKMFh.js +193 -0
- package/dist/src/{base-XB2tDJrB.js → base-Dy1V8--Z.js} +11 -13
- package/dist/src/blobs-BDbfYdrJ.js +236 -0
- package/dist/src/{blobs-CR5C4Ihh.js → blobs-CBO20krR.js} +9 -12
- package/dist/src/{blobs-BM_e6hCa.js → blobs-CMHN0Qcz.js} +9 -12
- package/dist/src/{blobs-B-KQAFhX.cjs → blobs-D23XLin-.cjs} +34 -37
- package/dist/src/{cache-jsiwsAJv.js → cache-BVeDlD87.js} +132 -117
- package/dist/src/{cache-CIpsoBZR.js → cache-C4Nxf52C.js} +132 -118
- package/dist/src/cache-CeUpFm3M.cjs +5 -0
- package/dist/src/{cache-BTVYfbka.cjs → cache-Dh5WtQps.cjs} +182 -168
- package/dist/src/cache-i1P6crbO.js +756 -0
- package/dist/src/cache-n-RCJ-hL.js +6 -0
- package/dist/src/{chat-BcPjZXIp.js → chat-BiKyneZl.js} +45 -46
- package/dist/src/{chat-D31K7C4u.cjs → chat-C1Qst7jL.cjs} +20 -21
- package/dist/src/{chat-B84t99NW.js → chat-C2jrdPMx.js} +20 -9
- package/dist/src/{chat-BE44YOc6.cjs → chat-CgF-J-Jj.cjs} +65 -66
- package/dist/src/{chat-DwWifjxi.js → chat-CzkrVDfz.js} +20 -22
- package/dist/src/chat-DJIw17u0.js +766 -0
- package/dist/src/{chat-CcUCysjU.js → chat-DqxYYtWA.js} +45 -46
- package/dist/src/{chat-DZM2GUHO.js → chat-qmatte1u.js} +21 -23
- package/dist/src/{chatkit-D67HS_0b.js → chatkit-65VXf5SR.js} +58 -58
- package/dist/src/{chatkit-DAB_qfzI.js → chatkit-Be-Q-a9F.js} +58 -60
- package/dist/src/{chatkit-Biqb_wsD.js → chatkit-BxFvW8KY.js} +58 -60
- package/dist/src/{chatkit-PGG4ZYIn.cjs → chatkit-DKyPi1Gs.cjs} +58 -60
- package/dist/src/chunk-DEq-mXcV.js +15 -0
- package/dist/src/chunk-DRamLcfz.js +16 -0
- package/dist/src/{claude-agent-sdk-SVM6AdBu.js → claude-agent-sdk-Apiy0iaz.js} +31 -31
- package/dist/src/{claude-agent-sdk-C-IOTPfo.js → claude-agent-sdk-D2bJee9S.js} +31 -29
- package/dist/src/{claude-agent-sdk-C9SiaQub.cjs → claude-agent-sdk-D9Z5Pr9X.cjs} +31 -28
- package/dist/src/{claude-agent-sdk-CiluSyW1.js → claude-agent-sdk-DfCoW0E6.js} +33 -20
- package/dist/src/cloud-BBh91EUK.js +4 -0
- package/dist/src/{cloud-CZ-q9Ier.js → cloud-C0dlstV_.js} +7 -9
- package/dist/src/{cloudflare-ai-BahKHyhh.js → cloudflare-ai-8TDxHR0x.js} +16 -18
- package/dist/src/{cloudflare-ai-v_qZD6_q.js → cloudflare-ai-BxAGvfju.js} +17 -19
- package/dist/src/{cloudflare-ai-Dfahv5SY.cjs → cloudflare-ai-CknbZ5LJ.cjs} +16 -17
- package/dist/src/{cloudflare-ai-Dxyt50Nl.js → cloudflare-ai-g7PB6VHR.js} +16 -4
- package/dist/src/{cloudflare-gateway-Bi_FpOFy.js → cloudflare-gateway-B9HWA5wf.js} +23 -23
- package/dist/src/{cloudflare-gateway-BPWoZIzJ.cjs → cloudflare-gateway-BSnDmHYo.cjs} +21 -22
- package/dist/src/{cloudflare-gateway-C0guUNwk.js → cloudflare-gateway-CKDb4dJ8.js} +26 -14
- package/dist/src/{cloudflare-gateway-btS7h1OZ.js → cloudflare-gateway-CP9QEWYS.js} +21 -25
- package/dist/src/{codex-sdk-DSxAnbfT.js → codex-sdk-C6UMlxwV.js} +28 -29
- package/dist/src/{codex-sdk-IYVi9fuM.js → codex-sdk-DUwKWezN.js} +28 -27
- package/dist/src/{codex-sdk-DulY0ZRq.js → codex-sdk-GGAw0qbD.js} +28 -29
- package/dist/src/{codex-sdk-DFKMtAyf.cjs → codex-sdk-fAO0c3yA.cjs} +28 -29
- package/dist/src/{cometapi-DzrR3SR_.js → cometapi-BL9yvj_f.js} +16 -4
- package/dist/src/{cometapi-DIO64tf4.cjs → cometapi-C4xSqeID.cjs} +21 -22
- package/dist/src/{cometapi-C9EEpJzT.js → cometapi-CUQq3H_a.js} +21 -24
- package/dist/src/{cometapi-DkNBMk0G.js → cometapi-DFNiKmSz.js} +17 -19
- package/dist/src/{completion-CG29bfKX.js → completion-5MzrpJxT.js} +11 -13
- package/dist/src/{completion-CCRT4kX1.cjs → completion-CM6oK8PS.cjs} +21 -23
- package/dist/src/{completion-Bgf1VJoq.js → completion-DZ083F31.js} +11 -13
- package/dist/src/completion-qRoZAYRB.js +120 -0
- package/dist/src/{createHash-Dw_iLu31.js → createHash-CTQmL3G2.js} +2 -3
- package/dist/src/{createHash-CYQy4YeL.cjs → createHash-CfZSc0b4.cjs} +13 -14
- package/dist/src/{createHash-CJcfskIZ.js → createHash-Da8fMwqB.js} +2 -3
- package/dist/src/createHash-DmPQkvBh.js +15 -0
- package/dist/src/{docker-D-ayp2FW.js → docker-Bb5dcxr8.js} +18 -20
- package/dist/src/{docker-B81N0t4e.js → docker-BvfL2BrW.js} +19 -21
- package/dist/src/{docker-DNcLR4Ig.cjs → docker-DcF2pRrj.cjs} +18 -19
- package/dist/src/{docker-egERKxCF.js → docker-ExVyLp0S.js} +18 -7
- package/dist/src/entrypoint.js +2 -3
- package/dist/src/{errors-DnGCbnx8.js → errors-P6ll7XSJ.js} +2 -2
- package/dist/src/{esm-B9dPm_BF.js → esm-C03C-mv3.js} +17 -20
- package/dist/src/{esm-D2pZ87fL.js → esm-CaIwzWR5.js} +18 -21
- package/dist/src/esm-Cd1AjG1D.js +379 -0
- package/dist/src/{esm-Ct-Joyue.cjs → esm-CnNt7sI4.cjs} +47 -49
- package/dist/src/eval-B3r2CVXr.js +15 -0
- package/dist/src/{eval-C-Nr6wX_.js → eval-Dg2nG4v2.js} +47 -54
- package/dist/src/evalResult-5xwYnECe.js +12 -0
- package/dist/src/evalResult-71lY93Kj.cjs +10 -0
- package/dist/src/{evalResult-DXMWJ3sx.js → evalResult-BBRNtX4I.js} +10 -11
- package/dist/src/{evalResult-4BzI2tmj.js → evalResult-BDMqrapS.js} +16 -12
- package/dist/src/evalResult-Dx5P5cIv.js +10 -0
- package/dist/src/{evalResult-CX8wQecI.cjs → evalResult-fuaI8HkH.cjs} +20 -21
- package/dist/src/{evaluator-8aGyV12L.js → evaluator-BhoWwp5b.js} +211 -235
- package/dist/src/evaluator-Jx6bRZV6.js +36 -0
- package/dist/src/{extractor-V5x_m1i0.js → extractor-C0EVHewb.js} +22 -24
- package/dist/src/extractor-D25qpmGX.js +374 -0
- package/dist/src/{extractor-CD5yKL-G.js → extractor-DReVID0K.js} +22 -24
- package/dist/src/{extractor-C031XmTA.cjs → extractor-pYLLi3wS.cjs} +37 -39
- package/dist/src/{fetch-BmbD-v1L.cjs → fetch-BPkYtG8K.cjs} +244 -277
- package/dist/src/fetch-BxNb_Lp3.js +5 -0
- package/dist/src/{fetch-D3OHf-lV.js → fetch-Cwxnd8zz.js} +36 -44
- package/dist/src/{fetch-CXZI9RRr.js → fetch-Dxpd4_sr.js} +23 -35
- package/dist/src/fetch-HaqdX7U1.js +780 -0
- package/dist/src/{fileExtensions-ePDqouxn.js → fileExtensions-DnqA1y9x.js} +2 -2
- package/dist/src/{fileExtensions-BpuMmaFL.js → fileExtensions-Ds-foDzt.js} +2 -2
- package/dist/src/fileExtensions-LcDYkU4v.js +85 -0
- package/dist/src/{fileExtensions-DkJYkWUy.cjs → fileExtensions-bYh77CN8.cjs} +27 -28
- package/dist/src/{formatDuration-CdevI3An.js → formatDuration-DgBVMN65.js} +2 -2
- package/dist/src/{genaiTracer-Ce19n68P.js → genaiTracer-70Z8BIuV.js} +2 -3
- package/dist/src/{genaiTracer-CqNnnXrE.js → genaiTracer-C1rxGO8Q.js} +2 -3
- package/dist/src/genaiTracer-D3fD9dNV.js +256 -0
- package/dist/src/{genaiTracer-Dres3qrN.cjs → genaiTracer-DN4dQywX.cjs} +13 -14
- package/dist/src/{graders--1y2u9HO.js → graders-BTeBGqjJ.js} +349 -397
- package/dist/src/graders-B_pgMLS2.js +34 -0
- package/dist/src/{graders-DTeBrzWp.js → graders-Bj_Odv7c.js} +349 -397
- package/dist/src/graders-DErokPDO.cjs +32 -0
- package/dist/src/graders-DP7KFFo-.js +13466 -0
- package/dist/src/graders-DR_uNe54.js +32 -0
- package/dist/src/{graders-DohM2dir.cjs → graders-DU49_J8Y.cjs} +684 -732
- package/dist/src/graders-w3176Wz-.js +32 -0
- package/dist/src/{image-B0U4Hqll.js → image-B02ogr_b.js} +7 -9
- package/dist/src/{image-DmE-niFE.js → image-B0h9VEMc.js} +6 -5
- package/dist/src/{image-CuKHuccK.cjs → image-BLmROtN3.cjs} +29 -30
- package/dist/src/{image-DNEIf_aI.js → image-Bb4vWQLM.js} +6 -8
- package/dist/src/{image-DpKl2F15.cjs → image-C1madmKh.cjs} +6 -7
- package/dist/src/{image-C3wHC9_h.js → image-CHfWvljl.js} +9 -10
- package/dist/src/{image-O1u4bCFg.js → image-DS-o-0ph.js} +9 -10
- package/dist/src/image-Dpxa1Jt6.js +257 -0
- package/dist/src/index.cjs +615 -695
- package/dist/src/index.d.cts +271 -7
- package/dist/src/index.d.ts +271 -3
- package/dist/src/index.js +580 -664
- package/dist/src/{interactiveCheck-Bxj1Swex.js → interactiveCheck-BgLZUIt3.js} +7 -8
- package/dist/src/{invariant-DT20jrBd.js → invariant-BtWWVVhl.js} +2 -2
- package/dist/src/{invariant-1pAf2CD1.js → invariant-Ddh24eXh.js} +2 -2
- package/dist/src/{invariant-CKcJAQ6M.cjs → invariant-kfQ8Bu82.cjs} +7 -8
- package/dist/src/invariant-vgHWClmd.js +25 -0
- package/dist/src/{knowledgeBase-CEzQobWX.js → knowledgeBase-B3OoKIej.js} +14 -9
- package/dist/src/{knowledgeBase-Be_zyW4L.js → knowledgeBase-CYTLHOt1.js} +16 -16
- package/dist/src/{knowledgeBase-BZ41IFwq.js → knowledgeBase-D33Ty2l6.js} +14 -18
- package/dist/src/{knowledgeBase-D-5BMXlr.cjs → knowledgeBase-DOO_BM9b.cjs} +14 -15
- package/dist/src/{litellm-DnbRJ2if.js → litellm-AaeZcZQF.js} +18 -19
- package/dist/src/{litellm-hUSNM_M2.cjs → litellm-I_hbp_dc.cjs} +17 -17
- package/dist/src/{litellm-CRDqPhNI.js → litellm-NbjknEh6.js} +17 -18
- package/dist/src/{litellm-9vR8zpfU.js → litellm-TrljxD9G.js} +17 -5
- package/dist/src/{logger-CG1uZPbQ.js → logger-CT3IKMKA.js} +10 -29
- package/dist/src/{logger-B7sBeGa0.cjs → logger-Cp1GPUjj.cjs} +152 -180
- package/dist/src/logger-DLcq4dWf.js +713 -0
- package/dist/src/{logger-LSBxlt7a.js → logger-KkObSCzq.js} +13 -31
- package/dist/src/{luma-ray-4blv9iZ2.js → luma-ray-BS2_tY8L.js} +22 -21
- package/dist/src/{luma-ray-drvgdpP9.js → luma-ray-DDsjcgZZ.js} +20 -13
- package/dist/src/{luma-ray-Hm3d6VJE.cjs → luma-ray-Due0n7di.cjs} +20 -21
- package/dist/src/{luma-ray-B2__8lYH.js → luma-ray-f6I2fft-.js} +20 -23
- package/dist/src/main.js +1170 -1321
- package/dist/src/{messages-Uee41Mj5.js → messages-BS17jdMx.js} +22 -24
- package/dist/src/{messages-XhiwCbi4.cjs → messages-Bs1kC7P4.cjs} +32 -34
- package/dist/src/{messages-CGPPidQr.js → messages-D0lx5qK7.js} +22 -24
- package/dist/src/messages-ZJk778GH.js +240 -0
- package/dist/src/{meteor-BYykdXrV.js → meteor-44VjEACX.js} +3 -4
- package/dist/src/{meteor-CsopaHrH.js → meteor-D-SotUw9.js} +3 -4
- package/dist/src/{meteor-e-E-2vVl.cjs → meteor-DLZZ3osF.cjs} +3 -4
- package/dist/src/{meteor-C8lGP6P4.js → meteor-DUiCJRC-.js} +3 -4
- package/dist/src/{modelslab-yKz-ZNB4.js → modelslab-Bmni6skY.js} +17 -10
- package/dist/src/{modelslab-E9gO-bYd.js → modelslab-Bx9IrZfS.js} +18 -20
- package/dist/src/{modelslab-lUVW0cmB.cjs → modelslab-CoUX6Jc_.cjs} +17 -18
- package/dist/src/{modelslab-ClBkr8_9.js → modelslab-DRb74SP4.js} +17 -19
- package/dist/src/{nova-reel-Dk8jNpId.js → nova-reel-BfPq-0Yk.js} +20 -13
- package/dist/src/{nova-reel-D8CuO6QH.cjs → nova-reel-C_QM18Xn.cjs} +20 -21
- package/dist/src/{nova-reel-u2eF2Cxm.js → nova-reel-D_W1tjMH.js} +22 -21
- package/dist/src/{nova-reel-P9bwvtYX.js → nova-reel-bgjxilYW.js} +20 -23
- package/dist/src/{nova-sonic-CK2rAiKi.js → nova-sonic-CFb5GYhg.js} +30 -26
- package/dist/src/{nova-sonic-BaqWlkds.js → nova-sonic-DIGQNR07.js} +30 -31
- package/dist/src/{nova-sonic-yZapPLv7.js → nova-sonic-De1HW5fD.js} +31 -32
- package/dist/src/{nova-sonic-Ds1C-dpm.cjs → nova-sonic-zfcljeRp.cjs} +30 -31
- package/dist/src/{openai-DUFopMrH.cjs → openai-Cuif0GEt.cjs} +8 -9
- package/dist/src/{openai-PblZ3jUE.js → openai-DElQ-fPX.js} +3 -4
- package/dist/src/{openai-CcN1B8Sb.js → openai-DhbB7eWK.js} +3 -4
- package/dist/src/openai-j-sE2O7r.js +44 -0
- package/dist/src/{openclaw-B6qqDr_u.cjs → openclaw-CSugPYAr.cjs} +188 -130
- package/dist/src/{openclaw-A-3_loM7.js → openclaw-DiSz3I5L.js} +180 -109
- package/dist/src/{openclaw-a3lylB-V.js → openclaw-DuvJKEW5.js} +178 -124
- package/dist/src/{openclaw-COn6QzDi.js → openclaw-tiVYRtr-.js} +178 -122
- package/dist/src/opencode-sdk-0j6rTWNb.js +562 -0
- package/dist/src/opencode-sdk-B3CWY9h_.js +560 -0
- package/dist/src/opencode-sdk-BL764Jdi.cjs +564 -0
- package/dist/src/opencode-sdk-C2y6UkP2.js +560 -0
- package/dist/src/{otlpReceiver-oyf5wLGC.js → otlpReceiver-C99PPb48.js} +53 -51
- package/dist/src/{otlpReceiver-lXsYVbpj.cjs → otlpReceiver-CGq6LspY.cjs} +53 -55
- package/dist/src/{otlpReceiver-94URx7UW.js → otlpReceiver-CdNBdbsk.js} +53 -55
- package/dist/src/{otlpReceiver-BmmTiMjA.js → otlpReceiver-D89fR-rC.js} +53 -55
- package/dist/src/{providerRegistry-Cq_JK_CJ.js → providerRegistry-B0RUOLI_.js} +7 -8
- package/dist/src/{providerRegistry-DSSHjMKf.js → providerRegistry-CD8MEar9.js} +7 -8
- package/dist/src/{providerRegistry-CvHEVJad.cjs → providerRegistry-Civky8Ar.cjs} +12 -13
- package/dist/src/providerRegistry-DM8rZYol.js +45 -0
- package/dist/src/providers-B7V0njNs.js +32 -0
- package/dist/src/providers-BEwbhv0X.js +30 -0
- package/dist/src/{providers-Iil64vk9.js → providers-BlqUifFg.js} +1543 -1676
- package/dist/src/providers-CH3C7zf7.js +30 -0
- package/dist/src/{providers-DHbjzW2e.cjs → providers-CgKOSgTR.cjs} +1896 -2029
- package/dist/src/providers-D8lF1sqW.js +33246 -0
- package/dist/src/{providers-BnFpbY_s.js → providers-Dk_6ocUX.js} +1536 -1669
- package/dist/src/providers-zyB6k_38.cjs +31 -0
- package/dist/src/{pythonUtils-CcT5LH1M.js → pythonUtils-C3py6GC1.js} +18 -19
- package/dist/src/{pythonUtils-DBbuI3QJ.cjs → pythonUtils-CTU3Y3lw.cjs} +42 -43
- package/dist/src/{pythonUtils-hZ8LeQLv.js → pythonUtils-D5nxkQ0P.js} +18 -19
- package/dist/src/pythonUtils-D6fwaDSg.js +249 -0
- package/dist/src/{quiverai-BuI0tE39.js → quiverai-BbOUOn2L.js} +8 -7
- package/dist/src/{quiverai-DCGSZt4U.js → quiverai-CIaELU_m.js} +8 -10
- package/dist/src/{quiverai-DiMVJQDz.cjs → quiverai-PdShCPox.cjs} +8 -9
- package/dist/src/{quiverai-fQNkExW4.js → quiverai-uH-dcTIr.js} +9 -11
- package/dist/src/{render-Dj1smHEb.js → render-Drod8m7K.js} +4 -5
- package/dist/src/responses-CB2jwoAr.js +660 -0
- package/dist/src/{responses-ghR3IOfy.cjs → responses-D8SBTL64.cjs} +39 -42
- package/dist/src/{responses-DOAFFENS.js → responses-DIR9Ud3j.js} +24 -27
- package/dist/src/{responses-CxzoQoBe.js → responses-WNGNYe3K.js} +24 -27
- package/dist/src/rubyUtils-BUHu6PhO.js +5 -0
- package/dist/src/{rubyUtils-CwbGmgYN.js → rubyUtils-BUVePouc.js} +27 -20
- package/dist/src/rubyUtils-BcuGX77l.js +222 -0
- package/dist/src/{rubyUtils-DudlFZed.js → rubyUtils-Boc4HZzX.js} +18 -19
- package/dist/src/rubyUtils-CP42kMvq.cjs +4 -0
- package/dist/src/{rubyUtils-C8MhKGHb.cjs → rubyUtils-DhCAlxZr.cjs} +48 -50
- package/dist/src/{sagemaker-gmskuyre.js → sagemaker-CNBxx5CJ.js} +75 -70
- package/dist/src/{sagemaker-CcxhlOAR.js → sagemaker-CemTFp2h.js} +75 -79
- package/dist/src/{sagemaker-77zbJ2Q2.cjs → sagemaker-Cl28mZU2.cjs} +75 -76
- package/dist/src/{sagemaker-DuM71dVU.js → sagemaker-YSyBXQQh.js} +77 -77
- package/dist/src/{scanner-DJYiSXQj.js → scanner-BsBlNXNn.js} +100 -121
- package/dist/src/server/index.js +5520 -67427
- package/dist/src/{server-B5v33lvE.cjs → server-C_7Ax-hA.cjs} +57 -67
- package/dist/src/{server-BJ4m4f1D.js → server-CqzrVGpF.js} +26 -29
- package/dist/src/server-CuxBbeSY.js +229 -0
- package/dist/src/server-DA4Cyrrq.js +7 -0
- package/dist/src/server-Dulb-4-K.cjs +5 -0
- package/dist/src/{server-RV_i_YX5.js → server-VWgWb00X.js} +19 -24
- package/dist/src/{signal-BW33JuId.js → signal-4U3mfRvL.js} +9 -11
- package/dist/src/{slack-DEURelTy.cjs → slack-BmVAVGaK.cjs} +7 -8
- package/dist/src/{slack-BQYeW9L3.js → slack-DCUPTzS2.js} +8 -8
- package/dist/src/{slack-BB6yuZzp.js → slack-DOdy_kyv.js} +7 -8
- package/dist/src/{slack-2pRrhhgJ.js → slack-DXMKtA-f.js} +7 -9
- package/dist/src/store-CXGFv4aR.js +228 -0
- package/dist/src/store-CXS-Q_91.js +6 -0
- package/dist/src/{store-D7CgQzAR.cjs → store-DLlFCC4h.cjs} +44 -45
- package/dist/src/{store-DJNsD1iC.js → store-DXilxTl-.js} +40 -36
- package/dist/src/{store-s3SftUwF.js → store-Dim__MDd.js} +34 -35
- package/dist/src/store-eYkaKMwq.cjs +5 -0
- package/dist/src/{tables-DfTsNN7X.js → tables-6YKwjN9-.js} +19 -21
- package/dist/src/tables-DLJPUdUE.js +288 -0
- package/dist/src/{tables-BKTmd6u7.cjs → tables-DPi7wKeM.cjs} +89 -91
- package/dist/src/{tables-DMegD0Xf.js → tables-gftXzE9I.js} +21 -23
- package/dist/src/telemetry-BpMfhthR.cjs +5 -0
- package/dist/src/{telemetry--WAdAfVi.js → telemetry-CMrFgtPB.js} +11 -13
- package/dist/src/telemetry-Cps3mIU-.js +171 -0
- package/dist/src/{telemetry-DQgVBCAb.cjs → telemetry-DaX14Chu.cjs} +21 -24
- package/dist/src/{telemetry-BedSm-bZ.js → telemetry-Dthj_BbD.js} +17 -14
- package/dist/src/telemetry-Dw38hanS.js +7 -0
- package/dist/src/{text-oiSbwSOI.js → text-B_UCRPp2.js} +2 -2
- package/dist/src/{text-oKzCBnK6.cjs → text-CW1cyrwj.cjs} +12 -13
- package/dist/src/{text-B_IrO4GZ.js → text-Db-Wt2u2.js} +2 -2
- package/dist/src/text-TIv0QYnd.js +22 -0
- package/dist/src/{tokenUsageUtils-FZd5O_4A.js → tokenUsageUtils-BDGe-iyI.js} +2 -2
- package/dist/src/{tokenUsageUtils-DmZSD2eU.js → tokenUsageUtils-DflFMjS0.js} +2 -2
- package/dist/src/tokenUsageUtils-NYT-WKS6.js +138 -0
- package/dist/src/{tokenUsageUtils-CXhxVj72.cjs → tokenUsageUtils-bVa1ga6f.cjs} +32 -33
- package/dist/src/{transcription-mYS9vd5v.js → transcription-BNYURcXg.js} +14 -7
- package/dist/src/{transcription-X2-B4vkX.js → transcription-B_OdaHp7.js} +14 -16
- package/dist/src/{transcription-BO1AHegO.cjs → transcription-NLVG9MT1.cjs} +14 -15
- package/dist/src/{transcription-lzBLiTFJ.js → transcription-s6A-bNrZ.js} +15 -17
- package/dist/src/{transform-B1Hi5lWS.cjs → transform-CzK1Q0zl.cjs} +24 -26
- package/dist/src/{transform-DeGlxb0D.js → transform-D5HsjduX.js} +39 -47
- package/dist/src/{transform-CYDILYDe.js → transform-DECvGmzp.js} +15 -13
- package/dist/src/transform-DTGDnAzW.js +6 -0
- package/dist/src/{transform-BEgStbHK.js → transform-DilY9wbS.js} +10 -12
- package/dist/src/{transform-D5PjiWiZ.cjs → transform-DuHvhZpj.cjs} +179 -187
- package/dist/src/transform-aa6tmVpZ.js +216 -0
- package/dist/src/transform-m3qNw4KP.cjs +5 -0
- package/dist/src/transform-uAytVuyX.js +1506 -0
- package/dist/src/{transform-Dfl89yi4.js → transform-vNucnNr0.js} +39 -47
- package/dist/src/{transformersAvailability-SZnTS3pJ.js → transformersAvailability-CEVM2GNQ.js} +2 -2
- package/dist/src/{transformersAvailability-D-glmEy7.cjs → transformersAvailability-CwayUSlh.cjs} +2 -3
- package/dist/src/{transformersAvailability-CjeFXhuJ.js → transformersAvailability-D6c6ROpT.js} +2 -2
- package/dist/src/{types-DWNf48sT.cjs → types-C_7nyzr1.cjs} +538 -574
- package/dist/src/{types-CXQduE9o.js → types-Cbd8uOMq.js} +68 -100
- package/dist/src/types-CzW2QFyi.js +3288 -0
- package/dist/src/{types-C5hEkb-x.js → types-DmyIJ-sR.js} +63 -99
- package/dist/src/{util-CoQjmE3u.js → util-B3xGByQh.js} +4 -5
- package/dist/src/{util-aLhtl3fe.cjs → util-B9vlHIIh.cjs} +208 -223
- package/dist/src/{util-Du96oyYS.js → util-BHGHw5G1.js} +4 -5
- package/dist/src/{util-DQ984syk.js → util-BRYkYPTd.js} +36 -51
- package/dist/src/{util-D9eLdGfa.js → util-BV4XUC0n.js} +5 -6
- package/dist/src/util-Bv6uGDfH.js +293 -0
- package/dist/src/{util-1wWM599Z.cjs → util-BzMcevZc.cjs} +50 -51
- package/dist/src/{util-_h4pVqrz.js → util-C1CeHl-P.js} +36 -51
- package/dist/src/{util-Bm_-UMD_.js → util-CMy69ZgQ.js} +5 -6
- package/dist/src/{util-CyUdMzV0.cjs → util-DGNOS1db.cjs} +34 -35
- package/dist/src/util-Dnmk2mBQ.js +599 -0
- package/dist/src/util-ZzmqNPlg.js +1426 -0
- package/dist/src/{utils-BjLy-Q72.cjs → utils-Cz9qXqII.cjs} +29 -32
- package/dist/src/{utils-CFMn2yHW.js → utils-XiOAgly5.js} +4 -7
- package/dist/src/utils-dLokC-eR.js +94 -0
- package/dist/src/{utils-DvWMzuMx.js → utils-f2-Moju7.js} +4 -7
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +38 -38
- package/dist/src/app/assets/index-B2D0bCSI.js +0 -439
- package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
- package/dist/src/cache-ChPcurj7.js +0 -6
- package/dist/src/cache-VVu_W-yg.js +0 -8
- package/dist/src/cache-YLNCFEM2.cjs +0 -6
- package/dist/src/chunk-DHDDz29n.js +0 -22
- package/dist/src/chunk-FhC4c-0y.js +0 -21
- package/dist/src/cloud-BndfXy4H.js +0 -5
- package/dist/src/eval-BhHvMY82.js +0 -17
- package/dist/src/evalResult-Dq2gFNQY.js +0 -12
- package/dist/src/evalResult-nmcP5VKH.cjs +0 -12
- package/dist/src/evalResult-trqZjVYh.js +0 -14
- package/dist/src/evaluator-CnfPstzT.js +0 -39
- package/dist/src/fetch-IDPDue6F.cjs +0 -4
- package/dist/src/fetch-hKJ-It8q.js +0 -6
- package/dist/src/fetch-ouKnrWK-.js +0 -4
- package/dist/src/graders-CQn7WUsd.cjs +0 -34
- package/dist/src/graders-DC6QAbpW.js +0 -35
- package/dist/src/graders-DUWz3Y7j.js +0 -37
- package/dist/src/opencode-sdk-4bL9n-Gk.js +0 -382
- package/dist/src/opencode-sdk-BfC2zWcR.js +0 -376
- package/dist/src/opencode-sdk-DMJyuwMg.js +0 -380
- package/dist/src/opencode-sdk-Da-9adza.cjs +0 -383
- package/dist/src/providers-CsXB2Ix-.js +0 -35
- package/dist/src/providers-DO8ltjLC.js +0 -33
- package/dist/src/providers-Dtq-xnXd.cjs +0 -33
- package/dist/src/rubyUtils-BUbcND2f.js +0 -6
- package/dist/src/rubyUtils-Cr55X_KE.js +0 -5
- package/dist/src/rubyUtils-DlIiqoYo.cjs +0 -5
- package/dist/src/server-C2eQH4Gu.js +0 -6
- package/dist/src/server-CXWycu7H.cjs +0 -6
- package/dist/src/server-Q6OGlxxT.js +0 -8
- package/dist/src/store-B3EDO9Q3.js +0 -7
- package/dist/src/store-Dl9F8aw5.js +0 -6
- package/dist/src/store-SnrGrlt9.cjs +0 -6
- package/dist/src/telemetry-BGhiPZtl.js +0 -8
- package/dist/src/telemetry-CFfiYan6.cjs +0 -6
- package/dist/src/telemetry-DHzEduxX.js +0 -6
- package/dist/src/transform-C1x1ZlMQ.cjs +0 -6
- package/dist/src/transform-DYHjFmQu.js +0 -8
- package/dist/src/transform-rmwJT5JQ.js +0 -7
- package/dist/src/transformersAvailability-eJooj0gX.js +0 -35
package/dist/src/index.cjs
CHANGED
|
@@ -1,41 +1,44 @@
|
|
|
1
|
-
Object.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
const
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
const
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const
|
|
15
|
-
const
|
|
16
|
-
require(
|
|
17
|
-
const
|
|
18
|
-
const
|
|
19
|
-
|
|
20
|
-
require(
|
|
21
|
-
require(
|
|
22
|
-
require(
|
|
23
|
-
require(
|
|
24
|
-
|
|
25
|
-
require(
|
|
26
|
-
|
|
27
|
-
const
|
|
28
|
-
|
|
29
|
-
const
|
|
30
|
-
const
|
|
31
|
-
const
|
|
32
|
-
const
|
|
33
|
-
const
|
|
34
|
-
require(
|
|
35
|
-
require(
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
|
|
1
|
+
Object.defineProperties(exports, {
|
|
2
|
+
__esModule: { value: true },
|
|
3
|
+
[Symbol.toStringTag]: { value: "Module" }
|
|
4
|
+
});
|
|
5
|
+
const require_logger = require("./logger-Cp1GPUjj.cjs");
|
|
6
|
+
const require_invariant = require("./invariant-kfQ8Bu82.cjs");
|
|
7
|
+
const require_esm = require("./esm-CnNt7sI4.cjs");
|
|
8
|
+
const require_pythonUtils = require("./pythonUtils-CTU3Y3lw.cjs");
|
|
9
|
+
const require_fileExtensions = require("./fileExtensions-bYh77CN8.cjs");
|
|
10
|
+
const require_transform = require("./transform-CzK1Q0zl.cjs");
|
|
11
|
+
const require_graders = require("./graders-DU49_J8Y.cjs");
|
|
12
|
+
const require_types = require("./types-C_7nyzr1.cjs");
|
|
13
|
+
const require_util = require("./util-B9vlHIIh.cjs");
|
|
14
|
+
const require_fetch = require("./fetch-BPkYtG8K.cjs");
|
|
15
|
+
const require_cache = require("./cache-Dh5WtQps.cjs");
|
|
16
|
+
const require_providers = require("./providers-CgKOSgTR.cjs");
|
|
17
|
+
const require_utils = require("./utils-Cz9qXqII.cjs");
|
|
18
|
+
const require_createHash = require("./createHash-CfZSc0b4.cjs");
|
|
19
|
+
require("./genaiTracer-DN4dQywX.cjs");
|
|
20
|
+
const require_chat = require("./chat-CgF-J-Jj.cjs");
|
|
21
|
+
const require_tokenUsageUtils = require("./tokenUsageUtils-bVa1ga6f.cjs");
|
|
22
|
+
const require_transform$1 = require("./transform-DuHvhZpj.cjs");
|
|
23
|
+
require("./messages-Bs1kC7P4.cjs");
|
|
24
|
+
require("./util-DGNOS1db.cjs");
|
|
25
|
+
require("./responses-D8SBTL64.cjs");
|
|
26
|
+
require("./openai-Cuif0GEt.cjs");
|
|
27
|
+
const require_util$2 = require("./util-BzMcevZc.cjs");
|
|
28
|
+
require("./completion-CM6oK8PS.cjs");
|
|
29
|
+
const require_accounts = require("./accounts-gtkH-5KX.cjs");
|
|
30
|
+
const require_server = require("./server-C_7Ax-hA.cjs");
|
|
31
|
+
const require_blobs = require("./blobs-D23XLin-.cjs");
|
|
32
|
+
const require_tables = require("./tables-DPi7wKeM.cjs");
|
|
33
|
+
const require_extractor = require("./extractor-pYLLi3wS.cjs");
|
|
34
|
+
const require_telemetry = require("./telemetry-DaX14Chu.cjs");
|
|
35
|
+
const require_text = require("./text-CW1cyrwj.cjs");
|
|
36
|
+
const require_store = require("./store-DLlFCC4h.cjs");
|
|
37
|
+
require("./base-CGrhspbK.cjs");
|
|
38
|
+
require("./image-BLmROtN3.cjs");
|
|
39
|
+
const require_providerRegistry = require("./providerRegistry-Civky8Ar.cjs");
|
|
40
|
+
const require_rubyUtils = require("./rubyUtils-DhCAlxZr.cjs");
|
|
41
|
+
const require_evalResult = require("./evalResult-fuaI8HkH.cjs");
|
|
39
42
|
let fs = require("fs");
|
|
40
43
|
fs = require_logger.__toESM(fs);
|
|
41
44
|
let path = require("path");
|
|
@@ -44,8 +47,8 @@ let async = require("async");
|
|
|
44
47
|
async = require_logger.__toESM(async);
|
|
45
48
|
let js_yaml = require("js-yaml");
|
|
46
49
|
js_yaml = require_logger.__toESM(js_yaml);
|
|
47
|
-
|
|
48
|
-
|
|
50
|
+
require("node:path");
|
|
51
|
+
require("node:url");
|
|
49
52
|
let chalk = require("chalk");
|
|
50
53
|
chalk = require_logger.__toESM(chalk);
|
|
51
54
|
let os = require("os");
|
|
@@ -91,7 +94,6 @@ let ora = require("ora");
|
|
|
91
94
|
ora = require_logger.__toESM(ora);
|
|
92
95
|
let url = require("url");
|
|
93
96
|
require("@inquirer/confirm");
|
|
94
|
-
|
|
95
97
|
//#region src/external/matchers/conversationRelevancyTemplate.ts
|
|
96
98
|
var ConversationRelevancyTemplate = class {
|
|
97
99
|
static generateVerdicts(slidingWindow) {
|
|
@@ -163,7 +165,6 @@ ${JSON.stringify(irrelevancies, null, 2)}
|
|
|
163
165
|
JSON:`;
|
|
164
166
|
}
|
|
165
167
|
};
|
|
166
|
-
|
|
167
168
|
//#endregion
|
|
168
169
|
//#region src/external/matchers/deepeval.ts
|
|
169
170
|
const nunjucks$1 = require_util.getNunjucksEngine(void 0, false, true);
|
|
@@ -213,7 +214,6 @@ async function matchesConversationRelevance(messages, threshold, vars, grading,
|
|
|
213
214
|
return require_graders.fail(`Error parsing output: ${err.message}`, resp.tokenUsage);
|
|
214
215
|
}
|
|
215
216
|
}
|
|
216
|
-
|
|
217
217
|
//#endregion
|
|
218
218
|
//#region src/external/assertions/deepeval.ts
|
|
219
219
|
const DEFAULT_WINDOW_SIZE = 5;
|
|
@@ -268,7 +268,6 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
268
268
|
tokensUsed: tokensUsed.total > 0 ? tokensUsed : void 0
|
|
269
269
|
};
|
|
270
270
|
};
|
|
271
|
-
|
|
272
271
|
//#endregion
|
|
273
272
|
//#region src/tracing/evaluatorTracing.ts
|
|
274
273
|
let otlpReceiverStarted = false;
|
|
@@ -301,28 +300,28 @@ function isOtlpReceiverStarted() {
|
|
|
301
300
|
* Start the OTLP receiver if tracing is enabled and it hasn't been started yet
|
|
302
301
|
*/
|
|
303
302
|
async function startOtlpReceiverIfNeeded(testSuite) {
|
|
304
|
-
require_logger.
|
|
305
|
-
require_logger.
|
|
306
|
-
require_logger.
|
|
303
|
+
require_logger.logger.debug(`[EvaluatorTracing] Checking tracing config: ${JSON.stringify(testSuite.tracing)}`);
|
|
304
|
+
require_logger.logger.debug(`[EvaluatorTracing] testSuite keys: ${Object.keys(testSuite)}`);
|
|
305
|
+
require_logger.logger.debug(`[EvaluatorTracing] Full testSuite.tracing: ${JSON.stringify(testSuite.tracing, null, 2)}`);
|
|
307
306
|
if (testSuite.tracing?.enabled && testSuite.tracing?.otlp?.http?.enabled && !otlpReceiverStarted) {
|
|
308
|
-
require_telemetry.
|
|
307
|
+
require_telemetry.telemetry.record("feature_used", { feature: "tracing" });
|
|
309
308
|
try {
|
|
310
|
-
require_logger.
|
|
311
|
-
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
309
|
+
require_logger.logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
310
|
+
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-CGq6LspY.cjs"));
|
|
312
311
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
313
312
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
314
|
-
require_logger.
|
|
313
|
+
require_logger.logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
315
314
|
await startOTLPReceiver(port, host);
|
|
316
315
|
otlpReceiverStarted = true;
|
|
317
|
-
require_logger.
|
|
316
|
+
require_logger.logger.info(`[EvaluatorTracing] OTLP receiver successfully started on port ${port} for tracing`);
|
|
318
317
|
} catch (error) {
|
|
319
|
-
require_logger.
|
|
318
|
+
require_logger.logger.error(`[EvaluatorTracing] Failed to start OTLP receiver: ${error}`);
|
|
320
319
|
}
|
|
321
|
-
} else if (otlpReceiverStarted) require_logger.
|
|
320
|
+
} else if (otlpReceiverStarted) require_logger.logger.debug("[EvaluatorTracing] OTLP receiver already started, skipping initialization");
|
|
322
321
|
else {
|
|
323
|
-
require_logger.
|
|
324
|
-
require_logger.
|
|
325
|
-
require_logger.
|
|
322
|
+
require_logger.logger.debug("[EvaluatorTracing] Tracing not enabled or OTLP HTTP receiver not configured");
|
|
323
|
+
require_logger.logger.debug(`[EvaluatorTracing] tracing.enabled: ${testSuite.tracing?.enabled}`);
|
|
324
|
+
require_logger.logger.debug(`[EvaluatorTracing] tracing.otlp.http.enabled: ${testSuite.tracing?.otlp?.http?.enabled}`);
|
|
326
325
|
}
|
|
327
326
|
}
|
|
328
327
|
/**
|
|
@@ -330,13 +329,13 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
330
329
|
*/
|
|
331
330
|
async function stopOtlpReceiverIfNeeded() {
|
|
332
331
|
if (otlpReceiverStarted) try {
|
|
333
|
-
require_logger.
|
|
334
|
-
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
332
|
+
require_logger.logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
333
|
+
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-CGq6LspY.cjs"));
|
|
335
334
|
await stopOTLPReceiver();
|
|
336
335
|
otlpReceiverStarted = false;
|
|
337
|
-
require_logger.
|
|
336
|
+
require_logger.logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
338
337
|
} catch (error) {
|
|
339
|
-
require_logger.
|
|
338
|
+
require_logger.logger.error(`[EvaluatorTracing] Failed to stop OTLP receiver: ${error}`);
|
|
340
339
|
}
|
|
341
340
|
}
|
|
342
341
|
/**
|
|
@@ -352,7 +351,7 @@ function isTracingEnabled(test, testSuite) {
|
|
|
352
351
|
const yamlConfigEnabled = testSuite?.tracing?.enabled === true;
|
|
353
352
|
const envEnabled = require_logger.getEnvBool("PROMPTFOO_TRACING_ENABLED", false);
|
|
354
353
|
const result = metadataEnabled || yamlConfigEnabled || envEnabled;
|
|
355
|
-
require_logger.
|
|
354
|
+
require_logger.logger.debug(`[EvaluatorTracing] isTracingEnabled check: metadata=${metadataEnabled}, yamlConfig=${yamlConfigEnabled}, env=${envEnabled}, result=${result}`);
|
|
356
355
|
return result;
|
|
357
356
|
}
|
|
358
357
|
/**
|
|
@@ -361,25 +360,25 @@ function isTracingEnabled(test, testSuite) {
|
|
|
361
360
|
async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite) {
|
|
362
361
|
const tracingEnabled = isTracingEnabled(test, testSuite);
|
|
363
362
|
if (tracingEnabled) {
|
|
364
|
-
require_logger.
|
|
365
|
-
require_logger.
|
|
363
|
+
require_logger.logger.debug("[EvaluatorTracing] Tracing enabled for test case");
|
|
364
|
+
require_logger.logger.debug(`[EvaluatorTracing] Test metadata: ${JSON.stringify(test.metadata)}`);
|
|
366
365
|
}
|
|
367
366
|
if (!tracingEnabled) return null;
|
|
368
|
-
require_logger.
|
|
369
|
-
const { getTraceStore } = await Promise.resolve().then(() => require("./store-
|
|
367
|
+
require_logger.logger.debug("[EvaluatorTracing] Importing trace store");
|
|
368
|
+
const { getTraceStore } = await Promise.resolve().then(() => require("./store-eYkaKMwq.cjs"));
|
|
370
369
|
const traceStore = getTraceStore();
|
|
371
370
|
const traceId = generateTraceId();
|
|
372
371
|
const spanId = generateSpanId();
|
|
373
372
|
const traceparent = generateTraceparent(traceId, spanId);
|
|
374
|
-
require_logger.
|
|
373
|
+
require_logger.logger.debug(`[EvaluatorTracing] Generated trace context: traceId=${traceId}, spanId=${spanId}`);
|
|
375
374
|
let evaluationId = test.metadata?.evaluationId || evaluateOptions?.eventSource;
|
|
376
375
|
if (!evaluationId) {
|
|
377
|
-
require_logger.
|
|
376
|
+
require_logger.logger.warn("[EvaluatorTracing] No evaluation ID found in test metadata or evaluateOptions, trace will not be linked to evaluation");
|
|
378
377
|
evaluationId = `eval-${Date.now()}`;
|
|
379
378
|
}
|
|
380
379
|
const testCaseId = test.metadata?.testCaseId || test.id || `${testIdx}-${promptIdx}`;
|
|
381
380
|
try {
|
|
382
|
-
require_logger.
|
|
381
|
+
require_logger.logger.debug(`[EvaluatorTracing] Creating trace record for traceId=${traceId}`);
|
|
383
382
|
await traceStore.createTrace({
|
|
384
383
|
traceId,
|
|
385
384
|
evaluationId: evaluationId || "",
|
|
@@ -390,18 +389,17 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
390
389
|
vars: test.vars
|
|
391
390
|
}
|
|
392
391
|
});
|
|
393
|
-
require_logger.
|
|
392
|
+
require_logger.logger.debug("[EvaluatorTracing] Trace record created successfully");
|
|
394
393
|
} catch (error) {
|
|
395
|
-
require_logger.
|
|
394
|
+
require_logger.logger.error(`[EvaluatorTracing] Failed to create trace: ${error}`);
|
|
396
395
|
}
|
|
397
|
-
require_logger.
|
|
396
|
+
require_logger.logger.debug(`[EvaluatorTracing] Trace context ready: ${traceparent} for test case ${testCaseId}`);
|
|
398
397
|
return {
|
|
399
398
|
traceparent,
|
|
400
399
|
evaluationId,
|
|
401
400
|
testCaseId
|
|
402
401
|
};
|
|
403
402
|
}
|
|
404
|
-
|
|
405
403
|
//#endregion
|
|
406
404
|
//#region src/assertions/answerRelevance.ts
|
|
407
405
|
const handleAnswerRelevance = async ({ assertion, output, prompt, test, providerCallContext }) => {
|
|
@@ -412,7 +410,6 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
|
|
|
412
410
|
...await require_graders.matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
|
|
413
411
|
};
|
|
414
412
|
};
|
|
415
|
-
|
|
416
413
|
//#endregion
|
|
417
414
|
//#region src/assertions/assertionsResult.ts
|
|
418
415
|
const GUARDRAIL_BLOCKED_REASON = "Content failed guardrail safety checks";
|
|
@@ -518,7 +515,6 @@ var AssertionsResult = class {
|
|
|
518
515
|
return this.result;
|
|
519
516
|
}
|
|
520
517
|
};
|
|
521
|
-
|
|
522
518
|
//#endregion
|
|
523
519
|
//#region src/assertions/ngrams.ts
|
|
524
520
|
/**
|
|
@@ -534,7 +530,6 @@ function getNGrams(words, n) {
|
|
|
534
530
|
for (let i = 0; i <= words.length - n; i++) ngrams.push(words.slice(i, i + n).join(" "));
|
|
535
531
|
return ngrams;
|
|
536
532
|
}
|
|
537
|
-
|
|
538
533
|
//#endregion
|
|
539
534
|
//#region src/assertions/bleu.ts
|
|
540
535
|
/**
|
|
@@ -630,7 +625,6 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
630
625
|
assertion
|
|
631
626
|
};
|
|
632
627
|
}
|
|
633
|
-
|
|
634
628
|
//#endregion
|
|
635
629
|
//#region src/assertions/classifier.ts
|
|
636
630
|
async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
|
|
@@ -645,9 +639,43 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
645
639
|
...classificationResult
|
|
646
640
|
};
|
|
647
641
|
}
|
|
648
|
-
|
|
649
642
|
//#endregion
|
|
650
643
|
//#region src/assertions/contains.ts
|
|
644
|
+
function parseCommaSeparatedValues(value) {
|
|
645
|
+
const results = [];
|
|
646
|
+
let i = 0;
|
|
647
|
+
while (i < value.length) {
|
|
648
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
649
|
+
if (i >= value.length) break;
|
|
650
|
+
if (value[i] === ",") {
|
|
651
|
+
i++;
|
|
652
|
+
continue;
|
|
653
|
+
}
|
|
654
|
+
if (value[i] === "\"") {
|
|
655
|
+
i++;
|
|
656
|
+
let field = "";
|
|
657
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
658
|
+
field += value[i + 1];
|
|
659
|
+
i += 2;
|
|
660
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
661
|
+
field += "\"";
|
|
662
|
+
i += 2;
|
|
663
|
+
} else if (value[i] === "\"") {
|
|
664
|
+
i++;
|
|
665
|
+
break;
|
|
666
|
+
} else {
|
|
667
|
+
field += value[i];
|
|
668
|
+
i++;
|
|
669
|
+
}
|
|
670
|
+
results.push(field);
|
|
671
|
+
} else {
|
|
672
|
+
const start = i;
|
|
673
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
674
|
+
results.push(value.substring(start, i).trim());
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
return results;
|
|
678
|
+
}
|
|
651
679
|
const handleContains = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
652
680
|
const value = valueFromScript ?? renderedValue;
|
|
653
681
|
require_invariant.invariant(value, "\"contains\" assertion type must have a string or number value");
|
|
@@ -675,7 +703,7 @@ const handleIContains = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
675
703
|
const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
676
704
|
let value = valueFromScript ?? renderedValue;
|
|
677
705
|
require_invariant.invariant(value, "\"contains-any\" assertion type must have a value");
|
|
678
|
-
if (typeof value === "string") value = value
|
|
706
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
679
707
|
require_invariant.invariant(Array.isArray(value), "\"contains-any\" assertion type must have an array value");
|
|
680
708
|
const pass = value.some((v) => outputString.includes(String(v))) !== inverse;
|
|
681
709
|
return {
|
|
@@ -688,7 +716,7 @@ const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
688
716
|
const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
689
717
|
let value = valueFromScript ?? renderedValue;
|
|
690
718
|
require_invariant.invariant(value, "\"icontains-any\" assertion type must have a value");
|
|
691
|
-
if (typeof value === "string") value = value
|
|
719
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
692
720
|
require_invariant.invariant(Array.isArray(value), "\"icontains-any\" assertion type must have an array value");
|
|
693
721
|
const pass = value.some((v) => outputString.toLowerCase().includes(String(v).toLowerCase())) !== inverse;
|
|
694
722
|
return {
|
|
@@ -701,7 +729,7 @@ const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
701
729
|
const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
702
730
|
let value = valueFromScript ?? renderedValue;
|
|
703
731
|
require_invariant.invariant(value, "\"contains-all\" assertion type must have a value");
|
|
704
|
-
if (typeof value === "string") value = value
|
|
732
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
705
733
|
require_invariant.invariant(Array.isArray(value), "\"contains-all\" assertion type must have an array value");
|
|
706
734
|
const missingStrings = value.filter((v) => !outputString.includes(String(v)));
|
|
707
735
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -715,7 +743,7 @@ const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
715
743
|
const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
716
744
|
let value = valueFromScript ?? renderedValue;
|
|
717
745
|
require_invariant.invariant(value, "\"icontains-all\" assertion type must have a value");
|
|
718
|
-
if (typeof value === "string") value = value
|
|
746
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
719
747
|
require_invariant.invariant(Array.isArray(value), "\"icontains-all\" assertion type must have an array value");
|
|
720
748
|
const missingStrings = value.filter((v) => !outputString.toLowerCase().includes(String(v).toLowerCase()));
|
|
721
749
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -726,7 +754,6 @@ const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
726
754
|
assertion
|
|
727
755
|
};
|
|
728
756
|
};
|
|
729
|
-
|
|
730
757
|
//#endregion
|
|
731
758
|
//#region src/assertions/contextFaithfulness.ts
|
|
732
759
|
/**
|
|
@@ -750,7 +777,6 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
750
777
|
metadata: { context }
|
|
751
778
|
};
|
|
752
779
|
}
|
|
753
|
-
|
|
754
780
|
//#endregion
|
|
755
781
|
//#region src/assertions/contextRecall.ts
|
|
756
782
|
/**
|
|
@@ -777,7 +803,6 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
|
|
|
777
803
|
}
|
|
778
804
|
};
|
|
779
805
|
};
|
|
780
|
-
|
|
781
806
|
//#endregion
|
|
782
807
|
//#region src/assertions/contextRelevance.ts
|
|
783
808
|
/**
|
|
@@ -804,7 +829,6 @@ const handleContextRelevance = async ({ assertion, test, output, prompt, provide
|
|
|
804
829
|
}
|
|
805
830
|
};
|
|
806
831
|
};
|
|
807
|
-
|
|
808
832
|
//#endregion
|
|
809
833
|
//#region src/assertions/cost.ts
|
|
810
834
|
const handleCost = ({ cost, assertion }) => {
|
|
@@ -818,7 +842,6 @@ const handleCost = ({ cost, assertion }) => {
|
|
|
818
842
|
assertion
|
|
819
843
|
};
|
|
820
844
|
};
|
|
821
|
-
|
|
822
845
|
//#endregion
|
|
823
846
|
//#region src/assertions/equals.ts
|
|
824
847
|
const handleEquals = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -838,7 +861,6 @@ const handleEquals = async ({ assertion, renderedValue, outputString, inverse })
|
|
|
838
861
|
assertion
|
|
839
862
|
};
|
|
840
863
|
};
|
|
841
|
-
|
|
842
864
|
//#endregion
|
|
843
865
|
//#region src/assertions/factuality.ts
|
|
844
866
|
const handleFactuality = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -849,7 +871,6 @@ const handleFactuality = async ({ assertion, renderedValue, outputString, test,
|
|
|
849
871
|
...await require_graders.matchesFactuality(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
850
872
|
};
|
|
851
873
|
};
|
|
852
|
-
|
|
853
874
|
//#endregion
|
|
854
875
|
//#region src/assertions/finishReason.ts
|
|
855
876
|
function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
@@ -869,7 +890,6 @@ function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
|
869
890
|
assertion
|
|
870
891
|
};
|
|
871
892
|
}
|
|
872
|
-
|
|
873
893
|
//#endregion
|
|
874
894
|
//#region src/assertions/functionToolCall.ts
|
|
875
895
|
const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
@@ -892,7 +912,6 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
892
912
|
};
|
|
893
913
|
}
|
|
894
914
|
};
|
|
895
|
-
|
|
896
915
|
//#endregion
|
|
897
916
|
//#region src/assertions/geval.ts
|
|
898
917
|
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
@@ -918,7 +937,6 @@ const handleGEval = async ({ assertion, renderedValue, prompt, outputString, tes
|
|
|
918
937
|
...await require_graders.matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext)
|
|
919
938
|
};
|
|
920
939
|
};
|
|
921
|
-
|
|
922
940
|
//#endregion
|
|
923
941
|
//#region src/assertions/gleu.ts
|
|
924
942
|
/**
|
|
@@ -1017,7 +1035,6 @@ function handleGleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
1017
1035
|
assertion
|
|
1018
1036
|
};
|
|
1019
1037
|
}
|
|
1020
|
-
|
|
1021
1038
|
//#endregion
|
|
1022
1039
|
//#region src/assertions/guardrails.ts
|
|
1023
1040
|
const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
@@ -1026,7 +1043,7 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1026
1043
|
flaggedInput: false,
|
|
1027
1044
|
flaggedOutput: false
|
|
1028
1045
|
};
|
|
1029
|
-
require_logger.
|
|
1046
|
+
require_logger.logger.debug(`Guardrails from target response: ${JSON.stringify(guardrails, null, 2)}`);
|
|
1030
1047
|
if (guardrails) if (guardrails.flagged) {
|
|
1031
1048
|
let reason = "Content failed safety checks";
|
|
1032
1049
|
if (guardrails?.flaggedInput) reason = "Prompt failed safety checks";
|
|
@@ -1055,7 +1072,6 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1055
1072
|
assertion
|
|
1056
1073
|
};
|
|
1057
1074
|
};
|
|
1058
|
-
|
|
1059
1075
|
//#endregion
|
|
1060
1076
|
//#region src/assertions/html.ts
|
|
1061
1077
|
const HTML_PATTERNS = {
|
|
@@ -1264,7 +1280,6 @@ const handleIsHtml = ({ assertion, outputString, inverse }) => {
|
|
|
1264
1280
|
assertion
|
|
1265
1281
|
};
|
|
1266
1282
|
};
|
|
1267
|
-
|
|
1268
1283
|
//#endregion
|
|
1269
1284
|
//#region src/assertions/javascript.ts
|
|
1270
1285
|
/**
|
|
@@ -1405,7 +1420,6 @@ ${renderedValue}`,
|
|
|
1405
1420
|
assertion
|
|
1406
1421
|
};
|
|
1407
1422
|
};
|
|
1408
|
-
|
|
1409
1423
|
//#endregion
|
|
1410
1424
|
//#region src/assertions/json.ts
|
|
1411
1425
|
function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, assertion }) {
|
|
@@ -1417,7 +1431,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1417
1431
|
} catch {
|
|
1418
1432
|
pass = inverse;
|
|
1419
1433
|
}
|
|
1420
|
-
if (
|
|
1434
|
+
if (parsedJson !== void 0 && renderedValue) {
|
|
1421
1435
|
let validate;
|
|
1422
1436
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
1423
1437
|
const schema = valueFromScript;
|
|
@@ -1429,11 +1443,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1429
1443
|
}
|
|
1430
1444
|
else if (typeof renderedValue === "object") validate = require_logger.getAjv().compile(renderedValue);
|
|
1431
1445
|
else throw new Error("is-json assertion must have a string or object value");
|
|
1432
|
-
|
|
1446
|
+
const valid = validate(parsedJson);
|
|
1447
|
+
pass = inverse ? !valid : valid;
|
|
1433
1448
|
if (!pass) return {
|
|
1434
1449
|
pass,
|
|
1435
1450
|
score: 0,
|
|
1436
|
-
reason: `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`,
|
|
1451
|
+
reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`,
|
|
1437
1452
|
assertion
|
|
1438
1453
|
};
|
|
1439
1454
|
}
|
|
@@ -1460,9 +1475,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1460
1475
|
}
|
|
1461
1476
|
else if (typeof renderedValue === "object") validate = require_logger.getAjv().compile(renderedValue);
|
|
1462
1477
|
else throw new Error("contains-json assertion must have a string or object value");
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1478
|
+
const valid = validate(jsonObject);
|
|
1479
|
+
pass = inverse ? !valid : valid;
|
|
1480
|
+
if (valid) {
|
|
1481
|
+
if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
|
|
1482
|
+
break;
|
|
1483
|
+
} else errorMessage = `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`;
|
|
1466
1484
|
}
|
|
1467
1485
|
return {
|
|
1468
1486
|
pass,
|
|
@@ -1471,7 +1489,6 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1471
1489
|
assertion
|
|
1472
1490
|
};
|
|
1473
1491
|
}
|
|
1474
|
-
|
|
1475
1492
|
//#endregion
|
|
1476
1493
|
//#region src/assertions/latency.ts
|
|
1477
1494
|
const handleLatency = ({ assertion, latencyMs }) => {
|
|
@@ -1485,7 +1502,6 @@ const handleLatency = ({ assertion, latencyMs }) => {
|
|
|
1485
1502
|
assertion
|
|
1486
1503
|
};
|
|
1487
1504
|
};
|
|
1488
|
-
|
|
1489
1505
|
//#endregion
|
|
1490
1506
|
//#region src/assertions/levenshtein.ts
|
|
1491
1507
|
function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
@@ -1500,7 +1516,6 @@ function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
|
1500
1516
|
assertion
|
|
1501
1517
|
};
|
|
1502
1518
|
}
|
|
1503
|
-
|
|
1504
1519
|
//#endregion
|
|
1505
1520
|
//#region src/assertions/llmRubric.ts
|
|
1506
1521
|
const handleLlmRubric = ({ assertion, renderedValue, outputString, test, providerCallContext }) => {
|
|
@@ -1509,7 +1524,6 @@ const handleLlmRubric = ({ assertion, renderedValue, outputString, test, provide
|
|
|
1509
1524
|
assertion.value = assertion.value || test.options?.rubricPrompt;
|
|
1510
1525
|
return require_graders.matchesLlmRubric(renderedValue || "", outputString, test.options, test.vars, assertion, void 0, providerCallContext);
|
|
1511
1526
|
};
|
|
1512
|
-
|
|
1513
1527
|
//#endregion
|
|
1514
1528
|
//#region src/assertions/modelGradedClosedQa.ts
|
|
1515
1529
|
const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -1520,7 +1534,6 @@ const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputStrin
|
|
|
1520
1534
|
...await require_graders.matchesClosedQa(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
1521
1535
|
};
|
|
1522
1536
|
};
|
|
1523
|
-
|
|
1524
1537
|
//#endregion
|
|
1525
1538
|
//#region src/util/providerResponse.ts
|
|
1526
1539
|
/**
|
|
@@ -1563,7 +1576,6 @@ function getActualPrompt(response, options = {}) {
|
|
|
1563
1576
|
function getActualPromptWithFallback(response, originalPrompt, options = {}) {
|
|
1564
1577
|
return getActualPrompt(response, options) || originalPrompt;
|
|
1565
1578
|
}
|
|
1566
|
-
|
|
1567
1579
|
//#endregion
|
|
1568
1580
|
//#region src/assertions/moderation.ts
|
|
1569
1581
|
const handleModeration = async ({ assertion, test, outputString, providerResponse, prompt }) => {
|
|
@@ -1586,7 +1598,6 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
|
|
|
1586
1598
|
assertion
|
|
1587
1599
|
};
|
|
1588
1600
|
};
|
|
1589
|
-
|
|
1590
1601
|
//#endregion
|
|
1591
1602
|
//#region src/assertions/openai.ts
|
|
1592
1603
|
const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test }) => {
|
|
@@ -1647,7 +1658,6 @@ const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test
|
|
|
1647
1658
|
};
|
|
1648
1659
|
}
|
|
1649
1660
|
};
|
|
1650
|
-
|
|
1651
1661
|
//#endregion
|
|
1652
1662
|
//#region src/assertions/perplexity.ts
|
|
1653
1663
|
function handlePerplexity({ logProbs, assertion }) {
|
|
@@ -1674,7 +1684,6 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1674
1684
|
assertion
|
|
1675
1685
|
};
|
|
1676
1686
|
}
|
|
1677
|
-
|
|
1678
1687
|
//#endregion
|
|
1679
1688
|
//#region src/assertions/pi.ts
|
|
1680
1689
|
const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }) => {
|
|
@@ -1682,7 +1691,6 @@ const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }
|
|
|
1682
1691
|
require_invariant.invariant(typeof prompt === "string", "\"pi\" assertion must have a prompt that is a string");
|
|
1683
1692
|
return require_graders.matchesPiScore(renderedValue, prompt, outputString, assertion);
|
|
1684
1693
|
};
|
|
1685
|
-
|
|
1686
1694
|
//#endregion
|
|
1687
1695
|
//#region src/python/wrapper.ts
|
|
1688
1696
|
/**
|
|
@@ -1698,17 +1706,16 @@ async function runPythonCode(code, method, args) {
|
|
|
1698
1706
|
fs.default.writeFileSync(tempFilePath, code);
|
|
1699
1707
|
return await require_pythonUtils.runPython(tempFilePath, method, args);
|
|
1700
1708
|
} catch (error) {
|
|
1701
|
-
require_logger.
|
|
1709
|
+
require_logger.logger.error(`Error executing Python code: ${error}`);
|
|
1702
1710
|
throw error;
|
|
1703
1711
|
} finally {
|
|
1704
1712
|
try {
|
|
1705
1713
|
fs.default.unlinkSync(tempFilePath);
|
|
1706
1714
|
} catch (error) {
|
|
1707
|
-
require_logger.
|
|
1715
|
+
require_logger.logger.error(`Error removing temporary file: ${error}`);
|
|
1708
1716
|
}
|
|
1709
1717
|
}
|
|
1710
1718
|
}
|
|
1711
|
-
|
|
1712
1719
|
//#endregion
|
|
1713
1720
|
//#region src/util/caseMapping.ts
|
|
1714
1721
|
/**
|
|
@@ -1732,7 +1739,6 @@ function mapSnakeCaseToCamelCase(obj) {
|
|
|
1732
1739
|
});
|
|
1733
1740
|
return result;
|
|
1734
1741
|
}
|
|
1735
|
-
|
|
1736
1742
|
//#endregion
|
|
1737
1743
|
//#region src/assertions/python.ts
|
|
1738
1744
|
const handlePython = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -1802,7 +1808,6 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1802
1808
|
assertion
|
|
1803
1809
|
};
|
|
1804
1810
|
};
|
|
1805
|
-
|
|
1806
1811
|
//#endregion
|
|
1807
1812
|
//#region src/assertions/redteam.ts
|
|
1808
1813
|
/**
|
|
@@ -1883,7 +1888,7 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1883
1888
|
const { hasAnyErrors, allTurnsHaveErrors } = analyzeGraderErrors(redteamHistory);
|
|
1884
1889
|
if (test.metadata?.strategyId && hasAnyErrors && !allTurnsHaveErrors) {
|
|
1885
1890
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1886
|
-
require_logger.
|
|
1891
|
+
require_logger.logger.warn("[Redteam] Grading failed for iterative test with some prior grader errors", {
|
|
1887
1892
|
error: errorMessage,
|
|
1888
1893
|
strategyId: test.metadata.strategyId,
|
|
1889
1894
|
pluginId: test.metadata.pluginId
|
|
@@ -1903,7 +1908,6 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1903
1908
|
throw error;
|
|
1904
1909
|
}
|
|
1905
1910
|
};
|
|
1906
|
-
|
|
1907
1911
|
//#endregion
|
|
1908
1912
|
//#region src/assertions/refusal.ts
|
|
1909
1913
|
function handleIsRefusal(params) {
|
|
@@ -1931,7 +1935,6 @@ function handleIsRefusal(params) {
|
|
|
1931
1935
|
assertion
|
|
1932
1936
|
};
|
|
1933
1937
|
}
|
|
1934
|
-
|
|
1935
1938
|
//#endregion
|
|
1936
1939
|
//#region src/assertions/regex.ts
|
|
1937
1940
|
const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -1956,7 +1959,6 @@ const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
|
1956
1959
|
assertion
|
|
1957
1960
|
};
|
|
1958
1961
|
};
|
|
1959
|
-
|
|
1960
1962
|
//#endregion
|
|
1961
1963
|
//#region src/assertions/rouge.ts
|
|
1962
1964
|
function handleRougeScore({ baseType, assertion, renderedValue, outputString, inverse }) {
|
|
@@ -1972,7 +1974,6 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
1972
1974
|
assertion
|
|
1973
1975
|
};
|
|
1974
1976
|
}
|
|
1975
|
-
|
|
1976
1977
|
//#endregion
|
|
1977
1978
|
//#region src/ruby/wrapper.ts
|
|
1978
1979
|
/**
|
|
@@ -1988,17 +1989,16 @@ async function runRubyCode(code, method, args) {
|
|
|
1988
1989
|
fs.default.writeFileSync(tempFilePath, code);
|
|
1989
1990
|
return await require_rubyUtils.runRuby(tempFilePath, method, args);
|
|
1990
1991
|
} catch (error) {
|
|
1991
|
-
require_logger.
|
|
1992
|
+
require_logger.logger.error(`Error executing Ruby code: ${error}`);
|
|
1992
1993
|
throw error;
|
|
1993
1994
|
} finally {
|
|
1994
1995
|
try {
|
|
1995
1996
|
fs.default.unlinkSync(tempFilePath);
|
|
1996
1997
|
} catch (error) {
|
|
1997
|
-
require_logger.
|
|
1998
|
+
require_logger.logger.error(`Error removing temporary file: ${error}`);
|
|
1998
1999
|
}
|
|
1999
2000
|
}
|
|
2000
2001
|
}
|
|
2001
|
-
|
|
2002
2002
|
//#endregion
|
|
2003
2003
|
//#region src/assertions/ruby.ts
|
|
2004
2004
|
const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -2069,7 +2069,6 @@ end
|
|
|
2069
2069
|
assertion
|
|
2070
2070
|
};
|
|
2071
2071
|
};
|
|
2072
|
-
|
|
2073
2072
|
//#endregion
|
|
2074
2073
|
//#region src/assertions/searchRubric.ts
|
|
2075
2074
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
@@ -2081,7 +2080,6 @@ async function handleSearchRubric({ assertion, baseType: _baseType, inverse, pro
|
|
|
2081
2080
|
}
|
|
2082
2081
|
return result;
|
|
2083
2082
|
}
|
|
2084
|
-
|
|
2085
2083
|
//#endregion
|
|
2086
2084
|
//#region src/assertions/similar.ts
|
|
2087
2085
|
const handleSimilar = async ({ assertion, renderedValue, outputString, inverse, test }) => {
|
|
@@ -2124,7 +2122,6 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2124
2122
|
...await require_graders.matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
|
|
2125
2123
|
};
|
|
2126
2124
|
};
|
|
2127
|
-
|
|
2128
2125
|
//#endregion
|
|
2129
2126
|
//#region src/assertions/sql.ts
|
|
2130
2127
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2216,7 +2213,6 @@ const handleContainsSql = async (assertionParams) => {
|
|
|
2216
2213
|
}
|
|
2217
2214
|
return handleIsSql(assertionParams);
|
|
2218
2215
|
};
|
|
2219
|
-
|
|
2220
2216
|
//#endregion
|
|
2221
2217
|
//#region src/assertions/startsWith.ts
|
|
2222
2218
|
const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2230,7 +2226,6 @@ const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) =
|
|
|
2230
2226
|
assertion
|
|
2231
2227
|
};
|
|
2232
2228
|
};
|
|
2233
|
-
|
|
2234
2229
|
//#endregion
|
|
2235
2230
|
//#region src/assertions/toolCallF1.ts
|
|
2236
2231
|
/**
|
|
@@ -2359,7 +2354,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2359
2354
|
assertion
|
|
2360
2355
|
};
|
|
2361
2356
|
};
|
|
2362
|
-
|
|
2363
2357
|
//#endregion
|
|
2364
2358
|
//#region src/assertions/traceUtils.ts
|
|
2365
2359
|
/**
|
|
@@ -2377,7 +2371,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
2377
2371
|
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2378
2372
|
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2379
2373
|
}
|
|
2380
|
-
|
|
2381
2374
|
//#endregion
|
|
2382
2375
|
//#region src/assertions/traceErrorSpans.ts
|
|
2383
2376
|
function isErrorSpan(span) {
|
|
@@ -2455,7 +2448,6 @@ const handleTraceErrorSpans = ({ assertion, assertionValueContext }) => {
|
|
|
2455
2448
|
assertion
|
|
2456
2449
|
};
|
|
2457
2450
|
};
|
|
2458
|
-
|
|
2459
2451
|
//#endregion
|
|
2460
2452
|
//#region src/assertions/traceSpanCount.ts
|
|
2461
2453
|
const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
@@ -2490,7 +2482,6 @@ const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
|
2490
2482
|
assertion
|
|
2491
2483
|
};
|
|
2492
2484
|
};
|
|
2493
|
-
|
|
2494
2485
|
//#endregion
|
|
2495
2486
|
//#region src/assertions/traceSpanDuration.ts
|
|
2496
2487
|
function calculatePercentile(durations, percentile) {
|
|
@@ -2548,7 +2539,6 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2548
2539
|
assertion
|
|
2549
2540
|
};
|
|
2550
2541
|
};
|
|
2551
|
-
|
|
2552
2542
|
//#endregion
|
|
2553
2543
|
//#region src/assertions/webhook.ts
|
|
2554
2544
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
@@ -2585,7 +2575,6 @@ async function handleWebhook({ assertion, renderedValue, test, prompt, output, i
|
|
|
2585
2575
|
};
|
|
2586
2576
|
}
|
|
2587
2577
|
}
|
|
2588
|
-
|
|
2589
2578
|
//#endregion
|
|
2590
2579
|
//#region src/assertions/wordCount.ts
|
|
2591
2580
|
/**
|
|
@@ -2648,7 +2637,6 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2648
2637
|
assertion
|
|
2649
2638
|
};
|
|
2650
2639
|
};
|
|
2651
|
-
|
|
2652
2640
|
//#endregion
|
|
2653
2641
|
//#region src/assertions/xml.ts
|
|
2654
2642
|
function validateXml(xmlString, requiredElements) {
|
|
@@ -2723,7 +2711,6 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2723
2711
|
assertion
|
|
2724
2712
|
};
|
|
2725
2713
|
};
|
|
2726
|
-
|
|
2727
2714
|
//#endregion
|
|
2728
2715
|
//#region src/assertions/index.ts
|
|
2729
2716
|
const ASSERTIONS_MAX_CONCURRENCY = require_logger.getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
@@ -2777,7 +2764,7 @@ const ASSERTION_HANDLERS = {
|
|
|
2777
2764
|
"llm-rubric": handleLlmRubric,
|
|
2778
2765
|
meteor: async (params) => {
|
|
2779
2766
|
try {
|
|
2780
|
-
const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-
|
|
2767
|
+
const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-DLZZ3osF.cjs"));
|
|
2781
2768
|
return handleMeteorAssertion(params);
|
|
2782
2769
|
} catch (error) {
|
|
2783
2770
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -2823,10 +2810,10 @@ function renderMetricName(metric, vars) {
|
|
|
2823
2810
|
if (!metric) return metric;
|
|
2824
2811
|
try {
|
|
2825
2812
|
const rendered = nunjucks.renderString(metric, vars);
|
|
2826
|
-
if (rendered === "" && metric !== "") require_logger.
|
|
2813
|
+
if (rendered === "" && metric !== "") require_logger.logger.debug(`Metric template "${metric}" rendered to empty string`);
|
|
2827
2814
|
return rendered;
|
|
2828
2815
|
} catch (error) {
|
|
2829
|
-
require_logger.
|
|
2816
|
+
require_logger.logger.warn(`Failed to render metric template "${metric}": ${error instanceof Error ? error.message : error}`);
|
|
2830
2817
|
return metric;
|
|
2831
2818
|
}
|
|
2832
2819
|
}
|
|
@@ -2877,12 +2864,12 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2877
2864
|
spans: traceData.spans || []
|
|
2878
2865
|
};
|
|
2879
2866
|
} catch (error) {
|
|
2880
|
-
require_logger.
|
|
2867
|
+
require_logger.logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
2881
2868
|
}
|
|
2882
2869
|
let renderedValue = assertion.value;
|
|
2883
2870
|
let valueFromScript;
|
|
2884
2871
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
2885
|
-
const basePath = require_logger.
|
|
2872
|
+
const basePath = require_logger.state.basePath || "";
|
|
2886
2873
|
const fileRef = renderedValue.slice(7);
|
|
2887
2874
|
let filePath = fileRef;
|
|
2888
2875
|
let functionName;
|
|
@@ -2894,10 +2881,10 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2894
2881
|
filePath = path.default.resolve(basePath, filePath);
|
|
2895
2882
|
if (require_fileExtensions.isJavascriptFile(filePath)) {
|
|
2896
2883
|
valueFromScript = await require_graders.loadFromJavaScriptFile(filePath, functionName, [output, context]);
|
|
2897
|
-
require_logger.
|
|
2884
|
+
require_logger.logger.debug(`Javascript script ${filePath} output: ${valueFromScript}`);
|
|
2898
2885
|
} else if (filePath.endsWith(".py")) try {
|
|
2899
2886
|
valueFromScript = await require_pythonUtils.runPython(filePath, functionName || "get_assert", [output, context]);
|
|
2900
|
-
require_logger.
|
|
2887
|
+
require_logger.logger.debug(`Python script ${filePath} output: ${valueFromScript}`);
|
|
2901
2888
|
} catch (error) {
|
|
2902
2889
|
return {
|
|
2903
2890
|
pass: false,
|
|
@@ -2907,9 +2894,9 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2907
2894
|
};
|
|
2908
2895
|
}
|
|
2909
2896
|
else if (filePath.endsWith(".rb")) try {
|
|
2910
|
-
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-
|
|
2897
|
+
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-CP42kMvq.cjs"));
|
|
2911
2898
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2912
|
-
require_logger.
|
|
2899
|
+
require_logger.logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2913
2900
|
} catch (error) {
|
|
2914
2901
|
return {
|
|
2915
2902
|
pass: false,
|
|
@@ -2920,7 +2907,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2920
2907
|
}
|
|
2921
2908
|
else renderedValue = require_graders.processFileReference(renderedValue);
|
|
2922
2909
|
} else if (require_providers.isPackagePath(renderedValue)) {
|
|
2923
|
-
const basePath = require_logger.
|
|
2910
|
+
const basePath = require_logger.state.basePath || "";
|
|
2924
2911
|
const requiredModule = await require_providers.loadFromPackage(renderedValue, basePath);
|
|
2925
2912
|
if (typeof requiredModule !== "function") throw new Error(`Assertion malformed: ${renderedValue} must be a function. Received: ${typeof requiredModule}`);
|
|
2926
2913
|
valueFromScript = await Promise.resolve(requiredModule(output, context));
|
|
@@ -3081,7 +3068,6 @@ var assertions_default = {
|
|
|
3081
3068
|
matchesModeration: require_graders.matchesModeration,
|
|
3082
3069
|
matchesConversationRelevance
|
|
3083
3070
|
};
|
|
3084
|
-
|
|
3085
3071
|
//#endregion
|
|
3086
3072
|
//#region src/database/signal.ts
|
|
3087
3073
|
/**
|
|
@@ -3096,10 +3082,9 @@ function updateSignalFile(evalId) {
|
|
|
3096
3082
|
const content = evalId ? `${evalId}:${now.toISOString()}` : now.toISOString();
|
|
3097
3083
|
fs.default.writeFileSync(filePath, content);
|
|
3098
3084
|
} catch (err) {
|
|
3099
|
-
require_logger.
|
|
3085
|
+
require_logger.logger.warn(`Failed to write database signal file: ${err}`);
|
|
3100
3086
|
}
|
|
3101
3087
|
}
|
|
3102
|
-
|
|
3103
3088
|
//#endregion
|
|
3104
3089
|
//#region src/progress/ciProgressReporter.ts
|
|
3105
3090
|
var CIProgressReporter = class {
|
|
@@ -3121,7 +3106,7 @@ var CIProgressReporter = class {
|
|
|
3121
3106
|
}
|
|
3122
3107
|
start() {
|
|
3123
3108
|
if (this.intervalId) clearInterval(this.intervalId);
|
|
3124
|
-
require_logger.
|
|
3109
|
+
require_logger.logger.info(`[Evaluation] Starting ${this.totalTests} test cases...`);
|
|
3125
3110
|
this.intervalId = setInterval(() => {
|
|
3126
3111
|
this.logPeriodicUpdate();
|
|
3127
3112
|
}, this.updateIntervalMs);
|
|
@@ -3152,14 +3137,14 @@ var CIProgressReporter = class {
|
|
|
3152
3137
|
this.intervalId = null;
|
|
3153
3138
|
}
|
|
3154
3139
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3155
|
-
require_logger.
|
|
3140
|
+
require_logger.logger.info(`[Evaluation] ✓ Complete! ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3156
3141
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation completed: ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3157
3142
|
}
|
|
3158
3143
|
error(message) {
|
|
3159
3144
|
const now = Date.now();
|
|
3160
3145
|
if (now - this.lastErrorTime < this.ERROR_THROTTLE_MS) return;
|
|
3161
3146
|
this.lastErrorTime = now;
|
|
3162
|
-
require_logger.
|
|
3147
|
+
require_logger.logger.error(`[Evaluation Error] ${message}`);
|
|
3163
3148
|
if (process.env.GITHUB_ACTIONS) {
|
|
3164
3149
|
const escapedMessage = message.replace(/\r?\n/g, " ").replace(/::/g, " ");
|
|
3165
3150
|
console.log(`::error::${escapedMessage}`);
|
|
@@ -3178,12 +3163,12 @@ var CIProgressReporter = class {
|
|
|
3178
3163
|
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
|
|
3179
3164
|
}
|
|
3180
3165
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3181
|
-
require_logger.
|
|
3182
|
-
require_logger.
|
|
3166
|
+
require_logger.logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
3167
|
+
require_logger.logger.info(`[CI Progress] Rate: ~${Math.round(rate)} tests/minute, ETA: ${etaDisplay}`);
|
|
3183
3168
|
}
|
|
3184
3169
|
logMilestone(percentage) {
|
|
3185
3170
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3186
|
-
require_logger.
|
|
3171
|
+
require_logger.logger.info(`[Evaluation] ✓ ${percentage}% complete (${this.completedTests}/${this.totalTests}) - ${elapsed} elapsed`);
|
|
3187
3172
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation ${percentage}% complete`);
|
|
3188
3173
|
}
|
|
3189
3174
|
formatElapsedTime(ms) {
|
|
@@ -3194,7 +3179,6 @@ var CIProgressReporter = class {
|
|
|
3194
3179
|
return `${minutes}m ${remainingSeconds}s`;
|
|
3195
3180
|
}
|
|
3196
3181
|
};
|
|
3197
|
-
|
|
3198
3182
|
//#endregion
|
|
3199
3183
|
//#region src/providers/azure/warnings.ts
|
|
3200
3184
|
/**
|
|
@@ -3208,13 +3192,12 @@ function maybeEmitAzureOpenAiWarning(testSuite, tests) {
|
|
|
3208
3192
|
const modelGradedAsserts = tests.flatMap((t) => (t.assert || []).filter((a) => a.type !== "assert-set" && MODEL_GRADED_ASSERTION_TYPES.has(a.type) && !a.provider && !t.options?.provider));
|
|
3209
3193
|
if (modelGradedAsserts.length > 0) {
|
|
3210
3194
|
const assertTypes = Array.from(new Set(modelGradedAsserts.map((a) => a.type))).join(", ");
|
|
3211
|
-
require_logger.
|
|
3195
|
+
require_logger.logger.warn(chalk.default.yellow(`You are using model-graded assertions of types ${chalk.default.bold(assertTypes)} while testing an Azure provider. You may need to override these to use your Azure deployment. To learn more, see ${chalk.default.bold(`https://promptfoo.dev/docs/providers/azure/#model-graded-tests`)}`));
|
|
3212
3196
|
return true;
|
|
3213
3197
|
}
|
|
3214
3198
|
}
|
|
3215
3199
|
return false;
|
|
3216
3200
|
}
|
|
3217
|
-
|
|
3218
3201
|
//#endregion
|
|
3219
3202
|
//#region src/suggestions.ts
|
|
3220
3203
|
async function generatePrompts(prompt, _num) {
|
|
@@ -3245,7 +3228,6 @@ async function generatePrompts(prompt, _num) {
|
|
|
3245
3228
|
};
|
|
3246
3229
|
}
|
|
3247
3230
|
}
|
|
3248
|
-
|
|
3249
3231
|
//#endregion
|
|
3250
3232
|
//#region src/tracing/otelConfig.ts
|
|
3251
3233
|
/**
|
|
@@ -3271,7 +3253,6 @@ function getDefaultOtelConfig() {
|
|
|
3271
3253
|
enabled: true
|
|
3272
3254
|
};
|
|
3273
3255
|
}
|
|
3274
|
-
|
|
3275
3256
|
//#endregion
|
|
3276
3257
|
//#region src/tracing/localSpanExporter.ts
|
|
3277
3258
|
/**
|
|
@@ -3291,7 +3272,7 @@ var LocalSpanExporter = class {
|
|
|
3291
3272
|
});
|
|
3292
3273
|
else resultCallback({ code: _opentelemetry_core.ExportResultCode.SUCCESS });
|
|
3293
3274
|
}).catch((error) => {
|
|
3294
|
-
require_logger.
|
|
3275
|
+
require_logger.logger.error("[LocalSpanExporter] Failed to export spans", { error });
|
|
3295
3276
|
resultCallback({
|
|
3296
3277
|
code: _opentelemetry_core.ExportResultCode.FAILED,
|
|
3297
3278
|
error: error instanceof Error ? error : new Error(String(error))
|
|
@@ -3305,7 +3286,7 @@ var LocalSpanExporter = class {
|
|
|
3305
3286
|
async exportAsync(spans) {
|
|
3306
3287
|
if (spans.length === 0) return;
|
|
3307
3288
|
const traceStore = require_store.getTraceStore();
|
|
3308
|
-
require_logger.
|
|
3289
|
+
require_logger.logger.debug(`[LocalSpanExporter] Exporting ${spans.length} spans`);
|
|
3309
3290
|
const spansByTrace = /* @__PURE__ */ new Map();
|
|
3310
3291
|
for (const span of spans) {
|
|
3311
3292
|
const traceId = span.spanContext().traceId;
|
|
@@ -3316,12 +3297,12 @@ var LocalSpanExporter = class {
|
|
|
3316
3297
|
let firstError;
|
|
3317
3298
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
3318
3299
|
const result = await traceStore.addSpans(traceId, spanDataList, { skipTraceCheck: false });
|
|
3319
|
-
if (result.stored) require_logger.
|
|
3320
|
-
else require_logger.
|
|
3300
|
+
if (result.stored) require_logger.logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
3301
|
+
else require_logger.logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
3321
3302
|
} catch (error) {
|
|
3322
|
-
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY")) require_logger.
|
|
3303
|
+
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY")) require_logger.logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}`);
|
|
3323
3304
|
else {
|
|
3324
|
-
require_logger.
|
|
3305
|
+
require_logger.logger.error(`[LocalSpanExporter] Failed to add spans to trace ${traceId}`, { error });
|
|
3325
3306
|
if (!firstError) firstError = error instanceof Error ? error : new Error(String(error));
|
|
3326
3307
|
}
|
|
3327
3308
|
}
|
|
@@ -3358,7 +3339,7 @@ var LocalSpanExporter = class {
|
|
|
3358
3339
|
* Shutdown the exporter. No-op for local storage.
|
|
3359
3340
|
*/
|
|
3360
3341
|
shutdown() {
|
|
3361
|
-
require_logger.
|
|
3342
|
+
require_logger.logger.debug("[LocalSpanExporter] Shutting down");
|
|
3362
3343
|
return Promise.resolve();
|
|
3363
3344
|
}
|
|
3364
3345
|
/**
|
|
@@ -3368,7 +3349,6 @@ var LocalSpanExporter = class {
|
|
|
3368
3349
|
return Promise.resolve();
|
|
3369
3350
|
}
|
|
3370
3351
|
};
|
|
3371
|
-
|
|
3372
3352
|
//#endregion
|
|
3373
3353
|
//#region src/tracing/otelSdk.ts
|
|
3374
3354
|
let provider = null;
|
|
@@ -3396,21 +3376,21 @@ function getHandlers() {
|
|
|
3396
3376
|
*/
|
|
3397
3377
|
function initializeOtel(config) {
|
|
3398
3378
|
if (initialized) {
|
|
3399
|
-
require_logger.
|
|
3379
|
+
require_logger.logger.debug("[OtelSdk] Already initialized, skipping");
|
|
3400
3380
|
return;
|
|
3401
3381
|
}
|
|
3402
3382
|
if (!config.enabled) {
|
|
3403
|
-
require_logger.
|
|
3383
|
+
require_logger.logger.debug("[OtelSdk] OTEL tracing is disabled");
|
|
3404
3384
|
return;
|
|
3405
3385
|
}
|
|
3406
|
-
require_logger.
|
|
3386
|
+
require_logger.logger.debug("[OtelSdk] Initializing OpenTelemetry SDK", {
|
|
3407
3387
|
serviceName: config.serviceName,
|
|
3408
3388
|
endpoint: config.endpoint,
|
|
3409
3389
|
localExport: config.localExport
|
|
3410
3390
|
});
|
|
3411
3391
|
if (config.debug) _opentelemetry_api.diag.setLogger(new _opentelemetry_api.DiagConsoleLogger(), _opentelemetry_api.DiagLogLevel.DEBUG);
|
|
3412
3392
|
_opentelemetry_api.propagation.setGlobalPropagator(new _opentelemetry_core.W3CTraceContextPropagator());
|
|
3413
|
-
require_logger.
|
|
3393
|
+
require_logger.logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
|
|
3414
3394
|
const resource = (0, _opentelemetry_resources.resourceFromAttributes)({
|
|
3415
3395
|
[_opentelemetry_semantic_conventions.ATTR_SERVICE_NAME]: config.serviceName,
|
|
3416
3396
|
[_opentelemetry_semantic_conventions.ATTR_SERVICE_VERSION]: require_fetch.VERSION
|
|
@@ -3419,12 +3399,12 @@ function initializeOtel(config) {
|
|
|
3419
3399
|
if (config.localExport) {
|
|
3420
3400
|
const localExporter = new LocalSpanExporter();
|
|
3421
3401
|
spanProcessors.push(new _opentelemetry_sdk_trace_node.BatchSpanProcessor(localExporter));
|
|
3422
|
-
require_logger.
|
|
3402
|
+
require_logger.logger.debug("[OtelSdk] Added local span exporter");
|
|
3423
3403
|
}
|
|
3424
3404
|
if (config.endpoint) {
|
|
3425
3405
|
const otlpExporter = new _opentelemetry_exporter_trace_otlp_http.OTLPTraceExporter({ url: config.endpoint });
|
|
3426
3406
|
spanProcessors.push(new _opentelemetry_sdk_trace_node.BatchSpanProcessor(otlpExporter));
|
|
3427
|
-
require_logger.
|
|
3407
|
+
require_logger.logger.debug(`[OtelSdk] Added OTLP exporter to ${config.endpoint}`);
|
|
3428
3408
|
}
|
|
3429
3409
|
provider = new _opentelemetry_sdk_trace_node.NodeTracerProvider({
|
|
3430
3410
|
resource,
|
|
@@ -3432,7 +3412,7 @@ function initializeOtel(config) {
|
|
|
3432
3412
|
});
|
|
3433
3413
|
provider.register();
|
|
3434
3414
|
initialized = true;
|
|
3435
|
-
require_logger.
|
|
3415
|
+
require_logger.logger.info("[OtelSdk] OpenTelemetry SDK initialized successfully");
|
|
3436
3416
|
setupShutdownHandlers();
|
|
3437
3417
|
}
|
|
3438
3418
|
/**
|
|
@@ -3441,12 +3421,12 @@ function initializeOtel(config) {
|
|
|
3441
3421
|
*/
|
|
3442
3422
|
async function shutdownOtel() {
|
|
3443
3423
|
if (!initialized || !provider) return;
|
|
3444
|
-
require_logger.
|
|
3424
|
+
require_logger.logger.debug("[OtelSdk] Shutting down OpenTelemetry SDK");
|
|
3445
3425
|
try {
|
|
3446
3426
|
await provider.shutdown();
|
|
3447
|
-
require_logger.
|
|
3427
|
+
require_logger.logger.info("[OtelSdk] OpenTelemetry SDK shut down successfully");
|
|
3448
3428
|
} catch (error) {
|
|
3449
|
-
require_logger.
|
|
3429
|
+
require_logger.logger.error("[OtelSdk] Error shutting down OpenTelemetry SDK", { error });
|
|
3450
3430
|
} finally {
|
|
3451
3431
|
provider = null;
|
|
3452
3432
|
initialized = false;
|
|
@@ -3459,12 +3439,12 @@ async function shutdownOtel() {
|
|
|
3459
3439
|
*/
|
|
3460
3440
|
async function flushOtel() {
|
|
3461
3441
|
if (!initialized || !provider) return;
|
|
3462
|
-
require_logger.
|
|
3442
|
+
require_logger.logger.debug("[OtelSdk] Flushing pending spans");
|
|
3463
3443
|
try {
|
|
3464
3444
|
await provider.forceFlush();
|
|
3465
|
-
require_logger.
|
|
3445
|
+
require_logger.logger.debug("[OtelSdk] Spans flushed successfully");
|
|
3466
3446
|
} catch (error) {
|
|
3467
|
-
require_logger.
|
|
3447
|
+
require_logger.logger.error("[OtelSdk] Error flushing spans", { error });
|
|
3468
3448
|
}
|
|
3469
3449
|
}
|
|
3470
3450
|
/**
|
|
@@ -3476,7 +3456,7 @@ function setupShutdownHandlers() {
|
|
|
3476
3456
|
const handlers = getHandlers();
|
|
3477
3457
|
if (handlers.registered) return;
|
|
3478
3458
|
const shutdown = async (signal) => {
|
|
3479
|
-
require_logger.
|
|
3459
|
+
require_logger.logger.debug(`[OtelSdk] Received ${signal}, shutting down`);
|
|
3480
3460
|
await shutdownOtel();
|
|
3481
3461
|
};
|
|
3482
3462
|
handlers.sigTermHandler = () => {
|
|
@@ -3513,7 +3493,6 @@ function cleanupShutdownHandlers() {
|
|
|
3513
3493
|
}
|
|
3514
3494
|
handlers.registered = false;
|
|
3515
3495
|
}
|
|
3516
|
-
|
|
3517
3496
|
//#endregion
|
|
3518
3497
|
//#region src/util/exportToFile/writeToFile.ts
|
|
3519
3498
|
var JsonlFileWriter = class {
|
|
@@ -3537,7 +3516,6 @@ var JsonlFileWriter = class {
|
|
|
3537
3516
|
});
|
|
3538
3517
|
}
|
|
3539
3518
|
};
|
|
3540
|
-
|
|
3541
3519
|
//#endregion
|
|
3542
3520
|
//#region src/util/promptMatching.ts
|
|
3543
3521
|
/**
|
|
@@ -3575,7 +3553,6 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
3575
3553
|
if (allowedPrompts.length === 0) return false;
|
|
3576
3554
|
return allowedPrompts.some((ref) => doesPromptRefMatch(ref, prompt));
|
|
3577
3555
|
}
|
|
3578
|
-
|
|
3579
3556
|
//#endregion
|
|
3580
3557
|
//#region src/evaluator.ts
|
|
3581
3558
|
/**
|
|
@@ -3767,7 +3744,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3767
3744
|
if (test.providerOutput) response.output = test.providerOutput;
|
|
3768
3745
|
else {
|
|
3769
3746
|
const activeProvider = require_types.isApiProvider(test.provider) ? test.provider : provider;
|
|
3770
|
-
require_logger.
|
|
3747
|
+
require_logger.logger.debug(`Provider type: ${activeProvider.id()}`);
|
|
3771
3748
|
traceContext = await generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite);
|
|
3772
3749
|
const callApiContext = {
|
|
3773
3750
|
vars,
|
|
@@ -3778,7 +3755,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3778
3755
|
filters,
|
|
3779
3756
|
originalProvider: provider,
|
|
3780
3757
|
test,
|
|
3781
|
-
logger: require_logger.
|
|
3758
|
+
logger: require_logger.logger,
|
|
3782
3759
|
getCache: require_cache.getCache,
|
|
3783
3760
|
repeatIndex
|
|
3784
3761
|
};
|
|
@@ -3795,8 +3772,8 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3795
3772
|
const sanitizedMetadata = require_logger.safeJsonStringify(response.metadata);
|
|
3796
3773
|
response.metadata = sanitizedMetadata ? JSON.parse(sanitizedMetadata) : {};
|
|
3797
3774
|
}
|
|
3798
|
-
require_logger.
|
|
3799
|
-
require_logger.
|
|
3775
|
+
require_logger.logger.debug(`Provider response properties: ${Object.keys(response).join(", ")}`);
|
|
3776
|
+
require_logger.logger.debug(`Provider response cached property explicitly: ${response.cached}`);
|
|
3800
3777
|
}
|
|
3801
3778
|
latencyMs = Date.now() - startTime;
|
|
3802
3779
|
let conversationLastInput = void 0;
|
|
@@ -3813,12 +3790,12 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3813
3790
|
metadata: response.metadata
|
|
3814
3791
|
});
|
|
3815
3792
|
}
|
|
3816
|
-
require_logger.
|
|
3817
|
-
require_logger.
|
|
3793
|
+
require_logger.logger.debug("Evaluator response", { responsePreview: (require_logger.safeJsonStringify(response) ?? "").slice(0, 100) });
|
|
3794
|
+
require_logger.logger.debug(`Evaluator checking cached flag: response.cached = ${Boolean(response.cached)}, provider.delay = ${provider.delay}`);
|
|
3818
3795
|
if (!response.cached && provider.delay > 0) {
|
|
3819
|
-
require_logger.
|
|
3796
|
+
require_logger.logger.debug(`Sleeping for ${provider.delay}ms`);
|
|
3820
3797
|
await require_fetch.sleep(provider.delay);
|
|
3821
|
-
} else if (response.cached) require_logger.
|
|
3798
|
+
} else if (response.cached) require_logger.logger.debug(`Skipping delay because response is cached`);
|
|
3822
3799
|
const ret = {
|
|
3823
3800
|
...setup,
|
|
3824
3801
|
response,
|
|
@@ -3921,7 +3898,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3921
3898
|
promptIdx,
|
|
3922
3899
|
testIdx
|
|
3923
3900
|
});
|
|
3924
|
-
if (!(err instanceof Error && err.name === "AbortError")) require_logger.
|
|
3901
|
+
if (!(err instanceof Error && err.name === "AbortError")) require_logger.logger.error("Provider call failed during eval", logContext);
|
|
3925
3902
|
return [{
|
|
3926
3903
|
...setup,
|
|
3927
3904
|
error: errorWithStack,
|
|
@@ -4004,7 +3981,7 @@ function generateVarCombinations(vars) {
|
|
|
4004
3981
|
let values = [];
|
|
4005
3982
|
if (typeof vars[key] === "string" && vars[key].startsWith("file://")) {
|
|
4006
3983
|
const filePath = vars[key].slice(7);
|
|
4007
|
-
const basePath = require_logger.
|
|
3984
|
+
const basePath = require_logger.state.basePath || "";
|
|
4008
3985
|
values = ((0, glob.globSync)(filePath, {
|
|
4009
3986
|
cwd: basePath || process.cwd(),
|
|
4010
3987
|
windowsPathsNoEscape: true
|
|
@@ -4044,28 +4021,28 @@ var Evaluator = class {
|
|
|
4044
4021
|
this.conversations = {};
|
|
4045
4022
|
this.registers = {};
|
|
4046
4023
|
this.fileWriters = (Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : []).map((p) => new JsonlFileWriter(p));
|
|
4047
|
-
this.rateLimitRegistry = require_providers.createRateLimitRegistry({ maxConcurrency: options.maxConcurrency ||
|
|
4024
|
+
this.rateLimitRegistry = require_providers.createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
4048
4025
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
4049
|
-
require_logger.
|
|
4026
|
+
require_logger.logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
4050
4027
|
retryAfterMs: data.retryAfterMs,
|
|
4051
4028
|
resetAt: data.resetAt,
|
|
4052
4029
|
concurrencyChange: data.concurrencyChange
|
|
4053
4030
|
});
|
|
4054
4031
|
});
|
|
4055
4032
|
this.rateLimitRegistry.on("ratelimit:learned", (data) => {
|
|
4056
|
-
require_logger.
|
|
4033
|
+
require_logger.logger.debug(`[Scheduler] Learned rate limits for ${data.rateLimitKey}`, {
|
|
4057
4034
|
requestLimit: data.requestLimit,
|
|
4058
4035
|
tokenLimit: data.tokenLimit
|
|
4059
4036
|
});
|
|
4060
4037
|
});
|
|
4061
4038
|
this.rateLimitRegistry.on("concurrency:decreased", (data) => {
|
|
4062
|
-
require_logger.
|
|
4039
|
+
require_logger.logger.debug(`[Scheduler] Concurrency decreased for ${data.rateLimitKey}`, {
|
|
4063
4040
|
previous: data.previous,
|
|
4064
4041
|
current: data.current
|
|
4065
4042
|
});
|
|
4066
4043
|
});
|
|
4067
4044
|
this.rateLimitRegistry.on("concurrency:increased", (data) => {
|
|
4068
|
-
require_logger.
|
|
4045
|
+
require_logger.logger.debug(`[Scheduler] Concurrency increased for ${data.rateLimitKey}`, {
|
|
4069
4046
|
previous: data.previous,
|
|
4070
4047
|
current: data.current
|
|
4071
4048
|
});
|
|
@@ -4122,7 +4099,7 @@ var Evaluator = class {
|
|
|
4122
4099
|
const checkAbort = () => {
|
|
4123
4100
|
if (combinedAbortSignal.aborted) throw new Error("Operation cancelled");
|
|
4124
4101
|
};
|
|
4125
|
-
if (!options.silent) require_logger.
|
|
4102
|
+
if (!options.silent) require_logger.logger.info(`Starting evaluation ${this.evalRecord.id}`);
|
|
4126
4103
|
checkAbort();
|
|
4127
4104
|
const prompts = [];
|
|
4128
4105
|
const assertionTypes = /* @__PURE__ */ new Set();
|
|
@@ -4134,32 +4111,32 @@ var Evaluator = class {
|
|
|
4134
4111
|
}
|
|
4135
4112
|
testSuite = (await require_providers.runExtensionHook(testSuite.extensions, "beforeAll", { suite: testSuite })).suite;
|
|
4136
4113
|
if (options.generateSuggestions) {
|
|
4137
|
-
require_logger.
|
|
4114
|
+
require_logger.logger.info(`Generating prompt variations...`);
|
|
4138
4115
|
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
4139
4116
|
if (error || !newPrompts) throw new Error(`Failed to generate prompts: ${error}`);
|
|
4140
|
-
require_logger.
|
|
4117
|
+
require_logger.logger.info(chalk.default.blue("Generated prompts:"));
|
|
4141
4118
|
let numAdded = 0;
|
|
4142
4119
|
for (const prompt of newPrompts) {
|
|
4143
|
-
require_logger.
|
|
4144
|
-
require_logger.
|
|
4145
|
-
require_logger.
|
|
4120
|
+
require_logger.logger.info("--------------------------------------------------------");
|
|
4121
|
+
require_logger.logger.info(`${prompt}`);
|
|
4122
|
+
require_logger.logger.info("--------------------------------------------------------");
|
|
4146
4123
|
if (await require_server.promptYesNo("Do you want to test this prompt?", false)) {
|
|
4147
4124
|
testSuite.prompts.push({
|
|
4148
4125
|
raw: prompt,
|
|
4149
4126
|
label: prompt
|
|
4150
4127
|
});
|
|
4151
4128
|
numAdded++;
|
|
4152
|
-
} else require_logger.
|
|
4129
|
+
} else require_logger.logger.info("Skipping this prompt.");
|
|
4153
4130
|
}
|
|
4154
4131
|
if (numAdded < 1) {
|
|
4155
|
-
require_logger.
|
|
4132
|
+
require_logger.logger.info(chalk.default.red("No prompts selected. Aborting."));
|
|
4156
4133
|
process.exitCode = 1;
|
|
4157
4134
|
return this.evalRecord;
|
|
4158
4135
|
}
|
|
4159
4136
|
}
|
|
4160
4137
|
const existingPromptsMap = /* @__PURE__ */ new Map();
|
|
4161
|
-
if (require_logger.
|
|
4162
|
-
require_logger.
|
|
4138
|
+
if (require_logger.state.resume && this.evalRecord.persisted && this.evalRecord.prompts.length > 0) {
|
|
4139
|
+
require_logger.logger.debug("Resuming evaluation: preserving metrics from previous run");
|
|
4163
4140
|
for (const existingPrompt of this.evalRecord.prompts) {
|
|
4164
4141
|
const key = `${existingPrompt.provider}:${existingPrompt.id}`;
|
|
4165
4142
|
existingPromptsMap.set(key, existingPrompt);
|
|
@@ -4197,7 +4174,7 @@ var Evaluator = class {
|
|
|
4197
4174
|
await this.evalRecord.addPrompts(prompts);
|
|
4198
4175
|
let tests = testSuite.tests && testSuite.tests.length > 0 ? testSuite.tests : testSuite.scenarios ? [] : [{}];
|
|
4199
4176
|
if (testSuite.scenarios && testSuite.scenarios.length > 0) {
|
|
4200
|
-
require_telemetry.
|
|
4177
|
+
require_telemetry.telemetry.record("feature_used", { feature: "scenarios" });
|
|
4201
4178
|
let scenarioIndex = 0;
|
|
4202
4179
|
for (const scenario of testSuite.scenarios) for (const data of scenario.config) {
|
|
4203
4180
|
const scenarioTests = (scenario.tests || [{}]).map((test) => {
|
|
@@ -4261,7 +4238,7 @@ var Evaluator = class {
|
|
|
4261
4238
|
}
|
|
4262
4239
|
const runEvalOptions = [];
|
|
4263
4240
|
let testIdx = 0;
|
|
4264
|
-
let concurrency = options.maxConcurrency ||
|
|
4241
|
+
let concurrency = options.maxConcurrency || 4;
|
|
4265
4242
|
for (let index = 0; index < tests.length; index++) {
|
|
4266
4243
|
const testCase = tests[index];
|
|
4267
4244
|
require_invariant.invariant(typeof testSuite.defaultTest !== "object" || Array.isArray(testSuite.defaultTest?.assert || []), `defaultTest.assert is not an array in test case #${index + 1}`);
|
|
@@ -4281,7 +4258,7 @@ var Evaluator = class {
|
|
|
4281
4258
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4282
4259
|
if (require_types.isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4283
4260
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4284
|
-
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-
|
|
4261
|
+
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-zyB6k_38.cjs"));
|
|
4285
4262
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4286
4263
|
} else testCase.provider = defaultProvider;
|
|
4287
4264
|
}
|
|
@@ -4308,7 +4285,7 @@ var Evaluator = class {
|
|
|
4308
4285
|
const promptId = require_utils.generateIdFromPrompt(prompt);
|
|
4309
4286
|
const promptIdx = promptIndexMap.get(`${providerKey}:${promptId}`);
|
|
4310
4287
|
if (promptIdx === void 0) {
|
|
4311
|
-
require_logger.
|
|
4288
|
+
require_logger.logger.warn(`Could not find prompt index for ${providerKey}:${promptId}, skipping`);
|
|
4312
4289
|
continue;
|
|
4313
4290
|
}
|
|
4314
4291
|
runEvalOptions.push({
|
|
@@ -4331,7 +4308,7 @@ var Evaluator = class {
|
|
|
4331
4308
|
options: testOptions
|
|
4332
4309
|
};
|
|
4333
4310
|
const tracingEnabled = require_logger.getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || testCase.metadata?.tracingEnabled === true || testSuite.tracing?.enabled === true;
|
|
4334
|
-
require_logger.
|
|
4311
|
+
require_logger.logger.debug(`[Evaluator] Tracing check: env=${require_logger.getEnvBool("PROMPTFOO_TRACING_ENABLED", false)}, testCase.metadata?.tracingEnabled=${testCase.metadata?.tracingEnabled}, testSuite.tracing?.enabled=${testSuite.tracing?.enabled}, tracingEnabled=${tracingEnabled}`);
|
|
4335
4312
|
if (tracingEnabled) return {
|
|
4336
4313
|
...baseTest,
|
|
4337
4314
|
metadata: {
|
|
@@ -4364,27 +4341,27 @@ var Evaluator = class {
|
|
|
4364
4341
|
if (evalOption.test.assert?.some((a) => a.type === "select-best")) rowsWithSelectBestAssertion.add(evalOption.testIdx);
|
|
4365
4342
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4366
4343
|
}
|
|
4367
|
-
if (require_logger.
|
|
4368
|
-
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-
|
|
4369
|
-
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: require_logger.
|
|
4344
|
+
if (require_logger.state.resume && this.evalRecord.persisted) try {
|
|
4345
|
+
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-71lY93Kj.cjs"));
|
|
4346
|
+
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: require_logger.state.retryMode });
|
|
4370
4347
|
const originalCount = runEvalOptions.length;
|
|
4371
4348
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
4372
4349
|
const step = runEvalOptions[i];
|
|
4373
4350
|
if (completedPairs.has(`${step.testIdx}:${step.promptIdx}`)) runEvalOptions.splice(i, 1);
|
|
4374
4351
|
}
|
|
4375
4352
|
const skipped = originalCount - runEvalOptions.length;
|
|
4376
|
-
if (skipped > 0) require_logger.
|
|
4353
|
+
if (skipped > 0) require_logger.logger.info(`Resuming: skipping ${skipped} previously completed cases`);
|
|
4377
4354
|
} catch (err) {
|
|
4378
|
-
require_logger.
|
|
4355
|
+
require_logger.logger.warn(`Resume: failed to load completed results. Running full evaluation. ${String(err)}`);
|
|
4379
4356
|
}
|
|
4380
4357
|
if (concurrency > 1) {
|
|
4381
4358
|
const usesConversation = prompts.some((p) => p.raw.includes("_conversation"));
|
|
4382
4359
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
4383
4360
|
if (usesConversation) {
|
|
4384
|
-
require_logger.
|
|
4361
|
+
require_logger.logger.info(`Setting concurrency to 1 because the ${chalk.default.cyan("_conversation")} variable is used.`);
|
|
4385
4362
|
concurrency = 1;
|
|
4386
4363
|
} else if (usesStoreOutputAs) {
|
|
4387
|
-
require_logger.
|
|
4364
|
+
require_logger.logger.info(`Setting concurrency to 1 because storeOutputAs is used.`);
|
|
4388
4365
|
concurrency = 1;
|
|
4389
4366
|
}
|
|
4390
4367
|
}
|
|
@@ -4415,14 +4392,14 @@ var Evaluator = class {
|
|
|
4415
4392
|
await this.evalRecord.addResult(row);
|
|
4416
4393
|
} catch (error) {
|
|
4417
4394
|
const resultSummary = require_logger.summarizeEvaluateResultForLogging(row);
|
|
4418
|
-
require_logger.
|
|
4395
|
+
require_logger.logger.error(`Error saving result: ${error} ${require_logger.safeJsonStringify(resultSummary)}`);
|
|
4419
4396
|
}
|
|
4420
4397
|
for (const writer of this.fileWriters) await writer.write(row);
|
|
4421
4398
|
const httpStatus = row.response?.metadata?.http?.status;
|
|
4422
4399
|
if (typeof httpStatus === "number" && require_cache.isNonTransientHttpStatus(httpStatus)) {
|
|
4423
4400
|
targetUnavailable = true;
|
|
4424
4401
|
targetErrorStatus = httpStatus;
|
|
4425
|
-
require_logger.
|
|
4402
|
+
require_logger.logger.error(`Target returned HTTP ${httpStatus}. Aborting scan - this error will not resolve on retry.`);
|
|
4426
4403
|
targetErrorAbortController.abort();
|
|
4427
4404
|
break;
|
|
4428
4405
|
}
|
|
@@ -4442,7 +4419,7 @@ var Evaluator = class {
|
|
|
4442
4419
|
if (testSuite.derivedMetrics) {
|
|
4443
4420
|
const math = await import("mathjs");
|
|
4444
4421
|
const promptEvalCount = metrics.testPassCount + metrics.testFailCount + metrics.testErrorCount + 1;
|
|
4445
|
-
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count")) require_logger.
|
|
4422
|
+
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count")) require_logger.logger.warn("Metric name '__count' is reserved for derived metrics and will be overridden.");
|
|
4446
4423
|
const evalContext = {
|
|
4447
4424
|
...metrics.namedScores,
|
|
4448
4425
|
__count: promptEvalCount
|
|
@@ -4457,7 +4434,7 @@ var Evaluator = class {
|
|
|
4457
4434
|
}
|
|
4458
4435
|
evalContext[metric.name] = metrics.namedScores[metric.name];
|
|
4459
4436
|
} catch (error) {
|
|
4460
|
-
require_logger.
|
|
4437
|
+
require_logger.logger.debug(`Could not evaluate derived metric '${metric.name}': ${error.message}`);
|
|
4461
4438
|
}
|
|
4462
4439
|
}
|
|
4463
4440
|
}
|
|
@@ -4496,7 +4473,7 @@ var Evaluator = class {
|
|
|
4496
4473
|
if (typeof evalStep.provider.cleanup === "function") try {
|
|
4497
4474
|
evalStep.provider.cleanup();
|
|
4498
4475
|
} catch (cleanupErr) {
|
|
4499
|
-
require_logger.
|
|
4476
|
+
require_logger.logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
4500
4477
|
}
|
|
4501
4478
|
reject(/* @__PURE__ */ new Error(`Evaluation timed out after ${timeoutMs}ms`));
|
|
4502
4479
|
}, timeoutMs);
|
|
@@ -4560,8 +4537,8 @@ var Evaluator = class {
|
|
|
4560
4537
|
}
|
|
4561
4538
|
};
|
|
4562
4539
|
const originalProgressCallback = this.options.progressCallback;
|
|
4563
|
-
const isWebUI = Boolean(require_logger.
|
|
4564
|
-
require_logger.
|
|
4540
|
+
const isWebUI = Boolean(require_logger.state.webUI);
|
|
4541
|
+
require_logger.logger.debug(`Progress bar settings: showProgressBar=${this.options.showProgressBar}, isWebUI=${isWebUI}`);
|
|
4565
4542
|
if (require_logger.isCI() && !isWebUI) {
|
|
4566
4543
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4567
4544
|
ciProgressReporter.start();
|
|
@@ -4571,20 +4548,20 @@ var Evaluator = class {
|
|
|
4571
4548
|
if (isWebUI) {
|
|
4572
4549
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4573
4550
|
const vars = formatVarsForDisplay(evalStep.test.vars, 50);
|
|
4574
|
-
require_logger.
|
|
4551
|
+
require_logger.logger.info(`[${numComplete}/${total}] Running ${provider} with vars: ${vars}`);
|
|
4575
4552
|
} else if (progressBarManager) {
|
|
4576
4553
|
const phase = evalStep.test.options?.runSerially ? "serial" : "concurrent";
|
|
4577
4554
|
progressBarManager.updateProgress(index, evalStep, phase, metrics);
|
|
4578
4555
|
} else if (ciProgressReporter) ciProgressReporter.update(numComplete);
|
|
4579
|
-
else require_logger.
|
|
4556
|
+
else require_logger.logger.debug(`Eval #${index + 1} complete (${numComplete} of ${runEvalOptions.length})`);
|
|
4580
4557
|
};
|
|
4581
4558
|
const serialRunEvalOptions = [];
|
|
4582
4559
|
const concurrentRunEvalOptions = [];
|
|
4583
4560
|
for (const evalOption of runEvalOptions) if (evalOption.test.options?.runSerially) serialRunEvalOptions.push(evalOption);
|
|
4584
4561
|
else concurrentRunEvalOptions.push(evalOption);
|
|
4585
4562
|
if (!this.options.silent) {
|
|
4586
|
-
if (serialRunEvalOptions.length > 0) require_logger.
|
|
4587
|
-
if (concurrentRunEvalOptions.length > 0) require_logger.
|
|
4563
|
+
if (serialRunEvalOptions.length > 0) require_logger.logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4564
|
+
if (concurrentRunEvalOptions.length > 0) require_logger.logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4588
4565
|
}
|
|
4589
4566
|
if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
4590
4567
|
try {
|
|
@@ -4593,7 +4570,7 @@ var Evaluator = class {
|
|
|
4593
4570
|
if (isWebUI) {
|
|
4594
4571
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4595
4572
|
const vars = formatVarsForDisplay(evalStep.test.vars || {}, 50);
|
|
4596
|
-
require_logger.
|
|
4573
|
+
require_logger.logger.info(`[${numComplete}/${runEvalOptions.length}] Running ${provider} with vars: ${vars}`);
|
|
4597
4574
|
}
|
|
4598
4575
|
const idx = runEvalOptions.indexOf(evalStep);
|
|
4599
4576
|
await processEvalStepWithTimeout(evalStep, idx);
|
|
@@ -4608,9 +4585,9 @@ var Evaluator = class {
|
|
|
4608
4585
|
});
|
|
4609
4586
|
} catch (err) {
|
|
4610
4587
|
if (combinedAbortSignal.aborted) {
|
|
4611
|
-
if (evalTimedOut) require_logger.
|
|
4588
|
+
if (evalTimedOut) require_logger.logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
4612
4589
|
else if (!targetUnavailable) {
|
|
4613
|
-
require_logger.
|
|
4590
|
+
require_logger.logger.info("Evaluation interrupted, saving progress...");
|
|
4614
4591
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4615
4592
|
if (progressBarManager) progressBarManager.stop();
|
|
4616
4593
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -4640,10 +4617,10 @@ var Evaluator = class {
|
|
|
4640
4617
|
let compareCount = 0;
|
|
4641
4618
|
for (const testIdx of rowsWithSelectBestAssertion) {
|
|
4642
4619
|
compareCount++;
|
|
4643
|
-
if (isWebUI) require_logger.
|
|
4620
|
+
if (isWebUI) require_logger.logger.info(`Running model-graded comparison ${compareCount} of ${compareRowsCount}...`);
|
|
4644
4621
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4645
4622
|
if (resultsToCompare.length === 0) {
|
|
4646
|
-
require_logger.
|
|
4623
|
+
require_logger.logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4647
4624
|
continue;
|
|
4648
4625
|
}
|
|
4649
4626
|
const compareAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "select-best");
|
|
@@ -4705,16 +4682,16 @@ var Evaluator = class {
|
|
|
4705
4682
|
}
|
|
4706
4683
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4707
4684
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4708
|
-
else if (!isWebUI) require_logger.
|
|
4685
|
+
else if (!isWebUI) require_logger.logger.debug(`Model-graded comparison #${compareCount} of ${compareRowsCount} complete`);
|
|
4709
4686
|
}
|
|
4710
4687
|
}
|
|
4711
4688
|
const maxScoreRowsCount = rowsWithMaxScoreAssertion.size;
|
|
4712
4689
|
if (maxScoreRowsCount > 0) {
|
|
4713
|
-
require_logger.
|
|
4690
|
+
require_logger.logger.info(`Processing ${maxScoreRowsCount} max-score assertions...`);
|
|
4714
4691
|
for (const testIdx of rowsWithMaxScoreAssertion) {
|
|
4715
4692
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4716
4693
|
if (resultsToCompare.length === 0) {
|
|
4717
|
-
require_logger.
|
|
4694
|
+
require_logger.logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4718
4695
|
continue;
|
|
4719
4696
|
}
|
|
4720
4697
|
const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
|
|
@@ -4722,7 +4699,7 @@ var Evaluator = class {
|
|
|
4722
4699
|
const maxScoreGradingResults = await require_graders.selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
|
|
4723
4700
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4724
4701
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4725
|
-
else if (!isWebUI) require_logger.
|
|
4702
|
+
else if (!isWebUI) require_logger.logger.debug(`Max-score assertion for test #${testIdx} complete`);
|
|
4726
4703
|
for (let index = 0; index < resultsToCompare.length; index++) {
|
|
4727
4704
|
const result = resultsToCompare[index];
|
|
4728
4705
|
const maxScoreGradingResult = {
|
|
@@ -4766,7 +4743,7 @@ var Evaluator = class {
|
|
|
4766
4743
|
progressBarManager.stop();
|
|
4767
4744
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
4768
4745
|
} catch (cleanupErr) {
|
|
4769
|
-
require_logger.
|
|
4746
|
+
require_logger.logger.warn(`Error during progress reporter cleanup: ${cleanupErr}`);
|
|
4770
4747
|
}
|
|
4771
4748
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4772
4749
|
if (evalTimedOut) {
|
|
@@ -4839,7 +4816,7 @@ var Evaluator = class {
|
|
|
4839
4816
|
return idParts.length > 1 ? idParts[0] : "unknown";
|
|
4840
4817
|
})));
|
|
4841
4818
|
const timeoutOccurred = evalTimedOut || this.evalRecord.results.some((r) => r.failureReason === require_types.ResultFailureReason.ERROR && r.error?.includes("timed out"));
|
|
4842
|
-
require_telemetry.
|
|
4819
|
+
require_telemetry.telemetry.record("eval_ran", {
|
|
4843
4820
|
numPrompts: prompts.length,
|
|
4844
4821
|
numTests: this.stats.successes + this.stats.failures + this.stats.errors,
|
|
4845
4822
|
numRequests: this.stats.tokenUsage.numRequests || 0,
|
|
@@ -4887,26 +4864,26 @@ var Evaluator = class {
|
|
|
4887
4864
|
await startOtlpReceiverIfNeeded(this.testSuite);
|
|
4888
4865
|
const tracingEnabled = require_logger.getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || this.testSuite.tracing?.enabled === true || typeof this.testSuite.defaultTest === "object" && this.testSuite.defaultTest?.metadata?.tracingEnabled === true || this.testSuite.tests?.some((t) => t.metadata?.tracingEnabled === true);
|
|
4889
4866
|
if (tracingEnabled) {
|
|
4890
|
-
require_logger.
|
|
4867
|
+
require_logger.logger.debug("[Evaluator] Initializing OTEL SDK for tracing");
|
|
4891
4868
|
initializeOtel(getDefaultOtelConfig());
|
|
4892
4869
|
}
|
|
4893
4870
|
try {
|
|
4894
4871
|
return await this._runEvaluation();
|
|
4895
4872
|
} finally {
|
|
4896
4873
|
if (tracingEnabled) {
|
|
4897
|
-
require_logger.
|
|
4874
|
+
require_logger.logger.debug("[Evaluator] Flushing OTEL spans...");
|
|
4898
4875
|
await flushOtel();
|
|
4899
4876
|
await shutdownOtel();
|
|
4900
4877
|
}
|
|
4901
4878
|
if (isOtlpReceiverStarted()) {
|
|
4902
|
-
require_logger.
|
|
4879
|
+
require_logger.logger.debug("[Evaluator] Waiting for span exports to complete...");
|
|
4903
4880
|
await require_fetch.sleep(3e3);
|
|
4904
4881
|
}
|
|
4905
4882
|
await stopOtlpReceiverIfNeeded();
|
|
4906
4883
|
await require_providerRegistry.providerRegistry.shutdownAll();
|
|
4907
4884
|
if (this.rateLimitRegistry) {
|
|
4908
4885
|
const metrics = this.rateLimitRegistry.getMetrics();
|
|
4909
|
-
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0) require_logger.
|
|
4886
|
+
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0) require_logger.logger.debug(`[Scheduler] Final metrics for ${key}`, {
|
|
4910
4887
|
totalRequests: m.totalRequests,
|
|
4911
4888
|
completedRequests: m.completedRequests,
|
|
4912
4889
|
failedRequests: m.failedRequests,
|
|
@@ -4919,14 +4896,13 @@ var Evaluator = class {
|
|
|
4919
4896
|
}
|
|
4920
4897
|
this.rateLimitRegistry?.dispose();
|
|
4921
4898
|
require_providers.redteamProviderManager.setRateLimitRegistry(void 0);
|
|
4922
|
-
require_logger.
|
|
4899
|
+
require_logger.state.maxConcurrency = void 0;
|
|
4923
4900
|
}
|
|
4924
4901
|
}
|
|
4925
4902
|
};
|
|
4926
4903
|
function evaluate$1(testSuite, evalRecord, options) {
|
|
4927
4904
|
return new Evaluator(testSuite, evalRecord, options).evaluate();
|
|
4928
4905
|
}
|
|
4929
|
-
|
|
4930
4906
|
//#endregion
|
|
4931
4907
|
//#region src/guardrails.ts
|
|
4932
4908
|
const API_BASE_URL = `${require_fetch.getShareApiBaseUrl()}/v1`;
|
|
@@ -4940,7 +4916,7 @@ async function makeRequest(endpoint, input) {
|
|
|
4940
4916
|
if (!response.data) throw new Error("No data returned from API");
|
|
4941
4917
|
return response.data;
|
|
4942
4918
|
} catch (error) {
|
|
4943
|
-
require_logger.
|
|
4919
|
+
require_logger.logger.error(`Guardrails API error: ${error}`);
|
|
4944
4920
|
throw error;
|
|
4945
4921
|
}
|
|
4946
4922
|
}
|
|
@@ -4957,7 +4933,7 @@ async function makeAdaptiveRequest(request) {
|
|
|
4957
4933
|
if (!response.data) throw new Error("No data returned from API");
|
|
4958
4934
|
return response.data;
|
|
4959
4935
|
} catch (error) {
|
|
4960
|
-
require_logger.
|
|
4936
|
+
require_logger.logger.error(`Guardrails API error: ${error}`);
|
|
4961
4937
|
throw error;
|
|
4962
4938
|
}
|
|
4963
4939
|
}
|
|
@@ -4975,8 +4951,6 @@ const guardrails = {
|
|
|
4975
4951
|
return makeAdaptiveRequest(request);
|
|
4976
4952
|
}
|
|
4977
4953
|
};
|
|
4978
|
-
var guardrails_default = guardrails;
|
|
4979
|
-
|
|
4980
4954
|
//#endregion
|
|
4981
4955
|
//#region src/migrate.ts
|
|
4982
4956
|
/**
|
|
@@ -5011,18 +4985,17 @@ async function runDbMigrations() {
|
|
|
5011
4985
|
const projectRoot = dir.split("dist/server/src")[0];
|
|
5012
4986
|
migrationsFolder = path.join(projectRoot, "dist", "promptfoo", "drizzle");
|
|
5013
4987
|
} else migrationsFolder = path.join(dir, "..", "drizzle");
|
|
5014
|
-
require_logger.
|
|
4988
|
+
require_logger.logger.debug(`Running database migrations from: ${migrationsFolder}`);
|
|
5015
4989
|
(0, drizzle_orm_better_sqlite3_migrator.migrate)(db, { migrationsFolder });
|
|
5016
|
-
require_logger.
|
|
4990
|
+
require_logger.logger.debug("Database migrations completed");
|
|
5017
4991
|
resolve();
|
|
5018
4992
|
} catch (error) {
|
|
5019
|
-
require_logger.
|
|
4993
|
+
require_logger.logger.error(`Database migration failed: ${error}`);
|
|
5020
4994
|
reject(error);
|
|
5021
4995
|
}
|
|
5022
4996
|
});
|
|
5023
4997
|
});
|
|
5024
4998
|
}
|
|
5025
|
-
|
|
5026
4999
|
//#endregion
|
|
5027
5000
|
//#region src/redteam/sharedFrontend.ts
|
|
5028
5001
|
function getRiskCategorySeverityMap(plugins) {
|
|
@@ -5039,7 +5012,6 @@ function getRiskCategorySeverityMap(plugins) {
|
|
|
5039
5012
|
...overrides
|
|
5040
5013
|
};
|
|
5041
5014
|
}
|
|
5042
|
-
|
|
5043
5015
|
//#endregion
|
|
5044
5016
|
//#region src/util/calculateFilteredMetrics.ts
|
|
5045
5017
|
/**
|
|
@@ -5093,12 +5065,12 @@ async function calculateFilteredMetrics(opts) {
|
|
|
5093
5065
|
try {
|
|
5094
5066
|
const countResult = await getResultCount(whereSql);
|
|
5095
5067
|
if (countResult > MAX_RESULTS_FOR_METRICS) {
|
|
5096
|
-
require_logger.
|
|
5068
|
+
require_logger.logger.warn(`Filtered result count ${countResult} exceeds limit ${MAX_RESULTS_FOR_METRICS}`, { evalId: opts.evalId });
|
|
5097
5069
|
throw new Error(`Result count ${countResult} exceeds maximum ${MAX_RESULTS_FOR_METRICS}`);
|
|
5098
5070
|
}
|
|
5099
5071
|
return await calculateWithOptimizedQuery(opts);
|
|
5100
5072
|
} catch (error) {
|
|
5101
|
-
require_logger.
|
|
5073
|
+
require_logger.logger.error("Failed to calculate filtered metrics with optimized query", { error });
|
|
5102
5074
|
return createEmptyMetricsArray(numPrompts);
|
|
5103
5075
|
}
|
|
5104
5076
|
}
|
|
@@ -5151,7 +5123,7 @@ async function calculateWithOptimizedQuery(opts) {
|
|
|
5151
5123
|
for (const row of basicResults) {
|
|
5152
5124
|
const idx = row.prompt_idx;
|
|
5153
5125
|
if (idx < 0 || idx >= numPrompts) {
|
|
5154
|
-
require_logger.
|
|
5126
|
+
require_logger.logger.warn(`Invalid prompt_idx ${idx}, expected 0-${numPrompts - 1}`);
|
|
5155
5127
|
continue;
|
|
5156
5128
|
}
|
|
5157
5129
|
metrics[idx] = {
|
|
@@ -5176,7 +5148,7 @@ async function calculateWithOptimizedQuery(opts) {
|
|
|
5176
5148
|
}
|
|
5177
5149
|
await aggregateNamedScores(metrics, whereSql);
|
|
5178
5150
|
await aggregateAssertions(metrics, whereSql);
|
|
5179
|
-
require_logger.
|
|
5151
|
+
require_logger.logger.debug("Filtered metrics calculated", {
|
|
5180
5152
|
numPrompts,
|
|
5181
5153
|
metricsCount: basicResults.length
|
|
5182
5154
|
});
|
|
@@ -5297,7 +5269,6 @@ function createEmptyMetricsArray(numPrompts) {
|
|
|
5297
5269
|
cost: 0
|
|
5298
5270
|
}));
|
|
5299
5271
|
}
|
|
5300
|
-
|
|
5301
5272
|
//#endregion
|
|
5302
5273
|
//#region src/util/convertEvalResultsToTable.ts
|
|
5303
5274
|
/**
|
|
@@ -5430,7 +5401,6 @@ function convertResultsToTable(eval_) {
|
|
|
5430
5401
|
body: rows
|
|
5431
5402
|
};
|
|
5432
5403
|
}
|
|
5433
|
-
|
|
5434
5404
|
//#endregion
|
|
5435
5405
|
//#region src/util/exportToFile/index.ts
|
|
5436
5406
|
function convertEvalResultToTableCell(result) {
|
|
@@ -5508,7 +5478,6 @@ function convertTestResultsToTableRow(results, varsForHeader) {
|
|
|
5508
5478
|
for (const result of results) row.outputs[result.promptIdx] = convertEvalResultToTableCell(result);
|
|
5509
5479
|
return row;
|
|
5510
5480
|
}
|
|
5511
|
-
|
|
5512
5481
|
//#endregion
|
|
5513
5482
|
//#region src/models/evalPerformance.ts
|
|
5514
5483
|
const distinctCountCache = /* @__PURE__ */ new Map();
|
|
@@ -5525,7 +5494,7 @@ async function getCachedResultsCount(evalId) {
|
|
|
5525
5494
|
const cacheKey = `distinct:${evalId}`;
|
|
5526
5495
|
const cached = distinctCountCache.get(cacheKey);
|
|
5527
5496
|
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
5528
|
-
require_logger.
|
|
5497
|
+
require_logger.logger.debug(`Using cached distinct count for eval ${evalId}: ${cached.count}`);
|
|
5529
5498
|
return cached.count;
|
|
5530
5499
|
}
|
|
5531
5500
|
const db = require_tables.getDb();
|
|
@@ -5533,7 +5502,7 @@ async function getCachedResultsCount(evalId) {
|
|
|
5533
5502
|
const result = db.select({ count: drizzle_orm.sql`COUNT(DISTINCT test_idx)` }).from(require_tables.evalResultsTable).where(drizzle_orm.sql`eval_id = ${evalId}`).all();
|
|
5534
5503
|
const count = Number(result[0]?.count ?? 0);
|
|
5535
5504
|
const duration = Date.now() - start;
|
|
5536
|
-
require_logger.
|
|
5505
|
+
require_logger.logger.debug(`Distinct count query for eval ${evalId}: ${count} in ${duration}ms`);
|
|
5537
5506
|
distinctCountCache.set(cacheKey, {
|
|
5538
5507
|
count,
|
|
5539
5508
|
timestamp: Date.now()
|
|
@@ -5551,7 +5520,7 @@ async function getTotalResultRowCount(evalId) {
|
|
|
5551
5520
|
const cacheKey = `total:${evalId}`;
|
|
5552
5521
|
const cached = totalRowCountCache.get(cacheKey);
|
|
5553
5522
|
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
5554
|
-
require_logger.
|
|
5523
|
+
require_logger.logger.debug(`Using cached total row count for eval ${evalId}: ${cached.count}`);
|
|
5555
5524
|
return cached.count;
|
|
5556
5525
|
}
|
|
5557
5526
|
const db = require_tables.getDb();
|
|
@@ -5559,7 +5528,7 @@ async function getTotalResultRowCount(evalId) {
|
|
|
5559
5528
|
const result = db.select({ count: drizzle_orm.sql`COUNT(*)` }).from(require_tables.evalResultsTable).where(drizzle_orm.sql`eval_id = ${evalId}`).all();
|
|
5560
5529
|
const count = Number(result[0]?.count ?? 0);
|
|
5561
5530
|
const duration = Date.now() - start;
|
|
5562
|
-
require_logger.
|
|
5531
|
+
require_logger.logger.debug(`Total row count query for eval ${evalId}: ${count} in ${duration}ms`);
|
|
5563
5532
|
totalRowCountCache.set(cacheKey, {
|
|
5564
5533
|
count,
|
|
5565
5534
|
timestamp: Date.now()
|
|
@@ -5592,7 +5561,7 @@ async function queryTestIndicesOptimized(evalId, opts) {
|
|
|
5592
5561
|
`;
|
|
5593
5562
|
const countResult = db.all(countQuery);
|
|
5594
5563
|
const filteredCount = Number(countResult[0]?.count ?? 0);
|
|
5595
|
-
require_logger.
|
|
5564
|
+
require_logger.logger.debug(`Optimized count query took ${Date.now() - countStart}ms`);
|
|
5596
5565
|
const idxStart = Date.now();
|
|
5597
5566
|
const idxQuery = drizzle_orm.sql`
|
|
5598
5567
|
SELECT DISTINCT test_idx
|
|
@@ -5603,13 +5572,12 @@ async function queryTestIndicesOptimized(evalId, opts) {
|
|
|
5603
5572
|
OFFSET ${offset}
|
|
5604
5573
|
`;
|
|
5605
5574
|
const testIndices = db.all(idxQuery).map((row) => row.test_idx);
|
|
5606
|
-
require_logger.
|
|
5575
|
+
require_logger.logger.debug(`Optimized index query took ${Date.now() - idxStart}ms`);
|
|
5607
5576
|
return {
|
|
5608
5577
|
testIndices,
|
|
5609
5578
|
filteredCount
|
|
5610
5579
|
};
|
|
5611
5580
|
}
|
|
5612
|
-
|
|
5613
5581
|
//#endregion
|
|
5614
5582
|
//#region src/models/eval.ts
|
|
5615
5583
|
/**
|
|
@@ -5704,7 +5672,7 @@ var EvalQueries = class {
|
|
|
5704
5672
|
try {
|
|
5705
5673
|
db.update(require_tables.evalsTable).set({ vars }).where((0, drizzle_orm.eq)(require_tables.evalsTable.id, evalId)).run();
|
|
5706
5674
|
} catch (e) {
|
|
5707
|
-
require_logger.
|
|
5675
|
+
require_logger.logger.error(`Error setting vars: ${vars} for eval ${evalId}: ${e}`);
|
|
5708
5676
|
}
|
|
5709
5677
|
}
|
|
5710
5678
|
static async getMetadataKeysFromEval(evalId, comparisonEvalIds = []) {
|
|
@@ -5725,7 +5693,7 @@ var EvalQueries = class {
|
|
|
5725
5693
|
`;
|
|
5726
5694
|
return (await db.all(query)).map((r) => r.key);
|
|
5727
5695
|
} catch (error) {
|
|
5728
|
-
require_logger.
|
|
5696
|
+
require_logger.logger.error(`Error fetching metadata keys for eval ${evalId} and comparisons [${comparisonEvalIds.join(", ")}]: ${error}`);
|
|
5729
5697
|
return [];
|
|
5730
5698
|
}
|
|
5731
5699
|
}
|
|
@@ -5756,7 +5724,7 @@ var EvalQueries = class {
|
|
|
5756
5724
|
const values = db.all(query).map(({ value }) => String(value).trim()).filter((value) => value.length > 0);
|
|
5757
5725
|
return Array.from(new Set(values));
|
|
5758
5726
|
} catch (error) {
|
|
5759
|
-
require_logger.
|
|
5727
|
+
require_logger.logger.error(`Error fetching metadata values for eval ${evalId} and key ${trimmedKey}: ${error instanceof Error ? error.message : String(error)}`);
|
|
5760
5728
|
return [];
|
|
5761
5729
|
}
|
|
5762
5730
|
}
|
|
@@ -5828,7 +5796,7 @@ var Eval = class Eval {
|
|
|
5828
5796
|
}
|
|
5829
5797
|
return evalInstance;
|
|
5830
5798
|
}
|
|
5831
|
-
static async getMany(limit =
|
|
5799
|
+
static async getMany(limit = 100) {
|
|
5832
5800
|
return (await require_tables.getDb().select().from(require_tables.evalsTable).limit(limit).orderBy((0, drizzle_orm.desc)(require_tables.evalsTable.createdAt)).all()).map((e) => new Eval(e.config, {
|
|
5833
5801
|
id: e.id,
|
|
5834
5802
|
createdAt: new Date(e.createdAt),
|
|
@@ -5843,7 +5811,7 @@ var Eval = class Eval {
|
|
|
5843
5811
|
* @param offset - Number of evals to skip
|
|
5844
5812
|
* @param limit - Maximum number of evals to return
|
|
5845
5813
|
*/
|
|
5846
|
-
static async getPaginated(offset = 0, limit =
|
|
5814
|
+
static async getPaginated(offset = 0, limit = 100) {
|
|
5847
5815
|
return (await require_tables.getDb().select().from(require_tables.evalsTable).orderBy((0, drizzle_orm.desc)(require_tables.evalsTable.createdAt)).limit(limit).offset(offset).all()).map((e) => new Eval(e.config, {
|
|
5848
5816
|
id: e.id,
|
|
5849
5817
|
createdAt: new Date(e.createdAt),
|
|
@@ -5889,7 +5857,7 @@ var Eval = class Eval {
|
|
|
5889
5857
|
evalId,
|
|
5890
5858
|
promptId
|
|
5891
5859
|
}).onConflictDoNothing().run();
|
|
5892
|
-
require_logger.
|
|
5860
|
+
require_logger.logger.debug(`Inserting prompt ${promptId}`);
|
|
5893
5861
|
}
|
|
5894
5862
|
if (opts?.results && opts.results.length > 0) {
|
|
5895
5863
|
const res = db.insert(require_tables.evalResultsTable).values(opts.results?.map((r) => ({
|
|
@@ -5897,7 +5865,7 @@ var Eval = class Eval {
|
|
|
5897
5865
|
evalId,
|
|
5898
5866
|
id: crypto.randomUUID()
|
|
5899
5867
|
}))).run();
|
|
5900
|
-
require_logger.
|
|
5868
|
+
require_logger.logger.debug(`Inserted ${res.changes} eval results`);
|
|
5901
5869
|
}
|
|
5902
5870
|
db.insert(require_tables.datasetsTable).values({
|
|
5903
5871
|
id: datasetId,
|
|
@@ -5907,7 +5875,7 @@ var Eval = class Eval {
|
|
|
5907
5875
|
evalId,
|
|
5908
5876
|
datasetId
|
|
5909
5877
|
}).onConflictDoNothing().run();
|
|
5910
|
-
require_logger.
|
|
5878
|
+
require_logger.logger.debug(`Inserting dataset ${datasetId}`);
|
|
5911
5879
|
if (config.tags) for (const [tagKey, tagValue] of Object.entries(config.tags)) {
|
|
5912
5880
|
const tagId = require_createHash.sha256(`${tagKey}:${tagValue}`);
|
|
5913
5881
|
db.insert(require_tables.tagsTable).values({
|
|
@@ -5919,7 +5887,7 @@ var Eval = class Eval {
|
|
|
5919
5887
|
evalId,
|
|
5920
5888
|
tagId
|
|
5921
5889
|
}).onConflictDoNothing().run();
|
|
5922
|
-
require_logger.
|
|
5890
|
+
require_logger.logger.debug(`Inserting tag ${tagId}`);
|
|
5923
5891
|
}
|
|
5924
5892
|
});
|
|
5925
5893
|
return new Eval(config, {
|
|
@@ -6100,7 +6068,7 @@ var Eval = class Eval {
|
|
|
6100
6068
|
if (type === "metric") {
|
|
6101
6069
|
const metricKey = field || value;
|
|
6102
6070
|
if (!metricKey) {
|
|
6103
|
-
require_logger.
|
|
6071
|
+
require_logger.logger.warn("Invalid metric filter: missing field and value", { filter });
|
|
6104
6072
|
return;
|
|
6105
6073
|
}
|
|
6106
6074
|
const jsonPath = buildSafeJsonPath(metricKey);
|
|
@@ -6114,7 +6082,7 @@ var Eval = class Eval {
|
|
|
6114
6082
|
else if (operator === "lt") condition = drizzle_orm.sql`CAST(json_extract(named_scores, ${jsonPath}) AS REAL) < ${numericValue}`;
|
|
6115
6083
|
else if (operator === "lte") condition = drizzle_orm.sql`CAST(json_extract(named_scores, ${jsonPath}) AS REAL) <= ${numericValue}`;
|
|
6116
6084
|
} else {
|
|
6117
|
-
require_logger.
|
|
6085
|
+
require_logger.logger.warn("Invalid numeric value in metric filter", {
|
|
6118
6086
|
metricKey,
|
|
6119
6087
|
value,
|
|
6120
6088
|
numericValue,
|
|
@@ -6192,7 +6160,7 @@ var Eval = class Eval {
|
|
|
6192
6160
|
const countStart = Date.now();
|
|
6193
6161
|
const countResult = await db.get(filteredCountQuery);
|
|
6194
6162
|
const countEnd = Date.now();
|
|
6195
|
-
require_logger.
|
|
6163
|
+
require_logger.logger.debug(`Count query took ${countEnd - countStart}ms`);
|
|
6196
6164
|
const filteredCount = countResult?.count || 0;
|
|
6197
6165
|
const idxQuery = drizzle_orm.sql`
|
|
6198
6166
|
SELECT DISTINCT test_idx
|
|
@@ -6205,7 +6173,7 @@ var Eval = class Eval {
|
|
|
6205
6173
|
const idxStart = Date.now();
|
|
6206
6174
|
const rows = await db.all(idxQuery);
|
|
6207
6175
|
const idxEnd = Date.now();
|
|
6208
|
-
require_logger.
|
|
6176
|
+
require_logger.logger.debug(`Index query took ${idxEnd - idxStart}ms`);
|
|
6209
6177
|
return {
|
|
6210
6178
|
testIndices: rows.map((row) => row.test_idx),
|
|
6211
6179
|
filteredCount
|
|
@@ -6241,7 +6209,7 @@ var Eval = class Eval {
|
|
|
6241
6209
|
const hasComplexFilters = opts.filters && opts.filters.length > 0;
|
|
6242
6210
|
let queryResult;
|
|
6243
6211
|
if (hasComplexFilters) {
|
|
6244
|
-
require_logger.
|
|
6212
|
+
require_logger.logger.debug("Using original query for complex filters");
|
|
6245
6213
|
queryResult = await this.queryTestIndices({
|
|
6246
6214
|
offset: opts.offset,
|
|
6247
6215
|
limit: opts.limit,
|
|
@@ -6250,7 +6218,7 @@ var Eval = class Eval {
|
|
|
6250
6218
|
filters: opts.filters
|
|
6251
6219
|
});
|
|
6252
6220
|
} else {
|
|
6253
|
-
require_logger.
|
|
6221
|
+
require_logger.logger.debug("Using optimized query for table page");
|
|
6254
6222
|
queryResult = await queryTestIndicesOptimized(this.id, {
|
|
6255
6223
|
offset: opts.offset,
|
|
6256
6224
|
limit: opts.limit,
|
|
@@ -6265,12 +6233,12 @@ var Eval = class Eval {
|
|
|
6265
6233
|
const varsStart = Date.now();
|
|
6266
6234
|
const vars = Array.from(this.vars);
|
|
6267
6235
|
const varsEnd = Date.now();
|
|
6268
|
-
require_logger.
|
|
6236
|
+
require_logger.logger.debug(`Vars query took ${varsEnd - varsStart}ms`);
|
|
6269
6237
|
const body = [];
|
|
6270
6238
|
const bodyStart = Date.now();
|
|
6271
6239
|
if (testIndices.length === 0) {
|
|
6272
6240
|
const bodyEnd = Date.now();
|
|
6273
|
-
require_logger.
|
|
6241
|
+
require_logger.logger.debug(`Body query took ${bodyEnd - bodyStart}ms`);
|
|
6274
6242
|
return {
|
|
6275
6243
|
head: {
|
|
6276
6244
|
prompts: this.prompts,
|
|
@@ -6302,7 +6270,7 @@ var Eval = class Eval {
|
|
|
6302
6270
|
if (results.length > 0) body.push(convertTestResultsToTableRow(results, vars));
|
|
6303
6271
|
}
|
|
6304
6272
|
const bodyEnd = Date.now();
|
|
6305
|
-
require_logger.
|
|
6273
|
+
require_logger.logger.debug(`Body query took ${bodyEnd - bodyStart}ms`);
|
|
6306
6274
|
return {
|
|
6307
6275
|
head: {
|
|
6308
6276
|
prompts: this.prompts,
|
|
@@ -6415,7 +6383,7 @@ var Eval = class Eval {
|
|
|
6415
6383
|
})
|
|
6416
6384
|
}));
|
|
6417
6385
|
} catch (error) {
|
|
6418
|
-
require_logger.
|
|
6386
|
+
require_logger.logger.debug(`Failed to fetch traces for eval ${this.id}: ${error}`);
|
|
6419
6387
|
return [];
|
|
6420
6388
|
}
|
|
6421
6389
|
}
|
|
@@ -6452,7 +6420,7 @@ var Eval = class Eval {
|
|
|
6452
6420
|
const newEvalId = createEvalId(/* @__PURE__ */ new Date());
|
|
6453
6421
|
const copyDescription = description || `${this.description || "Evaluation"} (Copy)`;
|
|
6454
6422
|
const testCount = distinctTestCount ?? await this.getResultsCount();
|
|
6455
|
-
require_logger.
|
|
6423
|
+
require_logger.logger.info("Starting eval copy", {
|
|
6456
6424
|
sourceEvalId: this.id,
|
|
6457
6425
|
targetEvalId: newEvalId,
|
|
6458
6426
|
distinctTestCount: testCount
|
|
@@ -6515,7 +6483,7 @@ var Eval = class Eval {
|
|
|
6515
6483
|
db.insert(require_tables.evalResultsTable).values(copiedResults).run();
|
|
6516
6484
|
copiedCount += batch.length;
|
|
6517
6485
|
offset += BATCH_SIZE;
|
|
6518
|
-
require_logger.
|
|
6486
|
+
require_logger.logger.debug("Copied batch of eval results", {
|
|
6519
6487
|
sourceEvalId: this.id,
|
|
6520
6488
|
targetEvalId: newEvalId,
|
|
6521
6489
|
batchSize: batch.length,
|
|
@@ -6524,7 +6492,7 @@ var Eval = class Eval {
|
|
|
6524
6492
|
});
|
|
6525
6493
|
}
|
|
6526
6494
|
});
|
|
6527
|
-
require_logger.
|
|
6495
|
+
require_logger.logger.info("Eval copy completed successfully", {
|
|
6528
6496
|
sourceEvalId: this.id,
|
|
6529
6497
|
targetEvalId: newEvalId,
|
|
6530
6498
|
rowsCopied: copiedCount,
|
|
@@ -6539,7 +6507,6 @@ var Eval = class Eval {
|
|
|
6539
6507
|
this._shared = shared;
|
|
6540
6508
|
}
|
|
6541
6509
|
};
|
|
6542
|
-
|
|
6543
6510
|
//#endregion
|
|
6544
6511
|
//#region src/assertions/validateAssertions.ts
|
|
6545
6512
|
var AssertValidationError = class extends Error {
|
|
@@ -6591,7 +6558,6 @@ function validateAssertions(tests, defaultTest) {
|
|
|
6591
6558
|
}
|
|
6592
6559
|
}
|
|
6593
6560
|
}
|
|
6594
|
-
|
|
6595
6561
|
//#endregion
|
|
6596
6562
|
//#region src/commands/eval/filterPrompts.ts
|
|
6597
6563
|
/**
|
|
@@ -6617,7 +6583,6 @@ function filterPrompts(prompts, filterPromptsOption) {
|
|
|
6617
6583
|
return promptId && filterRegex.test(promptId) || promptLabel && filterRegex.test(promptLabel);
|
|
6618
6584
|
});
|
|
6619
6585
|
}
|
|
6620
|
-
|
|
6621
6586
|
//#endregion
|
|
6622
6587
|
//#region src/commands/eval/filterProviders.ts
|
|
6623
6588
|
/**
|
|
@@ -6698,7 +6663,6 @@ function filterProviders(providers, filterProvidersOption) {
|
|
|
6698
6663
|
return filterRegex.test(providerId) || providerLabel && filterRegex.test(providerLabel);
|
|
6699
6664
|
});
|
|
6700
6665
|
}
|
|
6701
|
-
|
|
6702
6666
|
//#endregion
|
|
6703
6667
|
//#region src/commands/eval/filterTestsUtil.ts
|
|
6704
6668
|
/**
|
|
@@ -6726,35 +6690,35 @@ function mergeDefaultVars(test, defaultTest) {
|
|
|
6726
6690
|
*/
|
|
6727
6691
|
async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
6728
6692
|
if (!testSuite.tests) {
|
|
6729
|
-
require_logger.
|
|
6693
|
+
require_logger.logger.debug("[filterTestsByResults] No tests in test suite");
|
|
6730
6694
|
return [];
|
|
6731
6695
|
}
|
|
6732
|
-
require_logger.
|
|
6696
|
+
require_logger.logger.debug(`[filterTestsByResults] Loading results from: ${pathOrId}`);
|
|
6733
6697
|
let results;
|
|
6734
6698
|
try {
|
|
6735
6699
|
if (pathOrId.endsWith(".json")) results = (await require_util.readOutput(pathOrId)).results;
|
|
6736
6700
|
else {
|
|
6737
6701
|
const eval_ = await Eval.findById(pathOrId);
|
|
6738
6702
|
if (!eval_) {
|
|
6739
|
-
require_logger.
|
|
6703
|
+
require_logger.logger.warn(`[filterTestsByResults] Evaluation not found: ${pathOrId}`);
|
|
6740
6704
|
return [];
|
|
6741
6705
|
}
|
|
6742
6706
|
const summary = await eval_.toEvaluateSummary();
|
|
6743
6707
|
if ("results" in summary) results = { results: summary.results };
|
|
6744
6708
|
else {
|
|
6745
|
-
require_logger.
|
|
6709
|
+
require_logger.logger.debug("[filterTestsByResults] No results in evaluation summary");
|
|
6746
6710
|
return [];
|
|
6747
6711
|
}
|
|
6748
6712
|
}
|
|
6749
6713
|
} catch (error) {
|
|
6750
|
-
require_logger.
|
|
6714
|
+
require_logger.logger.warn(`[filterTestsByResults] Error loading results: ${error}`);
|
|
6751
6715
|
return [];
|
|
6752
6716
|
}
|
|
6753
6717
|
const filteredResults = results.results.filter(filterFn);
|
|
6754
|
-
require_logger.
|
|
6718
|
+
require_logger.logger.debug(`[filterTestsByResults] Found ${filteredResults.length} matching results out of ${results.results.length} total`);
|
|
6755
6719
|
if (filteredResults.length === 0) return [];
|
|
6756
6720
|
const uniqueVarsInResults = new Set(filteredResults.map((r) => JSON.stringify(require_util.filterRuntimeVars(r.vars))));
|
|
6757
|
-
require_logger.
|
|
6721
|
+
require_logger.logger.debug(`[filterTestsByResults] ${uniqueVarsInResults.size} unique test cases (by vars) in filtered results`);
|
|
6758
6722
|
const matchedTests = [];
|
|
6759
6723
|
for (const test of testSuite.tests) {
|
|
6760
6724
|
const testWithDefaults = mergeDefaultVars(test, testSuite.defaultTest);
|
|
@@ -6776,15 +6740,15 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6776
6740
|
...runtimeVars
|
|
6777
6741
|
}
|
|
6778
6742
|
};
|
|
6779
|
-
require_logger.
|
|
6743
|
+
require_logger.logger.debug("[filterTestsByResults] Restored runtime vars for test", { varKeys: Object.keys(runtimeVars) });
|
|
6780
6744
|
matchedTests.push(testWithRuntimeVars);
|
|
6781
6745
|
} else {
|
|
6782
|
-
require_logger.
|
|
6746
|
+
require_logger.logger.debug("[filterTestsByResults] Matched test has no runtime vars to restore");
|
|
6783
6747
|
matchedTests.push(test);
|
|
6784
6748
|
}
|
|
6785
6749
|
}
|
|
6786
6750
|
}
|
|
6787
|
-
require_logger.
|
|
6751
|
+
require_logger.logger.debug(`[filterTestsByResults] Matched ${matchedTests.length} tests out of ${testSuite.tests.length} in test suite`);
|
|
6788
6752
|
const extractedTests = [];
|
|
6789
6753
|
const matchedResultKeys = /* @__PURE__ */ new Set();
|
|
6790
6754
|
for (const result of filteredResults) for (const test of matchedTests) if (require_util.resultIsForTestCase(result, mergeDefaultVars(test, testSuite.defaultTest))) {
|
|
@@ -6795,7 +6759,7 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6795
6759
|
const resultKey = JSON.stringify(require_util.filterRuntimeVars(result.vars));
|
|
6796
6760
|
if (matchedResultKeys.has(resultKey)) continue;
|
|
6797
6761
|
if (!result.testCase) {
|
|
6798
|
-
require_logger.
|
|
6762
|
+
require_logger.logger.debug("[filterTestsByResults] Skipping result without testCase data for extraction");
|
|
6799
6763
|
continue;
|
|
6800
6764
|
}
|
|
6801
6765
|
if (extractedTests.some((t) => JSON.stringify(require_util.filterRuntimeVars(t.vars)) === resultKey)) continue;
|
|
@@ -6807,12 +6771,11 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6807
6771
|
options: result.testCase.options
|
|
6808
6772
|
});
|
|
6809
6773
|
}
|
|
6810
|
-
if (extractedTests.length > 0) require_logger.
|
|
6811
|
-
if (matchedTests.length === 0 && extractedTests.length === 0 && filteredResults.length > 0) require_logger.
|
|
6812
|
-
else if (matchedTests.length + extractedTests.length < uniqueVarsInResults.size) require_logger.
|
|
6774
|
+
if (extractedTests.length > 0) require_logger.logger.info(`[filterTestsByResults] Extracted ${extractedTests.length} runtime-generated test(s) from results`);
|
|
6775
|
+
if (matchedTests.length === 0 && extractedTests.length === 0 && filteredResults.length > 0) require_logger.logger.warn(`[filterTestsByResults] No tests matched ${filteredResults.length} filtered results. This may indicate a vars or provider mismatch between stored results and current test suite. Use LOG_LEVEL=debug for detailed matching info.`);
|
|
6776
|
+
else if (matchedTests.length + extractedTests.length < uniqueVarsInResults.size) require_logger.logger.debug(`[filterTestsByResults] Note: ${uniqueVarsInResults.size - matchedTests.length - extractedTests.length} unique test cases in results did not match any test in the current test suite and could not be extracted. This may indicate results without testCase data.`);
|
|
6813
6777
|
return require_util.deduplicateTestCases([...matchedTests, ...extractedTests]);
|
|
6814
6778
|
}
|
|
6815
|
-
|
|
6816
6779
|
//#endregion
|
|
6817
6780
|
//#region src/commands/eval/filterTests.ts
|
|
6818
6781
|
/**
|
|
@@ -6838,7 +6801,7 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6838
6801
|
* @param reason - Description of what the filter was looking for (e.g., 'no failures/errors')
|
|
6839
6802
|
*/
|
|
6840
6803
|
function logNoTestsWarning(filterType, pathOrId, reason) {
|
|
6841
|
-
require_logger.
|
|
6804
|
+
require_logger.logger.warn(`--${filterType} returned no tests. The evaluation "${pathOrId}" may have ${reason}, or the test suite may have changed since the evaluation was run.`);
|
|
6842
6805
|
}
|
|
6843
6806
|
/**
|
|
6844
6807
|
* Filters a test suite to only include all tests that did not pass (failures + errors)
|
|
@@ -6884,10 +6847,10 @@ async function filterErrorTests(testSuite, pathOrId) {
|
|
|
6884
6847
|
*/
|
|
6885
6848
|
async function filterTests(testSuite, options) {
|
|
6886
6849
|
let tests = testSuite.tests || [];
|
|
6887
|
-
require_logger.
|
|
6888
|
-
require_logger.
|
|
6850
|
+
require_logger.logger.debug(`Starting filterTests with options: ${JSON.stringify(options)}`);
|
|
6851
|
+
require_logger.logger.debug(`Initial test count: ${tests.length}`);
|
|
6889
6852
|
if (Object.keys(options).length === 0) {
|
|
6890
|
-
require_logger.
|
|
6853
|
+
require_logger.logger.debug("No filter options provided, returning all tests");
|
|
6891
6854
|
return tests;
|
|
6892
6855
|
}
|
|
6893
6856
|
if (options.metadata) {
|
|
@@ -6902,11 +6865,11 @@ async function filterTests(testSuite, options) {
|
|
|
6902
6865
|
value
|
|
6903
6866
|
});
|
|
6904
6867
|
}
|
|
6905
|
-
require_logger.
|
|
6906
|
-
require_logger.
|
|
6868
|
+
require_logger.logger.debug(`Filtering for metadata conditions (AND logic): ${parsedFilters.map((f) => `${f.key}=${f.value}`).join(", ")}`);
|
|
6869
|
+
require_logger.logger.debug(`Before metadata filter: ${tests.length} tests`);
|
|
6907
6870
|
tests = tests.filter((test) => {
|
|
6908
6871
|
if (!test.metadata) {
|
|
6909
|
-
require_logger.
|
|
6872
|
+
require_logger.logger.debug(`Test has no metadata: ${test.description || "unnamed test"}`);
|
|
6910
6873
|
return false;
|
|
6911
6874
|
}
|
|
6912
6875
|
for (const { key, value } of parsedFilters) {
|
|
@@ -6915,16 +6878,16 @@ async function filterTests(testSuite, options) {
|
|
|
6915
6878
|
if (Array.isArray(testValue)) matches = testValue.some((v) => v.toString().includes(value));
|
|
6916
6879
|
else if (testValue !== void 0) matches = testValue.toString().includes(value);
|
|
6917
6880
|
if (!matches) {
|
|
6918
|
-
require_logger.
|
|
6881
|
+
require_logger.logger.debug(`Test "${test.description || "unnamed test"}" metadata doesn't match. Expected ${key} to include ${value}, got ${JSON.stringify(test.metadata)}`);
|
|
6919
6882
|
return false;
|
|
6920
6883
|
}
|
|
6921
6884
|
}
|
|
6922
6885
|
return true;
|
|
6923
6886
|
});
|
|
6924
|
-
require_logger.
|
|
6887
|
+
require_logger.logger.debug(`After metadata filter: ${tests.length} tests remain`);
|
|
6925
6888
|
}
|
|
6926
6889
|
if (options.failingOnly && options.errorsOnly) {
|
|
6927
|
-
require_logger.
|
|
6890
|
+
require_logger.logger.debug("Using both --filter-failing-only and --filter-errors-only together (equivalent to --filter-failing)");
|
|
6928
6891
|
const failingOnlyTests = await filterFailingOnlyTests(testSuite, options.failingOnly);
|
|
6929
6892
|
const errorTests = await filterErrorTests(testSuite, options.errorsOnly);
|
|
6930
6893
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -6934,8 +6897,8 @@ async function filterTests(testSuite, options) {
|
|
|
6934
6897
|
seen.add(key);
|
|
6935
6898
|
return true;
|
|
6936
6899
|
});
|
|
6937
|
-
require_logger.
|
|
6938
|
-
if (tests.length === 0) require_logger.
|
|
6900
|
+
require_logger.logger.debug(`Combined failingOnly (${failingOnlyTests.length}) and errors (${errorTests.length}) filters: ${tests.length} unique tests`);
|
|
6901
|
+
if (tests.length === 0) require_logger.logger.warn("Combined --filter-failing-only and --filter-errors-only returned no tests. The specified evaluations may have no failures or errors, or the test suite may have changed.");
|
|
6939
6902
|
} else if (options.failing) {
|
|
6940
6903
|
tests = await filterFailingTests(testSuite, options.failing);
|
|
6941
6904
|
if (tests.length === 0) logNoTestsWarning("filter-failing", options.failing, "no failures/errors");
|
|
@@ -6972,7 +6935,6 @@ async function filterTests(testSuite, options) {
|
|
|
6972
6935
|
}
|
|
6973
6936
|
return tests;
|
|
6974
6937
|
}
|
|
6975
|
-
|
|
6976
6938
|
//#endregion
|
|
6977
6939
|
//#region src/util/promptfooCommand.ts
|
|
6978
6940
|
/**
|
|
@@ -7018,7 +6980,6 @@ function promptfooCommand(subcommand) {
|
|
|
7018
6980
|
if (detectInstaller() === "npx") return subcommand ? `npx promptfoo@latest ${subcommand}` : "npx promptfoo@latest";
|
|
7019
6981
|
return subcommand ? `promptfoo ${subcommand}` : "promptfoo";
|
|
7020
6982
|
}
|
|
7021
|
-
|
|
7022
6983
|
//#endregion
|
|
7023
6984
|
//#region src/csv.ts
|
|
7024
6985
|
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = .8;
|
|
@@ -7110,7 +7071,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7110
7071
|
if (!key.startsWith("__") && specialKeys.some((k) => key.startsWith(k)) && !uniqueErrorMessages.has(key)) {
|
|
7111
7072
|
const error = `You used a single underscore for the key "${key}". Did you mean to use "${key.replace("_", "__")}" instead?`;
|
|
7112
7073
|
uniqueErrorMessages.add(key);
|
|
7113
|
-
require_logger.
|
|
7074
|
+
require_logger.logger.warn(error);
|
|
7114
7075
|
}
|
|
7115
7076
|
if (key.startsWith("__expected")) {
|
|
7116
7077
|
if (value.trim() !== "") asserts.push(assertionFromString(value.trim()));
|
|
@@ -7128,10 +7089,10 @@ function testCaseFromCsvRow(row) {
|
|
|
7128
7089
|
} else if (value.trim() !== "") metadata[metadataKey] = value;
|
|
7129
7090
|
} else if (key === "__metadata" && !uniqueErrorMessages.has(key)) {
|
|
7130
7091
|
uniqueErrorMessages.add(key);
|
|
7131
|
-
require_logger.
|
|
7092
|
+
require_logger.logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
|
|
7132
7093
|
} else if (key.startsWith("__config:")) {
|
|
7133
7094
|
const configParts = key.slice(9).split(":");
|
|
7134
|
-
if (configParts.length !== 2) require_logger.
|
|
7095
|
+
if (configParts.length !== 2) require_logger.logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
|
|
7135
7096
|
else {
|
|
7136
7097
|
const [expectedKey, configKey] = configParts;
|
|
7137
7098
|
let targetIndex;
|
|
@@ -7141,11 +7102,11 @@ function testCaseFromCsvRow(row) {
|
|
|
7141
7102
|
if (indexMatch) targetIndex = Number.parseInt(indexMatch[1], 10) - 1;
|
|
7142
7103
|
}
|
|
7143
7104
|
if (targetIndex === void 0) {
|
|
7144
|
-
require_logger.
|
|
7105
|
+
require_logger.logger.error(`Invalid expected key "${expectedKey}" in __config column "${key}". Must be __expected or __expected<N> where N is a positive integer.`);
|
|
7145
7106
|
throw new Error(`Invalid expected key "${expectedKey}" in __config column`);
|
|
7146
7107
|
}
|
|
7147
7108
|
if (!["threshold"].includes(configKey)) {
|
|
7148
|
-
require_logger.
|
|
7109
|
+
require_logger.logger.error(`Invalid config key "${configKey}" in __config column "${key}". Valid config keys include: threshold`);
|
|
7149
7110
|
throw new Error(`Invalid config key "${configKey}" in __config column`);
|
|
7150
7111
|
}
|
|
7151
7112
|
if (!assertionConfigs[targetIndex]) assertionConfigs[targetIndex] = {};
|
|
@@ -7153,7 +7114,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7153
7114
|
if (configKey === "threshold") {
|
|
7154
7115
|
parsedValue = Number.parseFloat(value);
|
|
7155
7116
|
if (!Number.isFinite(parsedValue)) {
|
|
7156
|
-
require_logger.
|
|
7117
|
+
require_logger.logger.error(`Invalid numeric value "${value}" for config key "${configKey}" in column "${key}"`);
|
|
7157
7118
|
throw new Error(`Invalid numeric value for ${configKey}`);
|
|
7158
7119
|
}
|
|
7159
7120
|
}
|
|
@@ -7180,7 +7141,6 @@ function testCaseFromCsvRow(row) {
|
|
|
7180
7141
|
...Object.keys(metadata).length > 0 ? { metadata } : {}
|
|
7181
7142
|
};
|
|
7182
7143
|
}
|
|
7183
|
-
|
|
7184
7144
|
//#endregion
|
|
7185
7145
|
//#region src/microsoftSharepoint.ts
|
|
7186
7146
|
let cca = null;
|
|
@@ -7200,7 +7160,7 @@ async function fetchCsvFromSharepoint(url) {
|
|
|
7200
7160
|
const fileRelativeUrl = url.startsWith(normalizedBaseUrl) ? url.slice(normalizedBaseUrl.length) : url;
|
|
7201
7161
|
const serverRelativeUrl = fileRelativeUrl.startsWith("/") ? fileRelativeUrl : `/${fileRelativeUrl}`;
|
|
7202
7162
|
const apiUrl = `${normalizedBaseUrl}/_api/web/GetFileByServerRelativeUrl('${encodeURI(serverRelativeUrl)}')/$value`;
|
|
7203
|
-
require_logger.
|
|
7163
|
+
require_logger.logger.debug(`Fetching CSV from SharePoint: ${apiUrl}`);
|
|
7204
7164
|
const response = await require_fetch.fetchWithProxy(apiUrl, { headers: {
|
|
7205
7165
|
Authorization: `Bearer ${accessToken}`,
|
|
7206
7166
|
Accept: "text/csv"
|
|
@@ -7257,7 +7217,6 @@ async function getSharePointAccessToken() {
|
|
|
7257
7217
|
if (!tokenResult?.accessToken) throw new Error("Failed to acquire SharePoint access token. Please check your authentication configuration.");
|
|
7258
7218
|
return tokenResult.accessToken;
|
|
7259
7219
|
}
|
|
7260
|
-
|
|
7261
7220
|
//#endregion
|
|
7262
7221
|
//#region src/util/xlsx.ts
|
|
7263
7222
|
async function parseXlsxFile(filePath) {
|
|
@@ -7317,7 +7276,6 @@ async function parseXlsxFile(filePath) {
|
|
|
7317
7276
|
throw new Error(`Failed to parse Excel file ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
7318
7277
|
}
|
|
7319
7278
|
}
|
|
7320
|
-
|
|
7321
7279
|
//#endregion
|
|
7322
7280
|
//#region src/util/testCaseReader.ts
|
|
7323
7281
|
async function readTestFiles(pathOrGlobs, basePath = "") {
|
|
@@ -7363,29 +7321,29 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7363
7321
|
const fileExtension = (0, path.parse)(pathWithoutFunction).ext.slice(1);
|
|
7364
7322
|
const extensionWithoutSheet = fileExtension.split("#")[0];
|
|
7365
7323
|
if (varsPath.startsWith("huggingface://datasets/")) {
|
|
7366
|
-
require_telemetry.
|
|
7324
|
+
require_telemetry.telemetry.record("feature_used", { feature: "huggingface dataset" });
|
|
7367
7325
|
return await require_graders.fetchHuggingFaceDataset(varsPath);
|
|
7368
7326
|
}
|
|
7369
7327
|
if (require_fileExtensions.isJavascriptFile(pathWithoutFunction)) {
|
|
7370
|
-
require_telemetry.
|
|
7328
|
+
require_telemetry.telemetry.record("feature_used", { feature: "js tests file" });
|
|
7371
7329
|
const mod = await require_esm.importModule(pathWithoutFunction, maybeFunctionName);
|
|
7372
7330
|
return typeof mod === "function" ? await mod(finalConfig) : mod;
|
|
7373
7331
|
}
|
|
7374
7332
|
if (fileExtension === "py") {
|
|
7375
|
-
require_telemetry.
|
|
7333
|
+
require_telemetry.telemetry.record("feature_used", { feature: "python tests file" });
|
|
7376
7334
|
const result = await require_pythonUtils.runPython(pathWithoutFunction, maybeFunctionName ?? "generate_tests", finalConfig === void 0 ? [] : [finalConfig]);
|
|
7377
7335
|
if (!Array.isArray(result)) throw new Error(`Python test function must return a list of test cases, got ${typeof result}`);
|
|
7378
7336
|
return result;
|
|
7379
7337
|
}
|
|
7380
7338
|
let rows = [];
|
|
7381
7339
|
if (varsPath.startsWith("https://docs.google.com/spreadsheets/")) {
|
|
7382
|
-
require_telemetry.
|
|
7340
|
+
require_telemetry.telemetry.record("feature_used", { feature: "csv tests file - google sheet" });
|
|
7383
7341
|
rows = await require_util.fetchCsvFromGoogleSheet(varsPath);
|
|
7384
7342
|
} else if (/https:\/\/[^/]+\.sharepoint\.com\//i.test(varsPath)) {
|
|
7385
|
-
require_telemetry.
|
|
7343
|
+
require_telemetry.telemetry.record("feature_used", { feature: "csv tests file - sharepoint" });
|
|
7386
7344
|
rows = await fetchCsvFromSharepoint(varsPath);
|
|
7387
7345
|
} else if (fileExtension === "csv") {
|
|
7388
|
-
require_telemetry.
|
|
7346
|
+
require_telemetry.telemetry.record("feature_used", { feature: "csv tests file - local" });
|
|
7389
7347
|
const delimiter = require_logger.getEnvString("PROMPTFOO_CSV_DELIMITER", ",");
|
|
7390
7348
|
const fileContent = await fs_promises.readFile(resolvedVarsPath, "utf-8");
|
|
7391
7349
|
const enforceStrict = require_logger.getEnvBool("PROMPTFOO_CSV_STRICT", false);
|
|
@@ -7417,10 +7375,10 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7417
7375
|
throw e;
|
|
7418
7376
|
}
|
|
7419
7377
|
} else if (extensionWithoutSheet === "xlsx" || extensionWithoutSheet === "xls") {
|
|
7420
|
-
require_telemetry.
|
|
7378
|
+
require_telemetry.telemetry.record("feature_used", { feature: "xlsx tests file - local" });
|
|
7421
7379
|
rows = await parseXlsxFile(resolvedVarsPath);
|
|
7422
7380
|
} else if (fileExtension === "json") {
|
|
7423
|
-
require_telemetry.
|
|
7381
|
+
require_telemetry.telemetry.record("feature_used", { feature: "json tests file" });
|
|
7424
7382
|
const fileContent = await fs_promises.readFile(resolvedVarsPath, "utf-8");
|
|
7425
7383
|
const jsonData = js_yaml.default.load(fileContent);
|
|
7426
7384
|
return (Array.isArray(jsonData) ? jsonData : [jsonData]).map((item, idx) => ({
|
|
@@ -7428,7 +7386,7 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7428
7386
|
description: item.description || `Row #${idx + 1}`
|
|
7429
7387
|
}));
|
|
7430
7388
|
} else if (fileExtension === "jsonl") {
|
|
7431
|
-
require_telemetry.
|
|
7389
|
+
require_telemetry.telemetry.record("feature_used", { feature: "jsonl tests file" });
|
|
7432
7390
|
return (await fs_promises.readFile(resolvedVarsPath, "utf-8")).split("\n").filter((line) => line.trim()).map((line, idx) => {
|
|
7433
7391
|
return {
|
|
7434
7392
|
...JSON.parse(line),
|
|
@@ -7436,7 +7394,7 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7436
7394
|
};
|
|
7437
7395
|
});
|
|
7438
7396
|
} else if (fileExtension === "yaml" || fileExtension === "yml") {
|
|
7439
|
-
require_telemetry.
|
|
7397
|
+
require_telemetry.telemetry.record("feature_used", { feature: "yaml tests file" });
|
|
7440
7398
|
rows = require_util.maybeLoadConfigFromExternalFile(js_yaml.default.load(await fs_promises.readFile(resolvedVarsPath, "utf-8")));
|
|
7441
7399
|
}
|
|
7442
7400
|
return rows.map((row, idx) => {
|
|
@@ -7480,7 +7438,7 @@ async function readTest(test, basePath = "", isDefaultTest = false) {
|
|
|
7480
7438
|
*/
|
|
7481
7439
|
async function loadTestsFromGlob(loadTestsGlob, basePath = "") {
|
|
7482
7440
|
if (loadTestsGlob.startsWith("huggingface://datasets/")) {
|
|
7483
|
-
require_telemetry.
|
|
7441
|
+
require_telemetry.telemetry.record("feature_used", { feature: "huggingface dataset" });
|
|
7484
7442
|
return await require_graders.fetchHuggingFaceDataset(loadTestsGlob);
|
|
7485
7443
|
}
|
|
7486
7444
|
if (loadTestsGlob.startsWith("file://")) loadTestsGlob = loadTestsGlob.slice(7);
|
|
@@ -7491,12 +7449,12 @@ async function loadTestsFromGlob(loadTestsGlob, basePath = "") {
|
|
|
7491
7449
|
if ((require_fileExtensions.isJavascriptFile(pathWithoutFunction) || pathWithoutFunction.endsWith(".py")) && !testFiles.some((file) => file === resolvedPath || file === pathWithoutFunction)) testFiles.push(resolvedPath);
|
|
7492
7450
|
if (loadTestsGlob.startsWith("https://docs.google.com/spreadsheets/")) testFiles.push(loadTestsGlob);
|
|
7493
7451
|
const _deref = async (testCases, file) => {
|
|
7494
|
-
require_logger.
|
|
7452
|
+
require_logger.logger.debug(`Dereferencing test file: ${file}`);
|
|
7495
7453
|
return await _apidevtools_json_schema_ref_parser.default.dereference(testCases);
|
|
7496
7454
|
};
|
|
7497
7455
|
const ret = [];
|
|
7498
7456
|
if (testFiles.length < 1) {
|
|
7499
|
-
require_logger.
|
|
7457
|
+
require_logger.logger.error(`No test files found for path: ${loadTestsGlob}`);
|
|
7500
7458
|
return ret;
|
|
7501
7459
|
}
|
|
7502
7460
|
for (const testFile of testFiles) {
|
|
@@ -7536,14 +7494,14 @@ async function readTests(tests, basePath = "") {
|
|
|
7536
7494
|
else ret.push(...await loadTestsFromGlob(globOrTest, basePath));
|
|
7537
7495
|
} else if ("path" in globOrTest) ret.push(...await readStandaloneTestsFile(globOrTest.path, basePath, globOrTest.config));
|
|
7538
7496
|
else ret.push(await readTest(globOrTest, basePath));
|
|
7539
|
-
else if (tests !== void 0 && tests !== null) require_logger.
|
|
7497
|
+
else if (tests !== void 0 && tests !== null) require_logger.logger.warn(dedent.default`
|
|
7540
7498
|
Warning: Unsupported 'tests' format in promptfooconfig.yaml.
|
|
7541
7499
|
Expected: string, string[], or TestCase[], but received: ${typeof tests}
|
|
7542
7500
|
|
|
7543
7501
|
Please check your configuration file and ensure the 'tests' field is correctly formatted.
|
|
7544
7502
|
For more information, visit: https://promptfoo.dev/docs/configuration/reference/#test-case
|
|
7545
7503
|
`);
|
|
7546
|
-
if (ret.some((testCase) => testCase.vars?.assert) && !require_logger.getEnvBool("PROMPTFOO_NO_TESTCASE_ASSERT_WARNING")) require_logger.
|
|
7504
|
+
if (ret.some((testCase) => testCase.vars?.assert) && !require_logger.getEnvBool("PROMPTFOO_NO_TESTCASE_ASSERT_WARNING")) require_logger.logger.warn(dedent.default`
|
|
7547
7505
|
Warning: Found 'assert' key in vars. This is likely a mistake in your configuration.
|
|
7548
7506
|
|
|
7549
7507
|
'assert' should be *unindented* so it is under the test itself, not vars. For example:
|
|
@@ -7559,7 +7517,6 @@ async function readTests(tests, basePath = "") {
|
|
|
7559
7517
|
`);
|
|
7560
7518
|
return ret;
|
|
7561
7519
|
}
|
|
7562
|
-
|
|
7563
7520
|
//#endregion
|
|
7564
7521
|
//#region src/util/validateTestPromptReferences.ts
|
|
7565
7522
|
var PromptReferenceValidationError = class extends Error {
|
|
@@ -7602,7 +7559,6 @@ function validateTestPromptReferences(tests, prompts, defaultTest) {
|
|
|
7602
7559
|
}
|
|
7603
7560
|
}
|
|
7604
7561
|
}
|
|
7605
|
-
|
|
7606
7562
|
//#endregion
|
|
7607
7563
|
//#region src/util/validateTestProviderReferences.ts
|
|
7608
7564
|
var ProviderReferenceValidationError = class extends Error {
|
|
@@ -7648,7 +7604,6 @@ function validateTestProviderReferences(tests, providers, defaultTest, scenarios
|
|
|
7648
7604
|
});
|
|
7649
7605
|
});
|
|
7650
7606
|
}
|
|
7651
|
-
|
|
7652
7607
|
//#endregion
|
|
7653
7608
|
//#region src/util/config/extensions.ts
|
|
7654
7609
|
/**
|
|
@@ -7666,7 +7621,6 @@ const DEFAULT_CONFIG_EXTENSIONS = [
|
|
|
7666
7621
|
"mts",
|
|
7667
7622
|
"ts"
|
|
7668
7623
|
];
|
|
7669
|
-
|
|
7670
7624
|
//#endregion
|
|
7671
7625
|
//#region src/util/config/load.ts
|
|
7672
7626
|
/**
|
|
@@ -7789,34 +7743,34 @@ async function readConfig(configPath) {
|
|
|
7789
7743
|
const hasProviders = data.providers !== void 0;
|
|
7790
7744
|
return hasTargets && !hasProviders || !hasTargets && hasProviders;
|
|
7791
7745
|
}, { message: "Exactly one of 'targets' or 'providers' must be provided, but not both" }).safeParse(renderedConfig);
|
|
7792
|
-
if (!validationResult.success) require_logger.
|
|
7746
|
+
if (!validationResult.success) require_logger.logger.warn(`Invalid configuration file ${configPath}:\n${zod.z.prettifyError(validationResult.error)}`);
|
|
7793
7747
|
ret = renderedConfig;
|
|
7794
7748
|
} else if (require_fileExtensions.isJavascriptFile(configPath)) {
|
|
7795
7749
|
const renderedConfig = renderConfigEnvTemplates(await require_esm.importModule(configPath));
|
|
7796
7750
|
const validationResult = require_types.UnifiedConfigSchema.safeParse(renderedConfig);
|
|
7797
|
-
if (!validationResult.success) require_logger.
|
|
7751
|
+
if (!validationResult.success) require_logger.logger.warn(`Invalid configuration file ${configPath}:\n${zod.z.prettifyError(validationResult.error)}`);
|
|
7798
7752
|
ret = renderedConfig;
|
|
7799
7753
|
} else throw new Error(`Unsupported configuration file format: ${ext}`);
|
|
7800
7754
|
if (ret.targets) {
|
|
7801
|
-
require_logger.
|
|
7755
|
+
require_logger.logger.debug(`Rewriting config.targets to config.providers`);
|
|
7802
7756
|
ret.providers = ret.targets;
|
|
7803
7757
|
delete ret.targets;
|
|
7804
7758
|
}
|
|
7805
7759
|
if (ret.plugins) {
|
|
7806
|
-
require_logger.
|
|
7760
|
+
require_logger.logger.debug(`Rewriting config.plugins to config.redteam.plugins`);
|
|
7807
7761
|
ret.redteam = ret.redteam || {};
|
|
7808
7762
|
ret.redteam.plugins = ret.plugins;
|
|
7809
7763
|
delete ret.plugins;
|
|
7810
7764
|
}
|
|
7811
7765
|
if (ret.strategies) {
|
|
7812
|
-
require_logger.
|
|
7766
|
+
require_logger.logger.debug(`Rewriting config.strategies to config.redteam.strategies`);
|
|
7813
7767
|
ret.redteam = ret.redteam || {};
|
|
7814
7768
|
ret.redteam.strategies = ret.strategies;
|
|
7815
7769
|
delete ret.strategies;
|
|
7816
7770
|
}
|
|
7817
7771
|
if (!ret.prompts) {
|
|
7818
|
-
require_logger.
|
|
7819
|
-
if (!(!ret.tests || typeof ret.tests === "string" || Array.isArray(ret.tests) && ret.tests.some((test) => isTestCaseWithVars(test) && Object.keys(test.vars || {}).includes("prompt")))) require_logger.
|
|
7772
|
+
require_logger.logger.debug(`Setting default prompt because there is no \`prompts\` field`);
|
|
7773
|
+
if (!(!ret.tests || typeof ret.tests === "string" || Array.isArray(ret.tests) && ret.tests.some((test) => isTestCaseWithVars(test) && Object.keys(test.vars || {}).includes("prompt")))) require_logger.logger.warn(`Warning: Expected top-level "prompts" property in config or a test variable named "prompt"`);
|
|
7820
7774
|
ret.prompts = ["{{prompt}}"];
|
|
7821
7775
|
}
|
|
7822
7776
|
return ret;
|
|
@@ -8014,9 +7968,9 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8014
7968
|
defaultConfig = {};
|
|
8015
7969
|
}
|
|
8016
7970
|
if (cmdObj.assertions) {
|
|
8017
|
-
require_telemetry.
|
|
7971
|
+
require_telemetry.telemetry.record("feature_used", { feature: "standalone assertions mode" });
|
|
8018
7972
|
if (!cmdObj.modelOutputs) {
|
|
8019
|
-
require_logger.
|
|
7973
|
+
require_logger.logger.error("You must provide --model-outputs when using --assertions");
|
|
8020
7974
|
process$1.default.exit(1);
|
|
8021
7975
|
}
|
|
8022
7976
|
const modelOutputs = JSON.parse(fs.readFileSync(path.join(process$1.default.cwd(), cmdObj.modelOutputs), "utf8"));
|
|
@@ -8038,14 +7992,14 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8038
7992
|
});
|
|
8039
7993
|
}
|
|
8040
7994
|
const basePath = configPaths ? path.dirname(configPaths[0]) : "";
|
|
8041
|
-
require_logger.
|
|
7995
|
+
require_logger.state.basePath = basePath;
|
|
8042
7996
|
const defaultTestRaw = fileConfig.defaultTest || defaultConfig.defaultTest;
|
|
8043
7997
|
let processedDefaultTest;
|
|
8044
7998
|
if (typeof defaultTestRaw === "string" && defaultTestRaw.startsWith("file://")) {
|
|
8045
|
-
const originalBasePath = require_logger.
|
|
8046
|
-
require_logger.
|
|
7999
|
+
const originalBasePath = require_logger.state.basePath;
|
|
8000
|
+
require_logger.state.basePath = basePath;
|
|
8047
8001
|
const loaded = await require_util.maybeLoadFromExternalFile(defaultTestRaw);
|
|
8048
|
-
require_logger.
|
|
8002
|
+
require_logger.state.basePath = originalBasePath;
|
|
8049
8003
|
processedDefaultTest = loaded;
|
|
8050
8004
|
} else if (defaultTestRaw) processedDefaultTest = defaultTestRaw;
|
|
8051
8005
|
const config = {
|
|
@@ -8070,7 +8024,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8070
8024
|
const hasProviders = cmdObj.providers && cmdObj.providers.length > 0 || [config.providers].flat().filter(Boolean).length > 0;
|
|
8071
8025
|
if (!Boolean(configPaths) && !hasPrompts && !hasProviders && !require_logger.isCI()) {
|
|
8072
8026
|
const extList = DEFAULT_CONFIG_EXTENSIONS.join(", ");
|
|
8073
|
-
require_logger.
|
|
8027
|
+
require_logger.logger.warn(dedent.default`
|
|
8074
8028
|
${chalk.default.yellow.bold("⚠️ No promptfooconfig found")}
|
|
8075
8029
|
|
|
8076
8030
|
${chalk.default.white(`Searched in ${chalk.default.bold(process$1.default.cwd())} for promptfooconfig.{${extList}}`)}
|
|
@@ -8086,11 +8040,11 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8086
8040
|
process$1.default.exit(1);
|
|
8087
8041
|
}
|
|
8088
8042
|
if (!hasPrompts) {
|
|
8089
|
-
require_logger.
|
|
8043
|
+
require_logger.logger.error("You must provide at least 1 prompt");
|
|
8090
8044
|
process$1.default.exit(1);
|
|
8091
8045
|
}
|
|
8092
8046
|
if (type !== "DatasetGeneration" && type !== "AssertionGeneration" && !hasProviders) {
|
|
8093
|
-
require_logger.
|
|
8047
|
+
require_logger.logger.error("You must specify at least 1 provider (for example, openai:gpt-4.1)");
|
|
8094
8048
|
process$1.default.exit(1);
|
|
8095
8049
|
}
|
|
8096
8050
|
require_invariant.invariant(Array.isArray(config.providers), "providers must be an array");
|
|
@@ -8098,11 +8052,11 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8098
8052
|
const cliFilteredProviderConfigs = (cmdObj.providers ? resolveCliProvidersWithConfig(cmdObj.providers, resolvedProviderConfigs) : resolvedProviderConfigs) ?? [];
|
|
8099
8053
|
const filterOption = cmdObj.filterProviders || cmdObj.filterTargets;
|
|
8100
8054
|
const filteredProviderConfigs = filterProviderConfigs(cliFilteredProviderConfigs, filterOption);
|
|
8101
|
-
if (filterOption && Array.isArray(filteredProviderConfigs) && filteredProviderConfigs.length === 0) require_logger.
|
|
8055
|
+
if (filterOption && Array.isArray(filteredProviderConfigs) && filteredProviderConfigs.length === 0) require_logger.logger.warn(`No providers matched the filter "${filterOption}". Check your --filter-providers/--filter-targets value.`);
|
|
8102
8056
|
let parsedPrompts = await require_graders.readPrompts(config.prompts, cmdObj.prompts ? void 0 : basePath);
|
|
8103
8057
|
if (cmdObj.filterPrompts) {
|
|
8104
8058
|
parsedPrompts = filterPrompts(parsedPrompts, cmdObj.filterPrompts);
|
|
8105
|
-
if (parsedPrompts.length === 0) require_logger.
|
|
8059
|
+
if (parsedPrompts.length === 0) require_logger.logger.warn(`No prompts matched the filter "${cmdObj.filterPrompts}". Check your --filter-prompts value.`);
|
|
8106
8060
|
}
|
|
8107
8061
|
const parsedProviders = await require_providers.loadApiProviders(filteredProviderConfigs, {
|
|
8108
8062
|
env: config.env,
|
|
@@ -8133,7 +8087,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8133
8087
|
}
|
|
8134
8088
|
const parsedProviderPromptMap = require_graders.readProviderPromptMap({ providers: filteredProviderConfigs }, parsedPrompts);
|
|
8135
8089
|
if (parsedPrompts.length === 0) {
|
|
8136
|
-
require_logger.
|
|
8090
|
+
require_logger.logger.error("No prompts found. Add a `prompts:` entry to your config or pass --prompts path/to/prompt.txt.");
|
|
8137
8091
|
process$1.default.exit(1);
|
|
8138
8092
|
}
|
|
8139
8093
|
const defaultTest = {
|
|
@@ -8163,7 +8117,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8163
8117
|
validateAssertions(testSuite.tests || [], typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0);
|
|
8164
8118
|
validateTestProviderReferences(testSuite.tests || [], testSuite.providers, typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0, testSuite.scenarios);
|
|
8165
8119
|
validateTestPromptReferences(testSuite.tests || [], testSuite.prompts, typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0);
|
|
8166
|
-
require_logger.
|
|
8120
|
+
require_logger.state.config = config;
|
|
8167
8121
|
let commandLineOptions = fileConfig.commandLineOptions || defaultConfig.commandLineOptions;
|
|
8168
8122
|
if (commandLineOptions?.envPath && basePath) {
|
|
8169
8123
|
const resolvedPaths = (Array.isArray(commandLineOptions.envPath) ? commandLineOptions.envPath : [commandLineOptions.envPath]).map((p) => path.isAbsolute(p) ? p : path.resolve(basePath, p));
|
|
@@ -8179,7 +8133,6 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8179
8133
|
commandLineOptions
|
|
8180
8134
|
};
|
|
8181
8135
|
}
|
|
8182
|
-
|
|
8183
8136
|
//#endregion
|
|
8184
8137
|
//#region src/util/config/writer.ts
|
|
8185
8138
|
function writePromptfooConfig(config, outputPath, headerComments) {
|
|
@@ -8195,7 +8148,7 @@ function writePromptfooConfig(config, outputPath, headerComments) {
|
|
|
8195
8148
|
]);
|
|
8196
8149
|
const yamlContent = js_yaml.default.dump(orderedConfig, { skipInvalid: true });
|
|
8197
8150
|
if (!yamlContent) {
|
|
8198
|
-
require_logger.
|
|
8151
|
+
require_logger.logger.warn("Warning: config is empty, skipping write");
|
|
8199
8152
|
return orderedConfig;
|
|
8200
8153
|
}
|
|
8201
8154
|
const schemaComment = `# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json`;
|
|
@@ -8203,7 +8156,6 @@ function writePromptfooConfig(config, outputPath, headerComments) {
|
|
|
8203
8156
|
fs.default.writeFileSync(outputPath, `${schemaComment}\n${headerCommentLines}${yamlContent}`);
|
|
8204
8157
|
return orderedConfig;
|
|
8205
8158
|
}
|
|
8206
|
-
|
|
8207
8159
|
//#endregion
|
|
8208
8160
|
//#region src/util/redteamProbeLimit.ts
|
|
8209
8161
|
const MONTHLY_PROBE_LIMIT = 1e5;
|
|
@@ -8253,7 +8205,6 @@ function checkRedteamProbeLimit() {
|
|
|
8253
8205
|
remaining
|
|
8254
8206
|
};
|
|
8255
8207
|
}
|
|
8256
|
-
|
|
8257
8208
|
//#endregion
|
|
8258
8209
|
//#region src/redteam/extraction/mcpTools.ts
|
|
8259
8210
|
/**
|
|
@@ -8289,11 +8240,10 @@ async function extractMcpToolsInfo(providers) {
|
|
|
8289
8240
|
for (const tool of tools) toolsInfo.push(JSON.stringify(tool));
|
|
8290
8241
|
}
|
|
8291
8242
|
} catch (error) {
|
|
8292
|
-
require_logger.
|
|
8243
|
+
require_logger.logger.warn(`Failed to get tools from MCP provider: ${error instanceof Error ? error.message : String(error)}`);
|
|
8293
8244
|
}
|
|
8294
8245
|
return toolsInfo.join("\n");
|
|
8295
8246
|
}
|
|
8296
|
-
|
|
8297
8247
|
//#endregion
|
|
8298
8248
|
//#region src/util/apiHealth.ts
|
|
8299
8249
|
/**
|
|
@@ -8302,7 +8252,7 @@ async function extractMcpToolsInfo(providers) {
|
|
|
8302
8252
|
* @returns A promise that resolves to the health check response.
|
|
8303
8253
|
*/
|
|
8304
8254
|
async function checkRemoteHealth(url) {
|
|
8305
|
-
require_logger.
|
|
8255
|
+
require_logger.logger.debug(`[CheckRemoteHealth] Checking API health: ${JSON.stringify({
|
|
8306
8256
|
url,
|
|
8307
8257
|
env: {
|
|
8308
8258
|
httpProxy: require_logger.getEnvString("HTTP_PROXY") || require_logger.getEnvString("http_proxy"),
|
|
@@ -8317,7 +8267,7 @@ async function checkRemoteHealth(url) {
|
|
|
8317
8267
|
const cloudConfig = new require_fetch.CloudConfig();
|
|
8318
8268
|
const response = await require_fetch.fetchWithTimeout(url, { headers: { "Content-Type": "application/json" } }, 5e3);
|
|
8319
8269
|
if (!response.ok) {
|
|
8320
|
-
require_logger.
|
|
8270
|
+
require_logger.logger.debug(`[CheckRemoteHealth] API health check failed with non-OK response: ${JSON.stringify({
|
|
8321
8271
|
status: response.status,
|
|
8322
8272
|
statusText: response.statusText,
|
|
8323
8273
|
url
|
|
@@ -8357,7 +8307,7 @@ async function checkRemoteHealth(url) {
|
|
|
8357
8307
|
};
|
|
8358
8308
|
const cause = "cause" in error ? ` (Cause: ${error.cause})` : "";
|
|
8359
8309
|
const code = "code" in error ? ` [${error["code"]}]` : "";
|
|
8360
|
-
require_logger.
|
|
8310
|
+
require_logger.logger.debug(`[CheckRemoteHealth] API health check failed: ${JSON.stringify({
|
|
8361
8311
|
error: error.message,
|
|
8362
8312
|
url
|
|
8363
8313
|
})}`);
|
|
@@ -8367,7 +8317,6 @@ async function checkRemoteHealth(url) {
|
|
|
8367
8317
|
};
|
|
8368
8318
|
}
|
|
8369
8319
|
}
|
|
8370
|
-
|
|
8371
8320
|
//#endregion
|
|
8372
8321
|
//#region src/redteam/extraction/util.ts
|
|
8373
8322
|
const RedTeamGenerationResponse = zod.z.object({
|
|
@@ -8404,7 +8353,7 @@ async function fetchRemoteGeneration(task, prompts) {
|
|
|
8404
8353
|
}, require_fetch.REQUEST_TIMEOUT_MS, "json");
|
|
8405
8354
|
return RedTeamGenerationResponse.parse(response.data).result;
|
|
8406
8355
|
} catch (error) {
|
|
8407
|
-
require_logger.
|
|
8356
|
+
require_logger.logger.warn(`Error using remote generation for task '${task}': ${error}`);
|
|
8408
8357
|
throw error;
|
|
8409
8358
|
}
|
|
8410
8359
|
}
|
|
@@ -8414,11 +8363,11 @@ async function callExtraction(provider, prompt, processOutput) {
|
|
|
8414
8363
|
content: prompt
|
|
8415
8364
|
}]));
|
|
8416
8365
|
if (error) {
|
|
8417
|
-
require_logger.
|
|
8366
|
+
require_logger.logger.error(`Error in extraction: ${error}`);
|
|
8418
8367
|
throw new Error(`Failed to perform extraction: ${error}`);
|
|
8419
8368
|
}
|
|
8420
8369
|
if (typeof output !== "string") {
|
|
8421
|
-
require_logger.
|
|
8370
|
+
require_logger.logger.error(`Invalid output from extraction. Got: ${output}`);
|
|
8422
8371
|
throw new Error(`Invalid extraction output: expected string, got: ${output}`);
|
|
8423
8372
|
}
|
|
8424
8373
|
return processOutput(output);
|
|
@@ -8429,14 +8378,13 @@ function formatPrompts(prompts) {
|
|
|
8429
8378
|
${prompt}
|
|
8430
8379
|
</Prompt>`).join("\n");
|
|
8431
8380
|
}
|
|
8432
|
-
|
|
8433
8381
|
//#endregion
|
|
8434
8382
|
//#region src/redteam/extraction/entities.ts
|
|
8435
8383
|
async function extractEntities(provider, prompts) {
|
|
8436
8384
|
if (require_server.shouldGenerateRemote()) try {
|
|
8437
8385
|
return await fetchRemoteGeneration("entities", prompts);
|
|
8438
8386
|
} catch (error) {
|
|
8439
|
-
require_logger.
|
|
8387
|
+
require_logger.logger.warn(`[Entity Extraction] Failed, returning 0 entities. Error using remote generation: ${error}`);
|
|
8440
8388
|
return [];
|
|
8441
8389
|
}
|
|
8442
8390
|
const prompt = dedent.default`
|
|
@@ -8463,28 +8411,27 @@ async function extractEntities(provider, prompts) {
|
|
|
8463
8411
|
try {
|
|
8464
8412
|
return await callExtraction(provider, prompt, (output) => {
|
|
8465
8413
|
const entities = output.split("\n").filter((line) => line.trim().startsWith("Entity:")).map((line) => line.substring(line.indexOf("Entity:") + 7).trim()).filter((entity) => !/^\{\{\s*[^{}]+\s*\}\}$/.test(entity));
|
|
8466
|
-
if (entities.length === 0) require_logger.
|
|
8414
|
+
if (entities.length === 0) require_logger.logger.debug("No entities were extracted from the prompts.");
|
|
8467
8415
|
return entities;
|
|
8468
8416
|
});
|
|
8469
8417
|
} catch (error) {
|
|
8470
|
-
require_logger.
|
|
8418
|
+
require_logger.logger.warn(`Error using local extraction, returning empty list: ${error}`);
|
|
8471
8419
|
return [];
|
|
8472
8420
|
}
|
|
8473
8421
|
}
|
|
8474
|
-
|
|
8475
8422
|
//#endregion
|
|
8476
8423
|
//#region src/redteam/extraction/purpose.ts
|
|
8477
8424
|
const DEFAULT_PURPOSE = "An AI system";
|
|
8478
8425
|
async function extractSystemPurpose(provider, prompts) {
|
|
8479
8426
|
const onlyTemplatePrompt = prompts.length === 1 && prompts[0] && prompts[0].trim().replace(/\s+/g, "") === "{{prompt}}";
|
|
8480
8427
|
if (prompts.length === 0 || onlyTemplatePrompt) {
|
|
8481
|
-
require_logger.
|
|
8428
|
+
require_logger.logger.debug("[purpose] No meaningful prompts provided, returning default purpose");
|
|
8482
8429
|
return DEFAULT_PURPOSE;
|
|
8483
8430
|
}
|
|
8484
8431
|
if (!require_server.neverGenerateRemote()) try {
|
|
8485
8432
|
return await fetchRemoteGeneration("purpose", prompts);
|
|
8486
8433
|
} catch (error) {
|
|
8487
|
-
require_logger.
|
|
8434
|
+
require_logger.logger.warn(`[purpose] Error using remote generation, returning empty string: ${error}`);
|
|
8488
8435
|
return "";
|
|
8489
8436
|
}
|
|
8490
8437
|
const prompt = dedent.default`
|
|
@@ -8505,11 +8452,10 @@ async function extractSystemPurpose(provider, prompts) {
|
|
|
8505
8452
|
return match ? match[1].trim() : output.trim();
|
|
8506
8453
|
});
|
|
8507
8454
|
} catch (error) {
|
|
8508
|
-
require_logger.
|
|
8455
|
+
require_logger.logger.warn(`[purpose] Error using extracting purpose, returning empty string: ${error}`);
|
|
8509
8456
|
return "";
|
|
8510
8457
|
}
|
|
8511
8458
|
}
|
|
8512
|
-
|
|
8513
8459
|
//#endregion
|
|
8514
8460
|
//#region src/redteam/plugins/custom.ts
|
|
8515
8461
|
const CustomPluginDefinitionSchema = zod.z.strictObject({
|
|
@@ -8520,7 +8466,7 @@ const CustomPluginDefinitionSchema = zod.z.strictObject({
|
|
|
8520
8466
|
id: zod.z.string().optional()
|
|
8521
8467
|
});
|
|
8522
8468
|
function loadCustomPluginDefinition(filePath) {
|
|
8523
|
-
require_logger.
|
|
8469
|
+
require_logger.logger.debug(`Loading custom plugin from ${filePath}`);
|
|
8524
8470
|
const result = CustomPluginDefinitionSchema.safeParse(require_util.maybeLoadFromExternalFile(filePath));
|
|
8525
8471
|
if (!result.success) {
|
|
8526
8472
|
const validationError = zod.z.prettifyError(result.error);
|
|
@@ -8531,7 +8477,7 @@ function loadCustomPluginDefinition(filePath) {
|
|
|
8531
8477
|
|
|
8532
8478
|
Please review your plugin file ${filePath} configuration.`);
|
|
8533
8479
|
}
|
|
8534
|
-
require_logger.
|
|
8480
|
+
require_logger.logger.debug(`Custom plugin definition: ${JSON.stringify(result.data, null, 2)}`);
|
|
8535
8481
|
return result.data;
|
|
8536
8482
|
}
|
|
8537
8483
|
var CustomPlugin = class extends require_graders.RedteamPluginBase {
|
|
@@ -8569,7 +8515,6 @@ var CustomPlugin = class extends require_graders.RedteamPluginBase {
|
|
|
8569
8515
|
}));
|
|
8570
8516
|
}
|
|
8571
8517
|
};
|
|
8572
|
-
|
|
8573
8518
|
//#endregion
|
|
8574
8519
|
//#region src/redteam/plugins/cyberseceval.ts
|
|
8575
8520
|
const PLUGIN_ID$2 = "promptfoo:redteam:cyberseceval";
|
|
@@ -8587,11 +8532,11 @@ async function fetchDataset$2(limit, isMultilingual) {
|
|
|
8587
8532
|
user_input: input.user_input,
|
|
8588
8533
|
judge_question: input.judge_question
|
|
8589
8534
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8590
|
-
require_logger.
|
|
8535
|
+
require_logger.logger.debug(`[CyberSecEval] Generated ${testCases.length} test cases`);
|
|
8591
8536
|
if (testCases.length === 0) throw new Error(`[CyberSecEval] No test cases generated`);
|
|
8592
8537
|
return testCases;
|
|
8593
8538
|
} catch (error) {
|
|
8594
|
-
require_logger.
|
|
8539
|
+
require_logger.logger.error(`[CyberSecEval] Error fetching dataset: ${error}`);
|
|
8595
8540
|
return [];
|
|
8596
8541
|
}
|
|
8597
8542
|
}
|
|
@@ -8625,7 +8570,6 @@ var CyberSecEvalPlugin = class extends require_graders.RedteamPluginBase {
|
|
|
8625
8570
|
}));
|
|
8626
8571
|
}
|
|
8627
8572
|
};
|
|
8628
|
-
|
|
8629
8573
|
//#endregion
|
|
8630
8574
|
//#region src/redteam/plugins/donotanswer.ts
|
|
8631
8575
|
const PLUGIN_ID$1 = "promptfoo:redteam:donotanswer";
|
|
@@ -8644,7 +8588,7 @@ const RUBRIC_PASS_HEADING$1 = `THE RESPONSE SHOULD PASS IF:
|
|
|
8644
8588
|
`;
|
|
8645
8589
|
async function fetchDataset$1(limit) {
|
|
8646
8590
|
try {
|
|
8647
|
-
require_logger.
|
|
8591
|
+
require_logger.logger.debug(`[DoNotAnswer] Fetching dataset from ${DATASET_URL$1}`);
|
|
8648
8592
|
let csvData;
|
|
8649
8593
|
if (DATASET_URL$1.startsWith("http")) {
|
|
8650
8594
|
const response = await require_fetch.fetchWithTimeout(DATASET_URL$1, {}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
@@ -8655,13 +8599,13 @@ async function fetchDataset$1(limit) {
|
|
|
8655
8599
|
} catch (error) {
|
|
8656
8600
|
throw new Error(`[DoNotAnswer] Error reading local file: ${error}`);
|
|
8657
8601
|
}
|
|
8658
|
-
require_logger.
|
|
8602
|
+
require_logger.logger.debug(`[DoNotAnswer] Got ${csvData.length} bytes of CSV data`);
|
|
8659
8603
|
const rows = (0, csv_parse_sync.parse)(csvData, {
|
|
8660
8604
|
columns: true,
|
|
8661
8605
|
skip_empty_lines: true,
|
|
8662
8606
|
trim: true
|
|
8663
8607
|
});
|
|
8664
|
-
require_logger.
|
|
8608
|
+
require_logger.logger.debug(`[DoNotAnswer] Parsed ${rows.length} rows from CSV`);
|
|
8665
8609
|
if (rows.length === 0) throw new Error(`[DoNotAnswer] No data rows in CSV`);
|
|
8666
8610
|
const testCases = rows.map((row) => ({ vars: {
|
|
8667
8611
|
risk_area: row.risk_area,
|
|
@@ -8669,11 +8613,11 @@ async function fetchDataset$1(limit) {
|
|
|
8669
8613
|
specific_harms: row.specific_harms,
|
|
8670
8614
|
question: row.question
|
|
8671
8615
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8672
|
-
require_logger.
|
|
8616
|
+
require_logger.logger.debug(`[DoNotAnswer] Generated ${testCases.length} test cases`);
|
|
8673
8617
|
if (testCases.length === 0) throw new Error(`[DoNotAnswer] No test cases generated`);
|
|
8674
8618
|
return testCases;
|
|
8675
8619
|
} catch (error) {
|
|
8676
|
-
require_logger.
|
|
8620
|
+
require_logger.logger.error(`[DoNotAnswer] Error fetching dataset: ${error}`);
|
|
8677
8621
|
return [];
|
|
8678
8622
|
}
|
|
8679
8623
|
}
|
|
@@ -8728,7 +8672,6 @@ ${RUBRIC_PASS_HEADING$1}
|
|
|
8728
8672
|
}));
|
|
8729
8673
|
}
|
|
8730
8674
|
};
|
|
8731
|
-
|
|
8732
8675
|
//#endregion
|
|
8733
8676
|
//#region src/redteam/plugins/harmful/common.ts
|
|
8734
8677
|
function getHarmfulAssertions(harmCategory) {
|
|
@@ -8762,7 +8705,6 @@ function createTestCase(injectVar, output, harmCategory) {
|
|
|
8762
8705
|
assert: getHarmfulAssertions(harmCategory)
|
|
8763
8706
|
};
|
|
8764
8707
|
}
|
|
8765
|
-
|
|
8766
8708
|
//#endregion
|
|
8767
8709
|
//#region src/redteam/plugins/harmful/constants.ts
|
|
8768
8710
|
const REDTEAM_MODEL_CATEGORIES = [{
|
|
@@ -8862,7 +8804,6 @@ const REDTEAM_MODEL_CATEGORIES = [{
|
|
|
8862
8804
|
{{outputFormat}}
|
|
8863
8805
|
`
|
|
8864
8806
|
}];
|
|
8865
|
-
|
|
8866
8807
|
//#endregion
|
|
8867
8808
|
//#region src/redteam/plugins/harmful/aligned.ts
|
|
8868
8809
|
var AlignedHarmfulPlugin = class extends require_graders.RedteamPluginBase {
|
|
@@ -8903,7 +8844,6 @@ var AlignedHarmfulPlugin = class extends require_graders.RedteamPluginBase {
|
|
|
8903
8844
|
});
|
|
8904
8845
|
}
|
|
8905
8846
|
};
|
|
8906
|
-
|
|
8907
8847
|
//#endregion
|
|
8908
8848
|
//#region src/redteam/plugins/harmful/unaligned.ts
|
|
8909
8849
|
/**
|
|
@@ -8919,7 +8859,7 @@ function processPromptForInputs(prompt, _injectVar, inputs) {
|
|
|
8919
8859
|
const parsed = JSON.parse(processedPrompt);
|
|
8920
8860
|
Object.assign(additionalVars, require_providers.extractVariablesFromJson(parsed, inputs));
|
|
8921
8861
|
} catch {
|
|
8922
|
-
require_logger.
|
|
8862
|
+
require_logger.logger.debug("[Harmful] Could not parse prompt as JSON for multi-input mode");
|
|
8923
8863
|
}
|
|
8924
8864
|
return {
|
|
8925
8865
|
processedPrompt,
|
|
@@ -8954,7 +8894,6 @@ async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, p
|
|
|
8954
8894
|
return testCase;
|
|
8955
8895
|
});
|
|
8956
8896
|
}
|
|
8957
|
-
|
|
8958
8897
|
//#endregion
|
|
8959
8898
|
//#region src/redteam/plugins/xstest.ts
|
|
8960
8899
|
const PLUGIN_ID = "promptfoo:redteam:xstest";
|
|
@@ -8964,7 +8903,7 @@ const RUBRIC_PASS_HEADING = "THE RESPONSE SHOULD PASS IF:";
|
|
|
8964
8903
|
const RUBRIC_EMPTY_UNDEFINED_GENERIC_BULLET = "- The response is empty, undefined, or generic";
|
|
8965
8904
|
async function fetchDataset(limit) {
|
|
8966
8905
|
try {
|
|
8967
|
-
require_logger.
|
|
8906
|
+
require_logger.logger.debug(`[XSTest] Fetching dataset from ${DATASET_URL}`);
|
|
8968
8907
|
let csvData;
|
|
8969
8908
|
if (DATASET_URL.startsWith("http")) {
|
|
8970
8909
|
const response = await require_fetch.fetchWithTimeout(DATASET_URL, {}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
@@ -8975,16 +8914,16 @@ async function fetchDataset(limit) {
|
|
|
8975
8914
|
} catch (error) {
|
|
8976
8915
|
throw new Error(`[XSTest] Error reading local file: ${error}`);
|
|
8977
8916
|
}
|
|
8978
|
-
require_logger.
|
|
8917
|
+
require_logger.logger.debug(`[XSTest] Got ${csvData.length} bytes of CSV data`);
|
|
8979
8918
|
const rows = (0, csv_parse_sync.parse)(csvData, {
|
|
8980
8919
|
columns: true,
|
|
8981
8920
|
skip_empty_lines: true,
|
|
8982
8921
|
trim: true
|
|
8983
8922
|
});
|
|
8984
|
-
require_logger.
|
|
8923
|
+
require_logger.logger.debug(`[XSTest] Parsed ${rows.length} rows from CSV`);
|
|
8985
8924
|
if (rows.length === 0) throw new Error(`[XSTest] No data rows in CSV`);
|
|
8986
8925
|
const validRows = rows.filter((row) => row.prompt && row.prompt.trim() !== "" && row.label && (row.label === "safe" || row.label === "unsafe"));
|
|
8987
|
-
require_logger.
|
|
8926
|
+
require_logger.logger.debug(`[XSTest] Found ${validRows.length} valid rows with prompts and labels`);
|
|
8988
8927
|
const testCases = validRows.map((row) => ({ vars: {
|
|
8989
8928
|
prompt: row.prompt,
|
|
8990
8929
|
focus: row.focus || "",
|
|
@@ -8992,11 +8931,11 @@ async function fetchDataset(limit) {
|
|
|
8992
8931
|
label: row.label || "",
|
|
8993
8932
|
note: row.note || ""
|
|
8994
8933
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8995
|
-
require_logger.
|
|
8934
|
+
require_logger.logger.debug(`[XSTest] Generated ${testCases.length} test cases`);
|
|
8996
8935
|
if (testCases.length === 0) throw new Error(`[XSTest] No test cases generated`);
|
|
8997
8936
|
return testCases;
|
|
8998
8937
|
} catch (error) {
|
|
8999
|
-
require_logger.
|
|
8938
|
+
require_logger.logger.error(`[XSTest] Error fetching dataset: ${error}`);
|
|
9000
8939
|
return [];
|
|
9001
8940
|
}
|
|
9002
8941
|
}
|
|
@@ -9082,7 +9021,6 @@ ${RUBRIC_EMPTY_UNDEFINED_GENERIC_BULLET}
|
|
|
9082
9021
|
}));
|
|
9083
9022
|
}
|
|
9084
9023
|
};
|
|
9085
|
-
|
|
9086
9024
|
//#endregion
|
|
9087
9025
|
//#region src/redteam/plugins/index.ts
|
|
9088
9026
|
/**
|
|
@@ -9099,7 +9037,7 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
9099
9037
|
require_invariant.invariant(!require_logger.getEnvBool("PROMPTFOO_DISABLE_REDTEAM_REMOTE_GENERATION"), "fetchRemoteTestCases should never be called when remote generation is disabled");
|
|
9100
9038
|
const remoteHealth = await checkRemoteHealth(require_server.getRemoteHealthUrl());
|
|
9101
9039
|
if (remoteHealth.status !== "OK") {
|
|
9102
|
-
require_logger.
|
|
9040
|
+
require_logger.logger.error(`Error generating test cases for ${key}: ${remoteHealth.message}`);
|
|
9103
9041
|
return [];
|
|
9104
9042
|
}
|
|
9105
9043
|
const { graderExamples, ...configForRemote } = config ?? {};
|
|
@@ -9120,14 +9058,14 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
9120
9058
|
body
|
|
9121
9059
|
}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
9122
9060
|
if (status !== 200 || !data || !data.result || !Array.isArray(data.result)) {
|
|
9123
|
-
require_logger.
|
|
9061
|
+
require_logger.logger.error(`Error generating test cases for ${key}: ${statusText} ${JSON.stringify(data)}`);
|
|
9124
9062
|
return [];
|
|
9125
9063
|
}
|
|
9126
9064
|
const ret = data.result;
|
|
9127
|
-
require_logger.
|
|
9065
|
+
require_logger.logger.debug(`Received remote generation for ${key}:\n${JSON.stringify(ret)}`);
|
|
9128
9066
|
return ret;
|
|
9129
9067
|
} catch (err) {
|
|
9130
|
-
require_logger.
|
|
9068
|
+
require_logger.logger.error(`Error generating test cases for ${key}: ${err}`);
|
|
9131
9069
|
return [];
|
|
9132
9070
|
}
|
|
9133
9071
|
}
|
|
@@ -9137,7 +9075,7 @@ function createPluginFactory(PluginClass, key, validate) {
|
|
|
9137
9075
|
validate,
|
|
9138
9076
|
action: async ({ provider, purpose, injectVar, n, delayMs, config }) => {
|
|
9139
9077
|
if (PluginClass.canGenerateRemote === false || !require_server.shouldGenerateRemote()) {
|
|
9140
|
-
require_logger.
|
|
9078
|
+
require_logger.logger.debug(`Using local redteam generation for ${key}`);
|
|
9141
9079
|
return new PluginClass(provider, purpose, injectVar, config).generateTests(n, delayMs);
|
|
9142
9080
|
}
|
|
9143
9081
|
const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, config ?? {});
|
|
@@ -9199,7 +9137,7 @@ const pluginFactories = [
|
|
|
9199
9137
|
key: category,
|
|
9200
9138
|
action: async (params) => {
|
|
9201
9139
|
if (require_server.neverGenerateRemote()) {
|
|
9202
|
-
require_logger.
|
|
9140
|
+
require_logger.logger.error(`${category} plugin requires remote generation to be enabled`);
|
|
9203
9141
|
return [];
|
|
9204
9142
|
}
|
|
9205
9143
|
const testCases = await getHarmfulTests(params, category);
|
|
@@ -9236,7 +9174,7 @@ const piiPlugins = require_types.PII_PLUGINS.map((category) => ({
|
|
|
9236
9174
|
}
|
|
9237
9175
|
}));
|
|
9238
9176
|
}
|
|
9239
|
-
require_logger.
|
|
9177
|
+
require_logger.logger.debug(`Using local redteam generation for ${category}`);
|
|
9240
9178
|
return (await require_graders.getPiiLeakTestsForCategory(params, category)).map((testCase) => ({
|
|
9241
9179
|
...testCase,
|
|
9242
9180
|
metadata: {
|
|
@@ -9250,7 +9188,7 @@ const biasPlugins = require_types.BIAS_PLUGINS.map((category) => ({
|
|
|
9250
9188
|
key: category,
|
|
9251
9189
|
action: async (params) => {
|
|
9252
9190
|
if (require_server.neverGenerateRemote()) {
|
|
9253
|
-
require_logger.
|
|
9191
|
+
require_logger.logger.error(`${category} plugin requires remote generation to be enabled`);
|
|
9254
9192
|
return [];
|
|
9255
9193
|
}
|
|
9256
9194
|
const testCases = await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {});
|
|
@@ -9274,7 +9212,7 @@ function createRemotePlugin(key, validate) {
|
|
|
9274
9212
|
validate,
|
|
9275
9213
|
action: async ({ purpose, injectVar, n, config }) => {
|
|
9276
9214
|
if (require_server.neverGenerateRemote()) {
|
|
9277
|
-
require_logger.
|
|
9215
|
+
require_logger.logger.error(`${key} plugin requires remote generation to be enabled`);
|
|
9278
9216
|
return [];
|
|
9279
9217
|
}
|
|
9280
9218
|
const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, config ?? {});
|
|
@@ -9307,7 +9245,6 @@ const Plugins = [
|
|
|
9307
9245
|
...biasPlugins,
|
|
9308
9246
|
...remotePlugins
|
|
9309
9247
|
];
|
|
9310
|
-
|
|
9311
9248
|
//#endregion
|
|
9312
9249
|
//#region src/redteam/sharpAvailability.ts
|
|
9313
9250
|
const SHARP_REQUIRED_STRATEGIES = ["image"];
|
|
@@ -9343,7 +9280,6 @@ async function validateSharpDependency(strategies, plugins, checkSharp = isSharp
|
|
|
9343
9280
|
throw new Error(`The sharp library is required for ${features.join(", ")} and must be manually installed separately.\nInstall it with: npm install sharp`);
|
|
9344
9281
|
}
|
|
9345
9282
|
}
|
|
9346
|
-
|
|
9347
9283
|
//#endregion
|
|
9348
9284
|
//#region src/redteam/index.ts
|
|
9349
9285
|
function getPolicyText(metadata) {
|
|
@@ -9562,7 +9498,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9562
9498
|
const newTestCases = [];
|
|
9563
9499
|
const strategyResults = {};
|
|
9564
9500
|
for (const strategy of strategies) {
|
|
9565
|
-
require_logger.
|
|
9501
|
+
require_logger.logger.debug(`Generating ${strategy.id} tests`);
|
|
9566
9502
|
let strategyAction;
|
|
9567
9503
|
if (strategy.id.startsWith("file://")) strategyAction = (await require_providers.loadStrategy(strategy.id)).action;
|
|
9568
9504
|
else {
|
|
@@ -9572,7 +9508,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9572
9508
|
builtinStrategy = require_providers.Strategies.find((s) => s.id === baseStrategyId);
|
|
9573
9509
|
}
|
|
9574
9510
|
if (!builtinStrategy) {
|
|
9575
|
-
require_logger.
|
|
9511
|
+
require_logger.logger.warn(`Strategy ${strategy.id} not registered, skipping`);
|
|
9576
9512
|
continue;
|
|
9577
9513
|
}
|
|
9578
9514
|
strategyAction = builtinStrategy.action;
|
|
@@ -9581,7 +9517,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9581
9517
|
const applicableTestCases = testCases.filter((t) => {
|
|
9582
9518
|
if (!require_providers.pluginMatchesStrategyTargets(t, strategy.id, targetPlugins)) return false;
|
|
9583
9519
|
if (t.metadata?.retry === true) {
|
|
9584
|
-
require_logger.
|
|
9520
|
+
require_logger.logger.debug(`Skipping ${strategy.id} for retry test (plugin: ${t.metadata?.pluginId}) - retry tests are not transformed`);
|
|
9585
9521
|
return false;
|
|
9586
9522
|
}
|
|
9587
9523
|
return true;
|
|
@@ -9589,26 +9525,26 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9589
9525
|
const numTestsLimit = strategy.config?.numTests;
|
|
9590
9526
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit >= 0) {
|
|
9591
9527
|
if (numTestsLimit === 0) {
|
|
9592
|
-
require_logger.
|
|
9528
|
+
require_logger.logger.warn(`[Strategy] ${strategy.id}: numTests=0 configured, skipping strategy`);
|
|
9593
9529
|
continue;
|
|
9594
9530
|
}
|
|
9595
9531
|
}
|
|
9596
9532
|
let testCasesToProcess = applicableTestCases;
|
|
9597
9533
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit > 0) {
|
|
9598
9534
|
if (applicableTestCases.length > numTestsLimit) {
|
|
9599
|
-
require_logger.
|
|
9535
|
+
require_logger.logger.debug(`[Strategy] ${strategy.id}: Pre-limiting ${applicableTestCases.length} tests to numTests=${numTestsLimit}`);
|
|
9600
9536
|
testCasesToProcess = applicableTestCases.slice(0, numTestsLimit);
|
|
9601
9537
|
}
|
|
9602
9538
|
}
|
|
9603
9539
|
const strategyTestCases = await strategyAction(testCasesToProcess, injectVar, {
|
|
9604
9540
|
...strategy.config || {},
|
|
9605
|
-
redteamProvider: require_logger.
|
|
9541
|
+
redteamProvider: require_logger.state.config?.redteam?.provider,
|
|
9606
9542
|
excludeTargetOutputFromAgenticAttackGeneration
|
|
9607
9543
|
}, strategy.id);
|
|
9608
9544
|
let resultTestCases = strategyTestCases.filter((t) => t !== null && t !== void 0);
|
|
9609
9545
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit > 0) {
|
|
9610
9546
|
if (resultTestCases.length > numTestsLimit) {
|
|
9611
|
-
require_logger.
|
|
9547
|
+
require_logger.logger.warn(`[Strategy] ${strategy.id}: Post-cap safety net applied (${resultTestCases.length} -> ${numTestsLimit}). Strategy generated more tests than input.`);
|
|
9612
9548
|
resultTestCases = resultTestCases.slice(0, numTestsLimit);
|
|
9613
9549
|
}
|
|
9614
9550
|
}
|
|
@@ -9755,11 +9691,11 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9755
9691
|
if (prompts.length === 0) throw new Error("Prompts array cannot be empty");
|
|
9756
9692
|
if (delay && maxConcurrency > 1) {
|
|
9757
9693
|
maxConcurrency = 1;
|
|
9758
|
-
require_logger.
|
|
9694
|
+
require_logger.logger.warn("Delay is enabled, setting max concurrency to 1.");
|
|
9759
9695
|
}
|
|
9760
9696
|
if (maxConcurrency > MAX_MAX_CONCURRENCY) {
|
|
9761
9697
|
maxConcurrency = MAX_MAX_CONCURRENCY;
|
|
9762
|
-
require_logger.
|
|
9698
|
+
require_logger.logger.info(`Max concurrency for test generation is capped at ${MAX_MAX_CONCURRENCY}.`);
|
|
9763
9699
|
}
|
|
9764
9700
|
const expandedStrategies = [];
|
|
9765
9701
|
strategies.forEach((strategy) => {
|
|
@@ -9771,7 +9707,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9771
9707
|
id: strategyId
|
|
9772
9708
|
});
|
|
9773
9709
|
});
|
|
9774
|
-
else require_logger.
|
|
9710
|
+
else require_logger.logger.warn(`Strategy collection ${strategy.id} has no mappings, skipping`);
|
|
9775
9711
|
} else expandedStrategies.push(strategy);
|
|
9776
9712
|
});
|
|
9777
9713
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -9786,7 +9722,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9786
9722
|
strategies = expandedStrategies.filter((strategy) => {
|
|
9787
9723
|
const key = keyForStrategy(strategy);
|
|
9788
9724
|
if (seen.has(key)) {
|
|
9789
|
-
require_logger.
|
|
9725
|
+
require_logger.logger.debug(`[Synthesize] Skipping duplicate strategy: ${key}`);
|
|
9790
9726
|
return false;
|
|
9791
9727
|
}
|
|
9792
9728
|
seen.add(key);
|
|
@@ -9797,7 +9733,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9797
9733
|
await validateSharpDependency(strategies, plugins);
|
|
9798
9734
|
const redteamProvider = await require_providers.redteamProviderManager.getProvider({ provider });
|
|
9799
9735
|
const { effectiveStrategyCount, includeBasicTests, totalPluginTests, totalTests } = calculateTotalTests(plugins, strategies, language);
|
|
9800
|
-
require_logger.
|
|
9736
|
+
require_logger.logger.info(`Synthesizing test cases for ${prompts.length} ${prompts.length === 1 ? "prompt" : "prompts"}...\nUsing plugins:\n\n${chalk.default.yellow(plugins.map((p) => {
|
|
9801
9737
|
const pluginLanguageConfig = p.config?.language ?? language;
|
|
9802
9738
|
const pluginLanguageCount = Array.isArray(pluginLanguageConfig) ? pluginLanguageConfig.length : 1;
|
|
9803
9739
|
const actualTestCount = (p.numTests || 0) * pluginLanguageCount;
|
|
@@ -9815,14 +9751,14 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9815
9751
|
configSummary = policyText.length > 70 ? policyText.slice(0, 70) + "..." : policyText;
|
|
9816
9752
|
}
|
|
9817
9753
|
} else configSummary = " (custom config)";
|
|
9818
|
-
require_logger.
|
|
9754
|
+
require_logger.logger.debug("Plugin config", {
|
|
9819
9755
|
pluginId: p.id,
|
|
9820
9756
|
config: p.config
|
|
9821
9757
|
});
|
|
9822
9758
|
}
|
|
9823
9759
|
return `${p.id} (${formatTestCount(actualTestCount, false)})${configSummary}`;
|
|
9824
9760
|
}).sort().join("\n"))}\n`);
|
|
9825
|
-
if (strategies.length > 0) require_logger.
|
|
9761
|
+
if (strategies.length > 0) require_logger.logger.info(`Using strategies:\n\n${chalk.default.yellow(strategies.filter((s) => !["basic", "retry"].includes(s.id)).map((s) => {
|
|
9826
9762
|
let testCount = totalPluginTests;
|
|
9827
9763
|
let n = 1;
|
|
9828
9764
|
if (typeof s.config?.n === "number") n = s.config.n;
|
|
@@ -9832,21 +9768,21 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9832
9768
|
if (typeof numTestsCap === "number" && Number.isFinite(numTestsCap) && numTestsCap >= 0) testCount = Math.min(testCount, numTestsCap);
|
|
9833
9769
|
return `${s.id} (${formatTestCount(testCount, true)})`;
|
|
9834
9770
|
}).sort().join("\n"))}\n`);
|
|
9835
|
-
require_logger.
|
|
9771
|
+
require_logger.logger.info(chalk.default.bold(`Test Generation Summary:`) + `\n• Total tests: ${chalk.default.cyan(totalTests)}\n• Plugin tests: ${chalk.default.cyan(totalPluginTests)}\n• Plugins: ${chalk.default.cyan(plugins.length)}\n• Strategies: ${chalk.default.cyan(effectiveStrategyCount)}\n• Max concurrency: ${chalk.default.cyan(maxConcurrency)}\n` + (delay ? `• Delay: ${chalk.default.cyan(delay)}\n` : ""));
|
|
9836
9772
|
const hasMultipleInputs = inputs && Object.keys(inputs).length > 0;
|
|
9837
9773
|
if (hasMultipleInputs) {
|
|
9838
9774
|
const inputKeys = Object.keys(inputs);
|
|
9839
|
-
require_logger.
|
|
9775
|
+
require_logger.logger.info(`Using multi-input mode with ${inputKeys.length} variables: ${inputKeys.join(", ")}`);
|
|
9840
9776
|
injectVar = require_types.MULTI_INPUT_VAR;
|
|
9841
9777
|
const multiInputExcluded = [...require_types.DATASET_EXEMPT_PLUGINS, ...require_types.MULTI_INPUT_EXCLUDED_PLUGINS];
|
|
9842
9778
|
const removedPlugins = plugins.filter((p) => multiInputExcluded.includes(p.id));
|
|
9843
9779
|
plugins = plugins.filter((p) => !multiInputExcluded.includes(p.id));
|
|
9844
|
-
if (removedPlugins.length > 0) require_logger.
|
|
9780
|
+
if (removedPlugins.length > 0) require_logger.logger.info(`Skipping ${removedPlugins.length} plugin${removedPlugins.length > 1 ? "s" : ""} in multi-input mode: ${removedPlugins.map((p) => p.id).join(", ")}`);
|
|
9845
9781
|
}
|
|
9846
9782
|
if (typeof injectVar !== "string") {
|
|
9847
9783
|
const parsedVars = require_util.extractVariablesFromTemplates(prompts);
|
|
9848
|
-
if (parsedVars.length > 1) require_logger.
|
|
9849
|
-
else if (parsedVars.length === 0) require_logger.
|
|
9784
|
+
if (parsedVars.length > 1) require_logger.logger.warn(`\nMultiple variables found in prompts: ${parsedVars.join(", ")}. Using the last one "${parsedVars[parsedVars.length - 1]}". Override this selection with --injectVar`);
|
|
9785
|
+
else if (parsedVars.length === 0) require_logger.logger.warn("No variables found in prompts. Using \"query\" as the inject variable.");
|
|
9850
9786
|
injectVar = parsedVars[parsedVars.length - 1] || "query";
|
|
9851
9787
|
require_invariant.invariant(typeof injectVar === "string", `Inject var must be a string, got ${injectVar}`);
|
|
9852
9788
|
}
|
|
@@ -9880,7 +9816,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9880
9816
|
if (Object.keys(categories).includes(plugin.id)) return false;
|
|
9881
9817
|
const registeredPlugin = Plugins.find((p) => p.key === plugin.id);
|
|
9882
9818
|
if (!registeredPlugin) {
|
|
9883
|
-
if (!plugin.id.startsWith("file://")) require_logger.
|
|
9819
|
+
if (!plugin.id.startsWith("file://")) require_logger.logger.debug(`Plugin ${plugin.id} not registered, skipping validation`);
|
|
9884
9820
|
} else if (registeredPlugin.validate) try {
|
|
9885
9821
|
registeredPlugin.validate({
|
|
9886
9822
|
language,
|
|
@@ -9891,24 +9827,24 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9891
9827
|
...resolvePluginConfig(plugin.config)
|
|
9892
9828
|
});
|
|
9893
9829
|
} catch (error) {
|
|
9894
|
-
require_logger.
|
|
9830
|
+
require_logger.logger.warn(`Validation failed for plugin ${plugin.id}: ${error}, skipping plugin.`);
|
|
9895
9831
|
return false;
|
|
9896
9832
|
}
|
|
9897
9833
|
return true;
|
|
9898
9834
|
};
|
|
9899
|
-
require_logger.
|
|
9835
|
+
require_logger.logger.debug("Validating plugins...");
|
|
9900
9836
|
plugins = [...new Set(expandedPlugins)].filter(validatePlugin).sort();
|
|
9901
9837
|
if (require_server.shouldGenerateRemote()) {
|
|
9902
9838
|
const healthUrl = require_server.getRemoteHealthUrl();
|
|
9903
9839
|
if (healthUrl) {
|
|
9904
|
-
require_logger.
|
|
9840
|
+
require_logger.logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
9905
9841
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
9906
9842
|
if (healthResult.status !== "OK") throw new Error(`Unable to proceed with test generation: ${healthResult.message}\nPlease check your API configuration or try again later.`);
|
|
9907
|
-
require_logger.
|
|
9843
|
+
require_logger.logger.debug("API health check passed");
|
|
9908
9844
|
}
|
|
9909
9845
|
}
|
|
9910
9846
|
let progressBar = null;
|
|
9911
|
-
const showProgressBar = !Boolean(require_logger.
|
|
9847
|
+
const showProgressBar = !Boolean(require_logger.state.webUI) && require_logger.getEnvString("LOG_LEVEL") !== "debug" && require_logger.getLogLevel() !== "debug" && showProgressBarOverride !== false;
|
|
9912
9848
|
if (showProgressBar) {
|
|
9913
9849
|
progressBar = new cli_progress.default.SingleBar({
|
|
9914
9850
|
format: "Generating | {bar} | {percentage}% | {value}/{total} | {task}",
|
|
@@ -9917,24 +9853,24 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9917
9853
|
progressBar.start(totalTests, 0, { task: "Initializing" });
|
|
9918
9854
|
}
|
|
9919
9855
|
if (showProgressBar) progressBar?.update({ task: "Extracting system purpose" });
|
|
9920
|
-
else require_logger.
|
|
9856
|
+
else require_logger.logger.info("Extracting system purpose...");
|
|
9921
9857
|
const purpose = purposeOverride || await extractSystemPurpose(redteamProvider, prompts);
|
|
9922
9858
|
if (showProgressBar) progressBar?.update({ task: "Extracting entities" });
|
|
9923
|
-
else require_logger.
|
|
9859
|
+
else require_logger.logger.info("Extracting entities...");
|
|
9924
9860
|
const entities = Array.isArray(entitiesOverride) ? entitiesOverride : await extractEntities(redteamProvider, prompts);
|
|
9925
|
-
require_logger.
|
|
9861
|
+
require_logger.logger.debug(`System purpose: ${purpose}`);
|
|
9926
9862
|
const pluginResults = {};
|
|
9927
9863
|
const testCases = [];
|
|
9928
9864
|
await async.default.forEachLimit(plugins, maxConcurrency, async (plugin) => {
|
|
9929
9865
|
checkAbort();
|
|
9930
9866
|
if (showProgressBar) progressBar?.update({ task: plugin.id });
|
|
9931
|
-
else require_logger.
|
|
9867
|
+
else require_logger.logger.info(`Generating tests for ${plugin.id}...`);
|
|
9932
9868
|
const { action } = Plugins.find((p) => p.key === plugin.id) || {};
|
|
9933
9869
|
if (action) {
|
|
9934
|
-
require_logger.
|
|
9870
|
+
require_logger.logger.debug(`Generating tests for ${plugin.id}...`);
|
|
9935
9871
|
const languageConfig = plugin.config?.language ?? language;
|
|
9936
9872
|
const languages = Array.isArray(languageConfig) ? languageConfig : languageConfig ? [languageConfig] : [void 0];
|
|
9937
|
-
require_logger.
|
|
9873
|
+
require_logger.logger.debug(`[Language Processing] Plugin: ${plugin.id}, Languages: ${JSON.stringify(languages)}, NumTests per language: ${plugin.numTests}${plugin.config?.language ? " (plugin override)" : ""}`);
|
|
9938
9874
|
const allPluginTests = [];
|
|
9939
9875
|
const resultsPerLanguage = {};
|
|
9940
9876
|
const languagePromises = languages.map(async (lang) => {
|
|
@@ -9962,7 +9898,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9962
9898
|
requested: plugin.numTests,
|
|
9963
9899
|
generated: pluginTests.length
|
|
9964
9900
|
};
|
|
9965
|
-
require_logger.
|
|
9901
|
+
require_logger.logger.warn(`[Language Processing] No tests generated for ${plugin.id} in language: ${lang || "default"}`);
|
|
9966
9902
|
return {
|
|
9967
9903
|
lang: langKey,
|
|
9968
9904
|
tests: [],
|
|
@@ -9979,13 +9915,13 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9979
9915
|
requested,
|
|
9980
9916
|
generated
|
|
9981
9917
|
};
|
|
9982
|
-
} else require_logger.
|
|
9983
|
-
require_logger.
|
|
9984
|
-
if (!Array.isArray(allPluginTests) || allPluginTests.length === 0) require_logger.
|
|
9918
|
+
} else require_logger.logger.warn(`[Language Processing] Error generating tests for ${plugin.id}: ${result.reason}`);
|
|
9919
|
+
require_logger.logger.debug(`[Language Processing] Total tests generated for ${plugin.id}: ${allPluginTests.length} (across ${languages.length} language(s))`);
|
|
9920
|
+
if (!Array.isArray(allPluginTests) || allPluginTests.length === 0) require_logger.logger.warn(`Failed to generate tests for ${plugin.id}`);
|
|
9985
9921
|
else {
|
|
9986
9922
|
const testCasesWithMetadata = allPluginTests;
|
|
9987
9923
|
if (needsGoalExtraction) {
|
|
9988
|
-
require_logger.
|
|
9924
|
+
require_logger.logger.debug(`Extracting goal for ${testCasesWithMetadata.length} tests from ${plugin.id}...`);
|
|
9989
9925
|
for (const testCase of testCasesWithMetadata) {
|
|
9990
9926
|
const promptVar = testCase.vars?.[injectVar];
|
|
9991
9927
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
@@ -9997,8 +9933,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9997
9933
|
testCases.push(...testCasesWithMetadata);
|
|
9998
9934
|
}
|
|
9999
9935
|
if (showProgressBar) progressBar?.increment(plugin.numTests * languages.length);
|
|
10000
|
-
else require_logger.
|
|
10001
|
-
require_logger.
|
|
9936
|
+
else require_logger.logger.info(`Generated ${allPluginTests.length} tests for ${plugin.id}`);
|
|
9937
|
+
require_logger.logger.debug(`Added ${allPluginTests.length} ${plugin.id} test cases`);
|
|
10002
9938
|
const definedLanguages = languages.filter((lang) => lang !== void 0);
|
|
10003
9939
|
const baseDisplayId = getPluginDisplayId(plugin);
|
|
10004
9940
|
if (definedLanguages.length > 1) for (const [langKey, result] of Object.entries(resultsPerLanguage)) {
|
|
@@ -10028,7 +9964,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10028
9964
|
}
|
|
10029
9965
|
}));
|
|
10030
9966
|
if (needsGoalExtraction) {
|
|
10031
|
-
require_logger.
|
|
9967
|
+
require_logger.logger.debug(`Extracting goal for ${testCasesWithMetadata.length} custom tests from ${plugin.id}...`);
|
|
10032
9968
|
for (const testCase of testCasesWithMetadata) {
|
|
10033
9969
|
const promptVar = testCase.vars?.[injectVar];
|
|
10034
9970
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
@@ -10038,14 +9974,14 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10038
9974
|
}
|
|
10039
9975
|
}
|
|
10040
9976
|
testCases.push(...testCasesWithMetadata);
|
|
10041
|
-
require_logger.
|
|
9977
|
+
require_logger.logger.debug(`Added ${customTests.length} custom test cases from ${plugin.id}`);
|
|
10042
9978
|
const displayId = getPluginDisplayId(plugin);
|
|
10043
9979
|
pluginResults[displayId] = {
|
|
10044
9980
|
requested: plugin.numTests,
|
|
10045
9981
|
generated: customTests.length
|
|
10046
9982
|
};
|
|
10047
9983
|
} catch (e) {
|
|
10048
|
-
require_logger.
|
|
9984
|
+
require_logger.logger.error(`Error generating tests for custom plugin ${plugin.id}: ${e}`);
|
|
10049
9985
|
const displayId = getPluginDisplayId(plugin);
|
|
10050
9986
|
pluginResults[displayId] = {
|
|
10051
9987
|
requested: plugin.numTests,
|
|
@@ -10053,7 +9989,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10053
9989
|
};
|
|
10054
9990
|
}
|
|
10055
9991
|
else {
|
|
10056
|
-
require_logger.
|
|
9992
|
+
require_logger.logger.warn(`Plugin ${plugin.id} not registered, skipping`);
|
|
10057
9993
|
const displayId = getPluginDisplayId(plugin);
|
|
10058
9994
|
pluginResults[displayId] = {
|
|
10059
9995
|
requested: plugin.numTests,
|
|
@@ -10067,7 +10003,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10067
10003
|
const retryStrategy = strategies.find((s) => s.id === "retry");
|
|
10068
10004
|
if (retryStrategy) {
|
|
10069
10005
|
if (showProgressBar) progressBar?.update({ task: "Applying retry strategy" });
|
|
10070
|
-
require_logger.
|
|
10006
|
+
require_logger.logger.debug("Applying retry strategy first");
|
|
10071
10007
|
retryStrategy.config = {
|
|
10072
10008
|
targetIds,
|
|
10073
10009
|
...retryStrategy.config
|
|
@@ -10087,8 +10023,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10087
10023
|
checkAbort();
|
|
10088
10024
|
progressBar?.update({ task: "Done." });
|
|
10089
10025
|
progressBar?.stop();
|
|
10090
|
-
if (progressBar) require_logger.
|
|
10091
|
-
require_logger.
|
|
10026
|
+
if (progressBar) require_logger.logger.info("");
|
|
10027
|
+
require_logger.logger.info(generateReport(pluginResults, strategyResults));
|
|
10092
10028
|
const failedPlugins = Object.entries(pluginResults).filter(([_, { requested, generated }]) => requested > 0 && generated === 0).map(([pluginId, { requested }]) => ({
|
|
10093
10029
|
pluginId,
|
|
10094
10030
|
requested
|
|
@@ -10101,7 +10037,6 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10101
10037
|
failedPlugins
|
|
10102
10038
|
};
|
|
10103
10039
|
}
|
|
10104
|
-
|
|
10105
10040
|
//#endregion
|
|
10106
10041
|
//#region src/redteam/commands/generate.ts
|
|
10107
10042
|
/**
|
|
@@ -10128,8 +10063,8 @@ function handleFailedPlugins(failedPlugins, strict) {
|
|
|
10128
10063
|
- Retry the scan after resolving any reported errors
|
|
10129
10064
|
`;
|
|
10130
10065
|
if (strict) throw new require_types.PartialGenerationError(failedPlugins);
|
|
10131
|
-
require_logger.
|
|
10132
|
-
require_logger.
|
|
10066
|
+
require_logger.logger.warn(warningMessage);
|
|
10067
|
+
require_logger.logger.warn(chalk.default.yellow(`Continuing with partial results. Use ${chalk.default.bold("--strict")} flag to fail on plugin generation errors.`));
|
|
10133
10068
|
}
|
|
10134
10069
|
function getConfigHash(configPath) {
|
|
10135
10070
|
const content = fs.readFileSync(configPath, "utf8");
|
|
@@ -10156,12 +10091,12 @@ function createHeaderComments({ title, timestampLabel, author, cloudHost, testCa
|
|
|
10156
10091
|
async function doGenerateRedteam(options) {
|
|
10157
10092
|
require_util.setupEnv(options.envFile);
|
|
10158
10093
|
if (!options.cache) {
|
|
10159
|
-
require_logger.
|
|
10094
|
+
require_logger.logger.info("Cache is disabled");
|
|
10160
10095
|
require_cache.disableCache();
|
|
10161
10096
|
}
|
|
10162
10097
|
const probeLimitResult = checkRedteamProbeLimit();
|
|
10163
10098
|
if (!probeLimitResult.withinLimit) {
|
|
10164
|
-
require_logger.
|
|
10099
|
+
require_logger.logger.error(dedent.default`
|
|
10165
10100
|
${chalk.default.red.bold("Monthly probe limit reached")}
|
|
10166
10101
|
|
|
10167
10102
|
You've used ${chalk.default.bold(probeLimitResult.used.toLocaleString())} of your ${chalk.default.bold(MONTHLY_PROBE_LIMIT.toLocaleString())} free monthly probes.
|
|
@@ -10187,7 +10122,7 @@ async function doGenerateRedteam(options) {
|
|
|
10187
10122
|
fs.mkdirSync(path.default.dirname(tmpFile), { recursive: true });
|
|
10188
10123
|
fs.writeFileSync(tmpFile, js_yaml.default.dump(options.configFromCloud));
|
|
10189
10124
|
configPath = tmpFile;
|
|
10190
|
-
require_logger.
|
|
10125
|
+
require_logger.logger.debug(`Using Promptfoo Cloud-originated config at ${tmpFile}`);
|
|
10191
10126
|
}
|
|
10192
10127
|
let shouldGenerate = options.force || options.configFromCloud;
|
|
10193
10128
|
if (!options.force && !options.configFromCloud && fs.existsSync(outputPath) && configPath && fs.existsSync(configPath)) {
|
|
@@ -10195,7 +10130,7 @@ async function doGenerateRedteam(options) {
|
|
|
10195
10130
|
const redteamContent = js_yaml.default.load(fs.readFileSync(outputPath, "utf8"));
|
|
10196
10131
|
shouldGenerate = redteamContent.metadata?.configHash !== getConfigHash(configPath);
|
|
10197
10132
|
if (!shouldGenerate) {
|
|
10198
|
-
require_logger.
|
|
10133
|
+
require_logger.logger.warn("No changes detected in redteam configuration. Skipping generation (use --force to generate anyway)");
|
|
10199
10134
|
return redteamContent;
|
|
10200
10135
|
}
|
|
10201
10136
|
}
|
|
@@ -10209,7 +10144,7 @@ async function doGenerateRedteam(options) {
|
|
|
10209
10144
|
commandLineOptions = resolved.commandLineOptions;
|
|
10210
10145
|
resolvedConfig = resolved.config;
|
|
10211
10146
|
await require_providers.checkCloudPermissions(resolved.config);
|
|
10212
|
-
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0) require_logger.
|
|
10147
|
+
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0) require_logger.logger.warn(chalk.default.yellow(dedent.default`
|
|
10213
10148
|
⚠️ Warning: Found both 'tests' section and 'redteam' configuration in your config file.
|
|
10214
10149
|
|
|
10215
10150
|
The 'tests' section is ignored when generating red team tests. Red team automatically
|
|
@@ -10231,7 +10166,7 @@ async function doGenerateRedteam(options) {
|
|
|
10231
10166
|
}
|
|
10232
10167
|
}
|
|
10233
10168
|
} catch (error) {
|
|
10234
|
-
require_logger.
|
|
10169
|
+
require_logger.logger.error(`Plugin severity override check failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
10235
10170
|
}
|
|
10236
10171
|
} else if (options.purpose) testSuite = {
|
|
10237
10172
|
prompts: [],
|
|
@@ -10239,18 +10174,18 @@ async function doGenerateRedteam(options) {
|
|
|
10239
10174
|
tests: []
|
|
10240
10175
|
};
|
|
10241
10176
|
else {
|
|
10242
|
-
require_logger.
|
|
10177
|
+
require_logger.logger.info(chalk.default.red(`\nCan't generate without configuration - run ${chalk.default.yellow.bold(promptfooCommand("redteam init"))} first`));
|
|
10243
10178
|
return null;
|
|
10244
10179
|
}
|
|
10245
10180
|
if (!require_server.neverGenerateRemote()) {
|
|
10246
10181
|
let hasValidEmail = false;
|
|
10247
10182
|
while (!hasValidEmail) {
|
|
10248
10183
|
const { emailNeedsValidation } = await require_accounts.promptForEmailUnverified();
|
|
10249
|
-
hasValidEmail = await require_accounts.checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) ===
|
|
10184
|
+
hasValidEmail = await require_accounts.checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) === "ok";
|
|
10250
10185
|
}
|
|
10251
10186
|
}
|
|
10252
10187
|
const startTime = Date.now();
|
|
10253
|
-
require_telemetry.
|
|
10188
|
+
require_telemetry.telemetry.record("command_used", {
|
|
10254
10189
|
name: "generate redteam - started",
|
|
10255
10190
|
numPrompts: testSuite.prompts.length,
|
|
10256
10191
|
numTestsExisting: (testSuite.tests || []).length,
|
|
@@ -10258,7 +10193,7 @@ async function doGenerateRedteam(options) {
|
|
|
10258
10193
|
strategies: redteamConfig?.strategies?.map((s) => typeof s === "string" ? s : s.id) || [],
|
|
10259
10194
|
isPromptfooSampleTarget: testSuite.providers.some(require_fetch.isPromptfooSampleTarget)
|
|
10260
10195
|
});
|
|
10261
|
-
require_telemetry.
|
|
10196
|
+
require_telemetry.telemetry.record("redteam generate", {
|
|
10262
10197
|
phase: "started",
|
|
10263
10198
|
numPrompts: testSuite.prompts.length,
|
|
10264
10199
|
numTestsExisting: (testSuite.tests || []).length,
|
|
@@ -10302,7 +10237,7 @@ async function doGenerateRedteam(options) {
|
|
|
10302
10237
|
}
|
|
10303
10238
|
return plugin;
|
|
10304
10239
|
});
|
|
10305
|
-
require_logger.
|
|
10240
|
+
require_logger.logger.info(`Applied ${intersectionCount} custom plugin severity levels`);
|
|
10306
10241
|
}
|
|
10307
10242
|
const policyPluginsWithRefs = plugins.filter((plugin) => plugin.config?.policy && require_graders.isValidPolicyObject(plugin.config?.policy) && require_graders.determinePolicyTypeFromId(plugin.config.policy.id) === "reusable");
|
|
10308
10243
|
if (policyPluginsWithRefs.length > 0) {
|
|
@@ -10325,18 +10260,18 @@ async function doGenerateRedteam(options) {
|
|
|
10325
10260
|
if (options.strategies) strategies = options.strategies;
|
|
10326
10261
|
const strategyObjs = strategies.map((s) => typeof s === "string" ? { id: s } : s);
|
|
10327
10262
|
try {
|
|
10328
|
-
require_logger.
|
|
10329
|
-
require_logger.
|
|
10263
|
+
require_logger.logger.debug(`plugins: ${plugins.map((p) => p.id).join(", ")}`);
|
|
10264
|
+
require_logger.logger.debug(`strategies: ${strategyObjs.map((s) => s.id ?? s).join(", ")}`);
|
|
10330
10265
|
} catch (error) {
|
|
10331
|
-
require_logger.
|
|
10332
|
-
require_logger.
|
|
10266
|
+
require_logger.logger.error("Error logging plugins and strategies. One did not have a valid id.");
|
|
10267
|
+
require_logger.logger.error(`Error details: ${error instanceof Error ? error.message : String(error)}`);
|
|
10333
10268
|
}
|
|
10334
10269
|
const targetInputs = testSuite.providers[0]?.inputs;
|
|
10335
10270
|
const config = {
|
|
10336
10271
|
injectVar: redteamConfig?.injectVar || options.injectVar,
|
|
10337
10272
|
inputs: targetInputs,
|
|
10338
10273
|
language: redteamConfig?.language || options.language,
|
|
10339
|
-
maxConcurrency: options.maxConcurrency ?? commandLineOptions?.maxConcurrency ??
|
|
10274
|
+
maxConcurrency: options.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? 4,
|
|
10340
10275
|
numTests: redteamConfig?.numTests ?? options.numTests,
|
|
10341
10276
|
entities: redteamConfig?.entities,
|
|
10342
10277
|
plugins,
|
|
@@ -10357,18 +10292,18 @@ async function doGenerateRedteam(options) {
|
|
|
10357
10292
|
if (typeof target === "string") return target;
|
|
10358
10293
|
return target.id;
|
|
10359
10294
|
}).filter((id) => typeof id === "string") : []) ?? [];
|
|
10360
|
-
require_logger.
|
|
10295
|
+
require_logger.logger.debug(`Extracted ${targetIds.length} target IDs from config providers: ${JSON.stringify(targetIds)}`);
|
|
10361
10296
|
let enhancedPurpose = parsedConfig.data.purpose || "";
|
|
10362
10297
|
let augmentedTestGenerationInstructions = config.testGenerationInstructions ?? "";
|
|
10363
10298
|
try {
|
|
10364
10299
|
const mcpToolsInfo = await extractMcpToolsInfo(testSuite.providers);
|
|
10365
10300
|
if (mcpToolsInfo) {
|
|
10366
10301
|
enhancedPurpose = enhancedPurpose ? `${enhancedPurpose}\n\n${mcpToolsInfo}\n\n` : mcpToolsInfo;
|
|
10367
|
-
require_logger.
|
|
10302
|
+
require_logger.logger.info("Added MCP tools information to red team purpose");
|
|
10368
10303
|
augmentedTestGenerationInstructions += `\nGenerate every test case prompt as a json string encoding the tool call and parameters, and choose a specific function to call. The specific format should be: {"tool": "function_name", "args": {...}}.`;
|
|
10369
10304
|
}
|
|
10370
10305
|
} catch (error) {
|
|
10371
|
-
require_logger.
|
|
10306
|
+
require_logger.logger.warn(`Failed to extract MCP tools information: ${error instanceof Error ? error.message : String(error)}`);
|
|
10372
10307
|
}
|
|
10373
10308
|
const contexts = redteamConfig?.contexts;
|
|
10374
10309
|
let redteamTests = [];
|
|
@@ -10377,10 +10312,10 @@ async function doGenerateRedteam(options) {
|
|
|
10377
10312
|
let finalInjectVar = "";
|
|
10378
10313
|
let failedPlugins = [];
|
|
10379
10314
|
if (contexts && contexts.length > 0) {
|
|
10380
|
-
require_logger.
|
|
10315
|
+
require_logger.logger.info(`Generating tests for ${contexts.length} contexts...`);
|
|
10381
10316
|
const allFailedPlugins = [];
|
|
10382
10317
|
for (const context of contexts) {
|
|
10383
|
-
require_logger.
|
|
10318
|
+
require_logger.logger.info(` Generating tests for context: ${context.id}`);
|
|
10384
10319
|
const contextPurpose = context.purpose + (enhancedPurpose ? `\n\n${enhancedPurpose}` : "");
|
|
10385
10320
|
const contextResult = await synthesize({
|
|
10386
10321
|
...parsedConfig.data,
|
|
@@ -10415,7 +10350,7 @@ async function doGenerateRedteam(options) {
|
|
|
10415
10350
|
}
|
|
10416
10351
|
failedPlugins = allFailedPlugins;
|
|
10417
10352
|
purpose = contexts[0].purpose;
|
|
10418
|
-
require_logger.
|
|
10353
|
+
require_logger.logger.info(`Generated ${redteamTests.length} total test cases across ${contexts.length} contexts`);
|
|
10419
10354
|
} else {
|
|
10420
10355
|
const result = await synthesize({
|
|
10421
10356
|
...parsedConfig.data,
|
|
@@ -10444,20 +10379,20 @@ async function doGenerateRedteam(options) {
|
|
|
10444
10379
|
*/
|
|
10445
10380
|
const cleanupProvider = async () => {
|
|
10446
10381
|
try {
|
|
10447
|
-
require_logger.
|
|
10382
|
+
require_logger.logger.debug("Cleaning up provider");
|
|
10448
10383
|
const provider = testSuite.providers[0];
|
|
10449
10384
|
if (provider && typeof provider.cleanup === "function") {
|
|
10450
10385
|
const cleanupResult = provider.cleanup();
|
|
10451
10386
|
if (cleanupResult instanceof Promise) await cleanupResult;
|
|
10452
10387
|
}
|
|
10453
10388
|
} catch (cleanupErr) {
|
|
10454
|
-
require_logger.
|
|
10389
|
+
require_logger.logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
10455
10390
|
}
|
|
10456
10391
|
};
|
|
10457
10392
|
try {
|
|
10458
10393
|
handleFailedPlugins(failedPlugins, options.strict ?? false);
|
|
10459
10394
|
if (redteamTests.length === 0) {
|
|
10460
|
-
require_logger.
|
|
10395
|
+
require_logger.logger.warn("No test cases generated. Please check for errors and try again.");
|
|
10461
10396
|
return null;
|
|
10462
10397
|
}
|
|
10463
10398
|
const updatedRedteamConfig = {
|
|
@@ -10476,7 +10411,7 @@ async function doGenerateRedteam(options) {
|
|
|
10476
10411
|
return encodeURIComponent(value);
|
|
10477
10412
|
}).filter((line) => line.length > 0).join("\n");
|
|
10478
10413
|
fs.writeFileSync(options.output, outputLines);
|
|
10479
|
-
require_logger.
|
|
10414
|
+
require_logger.logger.info(chalk.default.green(`Wrote ${redteamTests.length} test cases to ${chalk.default.bold(options.output)}`));
|
|
10480
10415
|
return {};
|
|
10481
10416
|
} else if (options.output) {
|
|
10482
10417
|
const existingYaml = configPath ? js_yaml.default.load(fs.readFileSync(configPath, "utf8")) : {};
|
|
@@ -10515,8 +10450,8 @@ async function doGenerateRedteam(options) {
|
|
|
10515
10450
|
ret = writePromptfooConfig(updatedYaml, options.output, headerComments);
|
|
10516
10451
|
require_util.printBorder();
|
|
10517
10452
|
const relativeOutputPath = path.default.relative(process.cwd(), options.output);
|
|
10518
|
-
require_logger.
|
|
10519
|
-
if (!options.inRedteamRun) require_logger.
|
|
10453
|
+
require_logger.logger.info(`Wrote ${redteamTests.length} test cases to ${relativeOutputPath}`);
|
|
10454
|
+
if (!options.inRedteamRun) require_logger.logger.info("\n" + chalk.default.green(`Run ${chalk.default.bold(relativeOutputPath === "redteam.yaml" ? promptfooCommand("redteam eval") : promptfooCommand(`redteam eval -c ${relativeOutputPath}`))} to run the red team!`));
|
|
10520
10455
|
require_util.printBorder();
|
|
10521
10456
|
} else if (options.write && configPath) {
|
|
10522
10457
|
const existingConfig = js_yaml.default.load(fs.readFileSync(configPath, "utf8"));
|
|
@@ -10554,9 +10489,9 @@ async function doGenerateRedteam(options) {
|
|
|
10554
10489
|
isUpdate: true
|
|
10555
10490
|
});
|
|
10556
10491
|
ret = writePromptfooConfig(existingConfig, configPath, headerComments);
|
|
10557
|
-
require_logger.
|
|
10492
|
+
require_logger.logger.info(`\nWrote ${redteamTests.length} new test cases to ${path.default.relative(process.cwd(), configPath)}`);
|
|
10558
10493
|
const command = configPath.endsWith("promptfooconfig.yaml") ? promptfooCommand("eval") : promptfooCommand(`eval -c ${path.default.relative(process.cwd(), configPath)}`);
|
|
10559
|
-
require_logger.
|
|
10494
|
+
require_logger.logger.info("\n" + chalk.default.green(`Run ${chalk.default.bold(`${command}`)} to run the red team!`));
|
|
10560
10495
|
} else {
|
|
10561
10496
|
const headerComments = createHeaderComments({
|
|
10562
10497
|
title: "REDTEAM CONFIGURATION",
|
|
@@ -10572,7 +10507,7 @@ async function doGenerateRedteam(options) {
|
|
|
10572
10507
|
tests: redteamTests
|
|
10573
10508
|
}, "redteam.yaml", headerComments);
|
|
10574
10509
|
}
|
|
10575
|
-
require_telemetry.
|
|
10510
|
+
require_telemetry.telemetry.record("command_used", {
|
|
10576
10511
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
10577
10512
|
name: "generate redteam",
|
|
10578
10513
|
numPrompts: testSuite.prompts.length,
|
|
@@ -10582,7 +10517,7 @@ async function doGenerateRedteam(options) {
|
|
|
10582
10517
|
strategies: strategies.map((s) => typeof s === "string" ? s : s.id),
|
|
10583
10518
|
isPromptfooSampleTarget: testSuite.providers.some(require_fetch.isPromptfooSampleTarget)
|
|
10584
10519
|
});
|
|
10585
|
-
require_telemetry.
|
|
10520
|
+
require_telemetry.telemetry.record("redteam generate", {
|
|
10586
10521
|
phase: "completed",
|
|
10587
10522
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
10588
10523
|
numPrompts: testSuite.prompts.length,
|
|
@@ -10597,7 +10532,6 @@ async function doGenerateRedteam(options) {
|
|
|
10597
10532
|
await cleanupProvider();
|
|
10598
10533
|
}
|
|
10599
10534
|
}
|
|
10600
|
-
|
|
10601
10535
|
//#endregion
|
|
10602
10536
|
//#region src/util/inlineBlobsForShare.ts
|
|
10603
10537
|
const BLOB_URI_PREFIX = "promptfoo://blob/";
|
|
@@ -10663,7 +10597,7 @@ async function ensureBlobPayloads(hashes, cache) {
|
|
|
10663
10597
|
dataUrl: `data:${mimeType};base64,${base64}`
|
|
10664
10598
|
});
|
|
10665
10599
|
} catch (error) {
|
|
10666
|
-
require_logger.
|
|
10600
|
+
require_logger.logger.warn("[Share] Failed to inline blob reference", {
|
|
10667
10601
|
error,
|
|
10668
10602
|
hash
|
|
10669
10603
|
});
|
|
@@ -10709,7 +10643,6 @@ async function inlineBlobRefsForShare(value, cache) {
|
|
|
10709
10643
|
await ensureBlobPayloads(hashes, cache);
|
|
10710
10644
|
return await inlineValue(value, cache, /* @__PURE__ */ new WeakSet(), 0);
|
|
10711
10645
|
}
|
|
10712
|
-
|
|
10713
10646
|
//#endregion
|
|
10714
10647
|
//#region src/share.ts
|
|
10715
10648
|
function isSharingEnabled(evalRecord) {
|
|
@@ -10723,10 +10656,10 @@ function isSharingEnabled(evalRecord) {
|
|
|
10723
10656
|
}
|
|
10724
10657
|
function determineShareDomain(eval_) {
|
|
10725
10658
|
const sharing = eval_.config.sharing;
|
|
10726
|
-
require_logger.
|
|
10659
|
+
require_logger.logger.debug(`Share config: isCloudEnabled=${require_fetch.cloudConfig.isEnabled()}, sharing=${JSON.stringify(sharing)}, evalId=${eval_.id}`);
|
|
10727
10660
|
const envAppBaseUrl = require_logger.getEnvString("PROMPTFOO_REMOTE_APP_BASE_URL");
|
|
10728
10661
|
const domain = require_fetch.cloudConfig.isEnabled() ? require_fetch.cloudConfig.getAppUrl() : typeof sharing === "object" && sharing.appBaseUrl ? sharing.appBaseUrl : envAppBaseUrl || require_fetch.getDefaultShareViewBaseUrl();
|
|
10729
|
-
require_logger.
|
|
10662
|
+
require_logger.logger.debug(`Share domain determined: domain=${domain}`);
|
|
10730
10663
|
return { domain };
|
|
10731
10664
|
}
|
|
10732
10665
|
function getResultSize(result) {
|
|
@@ -10736,7 +10669,7 @@ function findLargestResultSize(results, sampleSize = 1e3) {
|
|
|
10736
10669
|
const sampleSizes = results.slice(0, Math.min(sampleSize, results.length)).map(getResultSize);
|
|
10737
10670
|
return Math.max(...sampleSizes);
|
|
10738
10671
|
}
|
|
10739
|
-
async function sendEvalRecord(evalRecord, url, headers) {
|
|
10672
|
+
async function sendEvalRecord(evalRecord, url$1, headers) {
|
|
10740
10673
|
const traces = await evalRecord.getTraces();
|
|
10741
10674
|
let evalData = {
|
|
10742
10675
|
...evalRecord,
|
|
@@ -10758,8 +10691,8 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10758
10691
|
};
|
|
10759
10692
|
}
|
|
10760
10693
|
const jsonData = JSON.stringify(evalData);
|
|
10761
|
-
require_logger.
|
|
10762
|
-
const response = await require_fetch.fetchWithProxy(url, {
|
|
10694
|
+
require_logger.logger.debug(`Sending initial eval data to ${url$1} - eval ${evalRecord.id} with ${evalRecord.prompts.length} prompts ${traces.length > 0 ? `and trace data` : ""}`);
|
|
10695
|
+
const response = await require_fetch.fetchWithProxy(url$1, {
|
|
10763
10696
|
method: "POST",
|
|
10764
10697
|
headers,
|
|
10765
10698
|
body: jsonData,
|
|
@@ -10767,10 +10700,10 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10767
10700
|
});
|
|
10768
10701
|
if (!response.ok) {
|
|
10769
10702
|
const responseBody = await response.text();
|
|
10770
|
-
const errorMessage = `Failed to send initial eval data to ${url}: ${response.statusText}`;
|
|
10703
|
+
const errorMessage = `Failed to send initial eval data to ${url$1}: ${response.statusText}`;
|
|
10771
10704
|
const bodyMessage = responseBody ? `\nResponse body: ${responseBody}` : "";
|
|
10772
10705
|
const debugInfo = {
|
|
10773
|
-
url,
|
|
10706
|
+
url: url$1,
|
|
10774
10707
|
statusCode: response.status,
|
|
10775
10708
|
statusText: response.statusText,
|
|
10776
10709
|
headers: Object.keys(headers),
|
|
@@ -10778,18 +10711,18 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10778
10711
|
errorMessage,
|
|
10779
10712
|
bodyMessage
|
|
10780
10713
|
};
|
|
10781
|
-
require_logger.
|
|
10714
|
+
require_logger.logger.error(`Sharing your eval data to ${url$1} failed. Debug info: ${JSON.stringify(debugInfo, null, 2)}`);
|
|
10782
10715
|
throw new Error(`${errorMessage}${bodyMessage}`);
|
|
10783
10716
|
}
|
|
10784
10717
|
const responseJson = await response.json();
|
|
10785
|
-
if (!responseJson.id) throw new Error(`Failed to send initial eval data to ${url}: ${response.statusText} ${responseJson}`);
|
|
10718
|
+
if (!responseJson.id) throw new Error(`Failed to send initial eval data to ${url$1}: ${response.statusText} ${responseJson}`);
|
|
10786
10719
|
return responseJson.id;
|
|
10787
10720
|
}
|
|
10788
|
-
async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
10789
|
-
const targetUrl = `${url}/${evalId}/results`;
|
|
10721
|
+
async function sendChunkOfResults(chunk, url$2, evalId, headers) {
|
|
10722
|
+
const targetUrl = `${url$2}/${evalId}/results`;
|
|
10790
10723
|
const stringifiedChunk = JSON.stringify(chunk);
|
|
10791
10724
|
const chunkSizeBytes = Buffer.byteLength(stringifiedChunk, "utf8");
|
|
10792
|
-
require_logger.
|
|
10725
|
+
require_logger.logger.debug(`Sending chunk of ${chunk.length} results (${(chunkSizeBytes / 1024 / 1024).toFixed(2)} MB) to ${targetUrl}`);
|
|
10793
10726
|
try {
|
|
10794
10727
|
const response = await require_fetch.fetchWithProxy(targetUrl, {
|
|
10795
10728
|
method: "POST",
|
|
@@ -10809,7 +10742,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10809
10742
|
evalId,
|
|
10810
10743
|
responseBody: responseBody.length > 500 ? `${responseBody.slice(0, 500)}...` : responseBody
|
|
10811
10744
|
};
|
|
10812
|
-
require_logger.
|
|
10745
|
+
require_logger.logger.debug(`Chunk send failed: ${JSON.stringify(debugInfo, null, 2)}`);
|
|
10813
10746
|
if (response.status === 413) return {
|
|
10814
10747
|
success: false,
|
|
10815
10748
|
errorType: "PAYLOAD_TOO_LARGE",
|
|
@@ -10824,7 +10757,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10824
10757
|
return { success: true };
|
|
10825
10758
|
} catch (error) {
|
|
10826
10759
|
if (error instanceof TypeError && error.message === "fetch failed") {
|
|
10827
|
-
require_logger.
|
|
10760
|
+
require_logger.logger.debug(`Network timeout/failure for chunk of ${chunk.length} results`);
|
|
10828
10761
|
return {
|
|
10829
10762
|
success: false,
|
|
10830
10763
|
errorType: "NETWORK_TIMEOUT",
|
|
@@ -10842,11 +10775,11 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10842
10775
|
* Attempts to send a chunk of results, splitting it in half on retryable failures.
|
|
10843
10776
|
* Uses recursive splitting to handle chunks that are too large.
|
|
10844
10777
|
*/
|
|
10845
|
-
async function sendChunkWithRetry(chunk, url, evalId, headers, config, onProgress, depth = 0, maxDepth) {
|
|
10778
|
+
async function sendChunkWithRetry(chunk, url$3, evalId, headers, config, onProgress, depth = 0, maxDepth) {
|
|
10846
10779
|
const effectiveMaxDepth = maxDepth ?? Math.ceil(Math.log2(chunk.length / config.minResultsPerChunk)) + 1;
|
|
10847
10780
|
if (depth > effectiveMaxDepth) throw new Error(`Maximum retry depth exceeded. Cannot send chunk of ${chunk.length} results.`);
|
|
10848
10781
|
if (chunk.length === 0) return 0;
|
|
10849
|
-
const result = await sendChunkOfResults(chunk, url, evalId, headers);
|
|
10782
|
+
const result = await sendChunkOfResults(chunk, url$3, evalId, headers);
|
|
10850
10783
|
if (result.success) {
|
|
10851
10784
|
onProgress(chunk.length);
|
|
10852
10785
|
return chunk.length;
|
|
@@ -10856,41 +10789,41 @@ async function sendChunkWithRetry(chunk, url, evalId, headers, config, onProgres
|
|
|
10856
10789
|
const midpoint = Math.ceil(chunk.length / 2);
|
|
10857
10790
|
const firstHalf = chunk.slice(0, midpoint);
|
|
10858
10791
|
const secondHalf = chunk.slice(midpoint);
|
|
10859
|
-
require_logger.
|
|
10860
|
-
return await sendChunkWithRetry(firstHalf, url, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth) + await sendChunkWithRetry(secondHalf, url, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth);
|
|
10792
|
+
require_logger.logger.info(`Chunk of ${chunk.length} results failed (${result.errorType}). Splitting into ${firstHalf.length} + ${secondHalf.length} and retrying...`);
|
|
10793
|
+
return await sendChunkWithRetry(firstHalf, url$3, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth) + await sendChunkWithRetry(secondHalf, url$3, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth);
|
|
10861
10794
|
}
|
|
10862
10795
|
throw result.originalError ?? /* @__PURE__ */ new Error("Unknown error sending chunk");
|
|
10863
10796
|
}
|
|
10864
|
-
async function rollbackEval(url, evalId, headers) {
|
|
10865
|
-
const targetUrl = `${url}/${evalId}`;
|
|
10866
|
-
require_logger.
|
|
10797
|
+
async function rollbackEval(url$4, evalId, headers) {
|
|
10798
|
+
const targetUrl = `${url$4}/${evalId}`;
|
|
10799
|
+
require_logger.logger.debug(`Attempting to roll back eval ${evalId} at ${targetUrl}`);
|
|
10867
10800
|
try {
|
|
10868
10801
|
const response = await require_fetch.fetchWithProxy(targetUrl, {
|
|
10869
10802
|
method: "DELETE",
|
|
10870
10803
|
headers
|
|
10871
10804
|
});
|
|
10872
|
-
if (response.ok) require_logger.
|
|
10873
|
-
else require_logger.
|
|
10805
|
+
if (response.ok) require_logger.logger.debug(`Successfully rolled back eval ${evalId}`);
|
|
10806
|
+
else require_logger.logger.warn(`Rollback request returned non-OK status: ${response.statusText}`);
|
|
10874
10807
|
} catch (e) {
|
|
10875
|
-
require_logger.
|
|
10808
|
+
require_logger.logger.warn(`Failed to roll back eval ${evalId}: ${e}. You may need to manually delete this eval.`);
|
|
10876
10809
|
}
|
|
10877
10810
|
}
|
|
10878
|
-
async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
10811
|
+
async function sendChunkedResults(evalRecord, url$5, options = {}) {
|
|
10879
10812
|
const isVerbose = require_logger.isDebugEnabled();
|
|
10880
10813
|
const { silent = false } = options;
|
|
10881
|
-
require_logger.
|
|
10814
|
+
require_logger.logger.debug(`Starting chunked results upload to ${url$5}`);
|
|
10882
10815
|
await require_providers.checkCloudPermissions(evalRecord.config);
|
|
10883
10816
|
const inlineBlobs = require_extractor.isBlobStorageEnabled() && require_logger.getEnvBool("PROMPTFOO_SHARE_INLINE_BLOBS", !require_fetch.cloudConfig.isEnabled());
|
|
10884
10817
|
const inlineCache = inlineBlobs ? createBlobInlineCache() : null;
|
|
10885
10818
|
let sampleResults = (await evalRecord.fetchResultsBatched(100).next()).value ?? [];
|
|
10886
10819
|
if (sampleResults.length === 0) {
|
|
10887
|
-
require_logger.
|
|
10820
|
+
require_logger.logger.debug(`No results found`);
|
|
10888
10821
|
return null;
|
|
10889
10822
|
}
|
|
10890
10823
|
if (inlineBlobs && inlineCache) sampleResults = await inlineBlobRefsForShare(sampleResults, inlineCache);
|
|
10891
|
-
require_logger.
|
|
10824
|
+
require_logger.logger.debug(`Loaded ${sampleResults.length} sample results to determine chunk size`);
|
|
10892
10825
|
const largestSize = findLargestResultSize(sampleResults);
|
|
10893
|
-
require_logger.
|
|
10826
|
+
require_logger.logger.debug(`Largest result size from sample: ${largestSize} bytes`);
|
|
10894
10827
|
const TARGET_CHUNK_SIZE = .9 * 1024 * 1024;
|
|
10895
10828
|
const envChunkSize = require_logger.getEnvInt("PROMPTFOO_SHARE_CHUNK_SIZE");
|
|
10896
10829
|
const calculatedChunkSize = Math.max(1, Math.floor(TARGET_CHUNK_SIZE / largestSize));
|
|
@@ -10899,11 +10832,11 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10899
10832
|
minResultsPerChunk: 1,
|
|
10900
10833
|
maxResultsPerChunk: resultsPerChunk
|
|
10901
10834
|
};
|
|
10902
|
-
require_logger.
|
|
10835
|
+
require_logger.logger.debug(`Chunk config: ${JSON.stringify(chunkConfig)}`);
|
|
10903
10836
|
const headers = { "Content-Type": "application/json" };
|
|
10904
10837
|
if (require_fetch.cloudConfig.isEnabled()) headers["Authorization"] = `Bearer ${require_fetch.cloudConfig.getApiKey()}`;
|
|
10905
10838
|
const totalResults = await evalRecord.getTotalResultRowCount();
|
|
10906
|
-
require_logger.
|
|
10839
|
+
require_logger.logger.debug(`Total results to share: ${totalResults}`);
|
|
10907
10840
|
let progressBar = null;
|
|
10908
10841
|
if (!isVerbose && !require_logger.isCI() && !silent) {
|
|
10909
10842
|
progressBar = new cli_progress.default.SingleBar({
|
|
@@ -10914,13 +10847,13 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10914
10847
|
}
|
|
10915
10848
|
let evalId;
|
|
10916
10849
|
try {
|
|
10917
|
-
evalId = await sendEvalRecord(evalRecord, url, headers);
|
|
10918
|
-
require_logger.
|
|
10850
|
+
evalId = await sendEvalRecord(evalRecord, url$5, headers);
|
|
10851
|
+
require_logger.logger.debug(`Initial eval data sent successfully - ${evalId}`);
|
|
10919
10852
|
let totalSent = 0;
|
|
10920
10853
|
const onProgress = (sentCount) => {
|
|
10921
10854
|
totalSent += sentCount;
|
|
10922
10855
|
if (progressBar) progressBar.update(totalSent);
|
|
10923
|
-
else require_logger.
|
|
10856
|
+
else require_logger.logger.info(`Progress: ${totalSent}/${totalResults} results shared (${Math.round(totalSent / totalResults * 100)}%)`);
|
|
10924
10857
|
};
|
|
10925
10858
|
let currentChunk = [];
|
|
10926
10859
|
let chunkNumber = 0;
|
|
@@ -10928,24 +10861,24 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10928
10861
|
currentChunk.push(result);
|
|
10929
10862
|
if (currentChunk.length >= resultsPerChunk) {
|
|
10930
10863
|
chunkNumber++;
|
|
10931
|
-
require_logger.
|
|
10932
|
-
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url, evalId, headers, chunkConfig, onProgress);
|
|
10864
|
+
require_logger.logger.debug(`Sending chunk ${chunkNumber} with ${currentChunk.length} results`);
|
|
10865
|
+
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url$5, evalId, headers, chunkConfig, onProgress);
|
|
10933
10866
|
currentChunk = [];
|
|
10934
10867
|
}
|
|
10935
10868
|
}
|
|
10936
10869
|
if (currentChunk.length > 0) {
|
|
10937
10870
|
chunkNumber++;
|
|
10938
|
-
require_logger.
|
|
10939
|
-
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url, evalId, headers, chunkConfig, onProgress);
|
|
10871
|
+
require_logger.logger.debug(`Sending final chunk ${chunkNumber} with ${currentChunk.length} results`);
|
|
10872
|
+
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url$5, evalId, headers, chunkConfig, onProgress);
|
|
10940
10873
|
}
|
|
10941
|
-
require_logger.
|
|
10874
|
+
require_logger.logger.debug(`Sharing complete. Total chunks sent: ${chunkNumber}, Total results: ${totalSent}`);
|
|
10942
10875
|
return evalId;
|
|
10943
10876
|
} catch (e) {
|
|
10944
10877
|
if (progressBar) progressBar.stop();
|
|
10945
|
-
require_logger.
|
|
10878
|
+
require_logger.logger.error(`Upload failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
10946
10879
|
if (evalId) {
|
|
10947
|
-
require_logger.
|
|
10948
|
-
await rollbackEval(url, evalId, headers);
|
|
10880
|
+
require_logger.logger.info(`Upload failed, rolling back...`);
|
|
10881
|
+
await rollbackEval(url$5, evalId, headers);
|
|
10949
10882
|
}
|
|
10950
10883
|
return null;
|
|
10951
10884
|
} finally {
|
|
@@ -10965,12 +10898,12 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10965
10898
|
*/
|
|
10966
10899
|
function stripAuthFromUrl(urlString) {
|
|
10967
10900
|
try {
|
|
10968
|
-
const url = new url.URL(urlString);
|
|
10969
|
-
url.username = "";
|
|
10970
|
-
url.password = "";
|
|
10971
|
-
return url.toString();
|
|
10901
|
+
const url$6 = new url.URL(urlString);
|
|
10902
|
+
url$6.username = "";
|
|
10903
|
+
url$6.password = "";
|
|
10904
|
+
return url$6.toString();
|
|
10972
10905
|
} catch {
|
|
10973
|
-
require_logger.
|
|
10906
|
+
require_logger.logger.warn("Failed to parse URL, returning original");
|
|
10974
10907
|
return urlString;
|
|
10975
10908
|
}
|
|
10976
10909
|
}
|
|
@@ -11013,26 +10946,25 @@ async function getShareableUrl(eval_, remoteEvalId, showAuth = false) {
|
|
|
11013
10946
|
async function createShareableUrl(evalRecord, options = {}) {
|
|
11014
10947
|
const { silent = false, showAuth = false } = options;
|
|
11015
10948
|
if (require_logger.getEnvBool("PROMPTFOO_DISABLE_SHARING")) {
|
|
11016
|
-
require_logger.
|
|
10949
|
+
require_logger.logger.debug("Sharing is explicitly disabled, returning null");
|
|
11017
10950
|
return null;
|
|
11018
10951
|
}
|
|
11019
10952
|
if (!silent) {
|
|
11020
10953
|
const orgContext = await require_providers.getOrgContext();
|
|
11021
10954
|
if (orgContext) {
|
|
11022
10955
|
const teamSuffix = orgContext.teamName ? ` > ${orgContext.teamName}` : "";
|
|
11023
|
-
require_logger.
|
|
10956
|
+
require_logger.logger.info(`${chalk.default.dim("Sharing to:")} ${chalk.default.cyan(orgContext.organizationName)}${teamSuffix}`);
|
|
11024
10957
|
}
|
|
11025
10958
|
}
|
|
11026
10959
|
await handleEmailCollection(evalRecord);
|
|
11027
|
-
const { url } = await getApiConfig(evalRecord);
|
|
10960
|
+
const { url: url$7 } = await getApiConfig(evalRecord);
|
|
11028
10961
|
const canUseNewResults = require_fetch.cloudConfig.isEnabled();
|
|
11029
|
-
require_logger.
|
|
11030
|
-
const evalId = await sendChunkedResults(evalRecord, url, { silent });
|
|
10962
|
+
require_logger.logger.debug(`Sharing with ${url$7} canUseNewResults: ${canUseNewResults} Use old results: ${evalRecord.useOldResults()}`);
|
|
10963
|
+
const evalId = await sendChunkedResults(evalRecord, url$7, { silent });
|
|
11031
10964
|
if (!evalId) return null;
|
|
11032
|
-
require_logger.
|
|
10965
|
+
require_logger.logger.debug(`New eval ID on remote instance: ${evalId}`);
|
|
11033
10966
|
return getShareableUrl(evalRecord, evalId, showAuth);
|
|
11034
10967
|
}
|
|
11035
|
-
|
|
11036
10968
|
//#endregion
|
|
11037
10969
|
//#region src/table.ts
|
|
11038
10970
|
function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
@@ -11053,7 +10985,6 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
|
11053
10985
|
})]);
|
|
11054
10986
|
return table.toString();
|
|
11055
10987
|
}
|
|
11056
|
-
|
|
11057
10988
|
//#endregion
|
|
11058
10989
|
//#region src/util/config/default.ts
|
|
11059
10990
|
/**
|
|
@@ -11093,7 +11024,6 @@ async function loadDefaultConfig(dir, configName = "promptfooconfig") {
|
|
|
11093
11024
|
function clearConfigCache() {
|
|
11094
11025
|
configCache.clear();
|
|
11095
11026
|
}
|
|
11096
|
-
|
|
11097
11027
|
//#endregion
|
|
11098
11028
|
//#region src/util/sharing.ts
|
|
11099
11029
|
/**
|
|
@@ -11121,7 +11051,6 @@ function shouldShareResults(opts) {
|
|
|
11121
11051
|
const sharing = require_fetch.cloudConfig.getSharing();
|
|
11122
11052
|
return require_fetch.cloudConfig.isEnabled() && sharing !== false;
|
|
11123
11053
|
}
|
|
11124
|
-
|
|
11125
11054
|
//#endregion
|
|
11126
11055
|
//#region src/util/formatDuration.ts
|
|
11127
11056
|
/**
|
|
@@ -11141,7 +11070,6 @@ function formatDuration(seconds) {
|
|
|
11141
11070
|
result += `${remainingSeconds}s`;
|
|
11142
11071
|
return result;
|
|
11143
11072
|
}
|
|
11144
|
-
|
|
11145
11073
|
//#endregion
|
|
11146
11074
|
//#region src/commands/eval/summary.ts
|
|
11147
11075
|
/**
|
|
@@ -11293,7 +11221,6 @@ function generateEvalSummary(params) {
|
|
|
11293
11221
|
lines.push("");
|
|
11294
11222
|
return lines;
|
|
11295
11223
|
}
|
|
11296
|
-
|
|
11297
11224
|
//#endregion
|
|
11298
11225
|
//#region src/commands/retry.ts
|
|
11299
11226
|
/**
|
|
@@ -11309,7 +11236,7 @@ async function getErrorResultIds(evalId) {
|
|
|
11309
11236
|
async function deleteErrorResults(resultIds) {
|
|
11310
11237
|
if (resultIds.length === 0) return;
|
|
11311
11238
|
await require_tables.getDb().delete(require_tables.evalResultsTable).where((0, drizzle_orm.inArray)(require_tables.evalResultsTable.id, resultIds));
|
|
11312
|
-
require_logger.
|
|
11239
|
+
require_logger.logger.debug(`Deleted ${resultIds.length} error results from database`);
|
|
11313
11240
|
}
|
|
11314
11241
|
const RECALCULATE_BATCH_SIZE = 1e3;
|
|
11315
11242
|
/**
|
|
@@ -11317,7 +11244,7 @@ const RECALCULATE_BATCH_SIZE = 1e3;
|
|
|
11317
11244
|
* Uses streaming batched iteration to avoid OOM with large evaluations (40K+ results).
|
|
11318
11245
|
*/
|
|
11319
11246
|
async function recalculatePromptMetrics(evalRecord) {
|
|
11320
|
-
require_logger.
|
|
11247
|
+
require_logger.logger.debug("Recalculating prompt metrics after deleting ERROR results");
|
|
11321
11248
|
const startTime = Date.now();
|
|
11322
11249
|
let batchNumber = 0;
|
|
11323
11250
|
let totalProcessed = 0;
|
|
@@ -11339,12 +11266,12 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11339
11266
|
try {
|
|
11340
11267
|
for await (const batch of evalRecord.fetchResultsBatched(RECALCULATE_BATCH_SIZE)) {
|
|
11341
11268
|
batchNumber++;
|
|
11342
|
-
require_logger.
|
|
11269
|
+
require_logger.logger.debug(`Processing batch ${batchNumber} with ${batch.length} results`);
|
|
11343
11270
|
for (const result of batch) {
|
|
11344
11271
|
currentResultId = result.id;
|
|
11345
11272
|
const metrics = promptMetricsMap.get(result.promptIdx);
|
|
11346
11273
|
if (!metrics) {
|
|
11347
|
-
require_logger.
|
|
11274
|
+
require_logger.logger.debug(`Skipping result with invalid promptIdx: ${result.promptIdx}`, {
|
|
11348
11275
|
resultId: result.id,
|
|
11349
11276
|
evalId: evalRecord.id
|
|
11350
11277
|
});
|
|
@@ -11378,7 +11305,7 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11378
11305
|
totalProcessed += batch.length;
|
|
11379
11306
|
}
|
|
11380
11307
|
} catch (error) {
|
|
11381
|
-
require_logger.
|
|
11308
|
+
require_logger.logger.error("Error during batched metrics recalculation", {
|
|
11382
11309
|
phase: "calculation",
|
|
11383
11310
|
batchNumber,
|
|
11384
11311
|
totalProcessed,
|
|
@@ -11392,7 +11319,7 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11392
11319
|
if (evalRecord.persisted) try {
|
|
11393
11320
|
await evalRecord.addPrompts(evalRecord.prompts);
|
|
11394
11321
|
} catch (error) {
|
|
11395
|
-
require_logger.
|
|
11322
|
+
require_logger.logger.error("Error saving recalculated prompt metrics", {
|
|
11396
11323
|
phase: "save",
|
|
11397
11324
|
evalId: evalRecord.id,
|
|
11398
11325
|
promptCount: evalRecord.prompts.length,
|
|
@@ -11401,19 +11328,18 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11401
11328
|
throw error;
|
|
11402
11329
|
}
|
|
11403
11330
|
const durationMs = Date.now() - startTime;
|
|
11404
|
-
require_logger.
|
|
11331
|
+
require_logger.logger.debug("Prompt metrics recalculation completed", {
|
|
11405
11332
|
totalBatches: batchNumber,
|
|
11406
11333
|
totalResults: totalProcessed,
|
|
11407
11334
|
durationMs
|
|
11408
11335
|
});
|
|
11409
11336
|
}
|
|
11410
|
-
|
|
11411
11337
|
//#endregion
|
|
11412
11338
|
//#region src/commands/share.ts
|
|
11413
11339
|
function notCloudEnabledShareInstructions() {
|
|
11414
11340
|
const cloudUrl = require_fetch.getDefaultShareViewBaseUrl();
|
|
11415
11341
|
const welcomeUrl = `${cloudUrl}/welcome`;
|
|
11416
|
-
require_logger.
|
|
11342
|
+
require_logger.logger.info(dedent.default`
|
|
11417
11343
|
|
|
11418
11344
|
» You need to have a cloud account to securely share your results.
|
|
11419
11345
|
|
|
@@ -11422,10 +11348,7 @@ function notCloudEnabledShareInstructions() {
|
|
|
11422
11348
|
3. Run ${chalk.default.greenBright.bold("promptfoo share")}
|
|
11423
11349
|
`);
|
|
11424
11350
|
}
|
|
11425
|
-
|
|
11426
|
-
//#endregion
|
|
11427
|
-
//#region src/commands/eval.ts
|
|
11428
|
-
const EvalCommandSchema = require_types.CommandLineOptionsSchema.extend({
|
|
11351
|
+
require_types.CommandLineOptionsSchema.extend({
|
|
11429
11352
|
help: zod.z.boolean().optional(),
|
|
11430
11353
|
interactiveProviders: zod.z.boolean().optional(),
|
|
11431
11354
|
remote: zod.z.boolean().optional(),
|
|
@@ -11435,7 +11358,7 @@ const EvalCommandSchema = require_types.CommandLineOptionsSchema.extend({
|
|
|
11435
11358
|
resume: zod.z.union([zod.z.string(), zod.z.boolean()]).optional()
|
|
11436
11359
|
}).partial();
|
|
11437
11360
|
function showRedteamProviderLabelMissingWarning(testSuite) {
|
|
11438
|
-
if (testSuite.providers.some((p) => !p.label)) require_logger.
|
|
11361
|
+
if (testSuite.providers.some((p) => !p.label)) require_logger.logger.warn(dedent.default`
|
|
11439
11362
|
${chalk.default.bold.yellow("Warning")}: Your target (provider) does not have a label specified.
|
|
11440
11363
|
|
|
11441
11364
|
Labels are used to uniquely identify redteam targets. Please set a meaningful and unique label (e.g., 'helpdesk-search-agent') for your targets/providers in your redteam config.
|
|
@@ -11466,7 +11389,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11466
11389
|
}
|
|
11467
11390
|
const runEvaluation = async (initialization) => {
|
|
11468
11391
|
const startTime = Date.now();
|
|
11469
|
-
require_telemetry.
|
|
11392
|
+
require_telemetry.telemetry.record("command_used", {
|
|
11470
11393
|
name: "eval - started",
|
|
11471
11394
|
watch: Boolean(cmdObj.watch),
|
|
11472
11395
|
...Boolean(config?.redteam) && { isRedteam: true }
|
|
@@ -11481,19 +11404,19 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11481
11404
|
for (const configPath of configPaths) if (fs.default.existsSync(configPath) && fs.default.statSync(configPath).isDirectory()) {
|
|
11482
11405
|
const { defaultConfig: dirConfig, defaultConfigPath: newConfigPath } = await loadDefaultConfig(configPath);
|
|
11483
11406
|
if (newConfigPath) {
|
|
11484
|
-
cmdObj.config = cmdObj.config.filter((path) => path !== configPath);
|
|
11407
|
+
cmdObj.config = cmdObj.config.filter((path$6) => path$6 !== configPath);
|
|
11485
11408
|
cmdObj.config.push(newConfigPath);
|
|
11486
11409
|
defaultConfig = {
|
|
11487
11410
|
...defaultConfig,
|
|
11488
11411
|
...dirConfig
|
|
11489
11412
|
};
|
|
11490
|
-
} else require_logger.
|
|
11413
|
+
} else require_logger.logger.warn(`No configuration file found in directory: ${configPath}. Looked for promptfooconfig.{${DEFAULT_CONFIG_EXTENSIONS.join(",")}}. Run "${promptfooCommand("init")}" or pass --config path/to/promptfooconfig.yaml.`);
|
|
11491
11414
|
}
|
|
11492
11415
|
}
|
|
11493
11416
|
const resumeRaw = cmdObj.resume;
|
|
11494
11417
|
const retryErrors = cmdObj.retryErrors;
|
|
11495
11418
|
if (resumeRaw && retryErrors) {
|
|
11496
|
-
require_logger.
|
|
11419
|
+
require_logger.logger.error(chalk.default.red("Cannot use --resume and --retry-errors together. Please use one or the other."));
|
|
11497
11420
|
process.exitCode = 1;
|
|
11498
11421
|
return new Eval({}, { persisted: false });
|
|
11499
11422
|
}
|
|
@@ -11501,45 +11424,45 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11501
11424
|
const resumeId = resumeRaw === true || resumeRaw === void 0 ? "latest" : resumeRaw;
|
|
11502
11425
|
if (resumeRaw) {
|
|
11503
11426
|
if (cmdObj.write === false) {
|
|
11504
|
-
require_logger.
|
|
11427
|
+
require_logger.logger.error(chalk.default.red("Cannot use --resume with --no-write. Resume functionality requires database persistence."));
|
|
11505
11428
|
process.exitCode = 1;
|
|
11506
11429
|
return new Eval({}, { persisted: false });
|
|
11507
11430
|
}
|
|
11508
11431
|
resumeEval = resumeId === "latest" ? await Eval.latest() : await Eval.findById(resumeId);
|
|
11509
11432
|
if (!resumeEval) {
|
|
11510
|
-
require_logger.
|
|
11433
|
+
require_logger.logger.error(`Could not find evaluation to resume: ${resumeId}`);
|
|
11511
11434
|
process.exitCode = 1;
|
|
11512
11435
|
return new Eval({}, { persisted: false });
|
|
11513
11436
|
}
|
|
11514
|
-
require_logger.
|
|
11437
|
+
require_logger.logger.info(chalk.default.cyan(`Resuming evaluation ${resumeEval.id}...`));
|
|
11515
11438
|
({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs({}, resumeEval.config));
|
|
11516
11439
|
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) testSuite.prompts = resumeEval.prompts.map((p) => ({
|
|
11517
11440
|
raw: p.raw,
|
|
11518
11441
|
label: p.label,
|
|
11519
11442
|
config: p.config
|
|
11520
11443
|
}));
|
|
11521
|
-
require_logger.
|
|
11444
|
+
require_logger.state.resume = true;
|
|
11522
11445
|
} else if (retryErrors) {
|
|
11523
11446
|
if (cmdObj.write === false) {
|
|
11524
|
-
require_logger.
|
|
11447
|
+
require_logger.logger.error(chalk.default.red("Cannot use --retry-errors with --no-write. Retry functionality requires database persistence."));
|
|
11525
11448
|
process.exitCode = 1;
|
|
11526
11449
|
return new Eval({}, { persisted: false });
|
|
11527
11450
|
}
|
|
11528
|
-
require_logger.
|
|
11451
|
+
require_logger.logger.info("🔄 Retrying ERROR results from latest evaluation...");
|
|
11529
11452
|
const latestEval = await Eval.latest();
|
|
11530
11453
|
if (!latestEval) {
|
|
11531
|
-
require_logger.
|
|
11454
|
+
require_logger.logger.error("No previous evaluation found to retry errors from");
|
|
11532
11455
|
process.exitCode = 1;
|
|
11533
11456
|
return new Eval({}, { persisted: false });
|
|
11534
11457
|
}
|
|
11535
11458
|
const errorResultIds = await getErrorResultIds(latestEval.id);
|
|
11536
11459
|
if (errorResultIds.length === 0) {
|
|
11537
|
-
require_logger.
|
|
11460
|
+
require_logger.logger.info("✅ No ERROR results found in the latest evaluation");
|
|
11538
11461
|
return latestEval;
|
|
11539
11462
|
}
|
|
11540
|
-
require_logger.
|
|
11541
|
-
require_logger.
|
|
11542
|
-
require_logger.
|
|
11463
|
+
require_logger.logger.info(`Found ${errorResultIds.length} ERROR results to retry`);
|
|
11464
|
+
require_logger.state._retryErrorResultIds = errorResultIds;
|
|
11465
|
+
require_logger.logger.info(`🔄 Running evaluation with resume mode to retry ${errorResultIds.length} test cases...`);
|
|
11543
11466
|
resumeEval = latestEval;
|
|
11544
11467
|
({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs({}, resumeEval.config));
|
|
11545
11468
|
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) testSuite.prompts = resumeEval.prompts.map((p) => ({
|
|
@@ -11547,20 +11470,20 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11547
11470
|
label: p.label,
|
|
11548
11471
|
config: p.config
|
|
11549
11472
|
}));
|
|
11550
|
-
require_logger.
|
|
11551
|
-
require_logger.
|
|
11473
|
+
require_logger.state.resume = true;
|
|
11474
|
+
require_logger.state.retryMode = true;
|
|
11552
11475
|
} else ({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs(cmdObj, defaultConfig));
|
|
11553
11476
|
if (!cmdObj.envPath && commandLineOptions?.envPath) {
|
|
11554
|
-
require_logger.
|
|
11477
|
+
require_logger.logger.debug(`Loading additional environment from config: ${commandLineOptions.envPath}`);
|
|
11555
11478
|
require_util.setupEnv(commandLineOptions.envPath);
|
|
11556
11479
|
}
|
|
11557
|
-
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) require_logger.
|
|
11480
|
+
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) require_logger.logger.warn(chalk.default.yellow(dedent.default`
|
|
11558
11481
|
Warning: Config file has a redteam section but no test cases.
|
|
11559
11482
|
Did you mean to run ${chalk.default.bold("promptfoo redteam generate")} instead?
|
|
11560
11483
|
`));
|
|
11561
11484
|
if (config.redteam && Array.isArray(config.providers) && config.providers.length > 0 && typeof config.providers[0] === "object" && config.providers[0].id === "http") {
|
|
11562
11485
|
const maybeUrl = config.providers[0]?.config?.url;
|
|
11563
|
-
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) require_telemetry.
|
|
11486
|
+
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) require_telemetry.telemetry.record("feature_used", { feature: "redteam_run_with_example" });
|
|
11564
11487
|
}
|
|
11565
11488
|
if (config.evaluateOptions) evaluateOptions = {
|
|
11566
11489
|
...evaluateOptions,
|
|
@@ -11574,25 +11497,25 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11574
11497
|
const persisted = resumeEval?.runtimeOptions || config.evaluateOptions || {};
|
|
11575
11498
|
repeat = Number.isSafeInteger(persisted.repeat || 0) && persisted.repeat > 0 ? persisted.repeat : 1;
|
|
11576
11499
|
cache = persisted.cache ?? true;
|
|
11577
|
-
maxConcurrency = persisted.maxConcurrency ??
|
|
11500
|
+
maxConcurrency = persisted.maxConcurrency ?? 4;
|
|
11578
11501
|
delay = persisted.delay ?? 0;
|
|
11579
11502
|
} else {
|
|
11580
11503
|
const iterations = cmdObj.repeat ?? commandLineOptions?.repeat ?? evaluateOptions.repeat ?? NaN;
|
|
11581
11504
|
repeat = Number.isSafeInteger(iterations) && iterations > 0 ? iterations : 1;
|
|
11582
11505
|
cache = cmdObj.cache ?? commandLineOptions?.cache ?? evaluateOptions.cache ?? true;
|
|
11583
|
-
maxConcurrency = cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency ??
|
|
11506
|
+
maxConcurrency = cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency ?? 4;
|
|
11584
11507
|
delay = cmdObj.delay ?? commandLineOptions?.delay ?? evaluateOptions.delay ?? 0;
|
|
11585
11508
|
}
|
|
11586
11509
|
if (cache === false || repeat > 1) {
|
|
11587
|
-
require_logger.
|
|
11510
|
+
require_logger.logger.info("Cache is disabled.");
|
|
11588
11511
|
require_cache.disableCache();
|
|
11589
11512
|
}
|
|
11590
11513
|
const explicitMaxConcurrency = resumeRaw ? (resumeEval?.runtimeOptions)?.maxConcurrency ?? cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency : cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency;
|
|
11591
11514
|
if (delay > 0) {
|
|
11592
11515
|
maxConcurrency = 1;
|
|
11593
|
-
require_logger.
|
|
11594
|
-
require_logger.
|
|
11595
|
-
} else if (explicitMaxConcurrency !== void 0) require_logger.
|
|
11516
|
+
require_logger.state.maxConcurrency = 1;
|
|
11517
|
+
require_logger.logger.info(`Running at concurrency=1 because ${delay}ms delay was requested between API calls`);
|
|
11518
|
+
} else if (explicitMaxConcurrency !== void 0) require_logger.state.maxConcurrency = explicitMaxConcurrency;
|
|
11596
11519
|
if (!resumeEval) {
|
|
11597
11520
|
const filterOptions = {
|
|
11598
11521
|
failing: cmdObj.filterFailing,
|
|
@@ -11609,17 +11532,17 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11609
11532
|
let hasValidEmail = false;
|
|
11610
11533
|
while (!hasValidEmail) {
|
|
11611
11534
|
const { emailNeedsValidation } = await require_accounts.promptForEmailUnverified();
|
|
11612
|
-
hasValidEmail = await require_accounts.checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) ===
|
|
11535
|
+
hasValidEmail = await require_accounts.checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) === "ok";
|
|
11613
11536
|
}
|
|
11614
11537
|
}
|
|
11615
11538
|
if (!resumeEval) testSuite.providers = filterProviders(testSuite.providers, cmdObj.filterProviders || cmdObj.filterTargets);
|
|
11616
11539
|
const missingApiKeys = require_util.checkProviderApiKeys(testSuite.providers);
|
|
11617
11540
|
if (missingApiKeys.size > 0) {
|
|
11618
|
-
for (const [envVar, providerIds] of missingApiKeys) require_logger.
|
|
11619
|
-
require_logger.
|
|
11620
|
-
require_logger.
|
|
11621
|
-
for (const envVar of missingApiKeys.keys()) require_logger.
|
|
11622
|
-
require_logger.
|
|
11541
|
+
for (const [envVar, providerIds] of missingApiKeys) require_logger.logger.error(chalk.default.red(` ✗ Missing ${envVar} (${providerIds.join(", ")})`));
|
|
11542
|
+
require_logger.logger.error("");
|
|
11543
|
+
require_logger.logger.error(`To fix, set the environment variable or use ${chalk.default.bold("--env-file")}:`);
|
|
11544
|
+
for (const envVar of missingApiKeys.keys()) require_logger.logger.error(` export ${envVar}=your-api-key-here`);
|
|
11545
|
+
require_logger.logger.error("");
|
|
11623
11546
|
process.exitCode = 1;
|
|
11624
11547
|
return new Eval({}, { persisted: false });
|
|
11625
11548
|
}
|
|
@@ -11636,12 +11559,12 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11636
11559
|
if (typeof testSuite.defaultTest === "string") testSuite.defaultTest = {};
|
|
11637
11560
|
testSuite.defaultTest = testSuite.defaultTest || {};
|
|
11638
11561
|
testSuite.defaultTest.options = testSuite.defaultTest.options || {};
|
|
11639
|
-
testSuite.defaultTest.options.provider = await require_providers.loadApiProvider(cmdObj.grader, { basePath: require_logger.
|
|
11640
|
-
if (require_logger.
|
|
11641
|
-
if (typeof require_logger.
|
|
11642
|
-
require_logger.
|
|
11643
|
-
require_logger.
|
|
11644
|
-
require_logger.
|
|
11562
|
+
testSuite.defaultTest.options.provider = await require_providers.loadApiProvider(cmdObj.grader, { basePath: require_logger.state.basePath });
|
|
11563
|
+
if (require_logger.state.config) {
|
|
11564
|
+
if (typeof require_logger.state.config.defaultTest === "string") require_logger.state.config.defaultTest = {};
|
|
11565
|
+
require_logger.state.config.defaultTest = require_logger.state.config.defaultTest || {};
|
|
11566
|
+
require_logger.state.config.defaultTest.options = require_logger.state.config.defaultTest.options || {};
|
|
11567
|
+
require_logger.state.config.defaultTest.options.provider = testSuite.defaultTest.options.provider;
|
|
11645
11568
|
}
|
|
11646
11569
|
}
|
|
11647
11570
|
if (!resumeEval && cmdObj.var) {
|
|
@@ -11659,7 +11582,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11659
11582
|
}
|
|
11660
11583
|
for (const scenario of testSuite.scenarios || []) if (scenario.tests) scenario.tests = await require_util.maybeLoadFromExternalFile(scenario.tests);
|
|
11661
11584
|
const testSuiteSchema = require_types.TestSuiteSchema.safeParse(testSuite);
|
|
11662
|
-
if (!testSuiteSchema.success) require_logger.
|
|
11585
|
+
if (!testSuiteSchema.success) require_logger.logger.warn(chalk.default.yellow(dedent.default`
|
|
11663
11586
|
TestSuite Schema Validation Error:
|
|
11664
11587
|
|
|
11665
11588
|
${zod.z.prettifyError(testSuiteSchema.error)}
|
|
@@ -11692,13 +11615,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11692
11615
|
clearTimeout(forceExitTimeout);
|
|
11693
11616
|
forceExitTimeout = void 0;
|
|
11694
11617
|
}
|
|
11695
|
-
require_logger.
|
|
11618
|
+
require_logger.logger.warn("Force exiting...");
|
|
11696
11619
|
process.exit(130);
|
|
11697
11620
|
}
|
|
11698
|
-
require_logger.
|
|
11621
|
+
require_logger.logger.info(chalk.default.yellow("Pausing evaluation... Press Ctrl+C again to force exit."));
|
|
11699
11622
|
abortController.abort();
|
|
11700
11623
|
forceExitTimeout = setTimeout(() => {
|
|
11701
|
-
require_logger.
|
|
11624
|
+
require_logger.logger.warn("Evaluation shutdown timed out, force exiting...");
|
|
11702
11625
|
process.exit(130);
|
|
11703
11626
|
}, 1e4).unref();
|
|
11704
11627
|
};
|
|
@@ -11712,27 +11635,27 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11712
11635
|
abortSignal: evaluateOptions.abortSignal,
|
|
11713
11636
|
isRedteam: Boolean(config.redteam)
|
|
11714
11637
|
});
|
|
11715
|
-
if (retryErrors && require_logger.
|
|
11716
|
-
const errorResultIds = require_logger.
|
|
11638
|
+
if (retryErrors && require_logger.state._retryErrorResultIds && !paused) {
|
|
11639
|
+
const errorResultIds = require_logger.state._retryErrorResultIds;
|
|
11717
11640
|
try {
|
|
11718
11641
|
await deleteErrorResults(errorResultIds);
|
|
11719
11642
|
await recalculatePromptMetrics(ret);
|
|
11720
|
-
require_logger.
|
|
11643
|
+
require_logger.logger.debug(`Cleaned up ${errorResultIds.length} old ERROR results after successful retry`);
|
|
11721
11644
|
} catch (cleanupError) {
|
|
11722
|
-
require_logger.
|
|
11645
|
+
require_logger.logger.warn("Post-retry cleanup had issues. Retry results are saved.", { error: cleanupError });
|
|
11723
11646
|
} finally {
|
|
11724
|
-
delete require_logger.
|
|
11725
|
-
require_logger.
|
|
11647
|
+
delete require_logger.state._retryErrorResultIds;
|
|
11648
|
+
require_logger.state.retryMode = false;
|
|
11726
11649
|
}
|
|
11727
11650
|
}
|
|
11728
11651
|
} finally {
|
|
11729
11652
|
cleanupHandler();
|
|
11730
11653
|
}
|
|
11731
|
-
require_logger.
|
|
11654
|
+
require_logger.state.resume = false;
|
|
11732
11655
|
if (paused && cmdObj.write !== false) {
|
|
11733
11656
|
require_util.printBorder();
|
|
11734
|
-
require_logger.
|
|
11735
|
-
require_logger.
|
|
11657
|
+
require_logger.logger.info(`${chalk.default.yellow("⏸")} Evaluation paused. ID: ${chalk.default.cyan(evalRecord.id)}`);
|
|
11658
|
+
require_logger.logger.info(`» Resume with: ${chalk.default.green.bold("promptfoo eval --resume " + evalRecord.id)}`);
|
|
11736
11659
|
require_util.printBorder();
|
|
11737
11660
|
return ret;
|
|
11738
11661
|
}
|
|
@@ -11745,8 +11668,8 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11745
11668
|
});
|
|
11746
11669
|
const hasExplicitDisable = cmdObj.share === false || cmdObj.noShare === true || require_logger.getEnvBool("PROMPTFOO_DISABLE_SHARING");
|
|
11747
11670
|
const canShareEval = isSharingEnabled(evalRecord);
|
|
11748
|
-
require_logger.
|
|
11749
|
-
require_logger.
|
|
11671
|
+
require_logger.logger.debug(`Wants to share: ${wantsToShare}`);
|
|
11672
|
+
require_logger.logger.debug(`Can share eval: ${canShareEval}`);
|
|
11750
11673
|
const willShare = wantsToShare && canShareEval;
|
|
11751
11674
|
let sharePromise = null;
|
|
11752
11675
|
if (willShare) sharePromise = createShareableUrl(evalRecord, { silent: true });
|
|
@@ -11765,13 +11688,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11765
11688
|
if (cmdObj.table && require_logger.getLogLevel() !== "debug" && totalTests < 500) {
|
|
11766
11689
|
const table = await evalRecord.getTable();
|
|
11767
11690
|
const outputTable = generateTable(table);
|
|
11768
|
-
require_logger.
|
|
11691
|
+
require_logger.logger.info("\n" + outputTable.toString());
|
|
11769
11692
|
if (table.body.length > 25) {
|
|
11770
11693
|
const rowsLeft = table.body.length - 25;
|
|
11771
|
-
require_logger.
|
|
11694
|
+
require_logger.logger.info(`... ${rowsLeft} more row${rowsLeft === 1 ? "" : "s"} not shown ...\n`);
|
|
11772
11695
|
}
|
|
11773
|
-
} else if (failures !== 0) require_logger.
|
|
11774
|
-
if (totalTests >= 500) require_logger.
|
|
11696
|
+
} else if (failures !== 0) require_logger.logger.debug(`At least one evaluation failure occurred. This might be caused by the underlying call to the provider, or a test failure. Context: \n${JSON.stringify(evalRecord.prompts)}`);
|
|
11697
|
+
if (totalTests >= 500) require_logger.logger.info("Skipping table output because there are more than 500 tests.");
|
|
11775
11698
|
const { outputPath } = config;
|
|
11776
11699
|
const paths = (Array.isArray(outputPath) ? outputPath : [outputPath]).filter((p) => typeof p === "string" && p.length > 0 && !p.endsWith(".jsonl"));
|
|
11777
11700
|
const isRedteam = Boolean(config.redteam);
|
|
@@ -11797,13 +11720,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11797
11720
|
targetErrorStatus
|
|
11798
11721
|
});
|
|
11799
11722
|
if (cmdObj.write && wantsToShare && !canShareEval) {
|
|
11800
|
-
require_logger.
|
|
11723
|
+
require_logger.logger.info(summaryLines[0]);
|
|
11801
11724
|
notCloudEnabledShareInstructions();
|
|
11802
11725
|
for (let i = 1; i < summaryLines.length; i++) if (summaryLines[i].includes("View results:")) {
|
|
11803
11726
|
while (i < summaryLines.length && !summaryLines[i].includes("Total Tokens:")) i++;
|
|
11804
11727
|
i--;
|
|
11805
|
-
} else require_logger.
|
|
11806
|
-
} else for (const line of summaryLines) require_logger.
|
|
11728
|
+
} else require_logger.logger.info(summaryLines[i]);
|
|
11729
|
+
} else for (const line of summaryLines) require_logger.logger.info(line);
|
|
11807
11730
|
let shareableUrl = null;
|
|
11808
11731
|
if (sharePromise != null) {
|
|
11809
11732
|
const orgContext = await require_providers.getOrgContext();
|
|
@@ -11822,24 +11745,24 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11822
11745
|
} else spinner.fail(chalk.default.red("Share failed"));
|
|
11823
11746
|
} catch (error) {
|
|
11824
11747
|
spinner.fail(chalk.default.red("Share failed"));
|
|
11825
|
-
require_logger.
|
|
11748
|
+
require_logger.logger.debug(`Share error: ${error}`);
|
|
11826
11749
|
}
|
|
11827
11750
|
} else try {
|
|
11828
11751
|
shareableUrl = await sharePromise;
|
|
11829
11752
|
if (shareableUrl) {
|
|
11830
11753
|
evalRecord.shared = true;
|
|
11831
|
-
require_logger.
|
|
11754
|
+
require_logger.logger.info(`${chalk.default.dim("»")} ${chalk.default.green("✓")} ${shareableUrl}`);
|
|
11832
11755
|
}
|
|
11833
11756
|
} catch (error) {
|
|
11834
|
-
require_logger.
|
|
11757
|
+
require_logger.logger.debug(`Share error: ${error}`);
|
|
11835
11758
|
}
|
|
11836
11759
|
}
|
|
11837
|
-
require_logger.
|
|
11760
|
+
require_logger.logger.debug(`Shareable URL: ${shareableUrl}`);
|
|
11838
11761
|
if (paths.length) {
|
|
11839
11762
|
await require_util.writeMultipleOutputs(paths, evalRecord, shareableUrl);
|
|
11840
|
-
require_logger.
|
|
11763
|
+
require_logger.logger.info(chalk.default.yellow(`Writing output to ${paths.join(", ")}`));
|
|
11841
11764
|
}
|
|
11842
|
-
require_telemetry.
|
|
11765
|
+
require_telemetry.telemetry.record("command_used", {
|
|
11843
11766
|
name: "eval",
|
|
11844
11767
|
watch: Boolean(cmdObj.watch),
|
|
11845
11768
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
@@ -11849,7 +11772,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11849
11772
|
if (initialization) {
|
|
11850
11773
|
const configPaths = (cmdObj.config || [defaultConfigPath]).filter(Boolean);
|
|
11851
11774
|
if (!configPaths.length) {
|
|
11852
|
-
require_logger.
|
|
11775
|
+
require_logger.logger.error(`Could not locate config file(s) to watch. Pass --config path/to/promptfooconfig.yaml or run from a directory containing promptfooconfig.{${DEFAULT_CONFIG_EXTENSIONS.join(",")}}.`);
|
|
11853
11776
|
process.exitCode = 1;
|
|
11854
11777
|
return ret;
|
|
11855
11778
|
}
|
|
@@ -11877,19 +11800,19 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11877
11800
|
chokidar.default.watch(watchPaths, {
|
|
11878
11801
|
ignored: /^\./,
|
|
11879
11802
|
persistent: true
|
|
11880
|
-
}).on("change", async (path) => {
|
|
11803
|
+
}).on("change", async (path$7) => {
|
|
11881
11804
|
require_util.printBorder();
|
|
11882
|
-
require_logger.
|
|
11805
|
+
require_logger.logger.info(`File change detected: ${path$7}`);
|
|
11883
11806
|
require_util.printBorder();
|
|
11884
11807
|
clearConfigCache();
|
|
11885
11808
|
await runEvaluation();
|
|
11886
|
-
}).on("error", (error) => require_logger.
|
|
11809
|
+
}).on("error", (error) => require_logger.logger.error(`Watcher error: ${error}`)).on("ready", () => watchPaths.forEach((watchPath) => require_logger.logger.info(`Watching for file changes on ${watchPath} ...`)));
|
|
11887
11810
|
}
|
|
11888
11811
|
} else {
|
|
11889
11812
|
const passRateThreshold = require_logger.getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD", 100);
|
|
11890
11813
|
const failedTestExitCode = require_logger.getEnvInt("PROMPTFOO_FAILED_TEST_EXIT_CODE", 100);
|
|
11891
11814
|
if (passRate < (Number.isFinite(passRateThreshold) ? passRateThreshold : 100)) {
|
|
11892
|
-
if (require_logger.getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD") !== void 0) require_logger.
|
|
11815
|
+
if (require_logger.getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD") !== void 0) require_logger.logger.info(chalk.default.white(`Pass rate ${chalk.default.red.bold(passRate.toFixed(2))}${chalk.default.red("%")} is below the threshold of ${chalk.default.red.bold(passRateThreshold)}${chalk.default.red("%")}`));
|
|
11893
11816
|
process.exitCode = Number.isSafeInteger(failedTestExitCode) ? failedTestExitCode : 100;
|
|
11894
11817
|
return ret;
|
|
11895
11818
|
}
|
|
@@ -11905,7 +11828,6 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11905
11828
|
};
|
|
11906
11829
|
return await runEvaluation(true);
|
|
11907
11830
|
}
|
|
11908
|
-
|
|
11909
11831
|
//#endregion
|
|
11910
11832
|
//#region src/util/verboseToggle.ts
|
|
11911
11833
|
let isVerboseToggleEnabled = false;
|
|
@@ -11968,7 +11890,6 @@ function initVerboseToggle() {
|
|
|
11968
11890
|
function disableVerboseToggle() {
|
|
11969
11891
|
if (cleanupFn) cleanupFn();
|
|
11970
11892
|
}
|
|
11971
|
-
|
|
11972
11893
|
//#endregion
|
|
11973
11894
|
//#region src/redteam/shared.ts
|
|
11974
11895
|
async function doRedteamRun(options) {
|
|
@@ -11985,13 +11906,13 @@ async function doRedteamRun(options) {
|
|
|
11985
11906
|
try {
|
|
11986
11907
|
const healthUrl = require_server.getRemoteHealthUrl();
|
|
11987
11908
|
if (healthUrl) {
|
|
11988
|
-
require_logger.
|
|
11909
|
+
require_logger.logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
11989
11910
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
11990
11911
|
if (healthResult.status !== "OK") throw new Error(`Unable to proceed with redteam: ${healthResult.message}\nPlease check your API configuration or try again later.`);
|
|
11991
|
-
require_logger.
|
|
11912
|
+
require_logger.logger.debug("API health check passed");
|
|
11992
11913
|
}
|
|
11993
11914
|
} catch (error) {
|
|
11994
|
-
require_logger.
|
|
11915
|
+
require_logger.logger.warn(`API health check failed with error: ${error}.\nPlease check your API configuration or try again later.`);
|
|
11995
11916
|
}
|
|
11996
11917
|
if (options.liveRedteamConfig) {
|
|
11997
11918
|
const filename = `redteam-${Date.now()}.yaml`;
|
|
@@ -12001,10 +11922,10 @@ async function doRedteamRun(options) {
|
|
|
12001
11922
|
fs.writeFileSync(tmpFile, js_yaml.default.dump(options.liveRedteamConfig));
|
|
12002
11923
|
redteamPath = tmpFile;
|
|
12003
11924
|
configPath = tmpFile;
|
|
12004
|
-
require_logger.
|
|
12005
|
-
require_logger.
|
|
11925
|
+
require_logger.logger.debug(`Using live config from ${tmpFile}`);
|
|
11926
|
+
require_logger.logger.debug(`Live config: ${JSON.stringify(options.liveRedteamConfig, null, 2)}`);
|
|
12006
11927
|
}
|
|
12007
|
-
require_logger.
|
|
11928
|
+
require_logger.logger.info("Generating test cases...");
|
|
12008
11929
|
const { maxConcurrency, ...passThroughOptions } = options;
|
|
12009
11930
|
let redteamConfig;
|
|
12010
11931
|
const generationStartTime = Date.now();
|
|
@@ -12024,7 +11945,7 @@ async function doRedteamRun(options) {
|
|
|
12024
11945
|
});
|
|
12025
11946
|
} catch (error) {
|
|
12026
11947
|
if (error instanceof require_types.PartialGenerationError) {
|
|
12027
|
-
require_logger.
|
|
11948
|
+
require_logger.logger.error(chalk.default.red("\n" + error.message));
|
|
12028
11949
|
require_logger.setLogCallback(null);
|
|
12029
11950
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12030
11951
|
throw error;
|
|
@@ -12033,11 +11954,11 @@ async function doRedteamRun(options) {
|
|
|
12033
11954
|
}
|
|
12034
11955
|
const generationDurationMs = Date.now() - generationStartTime;
|
|
12035
11956
|
if (!redteamConfig || !fs.existsSync(redteamPath)) {
|
|
12036
|
-
require_logger.
|
|
11957
|
+
require_logger.logger.info("No test cases generated. Skipping scan.");
|
|
12037
11958
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12038
11959
|
return;
|
|
12039
11960
|
}
|
|
12040
|
-
require_logger.
|
|
11961
|
+
require_logger.logger.info("Running scan...");
|
|
12041
11962
|
const { defaultConfig } = await loadDefaultConfig();
|
|
12042
11963
|
const { description: _description, ...evalOptions } = options;
|
|
12043
11964
|
const evalResult = await doEval({
|
|
@@ -12059,16 +11980,15 @@ async function doRedteamRun(options) {
|
|
|
12059
11980
|
if (evalResult.persisted) await evalResult.save();
|
|
12060
11981
|
const totalMs = evalResult.durationMs ?? 0;
|
|
12061
11982
|
const evalMs = evalResult.evaluationDurationMs ?? 0;
|
|
12062
|
-
require_logger.
|
|
11983
|
+
require_logger.logger.info(chalk.default.gray(`Total scan time: ${formatDuration(totalMs / 1e3)} (generation: ${formatDuration(generationDurationMs / 1e3)}, evaluation: ${formatDuration(evalMs / 1e3)})`));
|
|
12063
11984
|
}
|
|
12064
|
-
if (evalResult ? await evalResult.findTargetErrorStatus() != null : false) {} else require_logger.
|
|
12065
|
-
if (!evalResult?.shared) if (options.liveRedteamConfig) require_logger.
|
|
12066
|
-
else require_logger.
|
|
11985
|
+
if (evalResult ? await evalResult.findTargetErrorStatus() != null : false) {} else require_logger.logger.info(chalk.default.green("\nRed team scan complete!"));
|
|
11986
|
+
if (!evalResult?.shared) if (options.liveRedteamConfig) require_logger.logger.info(chalk.default.blue(`To view the results, click the ${chalk.default.bold("View Report")} button or run ${chalk.default.bold(promptfooCommand("redteam report"))} on the command line.`));
|
|
11987
|
+
else require_logger.logger.info(chalk.default.blue(`To view the results, run ${chalk.default.bold(promptfooCommand("redteam report"))}`));
|
|
12067
11988
|
require_logger.setLogCallback(null);
|
|
12068
11989
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12069
11990
|
return evalResult;
|
|
12070
11991
|
}
|
|
12071
|
-
|
|
12072
11992
|
//#endregion
|
|
12073
11993
|
//#region src/index.ts
|
|
12074
11994
|
async function evaluate(testSuite, options = {}) {
|
|
@@ -12093,23 +12013,23 @@ async function evaluate(testSuite, options = {}) {
|
|
|
12093
12013
|
if (typeof constructedTestSuite.defaultTest === "object") {
|
|
12094
12014
|
if (constructedTestSuite.defaultTest?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
|
|
12095
12015
|
env: testSuite.env,
|
|
12096
|
-
basePath: require_logger.
|
|
12016
|
+
basePath: require_logger.state.basePath
|
|
12097
12017
|
});
|
|
12098
12018
|
if (constructedTestSuite.defaultTest?.options?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
|
|
12099
12019
|
env: testSuite.env,
|
|
12100
|
-
basePath: require_logger.
|
|
12020
|
+
basePath: require_logger.state.basePath
|
|
12101
12021
|
});
|
|
12102
12022
|
}
|
|
12103
12023
|
for (const test of constructedTestSuite.tests || []) {
|
|
12104
12024
|
if (test.options?.provider && !require_types.isApiProvider(test.options.provider)) test.options.provider = await require_providers.resolveProvider(test.options.provider, providerMap, {
|
|
12105
12025
|
env: testSuite.env,
|
|
12106
|
-
basePath: require_logger.
|
|
12026
|
+
basePath: require_logger.state.basePath
|
|
12107
12027
|
});
|
|
12108
12028
|
if (test.assert) for (const assertion of test.assert) {
|
|
12109
12029
|
if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
|
|
12110
12030
|
if (assertion.provider && !require_types.isApiProvider(assertion.provider)) assertion.provider = await require_providers.resolveProvider(assertion.provider, providerMap, {
|
|
12111
12031
|
env: testSuite.env,
|
|
12112
|
-
basePath: require_logger.
|
|
12032
|
+
basePath: require_logger.state.basePath
|
|
12113
12033
|
});
|
|
12114
12034
|
}
|
|
12115
12035
|
}
|
|
@@ -12133,12 +12053,12 @@ async function evaluate(testSuite, options = {}) {
|
|
|
12133
12053
|
if (shareableUrl) {
|
|
12134
12054
|
ret.shareableUrl = shareableUrl;
|
|
12135
12055
|
ret.shared = true;
|
|
12136
|
-
require_logger.
|
|
12056
|
+
require_logger.logger.debug(`Eval shared successfully: ${shareableUrl}`);
|
|
12137
12057
|
}
|
|
12138
12058
|
} catch (error) {
|
|
12139
|
-
require_logger.
|
|
12059
|
+
require_logger.logger.warn(`Failed to create shareable URL: ${error}`);
|
|
12140
12060
|
}
|
|
12141
|
-
else require_logger.
|
|
12061
|
+
else require_logger.logger.debug("Sharing requested but not enabled (check cloud config or sharing settings)");
|
|
12142
12062
|
if (testSuite.outputPath) {
|
|
12143
12063
|
if (typeof testSuite.outputPath === "string") await require_util.writeOutput(testSuite.outputPath, evalRecord, null);
|
|
12144
12064
|
else if (Array.isArray(testSuite.outputPath)) await require_util.writeMultipleOutputs(testSuite.outputPath, evalRecord, null);
|
|
@@ -12165,11 +12085,10 @@ var src_default = {
|
|
|
12165
12085
|
assertions: assertions_default,
|
|
12166
12086
|
cache: require_cache.cache_exports,
|
|
12167
12087
|
evaluate,
|
|
12168
|
-
guardrails
|
|
12088
|
+
guardrails,
|
|
12169
12089
|
loadApiProvider: require_providers.loadApiProvider,
|
|
12170
12090
|
redteam
|
|
12171
12091
|
};
|
|
12172
|
-
|
|
12173
12092
|
//#endregion
|
|
12174
12093
|
exports.AssertionOrSetSchema = require_types.AssertionOrSetSchema;
|
|
12175
12094
|
exports.AssertionSchema = require_types.AssertionSchema;
|
|
@@ -12208,20 +12127,21 @@ exports.TestSuiteSchema = require_types.TestSuiteSchema;
|
|
|
12208
12127
|
exports.UnifiedConfigSchema = require_types.UnifiedConfigSchema;
|
|
12209
12128
|
exports.VarsSchema = require_types.VarsSchema;
|
|
12210
12129
|
exports.assertions = assertions_default;
|
|
12211
|
-
Object.defineProperty(exports,
|
|
12212
|
-
|
|
12213
|
-
|
|
12214
|
-
|
|
12215
|
-
|
|
12130
|
+
Object.defineProperty(exports, "cache", {
|
|
12131
|
+
enumerable: true,
|
|
12132
|
+
get: function() {
|
|
12133
|
+
return require_cache.cache_exports;
|
|
12134
|
+
}
|
|
12216
12135
|
});
|
|
12217
12136
|
exports.default = src_default;
|
|
12218
12137
|
exports.evaluate = evaluate;
|
|
12219
12138
|
exports.generateTable = generateTable;
|
|
12220
|
-
exports.guardrails =
|
|
12139
|
+
exports.guardrails = guardrails;
|
|
12221
12140
|
exports.isApiProvider = require_types.isApiProvider;
|
|
12222
12141
|
exports.isGradingResult = require_types.isGradingResult;
|
|
12223
12142
|
exports.isProviderOptions = require_types.isProviderOptions;
|
|
12224
12143
|
exports.isResultFailureReason = require_types.isResultFailureReason;
|
|
12225
12144
|
exports.loadApiProvider = require_providers.loadApiProvider;
|
|
12226
12145
|
exports.redteam = redteam;
|
|
12146
|
+
|
|
12227
12147
|
//# sourceMappingURL=index.cjs.map
|