promptfoo 0.120.27 → 0.121.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/src/{ListApp-8WOe2nT6.js → ListApp-Du7YVwj5.js} +2 -4
- package/dist/src/accounts-BgNJDBE6.js +206 -0
- package/dist/src/{accounts-Fl2J3_Fu.cjs → accounts-Bx-x3bmW.cjs} +77 -78
- package/dist/src/{accounts-DVINui-2.js → accounts-CMqkzrVf.js} +39 -34
- package/dist/src/{accounts-CPDRAMND.js → accounts-xrUGFA6n.js} +38 -33
- package/dist/src/{agentic-utils-D922n6mm.js → agentic-utils-BKIN5PKu.js} +9 -10
- package/dist/src/{agents-BO2n8Z0d.cjs → agents-B0f4HICh.cjs} +37 -40
- package/dist/src/{agents-BXLmVsxR.js → agents-C-dDThPK.js} +37 -37
- package/dist/src/{agents-DgJf2-ez.cjs → agents-CErsqg5U.cjs} +16 -17
- package/dist/src/{agents-BcsN_BgB.js → agents-CVIn-Utx.js} +16 -12
- package/dist/src/{agents-hqgSV-3o.js → agents-CXknwsFX.js} +37 -40
- package/dist/src/{agents-pMfppv9Z.js → agents-DeH4Gu94.js} +18 -18
- package/dist/src/{agents-BdUTAwi-.js → agents-Dy2YpZpa.js} +38 -41
- package/dist/src/{agents-DNvSH78i.js → agents-aF4-T121.js} +16 -20
- package/dist/src/{aimlapi-DOib86oE.js → aimlapi-BAGZDo5G.js} +16 -18
- package/dist/src/{aimlapi-DtgPI0nE.js → aimlapi-BNfTBexL.js} +15 -17
- package/dist/src/{aimlapi-DTPACCB1.js → aimlapi-DHRKlBEA.js} +15 -4
- package/dist/src/{aimlapi-BE_Tg9Fl.cjs → aimlapi-tg0Gkcvr.cjs} +15 -16
- package/dist/src/app/assets/index-BFCZg7hQ.js +439 -0
- package/dist/src/app/index.html +1 -1
- package/dist/src/{audio-BRYU0BFo.js → audio-BRODU0UK.js} +7 -9
- package/dist/src/{audio-Cwo68yZS.cjs → audio-BWeaWovU.cjs} +6 -7
- package/dist/src/{audio-BnRUGAm_.js → audio-CHQ4r-RV.js} +6 -5
- package/dist/src/{audio-MSRki4JU.js → audio-tf_NBjlC.js} +6 -8
- package/dist/src/{base-h961VXYk.js → base-B0tcrnq_.js} +11 -13
- package/dist/src/{base-XB2tDJrB.js → base-B4QJRyFS.js} +11 -13
- package/dist/src/{base-pGVmXNl4.cjs → base-DBtwl2FR.cjs} +36 -38
- package/dist/src/base-fEDN28WM.js +193 -0
- package/dist/src/{blobs-BM_e6hCa.js → blobs-BAU-dXan.js} +9 -12
- package/dist/src/{blobs-CR5C4Ihh.js → blobs-Bpg5rH6i.js} +9 -12
- package/dist/src/{blobs-B-KQAFhX.cjs → blobs-DvS-O6be.cjs} +34 -37
- package/dist/src/blobs-qTYm-1PY.js +236 -0
- package/dist/src/{cache-CIpsoBZR.js → cache-8XhNqPKW.js} +64 -67
- package/dist/src/cache-Bbn1Nyrd.cjs +5 -0
- package/dist/src/cache-BwsMSda7.js +6 -0
- package/dist/src/{cache-jsiwsAJv.js → cache-CG0SlR1d.js} +64 -66
- package/dist/src/{cache-BTVYfbka.cjs → cache-COish3-W.cjs} +114 -117
- package/dist/src/cache-D3eqDYGU.js +739 -0
- package/dist/src/{chat-D31K7C4u.cjs → chat-2K608PeQ.cjs} +20 -21
- package/dist/src/chat-BKm79wib.js +764 -0
- package/dist/src/{chat-B84t99NW.js → chat-CM_kyI8B.js} +20 -9
- package/dist/src/{chat-BcPjZXIp.js → chat-CRWNNq73.js} +41 -44
- package/dist/src/{chat-CcUCysjU.js → chat-CznLWr_D.js} +41 -44
- package/dist/src/{chat-DwWifjxi.js → chat-DHMH-N64.js} +20 -22
- package/dist/src/{chat-BE44YOc6.cjs → chat-DaqekjFr.cjs} +61 -64
- package/dist/src/{chat-DZM2GUHO.js → chat-DxysjBvt.js} +21 -23
- package/dist/src/{chatkit-D67HS_0b.js → chatkit-65VXf5SR.js} +58 -58
- package/dist/src/{chatkit-DAB_qfzI.js → chatkit-Be-Q-a9F.js} +58 -60
- package/dist/src/{chatkit-Biqb_wsD.js → chatkit-BxFvW8KY.js} +58 -60
- package/dist/src/{chatkit-PGG4ZYIn.cjs → chatkit-DKyPi1Gs.cjs} +58 -60
- package/dist/src/chunk-DEq-mXcV.js +15 -0
- package/dist/src/chunk-DRamLcfz.js +16 -0
- package/dist/src/{claude-agent-sdk-SVM6AdBu.js → claude-agent-sdk-BLTu0WBO.js} +31 -31
- package/dist/src/{claude-agent-sdk-C9SiaQub.cjs → claude-agent-sdk-CJH22shf.cjs} +31 -28
- package/dist/src/{claude-agent-sdk-C-IOTPfo.js → claude-agent-sdk-D6_k9FKA.js} +31 -29
- package/dist/src/{claude-agent-sdk-CiluSyW1.js → claude-agent-sdk-Dy5lT-Tx.js} +33 -20
- package/dist/src/{cloud-CZ-q9Ier.js → cloud-Bc9526yV.js} +7 -9
- package/dist/src/cloud-DmE0EwsY.js +4 -0
- package/dist/src/{cloudflare-ai-BahKHyhh.js → cloudflare-ai-C9r2sRhw.js} +16 -18
- package/dist/src/{cloudflare-ai-Dxyt50Nl.js → cloudflare-ai-CWWJCRim.js} +16 -4
- package/dist/src/{cloudflare-ai-Dfahv5SY.cjs → cloudflare-ai-ClWSdor4.cjs} +16 -17
- package/dist/src/{cloudflare-ai-v_qZD6_q.js → cloudflare-ai-ICsOuD-z.js} +17 -19
- package/dist/src/{cloudflare-gateway-BPWoZIzJ.cjs → cloudflare-gateway-C2_-KG5o.cjs} +21 -22
- package/dist/src/{cloudflare-gateway-Bi_FpOFy.js → cloudflare-gateway-D6O7AlYb.js} +23 -23
- package/dist/src/{cloudflare-gateway-btS7h1OZ.js → cloudflare-gateway-D6xFc5pa.js} +21 -25
- package/dist/src/{cloudflare-gateway-C0guUNwk.js → cloudflare-gateway-pXGHxJ47.js} +26 -14
- package/dist/src/{codex-sdk-DSxAnbfT.js → codex-sdk-C6UMlxwV.js} +28 -29
- package/dist/src/{codex-sdk-IYVi9fuM.js → codex-sdk-DUwKWezN.js} +28 -27
- package/dist/src/{codex-sdk-DulY0ZRq.js → codex-sdk-GGAw0qbD.js} +28 -29
- package/dist/src/{codex-sdk-DFKMtAyf.cjs → codex-sdk-fAO0c3yA.cjs} +28 -29
- package/dist/src/{cometapi-DkNBMk0G.js → cometapi-BasUi7-_.js} +17 -19
- package/dist/src/{cometapi-DzrR3SR_.js → cometapi-Bbjp5V4x.js} +16 -4
- package/dist/src/{cometapi-C9EEpJzT.js → cometapi-DkXrKi5z.js} +21 -24
- package/dist/src/{cometapi-DIO64tf4.cjs → cometapi-vY6aDZgo.cjs} +21 -22
- package/dist/src/{completion-CG29bfKX.js → completion-6Mx_iXxK.js} +11 -13
- package/dist/src/{completion-Bgf1VJoq.js → completion-C5rtR_9P.js} +11 -13
- package/dist/src/{completion-CCRT4kX1.cjs → completion-CDOouNzq.cjs} +21 -23
- package/dist/src/completion-C_P3ypkJ.js +120 -0
- package/dist/src/{createHash-Dw_iLu31.js → createHash-CTQmL3G2.js} +2 -3
- package/dist/src/{createHash-CYQy4YeL.cjs → createHash-CfZSc0b4.cjs} +13 -14
- package/dist/src/{createHash-CJcfskIZ.js → createHash-Da8fMwqB.js} +2 -3
- package/dist/src/createHash-DmPQkvBh.js +15 -0
- package/dist/src/{docker-D-ayp2FW.js → docker-5KcG-_86.js} +18 -20
- package/dist/src/{docker-DNcLR4Ig.cjs → docker-BwsKwxFs.cjs} +18 -19
- package/dist/src/{docker-egERKxCF.js → docker-CZnqU1XV.js} +18 -7
- package/dist/src/{docker-B81N0t4e.js → docker-DzxyDPIj.js} +19 -21
- package/dist/src/entrypoint.js +2 -3
- package/dist/src/{errors-DnGCbnx8.js → errors-P6ll7XSJ.js} +2 -2
- package/dist/src/{esm-B9dPm_BF.js → esm-C03C-mv3.js} +17 -20
- package/dist/src/{esm-D2pZ87fL.js → esm-CaIwzWR5.js} +18 -21
- package/dist/src/esm-Cd1AjG1D.js +379 -0
- package/dist/src/{esm-Ct-Joyue.cjs → esm-CnNt7sI4.cjs} +47 -49
- package/dist/src/eval-17JizQIv.js +15 -0
- package/dist/src/{eval-C-Nr6wX_.js → eval-DmFyWU7i.js} +47 -54
- package/dist/src/{evalResult-4BzI2tmj.js → evalResult-CDQiuUuf.js} +16 -12
- package/dist/src/{evalResult-DXMWJ3sx.js → evalResult-CTG2AHOS.js} +10 -11
- package/dist/src/evalResult-Cqj8pldJ.js +12 -0
- package/dist/src/{evalResult-CX8wQecI.cjs → evalResult-Dap2CekP.cjs} +20 -21
- package/dist/src/evalResult-DvcJAWJU.cjs +10 -0
- package/dist/src/evalResult-Hftn-S_i.js +10 -0
- package/dist/src/evaluator-B2CFNt-P.js +36 -0
- package/dist/src/{evaluator-8aGyV12L.js → evaluator-DPFRbFIL.js} +201 -229
- package/dist/src/{extractor-CD5yKL-G.js → extractor-CFG6bcWJ.js} +22 -24
- package/dist/src/{extractor-C031XmTA.cjs → extractor-DX36oYEv.cjs} +37 -39
- package/dist/src/{extractor-V5x_m1i0.js → extractor-M67RUtg6.js} +22 -24
- package/dist/src/extractor-YMU_Gvt8.js +374 -0
- package/dist/src/{fetch-D3OHf-lV.js → fetch-4M3YRaqL.js} +40 -45
- package/dist/src/fetch-60Gzydls.js +777 -0
- package/dist/src/{fetch-CXZI9RRr.js → fetch-BMv0O527.js} +23 -35
- package/dist/src/{fetch-BmbD-v1L.cjs → fetch-BxUk8odA.cjs} +244 -277
- package/dist/src/fetch-KV5kNASw.js +5 -0
- package/dist/src/{fileExtensions-ePDqouxn.js → fileExtensions-DnqA1y9x.js} +2 -2
- package/dist/src/{fileExtensions-BpuMmaFL.js → fileExtensions-Ds-foDzt.js} +2 -2
- package/dist/src/fileExtensions-LcDYkU4v.js +85 -0
- package/dist/src/{fileExtensions-DkJYkWUy.cjs → fileExtensions-bYh77CN8.cjs} +27 -28
- package/dist/src/{formatDuration-CdevI3An.js → formatDuration-DgBVMN65.js} +2 -2
- package/dist/src/{genaiTracer-Ce19n68P.js → genaiTracer-70Z8BIuV.js} +2 -3
- package/dist/src/{genaiTracer-CqNnnXrE.js → genaiTracer-C1rxGO8Q.js} +2 -3
- package/dist/src/genaiTracer-D3fD9dNV.js +256 -0
- package/dist/src/{genaiTracer-Dres3qrN.cjs → genaiTracer-DN4dQywX.cjs} +13 -14
- package/dist/src/graders-Bu0H9nXi.js +32 -0
- package/dist/src/{graders-DTeBrzWp.js → graders-CHO8EPM4.js} +349 -397
- package/dist/src/graders-Cfhkvx-e.js +34 -0
- package/dist/src/{graders--1y2u9HO.js → graders-CpdqD9PI.js} +349 -397
- package/dist/src/graders-DClJVpGP.cjs +32 -0
- package/dist/src/{graders-DohM2dir.cjs → graders-DOXycdlG.cjs} +684 -732
- package/dist/src/graders-DcnJsrMO.js +32 -0
- package/dist/src/graders-R9rYUM0d.js +13466 -0
- package/dist/src/{image-C3wHC9_h.js → image-BmEZqVmk.js} +9 -10
- package/dist/src/{image-O1u4bCFg.js → image-CBBVXWuT.js} +9 -10
- package/dist/src/{image-DpKl2F15.cjs → image-CDLQOcqT.cjs} +6 -7
- package/dist/src/{image-DmE-niFE.js → image-DJEvKveK.js} +6 -5
- package/dist/src/{image-CuKHuccK.cjs → image-DTedmQPg.cjs} +29 -30
- package/dist/src/{image-B0U4Hqll.js → image-gvmivTEe.js} +7 -9
- package/dist/src/image-pAX56tPG.js +257 -0
- package/dist/src/{image-DNEIf_aI.js → image-tL5hIOFh.js} +6 -8
- package/dist/src/index.cjs +605 -689
- package/dist/src/index.d.cts +11 -7
- package/dist/src/index.d.ts +11 -3
- package/dist/src/index.js +570 -658
- package/dist/src/{interactiveCheck-Bxj1Swex.js → interactiveCheck-BgLZUIt3.js} +7 -8
- package/dist/src/{invariant-DT20jrBd.js → invariant-BtWWVVhl.js} +2 -2
- package/dist/src/{invariant-1pAf2CD1.js → invariant-Ddh24eXh.js} +2 -2
- package/dist/src/{invariant-CKcJAQ6M.cjs → invariant-kfQ8Bu82.cjs} +7 -8
- package/dist/src/invariant-vgHWClmd.js +25 -0
- package/dist/src/{knowledgeBase-Be_zyW4L.js → knowledgeBase-CLJybhnF.js} +16 -16
- package/dist/src/{knowledgeBase-CEzQobWX.js → knowledgeBase-CoU-UQBg.js} +14 -9
- package/dist/src/{knowledgeBase-BZ41IFwq.js → knowledgeBase-DjWPVqSb.js} +14 -18
- package/dist/src/{knowledgeBase-D-5BMXlr.cjs → knowledgeBase-wkxuRFhA.cjs} +14 -15
- package/dist/src/{litellm-DnbRJ2if.js → litellm-B9Hysuri.js} +16 -18
- package/dist/src/{litellm-CRDqPhNI.js → litellm-CTfa0hqi.js} +15 -17
- package/dist/src/{litellm-hUSNM_M2.cjs → litellm-NYpQ8RQu.cjs} +15 -16
- package/dist/src/{litellm-9vR8zpfU.js → litellm-ePxtr9F1.js} +15 -4
- package/dist/src/{logger-CG1uZPbQ.js → logger-CT3IKMKA.js} +10 -29
- package/dist/src/{logger-B7sBeGa0.cjs → logger-Cp1GPUjj.cjs} +152 -180
- package/dist/src/logger-DLcq4dWf.js +713 -0
- package/dist/src/{logger-LSBxlt7a.js → logger-KkObSCzq.js} +13 -31
- package/dist/src/{luma-ray-Hm3d6VJE.cjs → luma-ray-B0GGNRc1.cjs} +20 -21
- package/dist/src/{luma-ray-drvgdpP9.js → luma-ray-BE2mOt6N.js} +20 -13
- package/dist/src/{luma-ray-4blv9iZ2.js → luma-ray-BW9IRGIc.js} +22 -21
- package/dist/src/{luma-ray-B2__8lYH.js → luma-ray-Cm1KZBhs.js} +20 -23
- package/dist/src/main.js +1170 -1321
- package/dist/src/{messages-XhiwCbi4.cjs → messages-1JrJs91T.cjs} +32 -34
- package/dist/src/{messages-CGPPidQr.js → messages-1x9atZmP.js} +22 -24
- package/dist/src/{messages-Uee41Mj5.js → messages-BLbWdsyt.js} +22 -24
- package/dist/src/messages-D8EA0oDc.js +240 -0
- package/dist/src/{meteor-BYykdXrV.js → meteor-44VjEACX.js} +3 -4
- package/dist/src/{meteor-CsopaHrH.js → meteor-D-SotUw9.js} +3 -4
- package/dist/src/{meteor-e-E-2vVl.cjs → meteor-DLZZ3osF.cjs} +3 -4
- package/dist/src/{meteor-C8lGP6P4.js → meteor-DUiCJRC-.js} +3 -4
- package/dist/src/{modelslab-yKz-ZNB4.js → modelslab-C1OLRmVX.js} +17 -10
- package/dist/src/{modelslab-E9gO-bYd.js → modelslab-CqXBy3U8.js} +18 -20
- package/dist/src/{modelslab-lUVW0cmB.cjs → modelslab-DcOSFwKh.cjs} +17 -18
- package/dist/src/{modelslab-ClBkr8_9.js → modelslab-X5-4LroM.js} +17 -19
- package/dist/src/{nova-reel-Dk8jNpId.js → nova-reel-BgS1ZWuK.js} +20 -13
- package/dist/src/{nova-reel-u2eF2Cxm.js → nova-reel-D2ZkOSyr.js} +22 -21
- package/dist/src/{nova-reel-D8CuO6QH.cjs → nova-reel-D9xfaMBs.cjs} +20 -21
- package/dist/src/{nova-reel-P9bwvtYX.js → nova-reel-DihqLeol.js} +20 -23
- package/dist/src/{nova-sonic-Ds1C-dpm.cjs → nova-sonic-DVu3mMIy.cjs} +30 -31
- package/dist/src/{nova-sonic-CK2rAiKi.js → nova-sonic-DezhVUYT.js} +30 -26
- package/dist/src/{nova-sonic-BaqWlkds.js → nova-sonic-P-CdUMlV.js} +30 -31
- package/dist/src/{nova-sonic-yZapPLv7.js → nova-sonic-Q3BOJeig.js} +31 -32
- package/dist/src/{openai-DUFopMrH.cjs → openai-Cuif0GEt.cjs} +8 -9
- package/dist/src/{openai-PblZ3jUE.js → openai-DElQ-fPX.js} +3 -4
- package/dist/src/{openai-CcN1B8Sb.js → openai-DhbB7eWK.js} +3 -4
- package/dist/src/openai-j-sE2O7r.js +44 -0
- package/dist/src/{openclaw-A-3_loM7.js → openclaw-BiSZPL7J.js} +20 -14
- package/dist/src/{openclaw-COn6QzDi.js → openclaw-Bv1DINsX.js} +20 -27
- package/dist/src/{openclaw-a3lylB-V.js → openclaw-D1D_ej1z.js} +21 -28
- package/dist/src/{openclaw-B6qqDr_u.cjs → openclaw-DAfWQn-o.cjs} +33 -39
- package/dist/src/opencode-sdk-C7m-wRfI.js +560 -0
- package/dist/src/opencode-sdk-CfaLN8PY.cjs +564 -0
- package/dist/src/opencode-sdk-D95s6SnR.js +562 -0
- package/dist/src/opencode-sdk-DxUPkLT7.js +560 -0
- package/dist/src/{otlpReceiver-oyf5wLGC.js → otlpReceiver--AIRW_S4.js} +53 -51
- package/dist/src/{otlpReceiver-BmmTiMjA.js → otlpReceiver-Bn5wGB1v.js} +53 -55
- package/dist/src/{otlpReceiver-lXsYVbpj.cjs → otlpReceiver-Diec4cln.cjs} +53 -55
- package/dist/src/{otlpReceiver-94URx7UW.js → otlpReceiver-g3ByGaXs.js} +53 -55
- package/dist/src/{providerRegistry-Cq_JK_CJ.js → providerRegistry-B0RUOLI_.js} +7 -8
- package/dist/src/{providerRegistry-DSSHjMKf.js → providerRegistry-CD8MEar9.js} +7 -8
- package/dist/src/{providerRegistry-CvHEVJad.cjs → providerRegistry-Civky8Ar.cjs} +12 -13
- package/dist/src/providerRegistry-DM8rZYol.js +45 -0
- package/dist/src/providers-B3HvufyI.js +33246 -0
- package/dist/src/{providers-BnFpbY_s.js → providers-BKRJTjBz.js} +1536 -1669
- package/dist/src/providers-C1rOSHiR.js +32 -0
- package/dist/src/{providers-Iil64vk9.js → providers-CFLy1_ji.js} +1543 -1676
- package/dist/src/{providers-DHbjzW2e.cjs → providers-CFu-TZl-.cjs} +1896 -2029
- package/dist/src/providers-CxmDwEFf.cjs +31 -0
- package/dist/src/providers-Dodakqr0.js +30 -0
- package/dist/src/providers-GIQ2TcsA.js +30 -0
- package/dist/src/{pythonUtils-CcT5LH1M.js → pythonUtils-C3py6GC1.js} +18 -19
- package/dist/src/{pythonUtils-DBbuI3QJ.cjs → pythonUtils-CTU3Y3lw.cjs} +42 -43
- package/dist/src/{pythonUtils-hZ8LeQLv.js → pythonUtils-D5nxkQ0P.js} +18 -19
- package/dist/src/pythonUtils-D6fwaDSg.js +249 -0
- package/dist/src/{quiverai-BuI0tE39.js → quiverai-C2jVwbH1.js} +8 -7
- package/dist/src/{quiverai-DCGSZt4U.js → quiverai-CI6gYJVI.js} +8 -10
- package/dist/src/{quiverai-DiMVJQDz.cjs → quiverai-CLkWkyZc.cjs} +8 -9
- package/dist/src/{quiverai-fQNkExW4.js → quiverai-MHSxbmmZ.js} +9 -11
- package/dist/src/{render-Dj1smHEb.js → render-Drod8m7K.js} +4 -5
- package/dist/src/{responses-DOAFFENS.js → responses-BKqJmhhc.js} +22 -25
- package/dist/src/{responses-CxzoQoBe.js → responses-CGw0DCzh.js} +22 -25
- package/dist/src/responses-jxdehPkC.js +660 -0
- package/dist/src/{responses-ghR3IOfy.cjs → responses-tD4Bd4dc.cjs} +37 -40
- package/dist/src/rubyUtils-BUHu6PhO.js +5 -0
- package/dist/src/{rubyUtils-CwbGmgYN.js → rubyUtils-BUVePouc.js} +27 -20
- package/dist/src/rubyUtils-BcuGX77l.js +222 -0
- package/dist/src/{rubyUtils-DudlFZed.js → rubyUtils-Boc4HZzX.js} +18 -19
- package/dist/src/rubyUtils-CP42kMvq.cjs +4 -0
- package/dist/src/{rubyUtils-C8MhKGHb.cjs → rubyUtils-DhCAlxZr.cjs} +48 -50
- package/dist/src/{sagemaker-gmskuyre.js → sagemaker-BK4Zb993.js} +75 -70
- package/dist/src/{sagemaker-DuM71dVU.js → sagemaker-BfiWTmvn.js} +77 -77
- package/dist/src/{sagemaker-77zbJ2Q2.cjs → sagemaker-CcQHM1jV.cjs} +75 -76
- package/dist/src/{sagemaker-CcxhlOAR.js → sagemaker-D2Q1c-sD.js} +75 -79
- package/dist/src/{scanner-DJYiSXQj.js → scanner-J8CA3LsV.js} +100 -121
- package/dist/src/server/index.js +5505 -67416
- package/dist/src/{server-B5v33lvE.cjs → server-B0PPuDw-.cjs} +57 -67
- package/dist/src/server-B1vi21hA.js +7 -0
- package/dist/src/{server-RV_i_YX5.js → server-BC7XJFgr.js} +19 -24
- package/dist/src/server-Cm9Kai_h.cjs +5 -0
- package/dist/src/{server-BJ4m4f1D.js → server-DbFphssR.js} +26 -29
- package/dist/src/server-OAs3nBRT.js +229 -0
- package/dist/src/{signal-BW33JuId.js → signal-BOTbd53Z.js} +9 -11
- package/dist/src/{slack-DEURelTy.cjs → slack-BmVAVGaK.cjs} +7 -8
- package/dist/src/{slack-BQYeW9L3.js → slack-DCUPTzS2.js} +8 -8
- package/dist/src/{slack-BB6yuZzp.js → slack-DOdy_kyv.js} +7 -8
- package/dist/src/{slack-2pRrhhgJ.js → slack-DXMKtA-f.js} +7 -9
- package/dist/src/store-BNmZ1KAz.cjs +5 -0
- package/dist/src/{store-D7CgQzAR.cjs → store-BSc-TF2w.cjs} +44 -45
- package/dist/src/store-BltJg2cd.js +6 -0
- package/dist/src/{store-s3SftUwF.js → store-D1tv90v3.js} +34 -35
- package/dist/src/{store-DJNsD1iC.js → store-DQLEjuEO.js} +40 -36
- package/dist/src/store-Ub2vaGJ1.js +228 -0
- package/dist/src/{tables-DfTsNN7X.js → tables-5EvT_Bwn.js} +19 -21
- package/dist/src/{tables-BKTmd6u7.cjs → tables-C7K-XKWp.cjs} +89 -91
- package/dist/src/{tables-DMegD0Xf.js → tables-D36WTqKX.js} +21 -23
- package/dist/src/tables-xKANLRBD.js +288 -0
- package/dist/src/telemetry-5BCRNBbe.cjs +5 -0
- package/dist/src/{telemetry-BedSm-bZ.js → telemetry-C15ziL8u.js} +17 -14
- package/dist/src/{telemetry--WAdAfVi.js → telemetry-C2YDkUQH.js} +11 -13
- package/dist/src/{telemetry-DQgVBCAb.cjs → telemetry-CbrnxHp_.cjs} +21 -24
- package/dist/src/telemetry-D4W5hboe.js +7 -0
- package/dist/src/telemetry-DMb2Mpfm.js +171 -0
- package/dist/src/{text-oiSbwSOI.js → text-B_UCRPp2.js} +2 -2
- package/dist/src/{text-oKzCBnK6.cjs → text-CW1cyrwj.cjs} +12 -13
- package/dist/src/{text-B_IrO4GZ.js → text-Db-Wt2u2.js} +2 -2
- package/dist/src/text-TIv0QYnd.js +22 -0
- package/dist/src/{tokenUsageUtils-FZd5O_4A.js → tokenUsageUtils-BDGe-iyI.js} +2 -2
- package/dist/src/{tokenUsageUtils-DmZSD2eU.js → tokenUsageUtils-DflFMjS0.js} +2 -2
- package/dist/src/tokenUsageUtils-NYT-WKS6.js +138 -0
- package/dist/src/{tokenUsageUtils-CXhxVj72.cjs → tokenUsageUtils-bVa1ga6f.cjs} +32 -33
- package/dist/src/{transcription-BO1AHegO.cjs → transcription-CL78qbOU.cjs} +14 -15
- package/dist/src/{transcription-mYS9vd5v.js → transcription-DAtxHhAM.js} +14 -7
- package/dist/src/{transcription-X2-B4vkX.js → transcription-LNZTNUUL.js} +14 -16
- package/dist/src/{transcription-lzBLiTFJ.js → transcription-QHh3AH6Z.js} +15 -17
- package/dist/src/{transform-DeGlxb0D.js → transform-Cgi24fJ7.js} +39 -47
- package/dist/src/{transform-B1Hi5lWS.cjs → transform-CzK1Q0zl.cjs} +24 -26
- package/dist/src/{transform-CYDILYDe.js → transform-DECvGmzp.js} +15 -13
- package/dist/src/{transform-Dfl89yi4.js → transform-DGLazrMm.js} +39 -47
- package/dist/src/transform-DGxXocjk.js +1506 -0
- package/dist/src/{transform-D5PjiWiZ.cjs → transform-DOcQeLld.cjs} +179 -187
- package/dist/src/transform-DTGDnAzW.js +6 -0
- package/dist/src/{transform-BEgStbHK.js → transform-DilY9wbS.js} +10 -12
- package/dist/src/transform-aa6tmVpZ.js +216 -0
- package/dist/src/transform-m3qNw4KP.cjs +5 -0
- package/dist/src/{transformersAvailability-SZnTS3pJ.js → transformersAvailability-CEVM2GNQ.js} +2 -2
- package/dist/src/{transformersAvailability-D-glmEy7.cjs → transformersAvailability-CwayUSlh.cjs} +2 -3
- package/dist/src/{transformersAvailability-CjeFXhuJ.js → transformersAvailability-D6c6ROpT.js} +2 -2
- package/dist/src/{types-CXQduE9o.js → types-CH3Ge2sE.js} +30 -90
- package/dist/src/{types-C5hEkb-x.js → types-CLKiCBW3.js} +25 -89
- package/dist/src/types-CN_TZ2GJ.js +3260 -0
- package/dist/src/{types-DWNf48sT.cjs → types-LJ0r3wbR.cjs} +500 -564
- package/dist/src/util-5cB-L7U3.js +1430 -0
- package/dist/src/util-6-GqIvzS.js +599 -0
- package/dist/src/{util-CoQjmE3u.js → util-B7T3SiBS.js} +4 -5
- package/dist/src/{util-D9eLdGfa.js → util-Betm42rL.js} +5 -6
- package/dist/src/{util-Bm_-UMD_.js → util-C-PPYSMq.js} +5 -6
- package/dist/src/{util-CyUdMzV0.cjs → util-CchiqXh_.cjs} +34 -35
- package/dist/src/{util-Du96oyYS.js → util-DaWTWKBK.js} +4 -5
- package/dist/src/{util-1wWM599Z.cjs → util-Db0a0AFH.cjs} +50 -51
- package/dist/src/{util-DQ984syk.js → util-Dlz_Wvgm.js} +37 -48
- package/dist/src/{util-_h4pVqrz.js → util-YT5HPZaS.js} +37 -48
- package/dist/src/{util-aLhtl3fe.cjs → util-Yz-1aEhW.cjs} +209 -220
- package/dist/src/util-ZZH-3QZz.js +293 -0
- package/dist/src/{utils-BjLy-Q72.cjs → utils-Cz9qXqII.cjs} +29 -32
- package/dist/src/{utils-CFMn2yHW.js → utils-XiOAgly5.js} +4 -7
- package/dist/src/utils-dLokC-eR.js +94 -0
- package/dist/src/{utils-DvWMzuMx.js → utils-f2-Moju7.js} +4 -7
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +30 -30
- package/dist/src/app/assets/index-B2D0bCSI.js +0 -439
- package/dist/src/app/tsconfig.app.tsbuildinfo +0 -1
- package/dist/src/cache-ChPcurj7.js +0 -6
- package/dist/src/cache-VVu_W-yg.js +0 -8
- package/dist/src/cache-YLNCFEM2.cjs +0 -6
- package/dist/src/chunk-DHDDz29n.js +0 -22
- package/dist/src/chunk-FhC4c-0y.js +0 -21
- package/dist/src/cloud-BndfXy4H.js +0 -5
- package/dist/src/eval-BhHvMY82.js +0 -17
- package/dist/src/evalResult-Dq2gFNQY.js +0 -12
- package/dist/src/evalResult-nmcP5VKH.cjs +0 -12
- package/dist/src/evalResult-trqZjVYh.js +0 -14
- package/dist/src/evaluator-CnfPstzT.js +0 -39
- package/dist/src/fetch-IDPDue6F.cjs +0 -4
- package/dist/src/fetch-hKJ-It8q.js +0 -6
- package/dist/src/fetch-ouKnrWK-.js +0 -4
- package/dist/src/graders-CQn7WUsd.cjs +0 -34
- package/dist/src/graders-DC6QAbpW.js +0 -35
- package/dist/src/graders-DUWz3Y7j.js +0 -37
- package/dist/src/opencode-sdk-4bL9n-Gk.js +0 -382
- package/dist/src/opencode-sdk-BfC2zWcR.js +0 -376
- package/dist/src/opencode-sdk-DMJyuwMg.js +0 -380
- package/dist/src/opencode-sdk-Da-9adza.cjs +0 -383
- package/dist/src/providers-CsXB2Ix-.js +0 -35
- package/dist/src/providers-DO8ltjLC.js +0 -33
- package/dist/src/providers-Dtq-xnXd.cjs +0 -33
- package/dist/src/rubyUtils-BUbcND2f.js +0 -6
- package/dist/src/rubyUtils-Cr55X_KE.js +0 -5
- package/dist/src/rubyUtils-DlIiqoYo.cjs +0 -5
- package/dist/src/server-C2eQH4Gu.js +0 -6
- package/dist/src/server-CXWycu7H.cjs +0 -6
- package/dist/src/server-Q6OGlxxT.js +0 -8
- package/dist/src/store-B3EDO9Q3.js +0 -7
- package/dist/src/store-Dl9F8aw5.js +0 -6
- package/dist/src/store-SnrGrlt9.cjs +0 -6
- package/dist/src/telemetry-BGhiPZtl.js +0 -8
- package/dist/src/telemetry-CFfiYan6.cjs +0 -6
- package/dist/src/telemetry-DHzEduxX.js +0 -6
- package/dist/src/transform-C1x1ZlMQ.cjs +0 -6
- package/dist/src/transform-DYHjFmQu.js +0 -8
- package/dist/src/transform-rmwJT5JQ.js +0 -7
- package/dist/src/transformersAvailability-eJooj0gX.js +0 -35
package/dist/src/index.js
CHANGED
|
@@ -1,41 +1,40 @@
|
|
|
1
|
-
import "./
|
|
2
|
-
import {
|
|
3
|
-
import { t as
|
|
4
|
-
import { r as
|
|
5
|
-
import {
|
|
6
|
-
import { i as
|
|
7
|
-
import { n as
|
|
8
|
-
import {
|
|
9
|
-
import { A as
|
|
10
|
-
import { A as
|
|
11
|
-
import {
|
|
12
|
-
import { a as
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import
|
|
16
|
-
import "./
|
|
17
|
-
import { t as
|
|
18
|
-
import {
|
|
19
|
-
import
|
|
20
|
-
import "./
|
|
21
|
-
import "./
|
|
22
|
-
import "./
|
|
23
|
-
import "./
|
|
24
|
-
import
|
|
25
|
-
import "./
|
|
26
|
-
import {
|
|
27
|
-
import {
|
|
28
|
-
import {
|
|
29
|
-
import {
|
|
30
|
-
import {
|
|
31
|
-
import {
|
|
32
|
-
import { t as
|
|
33
|
-
import
|
|
34
|
-
import "./
|
|
35
|
-
import "./
|
|
36
|
-
import {
|
|
37
|
-
import {
|
|
38
|
-
import { t as EvalResult } from "./evalResult-4BzI2tmj.js";
|
|
1
|
+
import { C as isCI, S as getMaxEvalTimeMs, _ as getEnvBool, a as setLogCallback, b as getEnvString, d as getAjv, h as summarizeEvaluateResultForLogging, i as logger, m as safeJsonStringify, n as isDebugEnabled, o as setLogLevel, p as orderKeys, t as getLogLevel, u as extractJsonObjects, v as getEnvFloat, w as state, x as getEvalTimeoutMs, y as getEnvInt } from "./logger-CT3IKMKA.js";
|
|
2
|
+
import { t as invariant } from "./invariant-Ddh24eXh.js";
|
|
3
|
+
import { r as importModule, t as getDirectory } from "./esm-Cd1AjG1D.js";
|
|
4
|
+
import { r as runPython } from "./pythonUtils-D5nxkQ0P.js";
|
|
5
|
+
import { i as isJavascriptFile } from "./fileExtensions-DnqA1y9x.js";
|
|
6
|
+
import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-DECvGmzp.js";
|
|
7
|
+
import { $ as matchesSearchRubric, A as BeavertailsPlugin, B as getAndCheckProvider, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as retryWithDeduplication, G as matchesContextFaithfulness, H as matchesAnswerRelevance, I as sampleArray, J as matchesFactuality, K as matchesContextRecall, L as fetchHuggingFaceDataset, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as getCustomPolicies, Q as matchesPiScore, R as callProviderWithContext, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as processPrompts, b as OverreliancePlugin, c as ToolDiscoveryPlugin, ct as SUGGEST_PROMPTS_SYSTEM_MESSAGE, d as RbacPlugin, dt as loadFromJavaScriptFile, et as matchesSelectBest, f as PromptExtractionPlugin, ft as processFileReference, g as isValidPolicyObject, h as determinePolicyTypeFromId, i as VLGuardPlugin, it as DefaultSuggestionsProvider, j as AegisPlugin, k as ContractPlugin, l as SqlInjectionPlugin, lt as coerceString, m as PolicyPlugin, n as getGraderById, nt as selectMaxScore, o as UnsafeBenchPlugin, ot as readPrompts, p as PoliticsPlugin, pt as resolveContext, q as matchesContextRelevance, r as VLSUPlugin, rt as getDefaultProviders, s as ToxicChatPlugin, st as readProviderPromptMap, t as GRADERS, tt as matchesSimilarity, u as ShellInjectionPlugin, ut as getFinalTest, v as PlinyPlugin, w as HallucinationPlugin, x as IntentPlugin, y as getPiiLeakTestsForCategory, z as fail } from "./graders-CpdqD9PI.js";
|
|
8
|
+
import { A as isApiProvider, C as TestGeneratorConfigSchema, Ct as BaseTokenUsageSchema, D as VarsSchema, E as UnifiedConfigSchema, F as ConversationMessageSchema, I as PartialGenerationError, J as getDefaultNFanout, K as STRATEGY_COLLECTIONS, L as PluginConfigSchema, M as RedteamConfigSchema, O as isGradingResult, P as ProvidersSchema, Q as categoryAliases, R as PolicyObjectSchema, S as TestCasesWithMetadataSchema, St as PromptSchema, T as TestSuiteSchema, Tt as InputsSchema, V as isUuid, W as DEFAULT_STRATEGIES, X as isFanoutStrategy, Z as Severity, _ as ScenarioSchema, _t as REDTEAM_PROVIDER_HARM_PLUGINS, a as AtomicTestCaseSchema, at as FINANCIAL_PLUGINS, b as TestCaseWithVarsFileSchema, bt as TELECOM_PLUGINS, c as CompletedPromptSchema, ct as INSURANCE_PLUGINS, d as EvaluateOptionsSchema, dt as MEDICAL_PLUGINS, et as riskCategorySeverityMap, f as GradingConfigSchema, ft as MULTI_INPUT_EXCLUDED_PLUGINS, g as ResultFailureReason, gt as PLUGIN_CATEGORIES, h as OutputFileExtension, ht as PII_PLUGINS, i as AssertionTypeSchema, it as DEFAULT_PLUGINS, j as isProviderOptions, k as isResultFailureReason, l as DerivedMetricSchema, lt as LLAMA_GUARD_ENABLED_CATEGORIES, m as OutputConfigSchema, mt as PHARMACY_PLUGINS, n as AssertionSchema, nt as BIAS_PLUGINS, o as BaseAssertionTypesSchema, ot as FOUNDATION_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_VAR, q as STRATEGY_COLLECTION_MAPPINGS, r as AssertionSetSchema, rt as DATASET_EXEMPT_PLUGINS, s as CommandLineOptionsSchema, st as HARM_PLUGINS, t as AssertionOrSetSchema, tt as ALIASED_PLUGIN_MAPPINGS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_REPLICATE_PROVIDER, v as SpecialAssertionTypesSchema, vt as REMOTE_ONLY_PLUGIN_IDS, w as TestSuiteConfigSchema, wt as CompletionTokenDetailsSchema, x as TestCasesWithMetadataPromptSchema, xt as UNALIGNED_PROVIDER_HARM_PLUGINS, y as TestCaseSchema, z as StrategyConfigSchema } from "./types-CLKiCBW3.js";
|
|
9
|
+
import { A as getProviderDescription, C as deduplicateTestCases, D as resultIsForTestCase, E as getTestCaseDeduplicationKey, M as isGoogleProvider, N as isOpenAiProvider, O as checkProviderApiKeys, P as isProviderAllowed, S as setupEnv, T as filterRuntimeVars, b as loadFunction, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, h as renderEnvOnlyInObject, i as fetchCsvFromGoogleSheet, j as isAnthropicProvider, k as doesProviderRefMatch, m as readOutput, n as writeMultipleOutputs, p as readFilters, r as writeOutput, s as maybeLoadConfigFromExternalFile, t as printBorder, v as extractVariablesFromTemplates, w as extractRuntimeVars, x as parseFileUrl, y as getNunjucksEngine } from "./util-Dlz_Wvgm.js";
|
|
10
|
+
import { A as getShareApiBaseUrl, F as HUMAN_ASSERTION_TYPE, N as VERSION, O as TERMINAL_MAX_WIDTH, P as FILE_METADATA_KEY, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareViewBaseUrl, k as getDefaultShareViewBaseUrl, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "./fetch-60Gzydls.js";
|
|
11
|
+
import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "./cache-8XhNqPKW.js";
|
|
12
|
+
import { A as createRateLimitRegistry, B as isCloudProvider, C as collectFileMetadata, D as loadFromPackage, E as isPackagePath, F as getCloudDatabaseId, I as getEvalConfigFromCloud, J as AIStudioChatProvider, L as getOrgContext, M as PromptfooHarmfulCompletionProvider, O as redteamProviderManager, P as checkCloudPermissions, R as getPluginSeverityOverridesFromCloud, T as runExtensionHook, V as resolveTeamId, _ as extractVariablesFromJson, a as resolveProviderConfigs, b as isBasicRefusal, c as Strategies, d as pluginMatchesStrategyTargets, f as checkExfilTracking, g as extractPromptFromTags, i as resolveProvider, j as createProviderRateLimitOptions, k as TokenUsageTracker, l as loadStrategy, m as extractGoalFromPrompt, n as loadApiProvider, o as MCPProvider, q as VertexChatProvider, r as loadApiProviders, s as GoogleLiveProvider, t as getProviderIds, u as validateStrategies, v as getSessionId, w as renderPrompt, y as getShortPluginId } from "./providers-BKRJTjBz.js";
|
|
13
|
+
import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-XiOAgly5.js";
|
|
14
|
+
import { n as sha256, t as randomSequence } from "./createHash-DmPQkvBh.js";
|
|
15
|
+
import "./genaiTracer-D3fD9dNV.js";
|
|
16
|
+
import { t as OpenAiChatCompletionProvider } from "./chat-CznLWr_D.js";
|
|
17
|
+
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-NYT-WKS6.js";
|
|
18
|
+
import { m as validateFunctionCall } from "./transform-DGLazrMm.js";
|
|
19
|
+
import "./messages-BLbWdsyt.js";
|
|
20
|
+
import "./util-DaWTWKBK.js";
|
|
21
|
+
import "./responses-BKqJmhhc.js";
|
|
22
|
+
import "./openai-DElQ-fPX.js";
|
|
23
|
+
import { l as validateFunctionCall$1 } from "./util-Betm42rL.js";
|
|
24
|
+
import "./completion-C_P3ypkJ.js";
|
|
25
|
+
import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-xrUGFA6n.js";
|
|
26
|
+
import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-BC7XJFgr.js";
|
|
27
|
+
import { t as getBlobByHash } from "./blobs-Bpg5rH6i.js";
|
|
28
|
+
import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-5EvT_Bwn.js";
|
|
29
|
+
import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-M67RUtg6.js";
|
|
30
|
+
import { t as telemetry } from "./telemetry-C15ziL8u.js";
|
|
31
|
+
import { t as ellipsize } from "./text-B_UCRPp2.js";
|
|
32
|
+
import { t as getTraceStore } from "./store-DQLEjuEO.js";
|
|
33
|
+
import "./base-B0tcrnq_.js";
|
|
34
|
+
import "./image-BmEZqVmk.js";
|
|
35
|
+
import { t as providerRegistry } from "./providerRegistry-CD8MEar9.js";
|
|
36
|
+
import { n as runRuby } from "./rubyUtils-BUVePouc.js";
|
|
37
|
+
import { t as EvalResult } from "./evalResult-CDQiuUuf.js";
|
|
39
38
|
import * as fs$1 from "fs";
|
|
40
39
|
import fs, { createWriteStream } from "fs";
|
|
41
40
|
import * as path$2 from "path";
|
|
@@ -57,7 +56,7 @@ import { XMLParser } from "fast-xml-parser";
|
|
|
57
56
|
import crypto$1, { createHash, randomBytes } from "crypto";
|
|
58
57
|
import { DiagConsoleLogger, DiagLogLevel, diag, propagation } from "@opentelemetry/api";
|
|
59
58
|
import input from "@inquirer/input";
|
|
60
|
-
import { and,
|
|
59
|
+
import { and, desc, eq, inArray, sql } from "drizzle-orm";
|
|
61
60
|
import cliProgress from "cli-progress";
|
|
62
61
|
import { JSDOM } from "jsdom";
|
|
63
62
|
import { distance } from "fastest-levenshtein";
|
|
@@ -76,7 +75,6 @@ import chokidar from "chokidar";
|
|
|
76
75
|
import ora from "ora";
|
|
77
76
|
import { URL } from "url";
|
|
78
77
|
import "@inquirer/confirm";
|
|
79
|
-
|
|
80
78
|
//#region src/external/matchers/conversationRelevancyTemplate.ts
|
|
81
79
|
var ConversationRelevancyTemplate = class {
|
|
82
80
|
static generateVerdicts(slidingWindow) {
|
|
@@ -148,7 +146,6 @@ ${JSON.stringify(irrelevancies, null, 2)}
|
|
|
148
146
|
JSON:`;
|
|
149
147
|
}
|
|
150
148
|
};
|
|
151
|
-
|
|
152
149
|
//#endregion
|
|
153
150
|
//#region src/external/matchers/deepeval.ts
|
|
154
151
|
const nunjucks$1 = getNunjucksEngine(void 0, false, true);
|
|
@@ -198,7 +195,6 @@ async function matchesConversationRelevance(messages, threshold, vars, grading,
|
|
|
198
195
|
return fail(`Error parsing output: ${err.message}`, resp.tokenUsage);
|
|
199
196
|
}
|
|
200
197
|
}
|
|
201
|
-
|
|
202
198
|
//#endregion
|
|
203
199
|
//#region src/external/assertions/deepeval.ts
|
|
204
200
|
const DEFAULT_WINDOW_SIZE = 5;
|
|
@@ -253,7 +249,6 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
253
249
|
tokensUsed: tokensUsed.total > 0 ? tokensUsed : void 0
|
|
254
250
|
};
|
|
255
251
|
};
|
|
256
|
-
|
|
257
252
|
//#endregion
|
|
258
253
|
//#region src/tracing/evaluatorTracing.ts
|
|
259
254
|
let otlpReceiverStarted = false;
|
|
@@ -286,28 +281,28 @@ function isOtlpReceiverStarted() {
|
|
|
286
281
|
* Start the OTLP receiver if tracing is enabled and it hasn't been started yet
|
|
287
282
|
*/
|
|
288
283
|
async function startOtlpReceiverIfNeeded(testSuite) {
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
284
|
+
logger.debug(`[EvaluatorTracing] Checking tracing config: ${JSON.stringify(testSuite.tracing)}`);
|
|
285
|
+
logger.debug(`[EvaluatorTracing] testSuite keys: ${Object.keys(testSuite)}`);
|
|
286
|
+
logger.debug(`[EvaluatorTracing] Full testSuite.tracing: ${JSON.stringify(testSuite.tracing, null, 2)}`);
|
|
292
287
|
if (testSuite.tracing?.enabled && testSuite.tracing?.otlp?.http?.enabled && !otlpReceiverStarted) {
|
|
293
|
-
|
|
288
|
+
telemetry.record("feature_used", { feature: "tracing" });
|
|
294
289
|
try {
|
|
295
|
-
|
|
296
|
-
const { startOTLPReceiver } = await import("./otlpReceiver
|
|
290
|
+
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
291
|
+
const { startOTLPReceiver } = await import("./otlpReceiver--AIRW_S4.js");
|
|
297
292
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
298
293
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
299
|
-
|
|
294
|
+
logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
300
295
|
await startOTLPReceiver(port, host);
|
|
301
296
|
otlpReceiverStarted = true;
|
|
302
|
-
|
|
297
|
+
logger.info(`[EvaluatorTracing] OTLP receiver successfully started on port ${port} for tracing`);
|
|
303
298
|
} catch (error) {
|
|
304
|
-
|
|
299
|
+
logger.error(`[EvaluatorTracing] Failed to start OTLP receiver: ${error}`);
|
|
305
300
|
}
|
|
306
|
-
} else if (otlpReceiverStarted)
|
|
301
|
+
} else if (otlpReceiverStarted) logger.debug("[EvaluatorTracing] OTLP receiver already started, skipping initialization");
|
|
307
302
|
else {
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
303
|
+
logger.debug("[EvaluatorTracing] Tracing not enabled or OTLP HTTP receiver not configured");
|
|
304
|
+
logger.debug(`[EvaluatorTracing] tracing.enabled: ${testSuite.tracing?.enabled}`);
|
|
305
|
+
logger.debug(`[EvaluatorTracing] tracing.otlp.http.enabled: ${testSuite.tracing?.otlp?.http?.enabled}`);
|
|
311
306
|
}
|
|
312
307
|
}
|
|
313
308
|
/**
|
|
@@ -315,13 +310,13 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
315
310
|
*/
|
|
316
311
|
async function stopOtlpReceiverIfNeeded() {
|
|
317
312
|
if (otlpReceiverStarted) try {
|
|
318
|
-
|
|
319
|
-
const { stopOTLPReceiver } = await import("./otlpReceiver
|
|
313
|
+
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
314
|
+
const { stopOTLPReceiver } = await import("./otlpReceiver--AIRW_S4.js");
|
|
320
315
|
await stopOTLPReceiver();
|
|
321
316
|
otlpReceiverStarted = false;
|
|
322
|
-
|
|
317
|
+
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
323
318
|
} catch (error) {
|
|
324
|
-
|
|
319
|
+
logger.error(`[EvaluatorTracing] Failed to stop OTLP receiver: ${error}`);
|
|
325
320
|
}
|
|
326
321
|
}
|
|
327
322
|
/**
|
|
@@ -337,7 +332,7 @@ function isTracingEnabled(test, testSuite) {
|
|
|
337
332
|
const yamlConfigEnabled = testSuite?.tracing?.enabled === true;
|
|
338
333
|
const envEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false);
|
|
339
334
|
const result = metadataEnabled || yamlConfigEnabled || envEnabled;
|
|
340
|
-
|
|
335
|
+
logger.debug(`[EvaluatorTracing] isTracingEnabled check: metadata=${metadataEnabled}, yamlConfig=${yamlConfigEnabled}, env=${envEnabled}, result=${result}`);
|
|
341
336
|
return result;
|
|
342
337
|
}
|
|
343
338
|
/**
|
|
@@ -346,25 +341,25 @@ function isTracingEnabled(test, testSuite) {
|
|
|
346
341
|
async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite) {
|
|
347
342
|
const tracingEnabled = isTracingEnabled(test, testSuite);
|
|
348
343
|
if (tracingEnabled) {
|
|
349
|
-
|
|
350
|
-
|
|
344
|
+
logger.debug("[EvaluatorTracing] Tracing enabled for test case");
|
|
345
|
+
logger.debug(`[EvaluatorTracing] Test metadata: ${JSON.stringify(test.metadata)}`);
|
|
351
346
|
}
|
|
352
347
|
if (!tracingEnabled) return null;
|
|
353
|
-
|
|
354
|
-
const { getTraceStore } = await import("./store-
|
|
348
|
+
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
349
|
+
const { getTraceStore } = await import("./store-DQLEjuEO.js").then((n) => n.n);
|
|
355
350
|
const traceStore = getTraceStore();
|
|
356
351
|
const traceId = generateTraceId();
|
|
357
352
|
const spanId = generateSpanId();
|
|
358
353
|
const traceparent = generateTraceparent(traceId, spanId);
|
|
359
|
-
|
|
354
|
+
logger.debug(`[EvaluatorTracing] Generated trace context: traceId=${traceId}, spanId=${spanId}`);
|
|
360
355
|
let evaluationId = test.metadata?.evaluationId || evaluateOptions?.eventSource;
|
|
361
356
|
if (!evaluationId) {
|
|
362
|
-
|
|
357
|
+
logger.warn("[EvaluatorTracing] No evaluation ID found in test metadata or evaluateOptions, trace will not be linked to evaluation");
|
|
363
358
|
evaluationId = `eval-${Date.now()}`;
|
|
364
359
|
}
|
|
365
360
|
const testCaseId = test.metadata?.testCaseId || test.id || `${testIdx}-${promptIdx}`;
|
|
366
361
|
try {
|
|
367
|
-
|
|
362
|
+
logger.debug(`[EvaluatorTracing] Creating trace record for traceId=${traceId}`);
|
|
368
363
|
await traceStore.createTrace({
|
|
369
364
|
traceId,
|
|
370
365
|
evaluationId: evaluationId || "",
|
|
@@ -375,18 +370,17 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
375
370
|
vars: test.vars
|
|
376
371
|
}
|
|
377
372
|
});
|
|
378
|
-
|
|
373
|
+
logger.debug("[EvaluatorTracing] Trace record created successfully");
|
|
379
374
|
} catch (error) {
|
|
380
|
-
|
|
375
|
+
logger.error(`[EvaluatorTracing] Failed to create trace: ${error}`);
|
|
381
376
|
}
|
|
382
|
-
|
|
377
|
+
logger.debug(`[EvaluatorTracing] Trace context ready: ${traceparent} for test case ${testCaseId}`);
|
|
383
378
|
return {
|
|
384
379
|
traceparent,
|
|
385
380
|
evaluationId,
|
|
386
381
|
testCaseId
|
|
387
382
|
};
|
|
388
383
|
}
|
|
389
|
-
|
|
390
384
|
//#endregion
|
|
391
385
|
//#region src/assertions/answerRelevance.ts
|
|
392
386
|
const handleAnswerRelevance = async ({ assertion, output, prompt, test, providerCallContext }) => {
|
|
@@ -397,7 +391,6 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
|
|
|
397
391
|
...await matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
|
|
398
392
|
};
|
|
399
393
|
};
|
|
400
|
-
|
|
401
394
|
//#endregion
|
|
402
395
|
//#region src/assertions/assertionsResult.ts
|
|
403
396
|
const GUARDRAIL_BLOCKED_REASON = "Content failed guardrail safety checks";
|
|
@@ -503,7 +496,6 @@ var AssertionsResult = class {
|
|
|
503
496
|
return this.result;
|
|
504
497
|
}
|
|
505
498
|
};
|
|
506
|
-
|
|
507
499
|
//#endregion
|
|
508
500
|
//#region src/assertions/ngrams.ts
|
|
509
501
|
/**
|
|
@@ -519,7 +511,6 @@ function getNGrams(words, n) {
|
|
|
519
511
|
for (let i = 0; i <= words.length - n; i++) ngrams.push(words.slice(i, i + n).join(" "));
|
|
520
512
|
return ngrams;
|
|
521
513
|
}
|
|
522
|
-
|
|
523
514
|
//#endregion
|
|
524
515
|
//#region src/assertions/bleu.ts
|
|
525
516
|
/**
|
|
@@ -615,7 +606,6 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
615
606
|
assertion
|
|
616
607
|
};
|
|
617
608
|
}
|
|
618
|
-
|
|
619
609
|
//#endregion
|
|
620
610
|
//#region src/assertions/classifier.ts
|
|
621
611
|
async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
|
|
@@ -630,9 +620,43 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
630
620
|
...classificationResult
|
|
631
621
|
};
|
|
632
622
|
}
|
|
633
|
-
|
|
634
623
|
//#endregion
|
|
635
624
|
//#region src/assertions/contains.ts
|
|
625
|
+
function parseCommaSeparatedValues(value) {
|
|
626
|
+
const results = [];
|
|
627
|
+
let i = 0;
|
|
628
|
+
while (i < value.length) {
|
|
629
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
630
|
+
if (i >= value.length) break;
|
|
631
|
+
if (value[i] === ",") {
|
|
632
|
+
i++;
|
|
633
|
+
continue;
|
|
634
|
+
}
|
|
635
|
+
if (value[i] === "\"") {
|
|
636
|
+
i++;
|
|
637
|
+
let field = "";
|
|
638
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
639
|
+
field += value[i + 1];
|
|
640
|
+
i += 2;
|
|
641
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
642
|
+
field += "\"";
|
|
643
|
+
i += 2;
|
|
644
|
+
} else if (value[i] === "\"") {
|
|
645
|
+
i++;
|
|
646
|
+
break;
|
|
647
|
+
} else {
|
|
648
|
+
field += value[i];
|
|
649
|
+
i++;
|
|
650
|
+
}
|
|
651
|
+
results.push(field);
|
|
652
|
+
} else {
|
|
653
|
+
const start = i;
|
|
654
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
655
|
+
results.push(value.substring(start, i).trim());
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
return results;
|
|
659
|
+
}
|
|
636
660
|
const handleContains = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
637
661
|
const value = valueFromScript ?? renderedValue;
|
|
638
662
|
invariant(value, "\"contains\" assertion type must have a string or number value");
|
|
@@ -660,7 +684,7 @@ const handleIContains = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
660
684
|
const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
661
685
|
let value = valueFromScript ?? renderedValue;
|
|
662
686
|
invariant(value, "\"contains-any\" assertion type must have a value");
|
|
663
|
-
if (typeof value === "string") value = value
|
|
687
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
664
688
|
invariant(Array.isArray(value), "\"contains-any\" assertion type must have an array value");
|
|
665
689
|
const pass = value.some((v) => outputString.includes(String(v))) !== inverse;
|
|
666
690
|
return {
|
|
@@ -673,7 +697,7 @@ const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
673
697
|
const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
674
698
|
let value = valueFromScript ?? renderedValue;
|
|
675
699
|
invariant(value, "\"icontains-any\" assertion type must have a value");
|
|
676
|
-
if (typeof value === "string") value = value
|
|
700
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
677
701
|
invariant(Array.isArray(value), "\"icontains-any\" assertion type must have an array value");
|
|
678
702
|
const pass = value.some((v) => outputString.toLowerCase().includes(String(v).toLowerCase())) !== inverse;
|
|
679
703
|
return {
|
|
@@ -686,7 +710,7 @@ const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
686
710
|
const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
687
711
|
let value = valueFromScript ?? renderedValue;
|
|
688
712
|
invariant(value, "\"contains-all\" assertion type must have a value");
|
|
689
|
-
if (typeof value === "string") value = value
|
|
713
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
690
714
|
invariant(Array.isArray(value), "\"contains-all\" assertion type must have an array value");
|
|
691
715
|
const missingStrings = value.filter((v) => !outputString.includes(String(v)));
|
|
692
716
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -700,7 +724,7 @@ const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
700
724
|
const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
701
725
|
let value = valueFromScript ?? renderedValue;
|
|
702
726
|
invariant(value, "\"icontains-all\" assertion type must have a value");
|
|
703
|
-
if (typeof value === "string") value = value
|
|
727
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
704
728
|
invariant(Array.isArray(value), "\"icontains-all\" assertion type must have an array value");
|
|
705
729
|
const missingStrings = value.filter((v) => !outputString.toLowerCase().includes(String(v).toLowerCase()));
|
|
706
730
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -711,7 +735,6 @@ const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
711
735
|
assertion
|
|
712
736
|
};
|
|
713
737
|
};
|
|
714
|
-
|
|
715
738
|
//#endregion
|
|
716
739
|
//#region src/assertions/contextFaithfulness.ts
|
|
717
740
|
/**
|
|
@@ -735,7 +758,6 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
735
758
|
metadata: { context }
|
|
736
759
|
};
|
|
737
760
|
}
|
|
738
|
-
|
|
739
761
|
//#endregion
|
|
740
762
|
//#region src/assertions/contextRecall.ts
|
|
741
763
|
/**
|
|
@@ -762,7 +784,6 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
|
|
|
762
784
|
}
|
|
763
785
|
};
|
|
764
786
|
};
|
|
765
|
-
|
|
766
787
|
//#endregion
|
|
767
788
|
//#region src/assertions/contextRelevance.ts
|
|
768
789
|
/**
|
|
@@ -789,7 +810,6 @@ const handleContextRelevance = async ({ assertion, test, output, prompt, provide
|
|
|
789
810
|
}
|
|
790
811
|
};
|
|
791
812
|
};
|
|
792
|
-
|
|
793
813
|
//#endregion
|
|
794
814
|
//#region src/assertions/cost.ts
|
|
795
815
|
const handleCost = ({ cost, assertion }) => {
|
|
@@ -803,7 +823,6 @@ const handleCost = ({ cost, assertion }) => {
|
|
|
803
823
|
assertion
|
|
804
824
|
};
|
|
805
825
|
};
|
|
806
|
-
|
|
807
826
|
//#endregion
|
|
808
827
|
//#region src/assertions/equals.ts
|
|
809
828
|
const handleEquals = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -823,7 +842,6 @@ const handleEquals = async ({ assertion, renderedValue, outputString, inverse })
|
|
|
823
842
|
assertion
|
|
824
843
|
};
|
|
825
844
|
};
|
|
826
|
-
|
|
827
845
|
//#endregion
|
|
828
846
|
//#region src/assertions/factuality.ts
|
|
829
847
|
const handleFactuality = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -834,7 +852,6 @@ const handleFactuality = async ({ assertion, renderedValue, outputString, test,
|
|
|
834
852
|
...await matchesFactuality(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
835
853
|
};
|
|
836
854
|
};
|
|
837
|
-
|
|
838
855
|
//#endregion
|
|
839
856
|
//#region src/assertions/finishReason.ts
|
|
840
857
|
function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
@@ -854,7 +871,6 @@ function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
|
854
871
|
assertion
|
|
855
872
|
};
|
|
856
873
|
}
|
|
857
|
-
|
|
858
874
|
//#endregion
|
|
859
875
|
//#region src/assertions/functionToolCall.ts
|
|
860
876
|
const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
@@ -877,7 +893,6 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
877
893
|
};
|
|
878
894
|
}
|
|
879
895
|
};
|
|
880
|
-
|
|
881
896
|
//#endregion
|
|
882
897
|
//#region src/assertions/geval.ts
|
|
883
898
|
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
@@ -903,7 +918,6 @@ const handleGEval = async ({ assertion, renderedValue, prompt, outputString, tes
|
|
|
903
918
|
...await matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext)
|
|
904
919
|
};
|
|
905
920
|
};
|
|
906
|
-
|
|
907
921
|
//#endregion
|
|
908
922
|
//#region src/assertions/gleu.ts
|
|
909
923
|
/**
|
|
@@ -1002,7 +1016,6 @@ function handleGleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
1002
1016
|
assertion
|
|
1003
1017
|
};
|
|
1004
1018
|
}
|
|
1005
|
-
|
|
1006
1019
|
//#endregion
|
|
1007
1020
|
//#region src/assertions/guardrails.ts
|
|
1008
1021
|
const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
@@ -1011,7 +1024,7 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1011
1024
|
flaggedInput: false,
|
|
1012
1025
|
flaggedOutput: false
|
|
1013
1026
|
};
|
|
1014
|
-
|
|
1027
|
+
logger.debug(`Guardrails from target response: ${JSON.stringify(guardrails, null, 2)}`);
|
|
1015
1028
|
if (guardrails) if (guardrails.flagged) {
|
|
1016
1029
|
let reason = "Content failed safety checks";
|
|
1017
1030
|
if (guardrails?.flaggedInput) reason = "Prompt failed safety checks";
|
|
@@ -1040,7 +1053,6 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1040
1053
|
assertion
|
|
1041
1054
|
};
|
|
1042
1055
|
};
|
|
1043
|
-
|
|
1044
1056
|
//#endregion
|
|
1045
1057
|
//#region src/assertions/html.ts
|
|
1046
1058
|
const HTML_PATTERNS = {
|
|
@@ -1249,7 +1261,6 @@ const handleIsHtml = ({ assertion, outputString, inverse }) => {
|
|
|
1249
1261
|
assertion
|
|
1250
1262
|
};
|
|
1251
1263
|
};
|
|
1252
|
-
|
|
1253
1264
|
//#endregion
|
|
1254
1265
|
//#region src/assertions/javascript.ts
|
|
1255
1266
|
/**
|
|
@@ -1390,7 +1401,6 @@ ${renderedValue}`,
|
|
|
1390
1401
|
assertion
|
|
1391
1402
|
};
|
|
1392
1403
|
};
|
|
1393
|
-
|
|
1394
1404
|
//#endregion
|
|
1395
1405
|
//#region src/assertions/json.ts
|
|
1396
1406
|
function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, assertion }) {
|
|
@@ -1456,7 +1466,6 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1456
1466
|
assertion
|
|
1457
1467
|
};
|
|
1458
1468
|
}
|
|
1459
|
-
|
|
1460
1469
|
//#endregion
|
|
1461
1470
|
//#region src/assertions/latency.ts
|
|
1462
1471
|
const handleLatency = ({ assertion, latencyMs }) => {
|
|
@@ -1470,7 +1479,6 @@ const handleLatency = ({ assertion, latencyMs }) => {
|
|
|
1470
1479
|
assertion
|
|
1471
1480
|
};
|
|
1472
1481
|
};
|
|
1473
|
-
|
|
1474
1482
|
//#endregion
|
|
1475
1483
|
//#region src/assertions/levenshtein.ts
|
|
1476
1484
|
function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
@@ -1485,7 +1493,6 @@ function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
|
1485
1493
|
assertion
|
|
1486
1494
|
};
|
|
1487
1495
|
}
|
|
1488
|
-
|
|
1489
1496
|
//#endregion
|
|
1490
1497
|
//#region src/assertions/llmRubric.ts
|
|
1491
1498
|
const handleLlmRubric = ({ assertion, renderedValue, outputString, test, providerCallContext }) => {
|
|
@@ -1494,7 +1501,6 @@ const handleLlmRubric = ({ assertion, renderedValue, outputString, test, provide
|
|
|
1494
1501
|
assertion.value = assertion.value || test.options?.rubricPrompt;
|
|
1495
1502
|
return matchesLlmRubric(renderedValue || "", outputString, test.options, test.vars, assertion, void 0, providerCallContext);
|
|
1496
1503
|
};
|
|
1497
|
-
|
|
1498
1504
|
//#endregion
|
|
1499
1505
|
//#region src/assertions/modelGradedClosedQa.ts
|
|
1500
1506
|
const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -1505,7 +1511,6 @@ const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputStrin
|
|
|
1505
1511
|
...await matchesClosedQa(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
1506
1512
|
};
|
|
1507
1513
|
};
|
|
1508
|
-
|
|
1509
1514
|
//#endregion
|
|
1510
1515
|
//#region src/util/providerResponse.ts
|
|
1511
1516
|
/**
|
|
@@ -1548,7 +1553,6 @@ function getActualPrompt(response, options = {}) {
|
|
|
1548
1553
|
function getActualPromptWithFallback(response, originalPrompt, options = {}) {
|
|
1549
1554
|
return getActualPrompt(response, options) || originalPrompt;
|
|
1550
1555
|
}
|
|
1551
|
-
|
|
1552
1556
|
//#endregion
|
|
1553
1557
|
//#region src/assertions/moderation.ts
|
|
1554
1558
|
const handleModeration = async ({ assertion, test, outputString, providerResponse, prompt }) => {
|
|
@@ -1571,7 +1575,6 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
|
|
|
1571
1575
|
assertion
|
|
1572
1576
|
};
|
|
1573
1577
|
};
|
|
1574
|
-
|
|
1575
1578
|
//#endregion
|
|
1576
1579
|
//#region src/assertions/openai.ts
|
|
1577
1580
|
const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test }) => {
|
|
@@ -1632,7 +1635,6 @@ const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test
|
|
|
1632
1635
|
};
|
|
1633
1636
|
}
|
|
1634
1637
|
};
|
|
1635
|
-
|
|
1636
1638
|
//#endregion
|
|
1637
1639
|
//#region src/assertions/perplexity.ts
|
|
1638
1640
|
function handlePerplexity({ logProbs, assertion }) {
|
|
@@ -1659,7 +1661,6 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1659
1661
|
assertion
|
|
1660
1662
|
};
|
|
1661
1663
|
}
|
|
1662
|
-
|
|
1663
1664
|
//#endregion
|
|
1664
1665
|
//#region src/assertions/pi.ts
|
|
1665
1666
|
const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }) => {
|
|
@@ -1667,7 +1668,6 @@ const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }
|
|
|
1667
1668
|
invariant(typeof prompt === "string", "\"pi\" assertion must have a prompt that is a string");
|
|
1668
1669
|
return matchesPiScore(renderedValue, prompt, outputString, assertion);
|
|
1669
1670
|
};
|
|
1670
|
-
|
|
1671
1671
|
//#endregion
|
|
1672
1672
|
//#region src/python/wrapper.ts
|
|
1673
1673
|
/**
|
|
@@ -1683,17 +1683,16 @@ async function runPythonCode(code, method, args) {
|
|
|
1683
1683
|
fs.writeFileSync(tempFilePath, code);
|
|
1684
1684
|
return await runPython(tempFilePath, method, args);
|
|
1685
1685
|
} catch (error) {
|
|
1686
|
-
|
|
1686
|
+
logger.error(`Error executing Python code: ${error}`);
|
|
1687
1687
|
throw error;
|
|
1688
1688
|
} finally {
|
|
1689
1689
|
try {
|
|
1690
1690
|
fs.unlinkSync(tempFilePath);
|
|
1691
1691
|
} catch (error) {
|
|
1692
|
-
|
|
1692
|
+
logger.error(`Error removing temporary file: ${error}`);
|
|
1693
1693
|
}
|
|
1694
1694
|
}
|
|
1695
1695
|
}
|
|
1696
|
-
|
|
1697
1696
|
//#endregion
|
|
1698
1697
|
//#region src/util/caseMapping.ts
|
|
1699
1698
|
/**
|
|
@@ -1717,7 +1716,6 @@ function mapSnakeCaseToCamelCase(obj) {
|
|
|
1717
1716
|
});
|
|
1718
1717
|
return result;
|
|
1719
1718
|
}
|
|
1720
|
-
|
|
1721
1719
|
//#endregion
|
|
1722
1720
|
//#region src/assertions/python.ts
|
|
1723
1721
|
const handlePython = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -1787,7 +1785,6 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1787
1785
|
assertion
|
|
1788
1786
|
};
|
|
1789
1787
|
};
|
|
1790
|
-
|
|
1791
1788
|
//#endregion
|
|
1792
1789
|
//#region src/assertions/redteam.ts
|
|
1793
1790
|
/**
|
|
@@ -1868,7 +1865,7 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1868
1865
|
const { hasAnyErrors, allTurnsHaveErrors } = analyzeGraderErrors(redteamHistory);
|
|
1869
1866
|
if (test.metadata?.strategyId && hasAnyErrors && !allTurnsHaveErrors) {
|
|
1870
1867
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1871
|
-
|
|
1868
|
+
logger.warn("[Redteam] Grading failed for iterative test with some prior grader errors", {
|
|
1872
1869
|
error: errorMessage,
|
|
1873
1870
|
strategyId: test.metadata.strategyId,
|
|
1874
1871
|
pluginId: test.metadata.pluginId
|
|
@@ -1888,7 +1885,6 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1888
1885
|
throw error;
|
|
1889
1886
|
}
|
|
1890
1887
|
};
|
|
1891
|
-
|
|
1892
1888
|
//#endregion
|
|
1893
1889
|
//#region src/assertions/refusal.ts
|
|
1894
1890
|
function handleIsRefusal(params) {
|
|
@@ -1916,7 +1912,6 @@ function handleIsRefusal(params) {
|
|
|
1916
1912
|
assertion
|
|
1917
1913
|
};
|
|
1918
1914
|
}
|
|
1919
|
-
|
|
1920
1915
|
//#endregion
|
|
1921
1916
|
//#region src/assertions/regex.ts
|
|
1922
1917
|
const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -1941,7 +1936,6 @@ const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
|
1941
1936
|
assertion
|
|
1942
1937
|
};
|
|
1943
1938
|
};
|
|
1944
|
-
|
|
1945
1939
|
//#endregion
|
|
1946
1940
|
//#region src/assertions/rouge.ts
|
|
1947
1941
|
function handleRougeScore({ baseType, assertion, renderedValue, outputString, inverse }) {
|
|
@@ -1957,7 +1951,6 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
1957
1951
|
assertion
|
|
1958
1952
|
};
|
|
1959
1953
|
}
|
|
1960
|
-
|
|
1961
1954
|
//#endregion
|
|
1962
1955
|
//#region src/ruby/wrapper.ts
|
|
1963
1956
|
/**
|
|
@@ -1973,17 +1966,16 @@ async function runRubyCode(code, method, args) {
|
|
|
1973
1966
|
fs.writeFileSync(tempFilePath, code);
|
|
1974
1967
|
return await runRuby(tempFilePath, method, args);
|
|
1975
1968
|
} catch (error) {
|
|
1976
|
-
|
|
1969
|
+
logger.error(`Error executing Ruby code: ${error}`);
|
|
1977
1970
|
throw error;
|
|
1978
1971
|
} finally {
|
|
1979
1972
|
try {
|
|
1980
1973
|
fs.unlinkSync(tempFilePath);
|
|
1981
1974
|
} catch (error) {
|
|
1982
|
-
|
|
1975
|
+
logger.error(`Error removing temporary file: ${error}`);
|
|
1983
1976
|
}
|
|
1984
1977
|
}
|
|
1985
1978
|
}
|
|
1986
|
-
|
|
1987
1979
|
//#endregion
|
|
1988
1980
|
//#region src/assertions/ruby.ts
|
|
1989
1981
|
const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -2054,7 +2046,6 @@ end
|
|
|
2054
2046
|
assertion
|
|
2055
2047
|
};
|
|
2056
2048
|
};
|
|
2057
|
-
|
|
2058
2049
|
//#endregion
|
|
2059
2050
|
//#region src/assertions/searchRubric.ts
|
|
2060
2051
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
@@ -2066,7 +2057,6 @@ async function handleSearchRubric({ assertion, baseType: _baseType, inverse, pro
|
|
|
2066
2057
|
}
|
|
2067
2058
|
return result;
|
|
2068
2059
|
}
|
|
2069
|
-
|
|
2070
2060
|
//#endregion
|
|
2071
2061
|
//#region src/assertions/similar.ts
|
|
2072
2062
|
const handleSimilar = async ({ assertion, renderedValue, outputString, inverse, test }) => {
|
|
@@ -2109,7 +2099,6 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2109
2099
|
...await matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
|
|
2110
2100
|
};
|
|
2111
2101
|
};
|
|
2112
|
-
|
|
2113
2102
|
//#endregion
|
|
2114
2103
|
//#region src/assertions/sql.ts
|
|
2115
2104
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2201,7 +2190,6 @@ const handleContainsSql = async (assertionParams) => {
|
|
|
2201
2190
|
}
|
|
2202
2191
|
return handleIsSql(assertionParams);
|
|
2203
2192
|
};
|
|
2204
|
-
|
|
2205
2193
|
//#endregion
|
|
2206
2194
|
//#region src/assertions/startsWith.ts
|
|
2207
2195
|
const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2215,7 +2203,6 @@ const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) =
|
|
|
2215
2203
|
assertion
|
|
2216
2204
|
};
|
|
2217
2205
|
};
|
|
2218
|
-
|
|
2219
2206
|
//#endregion
|
|
2220
2207
|
//#region src/assertions/toolCallF1.ts
|
|
2221
2208
|
/**
|
|
@@ -2344,7 +2331,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2344
2331
|
assertion
|
|
2345
2332
|
};
|
|
2346
2333
|
};
|
|
2347
|
-
|
|
2348
2334
|
//#endregion
|
|
2349
2335
|
//#region src/assertions/traceUtils.ts
|
|
2350
2336
|
/**
|
|
@@ -2362,7 +2348,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
2362
2348
|
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2363
2349
|
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2364
2350
|
}
|
|
2365
|
-
|
|
2366
2351
|
//#endregion
|
|
2367
2352
|
//#region src/assertions/traceErrorSpans.ts
|
|
2368
2353
|
function isErrorSpan(span) {
|
|
@@ -2440,7 +2425,6 @@ const handleTraceErrorSpans = ({ assertion, assertionValueContext }) => {
|
|
|
2440
2425
|
assertion
|
|
2441
2426
|
};
|
|
2442
2427
|
};
|
|
2443
|
-
|
|
2444
2428
|
//#endregion
|
|
2445
2429
|
//#region src/assertions/traceSpanCount.ts
|
|
2446
2430
|
const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
@@ -2475,7 +2459,6 @@ const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
|
2475
2459
|
assertion
|
|
2476
2460
|
};
|
|
2477
2461
|
};
|
|
2478
|
-
|
|
2479
2462
|
//#endregion
|
|
2480
2463
|
//#region src/assertions/traceSpanDuration.ts
|
|
2481
2464
|
function calculatePercentile(durations, percentile) {
|
|
@@ -2533,7 +2516,6 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2533
2516
|
assertion
|
|
2534
2517
|
};
|
|
2535
2518
|
};
|
|
2536
|
-
|
|
2537
2519
|
//#endregion
|
|
2538
2520
|
//#region src/assertions/webhook.ts
|
|
2539
2521
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
@@ -2570,7 +2552,6 @@ async function handleWebhook({ assertion, renderedValue, test, prompt, output, i
|
|
|
2570
2552
|
};
|
|
2571
2553
|
}
|
|
2572
2554
|
}
|
|
2573
|
-
|
|
2574
2555
|
//#endregion
|
|
2575
2556
|
//#region src/assertions/wordCount.ts
|
|
2576
2557
|
/**
|
|
@@ -2633,7 +2614,6 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2633
2614
|
assertion
|
|
2634
2615
|
};
|
|
2635
2616
|
};
|
|
2636
|
-
|
|
2637
2617
|
//#endregion
|
|
2638
2618
|
//#region src/assertions/xml.ts
|
|
2639
2619
|
function validateXml(xmlString, requiredElements) {
|
|
@@ -2708,7 +2688,6 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2708
2688
|
assertion
|
|
2709
2689
|
};
|
|
2710
2690
|
};
|
|
2711
|
-
|
|
2712
2691
|
//#endregion
|
|
2713
2692
|
//#region src/assertions/index.ts
|
|
2714
2693
|
const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
@@ -2762,7 +2741,7 @@ const ASSERTION_HANDLERS = {
|
|
|
2762
2741
|
"llm-rubric": handleLlmRubric,
|
|
2763
2742
|
meteor: async (params) => {
|
|
2764
2743
|
try {
|
|
2765
|
-
const { handleMeteorAssertion } = await import("./meteor-
|
|
2744
|
+
const { handleMeteorAssertion } = await import("./meteor-DUiCJRC-.js");
|
|
2766
2745
|
return handleMeteorAssertion(params);
|
|
2767
2746
|
} catch (error) {
|
|
2768
2747
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -2808,10 +2787,10 @@ function renderMetricName(metric, vars) {
|
|
|
2808
2787
|
if (!metric) return metric;
|
|
2809
2788
|
try {
|
|
2810
2789
|
const rendered = nunjucks.renderString(metric, vars);
|
|
2811
|
-
if (rendered === "" && metric !== "")
|
|
2790
|
+
if (rendered === "" && metric !== "") logger.debug(`Metric template "${metric}" rendered to empty string`);
|
|
2812
2791
|
return rendered;
|
|
2813
2792
|
} catch (error) {
|
|
2814
|
-
|
|
2793
|
+
logger.warn(`Failed to render metric template "${metric}": ${error instanceof Error ? error.message : error}`);
|
|
2815
2794
|
return metric;
|
|
2816
2795
|
}
|
|
2817
2796
|
}
|
|
@@ -2862,12 +2841,12 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2862
2841
|
spans: traceData.spans || []
|
|
2863
2842
|
};
|
|
2864
2843
|
} catch (error) {
|
|
2865
|
-
|
|
2844
|
+
logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
2866
2845
|
}
|
|
2867
2846
|
let renderedValue = assertion.value;
|
|
2868
2847
|
let valueFromScript;
|
|
2869
2848
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
2870
|
-
const basePath =
|
|
2849
|
+
const basePath = state.basePath || "";
|
|
2871
2850
|
const fileRef = renderedValue.slice(7);
|
|
2872
2851
|
let filePath = fileRef;
|
|
2873
2852
|
let functionName;
|
|
@@ -2879,10 +2858,10 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2879
2858
|
filePath = path.resolve(basePath, filePath);
|
|
2880
2859
|
if (isJavascriptFile(filePath)) {
|
|
2881
2860
|
valueFromScript = await loadFromJavaScriptFile(filePath, functionName, [output, context]);
|
|
2882
|
-
|
|
2861
|
+
logger.debug(`Javascript script ${filePath} output: ${valueFromScript}`);
|
|
2883
2862
|
} else if (filePath.endsWith(".py")) try {
|
|
2884
2863
|
valueFromScript = await runPython(filePath, functionName || "get_assert", [output, context]);
|
|
2885
|
-
|
|
2864
|
+
logger.debug(`Python script ${filePath} output: ${valueFromScript}`);
|
|
2886
2865
|
} catch (error) {
|
|
2887
2866
|
return {
|
|
2888
2867
|
pass: false,
|
|
@@ -2892,9 +2871,9 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2892
2871
|
};
|
|
2893
2872
|
}
|
|
2894
2873
|
else if (filePath.endsWith(".rb")) try {
|
|
2895
|
-
const { runRuby } = await import("./rubyUtils-
|
|
2874
|
+
const { runRuby } = await import("./rubyUtils-BUVePouc.js").then((n) => n.t);
|
|
2896
2875
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2897
|
-
|
|
2876
|
+
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2898
2877
|
} catch (error) {
|
|
2899
2878
|
return {
|
|
2900
2879
|
pass: false,
|
|
@@ -2905,7 +2884,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2905
2884
|
}
|
|
2906
2885
|
else renderedValue = processFileReference(renderedValue);
|
|
2907
2886
|
} else if (isPackagePath(renderedValue)) {
|
|
2908
|
-
const basePath =
|
|
2887
|
+
const basePath = state.basePath || "";
|
|
2909
2888
|
const requiredModule = await loadFromPackage(renderedValue, basePath);
|
|
2910
2889
|
if (typeof requiredModule !== "function") throw new Error(`Assertion malformed: ${renderedValue} must be a function. Received: ${typeof requiredModule}`);
|
|
2911
2890
|
valueFromScript = await Promise.resolve(requiredModule(output, context));
|
|
@@ -3066,7 +3045,6 @@ var assertions_default = {
|
|
|
3066
3045
|
matchesModeration,
|
|
3067
3046
|
matchesConversationRelevance
|
|
3068
3047
|
};
|
|
3069
|
-
|
|
3070
3048
|
//#endregion
|
|
3071
3049
|
//#region src/database/signal.ts
|
|
3072
3050
|
/**
|
|
@@ -3081,10 +3059,9 @@ function updateSignalFile(evalId) {
|
|
|
3081
3059
|
const content = evalId ? `${evalId}:${now.toISOString()}` : now.toISOString();
|
|
3082
3060
|
fs.writeFileSync(filePath, content);
|
|
3083
3061
|
} catch (err) {
|
|
3084
|
-
|
|
3062
|
+
logger.warn(`Failed to write database signal file: ${err}`);
|
|
3085
3063
|
}
|
|
3086
3064
|
}
|
|
3087
|
-
|
|
3088
3065
|
//#endregion
|
|
3089
3066
|
//#region src/progress/ciProgressReporter.ts
|
|
3090
3067
|
var CIProgressReporter = class {
|
|
@@ -3106,7 +3083,7 @@ var CIProgressReporter = class {
|
|
|
3106
3083
|
}
|
|
3107
3084
|
start() {
|
|
3108
3085
|
if (this.intervalId) clearInterval(this.intervalId);
|
|
3109
|
-
|
|
3086
|
+
logger.info(`[Evaluation] Starting ${this.totalTests} test cases...`);
|
|
3110
3087
|
this.intervalId = setInterval(() => {
|
|
3111
3088
|
this.logPeriodicUpdate();
|
|
3112
3089
|
}, this.updateIntervalMs);
|
|
@@ -3137,14 +3114,14 @@ var CIProgressReporter = class {
|
|
|
3137
3114
|
this.intervalId = null;
|
|
3138
3115
|
}
|
|
3139
3116
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3140
|
-
|
|
3117
|
+
logger.info(`[Evaluation] ✓ Complete! ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3141
3118
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation completed: ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3142
3119
|
}
|
|
3143
3120
|
error(message) {
|
|
3144
3121
|
const now = Date.now();
|
|
3145
3122
|
if (now - this.lastErrorTime < this.ERROR_THROTTLE_MS) return;
|
|
3146
3123
|
this.lastErrorTime = now;
|
|
3147
|
-
|
|
3124
|
+
logger.error(`[Evaluation Error] ${message}`);
|
|
3148
3125
|
if (process.env.GITHUB_ACTIONS) {
|
|
3149
3126
|
const escapedMessage = message.replace(/\r?\n/g, " ").replace(/::/g, " ");
|
|
3150
3127
|
console.log(`::error::${escapedMessage}`);
|
|
@@ -3163,12 +3140,12 @@ var CIProgressReporter = class {
|
|
|
3163
3140
|
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
|
|
3164
3141
|
}
|
|
3165
3142
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3166
|
-
|
|
3167
|
-
|
|
3143
|
+
logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
3144
|
+
logger.info(`[CI Progress] Rate: ~${Math.round(rate)} tests/minute, ETA: ${etaDisplay}`);
|
|
3168
3145
|
}
|
|
3169
3146
|
logMilestone(percentage) {
|
|
3170
3147
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3171
|
-
|
|
3148
|
+
logger.info(`[Evaluation] ✓ ${percentage}% complete (${this.completedTests}/${this.totalTests}) - ${elapsed} elapsed`);
|
|
3172
3149
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation ${percentage}% complete`);
|
|
3173
3150
|
}
|
|
3174
3151
|
formatElapsedTime(ms) {
|
|
@@ -3179,7 +3156,6 @@ var CIProgressReporter = class {
|
|
|
3179
3156
|
return `${minutes}m ${remainingSeconds}s`;
|
|
3180
3157
|
}
|
|
3181
3158
|
};
|
|
3182
|
-
|
|
3183
3159
|
//#endregion
|
|
3184
3160
|
//#region src/providers/azure/warnings.ts
|
|
3185
3161
|
/**
|
|
@@ -3193,13 +3169,12 @@ function maybeEmitAzureOpenAiWarning(testSuite, tests) {
|
|
|
3193
3169
|
const modelGradedAsserts = tests.flatMap((t) => (t.assert || []).filter((a) => a.type !== "assert-set" && MODEL_GRADED_ASSERTION_TYPES.has(a.type) && !a.provider && !t.options?.provider));
|
|
3194
3170
|
if (modelGradedAsserts.length > 0) {
|
|
3195
3171
|
const assertTypes = Array.from(new Set(modelGradedAsserts.map((a) => a.type))).join(", ");
|
|
3196
|
-
|
|
3172
|
+
logger.warn(chalk.yellow(`You are using model-graded assertions of types ${chalk.bold(assertTypes)} while testing an Azure provider. You may need to override these to use your Azure deployment. To learn more, see ${chalk.bold(`https://promptfoo.dev/docs/providers/azure/#model-graded-tests`)}`));
|
|
3197
3173
|
return true;
|
|
3198
3174
|
}
|
|
3199
3175
|
}
|
|
3200
3176
|
return false;
|
|
3201
3177
|
}
|
|
3202
|
-
|
|
3203
3178
|
//#endregion
|
|
3204
3179
|
//#region src/suggestions.ts
|
|
3205
3180
|
async function generatePrompts(prompt, _num) {
|
|
@@ -3230,7 +3205,6 @@ async function generatePrompts(prompt, _num) {
|
|
|
3230
3205
|
};
|
|
3231
3206
|
}
|
|
3232
3207
|
}
|
|
3233
|
-
|
|
3234
3208
|
//#endregion
|
|
3235
3209
|
//#region src/tracing/otelConfig.ts
|
|
3236
3210
|
/**
|
|
@@ -3256,7 +3230,6 @@ function getDefaultOtelConfig() {
|
|
|
3256
3230
|
enabled: true
|
|
3257
3231
|
};
|
|
3258
3232
|
}
|
|
3259
|
-
|
|
3260
3233
|
//#endregion
|
|
3261
3234
|
//#region src/tracing/localSpanExporter.ts
|
|
3262
3235
|
/**
|
|
@@ -3276,7 +3249,7 @@ var LocalSpanExporter = class {
|
|
|
3276
3249
|
});
|
|
3277
3250
|
else resultCallback({ code: ExportResultCode.SUCCESS });
|
|
3278
3251
|
}).catch((error) => {
|
|
3279
|
-
|
|
3252
|
+
logger.error("[LocalSpanExporter] Failed to export spans", { error });
|
|
3280
3253
|
resultCallback({
|
|
3281
3254
|
code: ExportResultCode.FAILED,
|
|
3282
3255
|
error: error instanceof Error ? error : new Error(String(error))
|
|
@@ -3290,7 +3263,7 @@ var LocalSpanExporter = class {
|
|
|
3290
3263
|
async exportAsync(spans) {
|
|
3291
3264
|
if (spans.length === 0) return;
|
|
3292
3265
|
const traceStore = getTraceStore();
|
|
3293
|
-
|
|
3266
|
+
logger.debug(`[LocalSpanExporter] Exporting ${spans.length} spans`);
|
|
3294
3267
|
const spansByTrace = /* @__PURE__ */ new Map();
|
|
3295
3268
|
for (const span of spans) {
|
|
3296
3269
|
const traceId = span.spanContext().traceId;
|
|
@@ -3301,12 +3274,12 @@ var LocalSpanExporter = class {
|
|
|
3301
3274
|
let firstError;
|
|
3302
3275
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
3303
3276
|
const result = await traceStore.addSpans(traceId, spanDataList, { skipTraceCheck: false });
|
|
3304
|
-
if (result.stored)
|
|
3305
|
-
else
|
|
3277
|
+
if (result.stored) logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
3278
|
+
else logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
3306
3279
|
} catch (error) {
|
|
3307
|
-
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY"))
|
|
3280
|
+
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY")) logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}`);
|
|
3308
3281
|
else {
|
|
3309
|
-
|
|
3282
|
+
logger.error(`[LocalSpanExporter] Failed to add spans to trace ${traceId}`, { error });
|
|
3310
3283
|
if (!firstError) firstError = error instanceof Error ? error : new Error(String(error));
|
|
3311
3284
|
}
|
|
3312
3285
|
}
|
|
@@ -3343,7 +3316,7 @@ var LocalSpanExporter = class {
|
|
|
3343
3316
|
* Shutdown the exporter. No-op for local storage.
|
|
3344
3317
|
*/
|
|
3345
3318
|
shutdown() {
|
|
3346
|
-
|
|
3319
|
+
logger.debug("[LocalSpanExporter] Shutting down");
|
|
3347
3320
|
return Promise.resolve();
|
|
3348
3321
|
}
|
|
3349
3322
|
/**
|
|
@@ -3353,7 +3326,6 @@ var LocalSpanExporter = class {
|
|
|
3353
3326
|
return Promise.resolve();
|
|
3354
3327
|
}
|
|
3355
3328
|
};
|
|
3356
|
-
|
|
3357
3329
|
//#endregion
|
|
3358
3330
|
//#region src/tracing/otelSdk.ts
|
|
3359
3331
|
let provider = null;
|
|
@@ -3381,21 +3353,21 @@ function getHandlers() {
|
|
|
3381
3353
|
*/
|
|
3382
3354
|
function initializeOtel(config) {
|
|
3383
3355
|
if (initialized) {
|
|
3384
|
-
|
|
3356
|
+
logger.debug("[OtelSdk] Already initialized, skipping");
|
|
3385
3357
|
return;
|
|
3386
3358
|
}
|
|
3387
3359
|
if (!config.enabled) {
|
|
3388
|
-
|
|
3360
|
+
logger.debug("[OtelSdk] OTEL tracing is disabled");
|
|
3389
3361
|
return;
|
|
3390
3362
|
}
|
|
3391
|
-
|
|
3363
|
+
logger.debug("[OtelSdk] Initializing OpenTelemetry SDK", {
|
|
3392
3364
|
serviceName: config.serviceName,
|
|
3393
3365
|
endpoint: config.endpoint,
|
|
3394
3366
|
localExport: config.localExport
|
|
3395
3367
|
});
|
|
3396
3368
|
if (config.debug) diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.DEBUG);
|
|
3397
3369
|
propagation.setGlobalPropagator(new W3CTraceContextPropagator());
|
|
3398
|
-
|
|
3370
|
+
logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
|
|
3399
3371
|
const resource = resourceFromAttributes({
|
|
3400
3372
|
[ATTR_SERVICE_NAME]: config.serviceName,
|
|
3401
3373
|
[ATTR_SERVICE_VERSION]: VERSION
|
|
@@ -3404,12 +3376,12 @@ function initializeOtel(config) {
|
|
|
3404
3376
|
if (config.localExport) {
|
|
3405
3377
|
const localExporter = new LocalSpanExporter();
|
|
3406
3378
|
spanProcessors.push(new BatchSpanProcessor(localExporter));
|
|
3407
|
-
|
|
3379
|
+
logger.debug("[OtelSdk] Added local span exporter");
|
|
3408
3380
|
}
|
|
3409
3381
|
if (config.endpoint) {
|
|
3410
3382
|
const otlpExporter = new OTLPTraceExporter({ url: config.endpoint });
|
|
3411
3383
|
spanProcessors.push(new BatchSpanProcessor(otlpExporter));
|
|
3412
|
-
|
|
3384
|
+
logger.debug(`[OtelSdk] Added OTLP exporter to ${config.endpoint}`);
|
|
3413
3385
|
}
|
|
3414
3386
|
provider = new NodeTracerProvider({
|
|
3415
3387
|
resource,
|
|
@@ -3417,7 +3389,7 @@ function initializeOtel(config) {
|
|
|
3417
3389
|
});
|
|
3418
3390
|
provider.register();
|
|
3419
3391
|
initialized = true;
|
|
3420
|
-
|
|
3392
|
+
logger.info("[OtelSdk] OpenTelemetry SDK initialized successfully");
|
|
3421
3393
|
setupShutdownHandlers();
|
|
3422
3394
|
}
|
|
3423
3395
|
/**
|
|
@@ -3426,12 +3398,12 @@ function initializeOtel(config) {
|
|
|
3426
3398
|
*/
|
|
3427
3399
|
async function shutdownOtel() {
|
|
3428
3400
|
if (!initialized || !provider) return;
|
|
3429
|
-
|
|
3401
|
+
logger.debug("[OtelSdk] Shutting down OpenTelemetry SDK");
|
|
3430
3402
|
try {
|
|
3431
3403
|
await provider.shutdown();
|
|
3432
|
-
|
|
3404
|
+
logger.info("[OtelSdk] OpenTelemetry SDK shut down successfully");
|
|
3433
3405
|
} catch (error) {
|
|
3434
|
-
|
|
3406
|
+
logger.error("[OtelSdk] Error shutting down OpenTelemetry SDK", { error });
|
|
3435
3407
|
} finally {
|
|
3436
3408
|
provider = null;
|
|
3437
3409
|
initialized = false;
|
|
@@ -3444,12 +3416,12 @@ async function shutdownOtel() {
|
|
|
3444
3416
|
*/
|
|
3445
3417
|
async function flushOtel() {
|
|
3446
3418
|
if (!initialized || !provider) return;
|
|
3447
|
-
|
|
3419
|
+
logger.debug("[OtelSdk] Flushing pending spans");
|
|
3448
3420
|
try {
|
|
3449
3421
|
await provider.forceFlush();
|
|
3450
|
-
|
|
3422
|
+
logger.debug("[OtelSdk] Spans flushed successfully");
|
|
3451
3423
|
} catch (error) {
|
|
3452
|
-
|
|
3424
|
+
logger.error("[OtelSdk] Error flushing spans", { error });
|
|
3453
3425
|
}
|
|
3454
3426
|
}
|
|
3455
3427
|
/**
|
|
@@ -3461,7 +3433,7 @@ function setupShutdownHandlers() {
|
|
|
3461
3433
|
const handlers = getHandlers();
|
|
3462
3434
|
if (handlers.registered) return;
|
|
3463
3435
|
const shutdown = async (signal) => {
|
|
3464
|
-
|
|
3436
|
+
logger.debug(`[OtelSdk] Received ${signal}, shutting down`);
|
|
3465
3437
|
await shutdownOtel();
|
|
3466
3438
|
};
|
|
3467
3439
|
handlers.sigTermHandler = () => {
|
|
@@ -3498,7 +3470,6 @@ function cleanupShutdownHandlers() {
|
|
|
3498
3470
|
}
|
|
3499
3471
|
handlers.registered = false;
|
|
3500
3472
|
}
|
|
3501
|
-
|
|
3502
3473
|
//#endregion
|
|
3503
3474
|
//#region src/util/exportToFile/writeToFile.ts
|
|
3504
3475
|
var JsonlFileWriter = class {
|
|
@@ -3522,7 +3493,6 @@ var JsonlFileWriter = class {
|
|
|
3522
3493
|
});
|
|
3523
3494
|
}
|
|
3524
3495
|
};
|
|
3525
|
-
|
|
3526
3496
|
//#endregion
|
|
3527
3497
|
//#region src/util/promptMatching.ts
|
|
3528
3498
|
/**
|
|
@@ -3560,7 +3530,6 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
3560
3530
|
if (allowedPrompts.length === 0) return false;
|
|
3561
3531
|
return allowedPrompts.some((ref) => doesPromptRefMatch(ref, prompt));
|
|
3562
3532
|
}
|
|
3563
|
-
|
|
3564
3533
|
//#endregion
|
|
3565
3534
|
//#region src/evaluator.ts
|
|
3566
3535
|
/**
|
|
@@ -3752,7 +3721,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3752
3721
|
if (test.providerOutput) response.output = test.providerOutput;
|
|
3753
3722
|
else {
|
|
3754
3723
|
const activeProvider = isApiProvider(test.provider) ? test.provider : provider;
|
|
3755
|
-
|
|
3724
|
+
logger.debug(`Provider type: ${activeProvider.id()}`);
|
|
3756
3725
|
traceContext = await generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite);
|
|
3757
3726
|
const callApiContext = {
|
|
3758
3727
|
vars,
|
|
@@ -3763,7 +3732,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3763
3732
|
filters,
|
|
3764
3733
|
originalProvider: provider,
|
|
3765
3734
|
test,
|
|
3766
|
-
logger
|
|
3735
|
+
logger,
|
|
3767
3736
|
getCache,
|
|
3768
3737
|
repeatIndex
|
|
3769
3738
|
};
|
|
@@ -3780,8 +3749,8 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3780
3749
|
const sanitizedMetadata = safeJsonStringify(response.metadata);
|
|
3781
3750
|
response.metadata = sanitizedMetadata ? JSON.parse(sanitizedMetadata) : {};
|
|
3782
3751
|
}
|
|
3783
|
-
|
|
3784
|
-
|
|
3752
|
+
logger.debug(`Provider response properties: ${Object.keys(response).join(", ")}`);
|
|
3753
|
+
logger.debug(`Provider response cached property explicitly: ${response.cached}`);
|
|
3785
3754
|
}
|
|
3786
3755
|
latencyMs = Date.now() - startTime;
|
|
3787
3756
|
let conversationLastInput = void 0;
|
|
@@ -3798,12 +3767,12 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3798
3767
|
metadata: response.metadata
|
|
3799
3768
|
});
|
|
3800
3769
|
}
|
|
3801
|
-
|
|
3802
|
-
|
|
3770
|
+
logger.debug("Evaluator response", { responsePreview: (safeJsonStringify(response) ?? "").slice(0, 100) });
|
|
3771
|
+
logger.debug(`Evaluator checking cached flag: response.cached = ${Boolean(response.cached)}, provider.delay = ${provider.delay}`);
|
|
3803
3772
|
if (!response.cached && provider.delay > 0) {
|
|
3804
|
-
|
|
3773
|
+
logger.debug(`Sleeping for ${provider.delay}ms`);
|
|
3805
3774
|
await sleep(provider.delay);
|
|
3806
|
-
} else if (response.cached)
|
|
3775
|
+
} else if (response.cached) logger.debug(`Skipping delay because response is cached`);
|
|
3807
3776
|
const ret = {
|
|
3808
3777
|
...setup,
|
|
3809
3778
|
response,
|
|
@@ -3906,7 +3875,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3906
3875
|
promptIdx,
|
|
3907
3876
|
testIdx
|
|
3908
3877
|
});
|
|
3909
|
-
if (!(err instanceof Error && err.name === "AbortError"))
|
|
3878
|
+
if (!(err instanceof Error && err.name === "AbortError")) logger.error("Provider call failed during eval", logContext);
|
|
3910
3879
|
return [{
|
|
3911
3880
|
...setup,
|
|
3912
3881
|
error: errorWithStack,
|
|
@@ -3989,7 +3958,7 @@ function generateVarCombinations(vars) {
|
|
|
3989
3958
|
let values = [];
|
|
3990
3959
|
if (typeof vars[key] === "string" && vars[key].startsWith("file://")) {
|
|
3991
3960
|
const filePath = vars[key].slice(7);
|
|
3992
|
-
const basePath =
|
|
3961
|
+
const basePath = state.basePath || "";
|
|
3993
3962
|
values = (globSync(filePath, {
|
|
3994
3963
|
cwd: basePath || process.cwd(),
|
|
3995
3964
|
windowsPathsNoEscape: true
|
|
@@ -4029,28 +3998,28 @@ var Evaluator = class {
|
|
|
4029
3998
|
this.conversations = {};
|
|
4030
3999
|
this.registers = {};
|
|
4031
4000
|
this.fileWriters = (Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : []).map((p) => new JsonlFileWriter(p));
|
|
4032
|
-
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency ||
|
|
4001
|
+
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
4033
4002
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
4034
|
-
|
|
4003
|
+
logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
4035
4004
|
retryAfterMs: data.retryAfterMs,
|
|
4036
4005
|
resetAt: data.resetAt,
|
|
4037
4006
|
concurrencyChange: data.concurrencyChange
|
|
4038
4007
|
});
|
|
4039
4008
|
});
|
|
4040
4009
|
this.rateLimitRegistry.on("ratelimit:learned", (data) => {
|
|
4041
|
-
|
|
4010
|
+
logger.debug(`[Scheduler] Learned rate limits for ${data.rateLimitKey}`, {
|
|
4042
4011
|
requestLimit: data.requestLimit,
|
|
4043
4012
|
tokenLimit: data.tokenLimit
|
|
4044
4013
|
});
|
|
4045
4014
|
});
|
|
4046
4015
|
this.rateLimitRegistry.on("concurrency:decreased", (data) => {
|
|
4047
|
-
|
|
4016
|
+
logger.debug(`[Scheduler] Concurrency decreased for ${data.rateLimitKey}`, {
|
|
4048
4017
|
previous: data.previous,
|
|
4049
4018
|
current: data.current
|
|
4050
4019
|
});
|
|
4051
4020
|
});
|
|
4052
4021
|
this.rateLimitRegistry.on("concurrency:increased", (data) => {
|
|
4053
|
-
|
|
4022
|
+
logger.debug(`[Scheduler] Concurrency increased for ${data.rateLimitKey}`, {
|
|
4054
4023
|
previous: data.previous,
|
|
4055
4024
|
current: data.current
|
|
4056
4025
|
});
|
|
@@ -4107,7 +4076,7 @@ var Evaluator = class {
|
|
|
4107
4076
|
const checkAbort = () => {
|
|
4108
4077
|
if (combinedAbortSignal.aborted) throw new Error("Operation cancelled");
|
|
4109
4078
|
};
|
|
4110
|
-
if (!options.silent)
|
|
4079
|
+
if (!options.silent) logger.info(`Starting evaluation ${this.evalRecord.id}`);
|
|
4111
4080
|
checkAbort();
|
|
4112
4081
|
const prompts = [];
|
|
4113
4082
|
const assertionTypes = /* @__PURE__ */ new Set();
|
|
@@ -4119,32 +4088,32 @@ var Evaluator = class {
|
|
|
4119
4088
|
}
|
|
4120
4089
|
testSuite = (await runExtensionHook(testSuite.extensions, "beforeAll", { suite: testSuite })).suite;
|
|
4121
4090
|
if (options.generateSuggestions) {
|
|
4122
|
-
|
|
4091
|
+
logger.info(`Generating prompt variations...`);
|
|
4123
4092
|
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
4124
4093
|
if (error || !newPrompts) throw new Error(`Failed to generate prompts: ${error}`);
|
|
4125
|
-
|
|
4094
|
+
logger.info(chalk.blue("Generated prompts:"));
|
|
4126
4095
|
let numAdded = 0;
|
|
4127
4096
|
for (const prompt of newPrompts) {
|
|
4128
|
-
|
|
4129
|
-
|
|
4130
|
-
|
|
4097
|
+
logger.info("--------------------------------------------------------");
|
|
4098
|
+
logger.info(`${prompt}`);
|
|
4099
|
+
logger.info("--------------------------------------------------------");
|
|
4131
4100
|
if (await promptYesNo("Do you want to test this prompt?", false)) {
|
|
4132
4101
|
testSuite.prompts.push({
|
|
4133
4102
|
raw: prompt,
|
|
4134
4103
|
label: prompt
|
|
4135
4104
|
});
|
|
4136
4105
|
numAdded++;
|
|
4137
|
-
} else
|
|
4106
|
+
} else logger.info("Skipping this prompt.");
|
|
4138
4107
|
}
|
|
4139
4108
|
if (numAdded < 1) {
|
|
4140
|
-
|
|
4109
|
+
logger.info(chalk.red("No prompts selected. Aborting."));
|
|
4141
4110
|
process.exitCode = 1;
|
|
4142
4111
|
return this.evalRecord;
|
|
4143
4112
|
}
|
|
4144
4113
|
}
|
|
4145
4114
|
const existingPromptsMap = /* @__PURE__ */ new Map();
|
|
4146
|
-
if (
|
|
4147
|
-
|
|
4115
|
+
if (state.resume && this.evalRecord.persisted && this.evalRecord.prompts.length > 0) {
|
|
4116
|
+
logger.debug("Resuming evaluation: preserving metrics from previous run");
|
|
4148
4117
|
for (const existingPrompt of this.evalRecord.prompts) {
|
|
4149
4118
|
const key = `${existingPrompt.provider}:${existingPrompt.id}`;
|
|
4150
4119
|
existingPromptsMap.set(key, existingPrompt);
|
|
@@ -4182,7 +4151,7 @@ var Evaluator = class {
|
|
|
4182
4151
|
await this.evalRecord.addPrompts(prompts);
|
|
4183
4152
|
let tests = testSuite.tests && testSuite.tests.length > 0 ? testSuite.tests : testSuite.scenarios ? [] : [{}];
|
|
4184
4153
|
if (testSuite.scenarios && testSuite.scenarios.length > 0) {
|
|
4185
|
-
|
|
4154
|
+
telemetry.record("feature_used", { feature: "scenarios" });
|
|
4186
4155
|
let scenarioIndex = 0;
|
|
4187
4156
|
for (const scenario of testSuite.scenarios) for (const data of scenario.config) {
|
|
4188
4157
|
const scenarioTests = (scenario.tests || [{}]).map((test) => {
|
|
@@ -4246,7 +4215,7 @@ var Evaluator = class {
|
|
|
4246
4215
|
}
|
|
4247
4216
|
const runEvalOptions = [];
|
|
4248
4217
|
let testIdx = 0;
|
|
4249
|
-
let concurrency = options.maxConcurrency ||
|
|
4218
|
+
let concurrency = options.maxConcurrency || 4;
|
|
4250
4219
|
for (let index = 0; index < tests.length; index++) {
|
|
4251
4220
|
const testCase = tests[index];
|
|
4252
4221
|
invariant(typeof testSuite.defaultTest !== "object" || Array.isArray(testSuite.defaultTest?.assert || []), `defaultTest.assert is not an array in test case #${index + 1}`);
|
|
@@ -4266,7 +4235,7 @@ var Evaluator = class {
|
|
|
4266
4235
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4267
4236
|
if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4268
4237
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4269
|
-
const { loadApiProvider } = await import("./providers-
|
|
4238
|
+
const { loadApiProvider } = await import("./providers-GIQ2TcsA.js");
|
|
4270
4239
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4271
4240
|
} else testCase.provider = defaultProvider;
|
|
4272
4241
|
}
|
|
@@ -4293,7 +4262,7 @@ var Evaluator = class {
|
|
|
4293
4262
|
const promptId = generateIdFromPrompt(prompt);
|
|
4294
4263
|
const promptIdx = promptIndexMap.get(`${providerKey}:${promptId}`);
|
|
4295
4264
|
if (promptIdx === void 0) {
|
|
4296
|
-
|
|
4265
|
+
logger.warn(`Could not find prompt index for ${providerKey}:${promptId}, skipping`);
|
|
4297
4266
|
continue;
|
|
4298
4267
|
}
|
|
4299
4268
|
runEvalOptions.push({
|
|
@@ -4316,7 +4285,7 @@ var Evaluator = class {
|
|
|
4316
4285
|
options: testOptions
|
|
4317
4286
|
};
|
|
4318
4287
|
const tracingEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || testCase.metadata?.tracingEnabled === true || testSuite.tracing?.enabled === true;
|
|
4319
|
-
|
|
4288
|
+
logger.debug(`[Evaluator] Tracing check: env=${getEnvBool("PROMPTFOO_TRACING_ENABLED", false)}, testCase.metadata?.tracingEnabled=${testCase.metadata?.tracingEnabled}, testSuite.tracing?.enabled=${testSuite.tracing?.enabled}, tracingEnabled=${tracingEnabled}`);
|
|
4320
4289
|
if (tracingEnabled) return {
|
|
4321
4290
|
...baseTest,
|
|
4322
4291
|
metadata: {
|
|
@@ -4349,27 +4318,27 @@ var Evaluator = class {
|
|
|
4349
4318
|
if (evalOption.test.assert?.some((a) => a.type === "select-best")) rowsWithSelectBestAssertion.add(evalOption.testIdx);
|
|
4350
4319
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4351
4320
|
}
|
|
4352
|
-
if (
|
|
4353
|
-
const { default: EvalResult } = await import("./evalResult-
|
|
4354
|
-
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors:
|
|
4321
|
+
if (state.resume && this.evalRecord.persisted) try {
|
|
4322
|
+
const { default: EvalResult } = await import("./evalResult-CDQiuUuf.js").then((n) => n.n);
|
|
4323
|
+
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
|
|
4355
4324
|
const originalCount = runEvalOptions.length;
|
|
4356
4325
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
4357
4326
|
const step = runEvalOptions[i];
|
|
4358
4327
|
if (completedPairs.has(`${step.testIdx}:${step.promptIdx}`)) runEvalOptions.splice(i, 1);
|
|
4359
4328
|
}
|
|
4360
4329
|
const skipped = originalCount - runEvalOptions.length;
|
|
4361
|
-
if (skipped > 0)
|
|
4330
|
+
if (skipped > 0) logger.info(`Resuming: skipping ${skipped} previously completed cases`);
|
|
4362
4331
|
} catch (err) {
|
|
4363
|
-
|
|
4332
|
+
logger.warn(`Resume: failed to load completed results. Running full evaluation. ${String(err)}`);
|
|
4364
4333
|
}
|
|
4365
4334
|
if (concurrency > 1) {
|
|
4366
4335
|
const usesConversation = prompts.some((p) => p.raw.includes("_conversation"));
|
|
4367
4336
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
4368
4337
|
if (usesConversation) {
|
|
4369
|
-
|
|
4338
|
+
logger.info(`Setting concurrency to 1 because the ${chalk.cyan("_conversation")} variable is used.`);
|
|
4370
4339
|
concurrency = 1;
|
|
4371
4340
|
} else if (usesStoreOutputAs) {
|
|
4372
|
-
|
|
4341
|
+
logger.info(`Setting concurrency to 1 because storeOutputAs is used.`);
|
|
4373
4342
|
concurrency = 1;
|
|
4374
4343
|
}
|
|
4375
4344
|
}
|
|
@@ -4400,14 +4369,14 @@ var Evaluator = class {
|
|
|
4400
4369
|
await this.evalRecord.addResult(row);
|
|
4401
4370
|
} catch (error) {
|
|
4402
4371
|
const resultSummary = summarizeEvaluateResultForLogging(row);
|
|
4403
|
-
|
|
4372
|
+
logger.error(`Error saving result: ${error} ${safeJsonStringify(resultSummary)}`);
|
|
4404
4373
|
}
|
|
4405
4374
|
for (const writer of this.fileWriters) await writer.write(row);
|
|
4406
4375
|
const httpStatus = row.response?.metadata?.http?.status;
|
|
4407
4376
|
if (typeof httpStatus === "number" && isNonTransientHttpStatus(httpStatus)) {
|
|
4408
4377
|
targetUnavailable = true;
|
|
4409
4378
|
targetErrorStatus = httpStatus;
|
|
4410
|
-
|
|
4379
|
+
logger.error(`Target returned HTTP ${httpStatus}. Aborting scan - this error will not resolve on retry.`);
|
|
4411
4380
|
targetErrorAbortController.abort();
|
|
4412
4381
|
break;
|
|
4413
4382
|
}
|
|
@@ -4427,7 +4396,7 @@ var Evaluator = class {
|
|
|
4427
4396
|
if (testSuite.derivedMetrics) {
|
|
4428
4397
|
const math = await import("mathjs");
|
|
4429
4398
|
const promptEvalCount = metrics.testPassCount + metrics.testFailCount + metrics.testErrorCount + 1;
|
|
4430
|
-
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count"))
|
|
4399
|
+
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count")) logger.warn("Metric name '__count' is reserved for derived metrics and will be overridden.");
|
|
4431
4400
|
const evalContext = {
|
|
4432
4401
|
...metrics.namedScores,
|
|
4433
4402
|
__count: promptEvalCount
|
|
@@ -4442,7 +4411,7 @@ var Evaluator = class {
|
|
|
4442
4411
|
}
|
|
4443
4412
|
evalContext[metric.name] = metrics.namedScores[metric.name];
|
|
4444
4413
|
} catch (error) {
|
|
4445
|
-
|
|
4414
|
+
logger.debug(`Could not evaluate derived metric '${metric.name}': ${error.message}`);
|
|
4446
4415
|
}
|
|
4447
4416
|
}
|
|
4448
4417
|
}
|
|
@@ -4481,7 +4450,7 @@ var Evaluator = class {
|
|
|
4481
4450
|
if (typeof evalStep.provider.cleanup === "function") try {
|
|
4482
4451
|
evalStep.provider.cleanup();
|
|
4483
4452
|
} catch (cleanupErr) {
|
|
4484
|
-
|
|
4453
|
+
logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
4485
4454
|
}
|
|
4486
4455
|
reject(/* @__PURE__ */ new Error(`Evaluation timed out after ${timeoutMs}ms`));
|
|
4487
4456
|
}, timeoutMs);
|
|
@@ -4545,8 +4514,8 @@ var Evaluator = class {
|
|
|
4545
4514
|
}
|
|
4546
4515
|
};
|
|
4547
4516
|
const originalProgressCallback = this.options.progressCallback;
|
|
4548
|
-
const isWebUI = Boolean(
|
|
4549
|
-
|
|
4517
|
+
const isWebUI = Boolean(state.webUI);
|
|
4518
|
+
logger.debug(`Progress bar settings: showProgressBar=${this.options.showProgressBar}, isWebUI=${isWebUI}`);
|
|
4550
4519
|
if (isCI() && !isWebUI) {
|
|
4551
4520
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4552
4521
|
ciProgressReporter.start();
|
|
@@ -4556,20 +4525,20 @@ var Evaluator = class {
|
|
|
4556
4525
|
if (isWebUI) {
|
|
4557
4526
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4558
4527
|
const vars = formatVarsForDisplay(evalStep.test.vars, 50);
|
|
4559
|
-
|
|
4528
|
+
logger.info(`[${numComplete}/${total}] Running ${provider} with vars: ${vars}`);
|
|
4560
4529
|
} else if (progressBarManager) {
|
|
4561
4530
|
const phase = evalStep.test.options?.runSerially ? "serial" : "concurrent";
|
|
4562
4531
|
progressBarManager.updateProgress(index, evalStep, phase, metrics);
|
|
4563
4532
|
} else if (ciProgressReporter) ciProgressReporter.update(numComplete);
|
|
4564
|
-
else
|
|
4533
|
+
else logger.debug(`Eval #${index + 1} complete (${numComplete} of ${runEvalOptions.length})`);
|
|
4565
4534
|
};
|
|
4566
4535
|
const serialRunEvalOptions = [];
|
|
4567
4536
|
const concurrentRunEvalOptions = [];
|
|
4568
4537
|
for (const evalOption of runEvalOptions) if (evalOption.test.options?.runSerially) serialRunEvalOptions.push(evalOption);
|
|
4569
4538
|
else concurrentRunEvalOptions.push(evalOption);
|
|
4570
4539
|
if (!this.options.silent) {
|
|
4571
|
-
if (serialRunEvalOptions.length > 0)
|
|
4572
|
-
if (concurrentRunEvalOptions.length > 0)
|
|
4540
|
+
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4541
|
+
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4573
4542
|
}
|
|
4574
4543
|
if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
4575
4544
|
try {
|
|
@@ -4578,7 +4547,7 @@ var Evaluator = class {
|
|
|
4578
4547
|
if (isWebUI) {
|
|
4579
4548
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4580
4549
|
const vars = formatVarsForDisplay(evalStep.test.vars || {}, 50);
|
|
4581
|
-
|
|
4550
|
+
logger.info(`[${numComplete}/${runEvalOptions.length}] Running ${provider} with vars: ${vars}`);
|
|
4582
4551
|
}
|
|
4583
4552
|
const idx = runEvalOptions.indexOf(evalStep);
|
|
4584
4553
|
await processEvalStepWithTimeout(evalStep, idx);
|
|
@@ -4593,9 +4562,9 @@ var Evaluator = class {
|
|
|
4593
4562
|
});
|
|
4594
4563
|
} catch (err) {
|
|
4595
4564
|
if (combinedAbortSignal.aborted) {
|
|
4596
|
-
if (evalTimedOut)
|
|
4565
|
+
if (evalTimedOut) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
4597
4566
|
else if (!targetUnavailable) {
|
|
4598
|
-
|
|
4567
|
+
logger.info("Evaluation interrupted, saving progress...");
|
|
4599
4568
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4600
4569
|
if (progressBarManager) progressBarManager.stop();
|
|
4601
4570
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -4625,10 +4594,10 @@ var Evaluator = class {
|
|
|
4625
4594
|
let compareCount = 0;
|
|
4626
4595
|
for (const testIdx of rowsWithSelectBestAssertion) {
|
|
4627
4596
|
compareCount++;
|
|
4628
|
-
if (isWebUI)
|
|
4597
|
+
if (isWebUI) logger.info(`Running model-graded comparison ${compareCount} of ${compareRowsCount}...`);
|
|
4629
4598
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4630
4599
|
if (resultsToCompare.length === 0) {
|
|
4631
|
-
|
|
4600
|
+
logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4632
4601
|
continue;
|
|
4633
4602
|
}
|
|
4634
4603
|
const compareAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "select-best");
|
|
@@ -4690,16 +4659,16 @@ var Evaluator = class {
|
|
|
4690
4659
|
}
|
|
4691
4660
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4692
4661
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4693
|
-
else if (!isWebUI)
|
|
4662
|
+
else if (!isWebUI) logger.debug(`Model-graded comparison #${compareCount} of ${compareRowsCount} complete`);
|
|
4694
4663
|
}
|
|
4695
4664
|
}
|
|
4696
4665
|
const maxScoreRowsCount = rowsWithMaxScoreAssertion.size;
|
|
4697
4666
|
if (maxScoreRowsCount > 0) {
|
|
4698
|
-
|
|
4667
|
+
logger.info(`Processing ${maxScoreRowsCount} max-score assertions...`);
|
|
4699
4668
|
for (const testIdx of rowsWithMaxScoreAssertion) {
|
|
4700
4669
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4701
4670
|
if (resultsToCompare.length === 0) {
|
|
4702
|
-
|
|
4671
|
+
logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4703
4672
|
continue;
|
|
4704
4673
|
}
|
|
4705
4674
|
const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
|
|
@@ -4707,7 +4676,7 @@ var Evaluator = class {
|
|
|
4707
4676
|
const maxScoreGradingResults = await selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
|
|
4708
4677
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4709
4678
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4710
|
-
else if (!isWebUI)
|
|
4679
|
+
else if (!isWebUI) logger.debug(`Max-score assertion for test #${testIdx} complete`);
|
|
4711
4680
|
for (let index = 0; index < resultsToCompare.length; index++) {
|
|
4712
4681
|
const result = resultsToCompare[index];
|
|
4713
4682
|
const maxScoreGradingResult = {
|
|
@@ -4751,7 +4720,7 @@ var Evaluator = class {
|
|
|
4751
4720
|
progressBarManager.stop();
|
|
4752
4721
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
4753
4722
|
} catch (cleanupErr) {
|
|
4754
|
-
|
|
4723
|
+
logger.warn(`Error during progress reporter cleanup: ${cleanupErr}`);
|
|
4755
4724
|
}
|
|
4756
4725
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4757
4726
|
if (evalTimedOut) {
|
|
@@ -4824,7 +4793,7 @@ var Evaluator = class {
|
|
|
4824
4793
|
return idParts.length > 1 ? idParts[0] : "unknown";
|
|
4825
4794
|
})));
|
|
4826
4795
|
const timeoutOccurred = evalTimedOut || this.evalRecord.results.some((r) => r.failureReason === ResultFailureReason.ERROR && r.error?.includes("timed out"));
|
|
4827
|
-
|
|
4796
|
+
telemetry.record("eval_ran", {
|
|
4828
4797
|
numPrompts: prompts.length,
|
|
4829
4798
|
numTests: this.stats.successes + this.stats.failures + this.stats.errors,
|
|
4830
4799
|
numRequests: this.stats.tokenUsage.numRequests || 0,
|
|
@@ -4872,26 +4841,26 @@ var Evaluator = class {
|
|
|
4872
4841
|
await startOtlpReceiverIfNeeded(this.testSuite);
|
|
4873
4842
|
const tracingEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || this.testSuite.tracing?.enabled === true || typeof this.testSuite.defaultTest === "object" && this.testSuite.defaultTest?.metadata?.tracingEnabled === true || this.testSuite.tests?.some((t) => t.metadata?.tracingEnabled === true);
|
|
4874
4843
|
if (tracingEnabled) {
|
|
4875
|
-
|
|
4844
|
+
logger.debug("[Evaluator] Initializing OTEL SDK for tracing");
|
|
4876
4845
|
initializeOtel(getDefaultOtelConfig());
|
|
4877
4846
|
}
|
|
4878
4847
|
try {
|
|
4879
4848
|
return await this._runEvaluation();
|
|
4880
4849
|
} finally {
|
|
4881
4850
|
if (tracingEnabled) {
|
|
4882
|
-
|
|
4851
|
+
logger.debug("[Evaluator] Flushing OTEL spans...");
|
|
4883
4852
|
await flushOtel();
|
|
4884
4853
|
await shutdownOtel();
|
|
4885
4854
|
}
|
|
4886
4855
|
if (isOtlpReceiverStarted()) {
|
|
4887
|
-
|
|
4856
|
+
logger.debug("[Evaluator] Waiting for span exports to complete...");
|
|
4888
4857
|
await sleep(3e3);
|
|
4889
4858
|
}
|
|
4890
4859
|
await stopOtlpReceiverIfNeeded();
|
|
4891
4860
|
await providerRegistry.shutdownAll();
|
|
4892
4861
|
if (this.rateLimitRegistry) {
|
|
4893
4862
|
const metrics = this.rateLimitRegistry.getMetrics();
|
|
4894
|
-
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0)
|
|
4863
|
+
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0) logger.debug(`[Scheduler] Final metrics for ${key}`, {
|
|
4895
4864
|
totalRequests: m.totalRequests,
|
|
4896
4865
|
completedRequests: m.completedRequests,
|
|
4897
4866
|
failedRequests: m.failedRequests,
|
|
@@ -4904,14 +4873,13 @@ var Evaluator = class {
|
|
|
4904
4873
|
}
|
|
4905
4874
|
this.rateLimitRegistry?.dispose();
|
|
4906
4875
|
redteamProviderManager.setRateLimitRegistry(void 0);
|
|
4907
|
-
|
|
4876
|
+
state.maxConcurrency = void 0;
|
|
4908
4877
|
}
|
|
4909
4878
|
}
|
|
4910
4879
|
};
|
|
4911
4880
|
function evaluate$1(testSuite, evalRecord, options) {
|
|
4912
4881
|
return new Evaluator(testSuite, evalRecord, options).evaluate();
|
|
4913
4882
|
}
|
|
4914
|
-
|
|
4915
4883
|
//#endregion
|
|
4916
4884
|
//#region src/guardrails.ts
|
|
4917
4885
|
const API_BASE_URL = `${getShareApiBaseUrl()}/v1`;
|
|
@@ -4925,7 +4893,7 @@ async function makeRequest(endpoint, input) {
|
|
|
4925
4893
|
if (!response.data) throw new Error("No data returned from API");
|
|
4926
4894
|
return response.data;
|
|
4927
4895
|
} catch (error) {
|
|
4928
|
-
|
|
4896
|
+
logger.error(`Guardrails API error: ${error}`);
|
|
4929
4897
|
throw error;
|
|
4930
4898
|
}
|
|
4931
4899
|
}
|
|
@@ -4942,7 +4910,7 @@ async function makeAdaptiveRequest(request) {
|
|
|
4942
4910
|
if (!response.data) throw new Error("No data returned from API");
|
|
4943
4911
|
return response.data;
|
|
4944
4912
|
} catch (error) {
|
|
4945
|
-
|
|
4913
|
+
logger.error(`Guardrails API error: ${error}`);
|
|
4946
4914
|
throw error;
|
|
4947
4915
|
}
|
|
4948
4916
|
}
|
|
@@ -4960,8 +4928,6 @@ const guardrails = {
|
|
|
4960
4928
|
return makeAdaptiveRequest(request);
|
|
4961
4929
|
}
|
|
4962
4930
|
};
|
|
4963
|
-
var guardrails_default = guardrails;
|
|
4964
|
-
|
|
4965
4931
|
//#endregion
|
|
4966
4932
|
//#region src/migrate.ts
|
|
4967
4933
|
/**
|
|
@@ -4996,12 +4962,12 @@ async function runDbMigrations() {
|
|
|
4996
4962
|
const projectRoot = dir.split("dist/server/src")[0];
|
|
4997
4963
|
migrationsFolder = path$2.join(projectRoot, "dist", "promptfoo", "drizzle");
|
|
4998
4964
|
} else migrationsFolder = path$2.join(dir, "..", "drizzle");
|
|
4999
|
-
|
|
4965
|
+
logger.debug(`Running database migrations from: ${migrationsFolder}`);
|
|
5000
4966
|
migrate(db, { migrationsFolder });
|
|
5001
|
-
|
|
4967
|
+
logger.debug("Database migrations completed");
|
|
5002
4968
|
resolve();
|
|
5003
4969
|
} catch (error) {
|
|
5004
|
-
|
|
4970
|
+
logger.error(`Database migration failed: ${error}`);
|
|
5005
4971
|
reject(error);
|
|
5006
4972
|
}
|
|
5007
4973
|
});
|
|
@@ -5011,7 +4977,6 @@ try {
|
|
|
5011
4977
|
const currentModulePath = resolve(fileURLToPath(import.meta.url));
|
|
5012
4978
|
if (currentModulePath === resolve(process.argv[1]) && (currentModulePath.endsWith("migrate.js") || currentModulePath.endsWith("migrate.ts"))) runDbMigrations().then(() => process.exit(0)).catch(() => process.exit(1));
|
|
5013
4979
|
} catch {}
|
|
5014
|
-
|
|
5015
4980
|
//#endregion
|
|
5016
4981
|
//#region src/redteam/sharedFrontend.ts
|
|
5017
4982
|
function getRiskCategorySeverityMap(plugins) {
|
|
@@ -5028,7 +4993,6 @@ function getRiskCategorySeverityMap(plugins) {
|
|
|
5028
4993
|
...overrides
|
|
5029
4994
|
};
|
|
5030
4995
|
}
|
|
5031
|
-
|
|
5032
4996
|
//#endregion
|
|
5033
4997
|
//#region src/util/calculateFilteredMetrics.ts
|
|
5034
4998
|
/**
|
|
@@ -5082,12 +5046,12 @@ async function calculateFilteredMetrics(opts) {
|
|
|
5082
5046
|
try {
|
|
5083
5047
|
const countResult = await getResultCount(whereSql);
|
|
5084
5048
|
if (countResult > MAX_RESULTS_FOR_METRICS) {
|
|
5085
|
-
|
|
5049
|
+
logger.warn(`Filtered result count ${countResult} exceeds limit ${MAX_RESULTS_FOR_METRICS}`, { evalId: opts.evalId });
|
|
5086
5050
|
throw new Error(`Result count ${countResult} exceeds maximum ${MAX_RESULTS_FOR_METRICS}`);
|
|
5087
5051
|
}
|
|
5088
5052
|
return await calculateWithOptimizedQuery(opts);
|
|
5089
5053
|
} catch (error) {
|
|
5090
|
-
|
|
5054
|
+
logger.error("Failed to calculate filtered metrics with optimized query", { error });
|
|
5091
5055
|
return createEmptyMetricsArray(numPrompts);
|
|
5092
5056
|
}
|
|
5093
5057
|
}
|
|
@@ -5140,7 +5104,7 @@ async function calculateWithOptimizedQuery(opts) {
|
|
|
5140
5104
|
for (const row of basicResults) {
|
|
5141
5105
|
const idx = row.prompt_idx;
|
|
5142
5106
|
if (idx < 0 || idx >= numPrompts) {
|
|
5143
|
-
|
|
5107
|
+
logger.warn(`Invalid prompt_idx ${idx}, expected 0-${numPrompts - 1}`);
|
|
5144
5108
|
continue;
|
|
5145
5109
|
}
|
|
5146
5110
|
metrics[idx] = {
|
|
@@ -5165,7 +5129,7 @@ async function calculateWithOptimizedQuery(opts) {
|
|
|
5165
5129
|
}
|
|
5166
5130
|
await aggregateNamedScores(metrics, whereSql);
|
|
5167
5131
|
await aggregateAssertions(metrics, whereSql);
|
|
5168
|
-
|
|
5132
|
+
logger.debug("Filtered metrics calculated", {
|
|
5169
5133
|
numPrompts,
|
|
5170
5134
|
metricsCount: basicResults.length
|
|
5171
5135
|
});
|
|
@@ -5286,7 +5250,6 @@ function createEmptyMetricsArray(numPrompts) {
|
|
|
5286
5250
|
cost: 0
|
|
5287
5251
|
}));
|
|
5288
5252
|
}
|
|
5289
|
-
|
|
5290
5253
|
//#endregion
|
|
5291
5254
|
//#region src/util/convertEvalResultsToTable.ts
|
|
5292
5255
|
/**
|
|
@@ -5419,7 +5382,6 @@ function convertResultsToTable(eval_) {
|
|
|
5419
5382
|
body: rows
|
|
5420
5383
|
};
|
|
5421
5384
|
}
|
|
5422
|
-
|
|
5423
5385
|
//#endregion
|
|
5424
5386
|
//#region src/util/exportToFile/index.ts
|
|
5425
5387
|
function convertEvalResultToTableCell(result) {
|
|
@@ -5497,7 +5459,6 @@ function convertTestResultsToTableRow(results, varsForHeader) {
|
|
|
5497
5459
|
for (const result of results) row.outputs[result.promptIdx] = convertEvalResultToTableCell(result);
|
|
5498
5460
|
return row;
|
|
5499
5461
|
}
|
|
5500
|
-
|
|
5501
5462
|
//#endregion
|
|
5502
5463
|
//#region src/models/evalPerformance.ts
|
|
5503
5464
|
const distinctCountCache = /* @__PURE__ */ new Map();
|
|
@@ -5514,7 +5475,7 @@ async function getCachedResultsCount(evalId) {
|
|
|
5514
5475
|
const cacheKey = `distinct:${evalId}`;
|
|
5515
5476
|
const cached = distinctCountCache.get(cacheKey);
|
|
5516
5477
|
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
5517
|
-
|
|
5478
|
+
logger.debug(`Using cached distinct count for eval ${evalId}: ${cached.count}`);
|
|
5518
5479
|
return cached.count;
|
|
5519
5480
|
}
|
|
5520
5481
|
const db = getDb();
|
|
@@ -5522,7 +5483,7 @@ async function getCachedResultsCount(evalId) {
|
|
|
5522
5483
|
const result = db.select({ count: sql`COUNT(DISTINCT test_idx)` }).from(evalResultsTable).where(sql`eval_id = ${evalId}`).all();
|
|
5523
5484
|
const count = Number(result[0]?.count ?? 0);
|
|
5524
5485
|
const duration = Date.now() - start;
|
|
5525
|
-
|
|
5486
|
+
logger.debug(`Distinct count query for eval ${evalId}: ${count} in ${duration}ms`);
|
|
5526
5487
|
distinctCountCache.set(cacheKey, {
|
|
5527
5488
|
count,
|
|
5528
5489
|
timestamp: Date.now()
|
|
@@ -5540,7 +5501,7 @@ async function getTotalResultRowCount(evalId) {
|
|
|
5540
5501
|
const cacheKey = `total:${evalId}`;
|
|
5541
5502
|
const cached = totalRowCountCache.get(cacheKey);
|
|
5542
5503
|
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
5543
|
-
|
|
5504
|
+
logger.debug(`Using cached total row count for eval ${evalId}: ${cached.count}`);
|
|
5544
5505
|
return cached.count;
|
|
5545
5506
|
}
|
|
5546
5507
|
const db = getDb();
|
|
@@ -5548,7 +5509,7 @@ async function getTotalResultRowCount(evalId) {
|
|
|
5548
5509
|
const result = db.select({ count: sql`COUNT(*)` }).from(evalResultsTable).where(sql`eval_id = ${evalId}`).all();
|
|
5549
5510
|
const count = Number(result[0]?.count ?? 0);
|
|
5550
5511
|
const duration = Date.now() - start;
|
|
5551
|
-
|
|
5512
|
+
logger.debug(`Total row count query for eval ${evalId}: ${count} in ${duration}ms`);
|
|
5552
5513
|
totalRowCountCache.set(cacheKey, {
|
|
5553
5514
|
count,
|
|
5554
5515
|
timestamp: Date.now()
|
|
@@ -5581,7 +5542,7 @@ async function queryTestIndicesOptimized(evalId, opts) {
|
|
|
5581
5542
|
`;
|
|
5582
5543
|
const countResult = db.all(countQuery);
|
|
5583
5544
|
const filteredCount = Number(countResult[0]?.count ?? 0);
|
|
5584
|
-
|
|
5545
|
+
logger.debug(`Optimized count query took ${Date.now() - countStart}ms`);
|
|
5585
5546
|
const idxStart = Date.now();
|
|
5586
5547
|
const idxQuery = sql`
|
|
5587
5548
|
SELECT DISTINCT test_idx
|
|
@@ -5592,13 +5553,12 @@ async function queryTestIndicesOptimized(evalId, opts) {
|
|
|
5592
5553
|
OFFSET ${offset}
|
|
5593
5554
|
`;
|
|
5594
5555
|
const testIndices = db.all(idxQuery).map((row) => row.test_idx);
|
|
5595
|
-
|
|
5556
|
+
logger.debug(`Optimized index query took ${Date.now() - idxStart}ms`);
|
|
5596
5557
|
return {
|
|
5597
5558
|
testIndices,
|
|
5598
5559
|
filteredCount
|
|
5599
5560
|
};
|
|
5600
5561
|
}
|
|
5601
|
-
|
|
5602
5562
|
//#endregion
|
|
5603
5563
|
//#region src/models/eval.ts
|
|
5604
5564
|
/**
|
|
@@ -5693,7 +5653,7 @@ var EvalQueries = class {
|
|
|
5693
5653
|
try {
|
|
5694
5654
|
db.update(evalsTable).set({ vars }).where(eq(evalsTable.id, evalId)).run();
|
|
5695
5655
|
} catch (e) {
|
|
5696
|
-
|
|
5656
|
+
logger.error(`Error setting vars: ${vars} for eval ${evalId}: ${e}`);
|
|
5697
5657
|
}
|
|
5698
5658
|
}
|
|
5699
5659
|
static async getMetadataKeysFromEval(evalId, comparisonEvalIds = []) {
|
|
@@ -5714,7 +5674,7 @@ var EvalQueries = class {
|
|
|
5714
5674
|
`;
|
|
5715
5675
|
return (await db.all(query)).map((r) => r.key);
|
|
5716
5676
|
} catch (error) {
|
|
5717
|
-
|
|
5677
|
+
logger.error(`Error fetching metadata keys for eval ${evalId} and comparisons [${comparisonEvalIds.join(", ")}]: ${error}`);
|
|
5718
5678
|
return [];
|
|
5719
5679
|
}
|
|
5720
5680
|
}
|
|
@@ -5745,7 +5705,7 @@ var EvalQueries = class {
|
|
|
5745
5705
|
const values = db.all(query).map(({ value }) => String(value).trim()).filter((value) => value.length > 0);
|
|
5746
5706
|
return Array.from(new Set(values));
|
|
5747
5707
|
} catch (error) {
|
|
5748
|
-
|
|
5708
|
+
logger.error(`Error fetching metadata values for eval ${evalId} and key ${trimmedKey}: ${error instanceof Error ? error.message : String(error)}`);
|
|
5749
5709
|
return [];
|
|
5750
5710
|
}
|
|
5751
5711
|
}
|
|
@@ -5817,7 +5777,7 @@ var Eval = class Eval {
|
|
|
5817
5777
|
}
|
|
5818
5778
|
return evalInstance;
|
|
5819
5779
|
}
|
|
5820
|
-
static async getMany(limit =
|
|
5780
|
+
static async getMany(limit = 100) {
|
|
5821
5781
|
return (await getDb().select().from(evalsTable).limit(limit).orderBy(desc(evalsTable.createdAt)).all()).map((e) => new Eval(e.config, {
|
|
5822
5782
|
id: e.id,
|
|
5823
5783
|
createdAt: new Date(e.createdAt),
|
|
@@ -5832,7 +5792,7 @@ var Eval = class Eval {
|
|
|
5832
5792
|
* @param offset - Number of evals to skip
|
|
5833
5793
|
* @param limit - Maximum number of evals to return
|
|
5834
5794
|
*/
|
|
5835
|
-
static async getPaginated(offset = 0, limit =
|
|
5795
|
+
static async getPaginated(offset = 0, limit = 100) {
|
|
5836
5796
|
return (await getDb().select().from(evalsTable).orderBy(desc(evalsTable.createdAt)).limit(limit).offset(offset).all()).map((e) => new Eval(e.config, {
|
|
5837
5797
|
id: e.id,
|
|
5838
5798
|
createdAt: new Date(e.createdAt),
|
|
@@ -5878,7 +5838,7 @@ var Eval = class Eval {
|
|
|
5878
5838
|
evalId,
|
|
5879
5839
|
promptId
|
|
5880
5840
|
}).onConflictDoNothing().run();
|
|
5881
|
-
|
|
5841
|
+
logger.debug(`Inserting prompt ${promptId}`);
|
|
5882
5842
|
}
|
|
5883
5843
|
if (opts?.results && opts.results.length > 0) {
|
|
5884
5844
|
const res = db.insert(evalResultsTable).values(opts.results?.map((r) => ({
|
|
@@ -5886,7 +5846,7 @@ var Eval = class Eval {
|
|
|
5886
5846
|
evalId,
|
|
5887
5847
|
id: crypto.randomUUID()
|
|
5888
5848
|
}))).run();
|
|
5889
|
-
|
|
5849
|
+
logger.debug(`Inserted ${res.changes} eval results`);
|
|
5890
5850
|
}
|
|
5891
5851
|
db.insert(datasetsTable).values({
|
|
5892
5852
|
id: datasetId,
|
|
@@ -5896,7 +5856,7 @@ var Eval = class Eval {
|
|
|
5896
5856
|
evalId,
|
|
5897
5857
|
datasetId
|
|
5898
5858
|
}).onConflictDoNothing().run();
|
|
5899
|
-
|
|
5859
|
+
logger.debug(`Inserting dataset ${datasetId}`);
|
|
5900
5860
|
if (config.tags) for (const [tagKey, tagValue] of Object.entries(config.tags)) {
|
|
5901
5861
|
const tagId = sha256(`${tagKey}:${tagValue}`);
|
|
5902
5862
|
db.insert(tagsTable).values({
|
|
@@ -5908,7 +5868,7 @@ var Eval = class Eval {
|
|
|
5908
5868
|
evalId,
|
|
5909
5869
|
tagId
|
|
5910
5870
|
}).onConflictDoNothing().run();
|
|
5911
|
-
|
|
5871
|
+
logger.debug(`Inserting tag ${tagId}`);
|
|
5912
5872
|
}
|
|
5913
5873
|
});
|
|
5914
5874
|
return new Eval(config, {
|
|
@@ -6089,7 +6049,7 @@ var Eval = class Eval {
|
|
|
6089
6049
|
if (type === "metric") {
|
|
6090
6050
|
const metricKey = field || value;
|
|
6091
6051
|
if (!metricKey) {
|
|
6092
|
-
|
|
6052
|
+
logger.warn("Invalid metric filter: missing field and value", { filter });
|
|
6093
6053
|
return;
|
|
6094
6054
|
}
|
|
6095
6055
|
const jsonPath = buildSafeJsonPath(metricKey);
|
|
@@ -6103,7 +6063,7 @@ var Eval = class Eval {
|
|
|
6103
6063
|
else if (operator === "lt") condition = sql`CAST(json_extract(named_scores, ${jsonPath}) AS REAL) < ${numericValue}`;
|
|
6104
6064
|
else if (operator === "lte") condition = sql`CAST(json_extract(named_scores, ${jsonPath}) AS REAL) <= ${numericValue}`;
|
|
6105
6065
|
} else {
|
|
6106
|
-
|
|
6066
|
+
logger.warn("Invalid numeric value in metric filter", {
|
|
6107
6067
|
metricKey,
|
|
6108
6068
|
value,
|
|
6109
6069
|
numericValue,
|
|
@@ -6181,7 +6141,7 @@ var Eval = class Eval {
|
|
|
6181
6141
|
const countStart = Date.now();
|
|
6182
6142
|
const countResult = await db.get(filteredCountQuery);
|
|
6183
6143
|
const countEnd = Date.now();
|
|
6184
|
-
|
|
6144
|
+
logger.debug(`Count query took ${countEnd - countStart}ms`);
|
|
6185
6145
|
const filteredCount = countResult?.count || 0;
|
|
6186
6146
|
const idxQuery = sql`
|
|
6187
6147
|
SELECT DISTINCT test_idx
|
|
@@ -6194,7 +6154,7 @@ var Eval = class Eval {
|
|
|
6194
6154
|
const idxStart = Date.now();
|
|
6195
6155
|
const rows = await db.all(idxQuery);
|
|
6196
6156
|
const idxEnd = Date.now();
|
|
6197
|
-
|
|
6157
|
+
logger.debug(`Index query took ${idxEnd - idxStart}ms`);
|
|
6198
6158
|
return {
|
|
6199
6159
|
testIndices: rows.map((row) => row.test_idx),
|
|
6200
6160
|
filteredCount
|
|
@@ -6230,7 +6190,7 @@ var Eval = class Eval {
|
|
|
6230
6190
|
const hasComplexFilters = opts.filters && opts.filters.length > 0;
|
|
6231
6191
|
let queryResult;
|
|
6232
6192
|
if (hasComplexFilters) {
|
|
6233
|
-
|
|
6193
|
+
logger.debug("Using original query for complex filters");
|
|
6234
6194
|
queryResult = await this.queryTestIndices({
|
|
6235
6195
|
offset: opts.offset,
|
|
6236
6196
|
limit: opts.limit,
|
|
@@ -6239,7 +6199,7 @@ var Eval = class Eval {
|
|
|
6239
6199
|
filters: opts.filters
|
|
6240
6200
|
});
|
|
6241
6201
|
} else {
|
|
6242
|
-
|
|
6202
|
+
logger.debug("Using optimized query for table page");
|
|
6243
6203
|
queryResult = await queryTestIndicesOptimized(this.id, {
|
|
6244
6204
|
offset: opts.offset,
|
|
6245
6205
|
limit: opts.limit,
|
|
@@ -6254,12 +6214,12 @@ var Eval = class Eval {
|
|
|
6254
6214
|
const varsStart = Date.now();
|
|
6255
6215
|
const vars = Array.from(this.vars);
|
|
6256
6216
|
const varsEnd = Date.now();
|
|
6257
|
-
|
|
6217
|
+
logger.debug(`Vars query took ${varsEnd - varsStart}ms`);
|
|
6258
6218
|
const body = [];
|
|
6259
6219
|
const bodyStart = Date.now();
|
|
6260
6220
|
if (testIndices.length === 0) {
|
|
6261
6221
|
const bodyEnd = Date.now();
|
|
6262
|
-
|
|
6222
|
+
logger.debug(`Body query took ${bodyEnd - bodyStart}ms`);
|
|
6263
6223
|
return {
|
|
6264
6224
|
head: {
|
|
6265
6225
|
prompts: this.prompts,
|
|
@@ -6291,7 +6251,7 @@ var Eval = class Eval {
|
|
|
6291
6251
|
if (results.length > 0) body.push(convertTestResultsToTableRow(results, vars));
|
|
6292
6252
|
}
|
|
6293
6253
|
const bodyEnd = Date.now();
|
|
6294
|
-
|
|
6254
|
+
logger.debug(`Body query took ${bodyEnd - bodyStart}ms`);
|
|
6295
6255
|
return {
|
|
6296
6256
|
head: {
|
|
6297
6257
|
prompts: this.prompts,
|
|
@@ -6404,7 +6364,7 @@ var Eval = class Eval {
|
|
|
6404
6364
|
})
|
|
6405
6365
|
}));
|
|
6406
6366
|
} catch (error) {
|
|
6407
|
-
|
|
6367
|
+
logger.debug(`Failed to fetch traces for eval ${this.id}: ${error}`);
|
|
6408
6368
|
return [];
|
|
6409
6369
|
}
|
|
6410
6370
|
}
|
|
@@ -6441,7 +6401,7 @@ var Eval = class Eval {
|
|
|
6441
6401
|
const newEvalId = createEvalId(/* @__PURE__ */ new Date());
|
|
6442
6402
|
const copyDescription = description || `${this.description || "Evaluation"} (Copy)`;
|
|
6443
6403
|
const testCount = distinctTestCount ?? await this.getResultsCount();
|
|
6444
|
-
|
|
6404
|
+
logger.info("Starting eval copy", {
|
|
6445
6405
|
sourceEvalId: this.id,
|
|
6446
6406
|
targetEvalId: newEvalId,
|
|
6447
6407
|
distinctTestCount: testCount
|
|
@@ -6504,7 +6464,7 @@ var Eval = class Eval {
|
|
|
6504
6464
|
db.insert(evalResultsTable).values(copiedResults).run();
|
|
6505
6465
|
copiedCount += batch.length;
|
|
6506
6466
|
offset += BATCH_SIZE;
|
|
6507
|
-
|
|
6467
|
+
logger.debug("Copied batch of eval results", {
|
|
6508
6468
|
sourceEvalId: this.id,
|
|
6509
6469
|
targetEvalId: newEvalId,
|
|
6510
6470
|
batchSize: batch.length,
|
|
@@ -6513,7 +6473,7 @@ var Eval = class Eval {
|
|
|
6513
6473
|
});
|
|
6514
6474
|
}
|
|
6515
6475
|
});
|
|
6516
|
-
|
|
6476
|
+
logger.info("Eval copy completed successfully", {
|
|
6517
6477
|
sourceEvalId: this.id,
|
|
6518
6478
|
targetEvalId: newEvalId,
|
|
6519
6479
|
rowsCopied: copiedCount,
|
|
@@ -6528,7 +6488,6 @@ var Eval = class Eval {
|
|
|
6528
6488
|
this._shared = shared;
|
|
6529
6489
|
}
|
|
6530
6490
|
};
|
|
6531
|
-
|
|
6532
6491
|
//#endregion
|
|
6533
6492
|
//#region src/assertions/validateAssertions.ts
|
|
6534
6493
|
var AssertValidationError = class extends Error {
|
|
@@ -6580,7 +6539,6 @@ function validateAssertions(tests, defaultTest) {
|
|
|
6580
6539
|
}
|
|
6581
6540
|
}
|
|
6582
6541
|
}
|
|
6583
|
-
|
|
6584
6542
|
//#endregion
|
|
6585
6543
|
//#region src/commands/eval/filterPrompts.ts
|
|
6586
6544
|
/**
|
|
@@ -6606,7 +6564,6 @@ function filterPrompts(prompts, filterPromptsOption) {
|
|
|
6606
6564
|
return promptId && filterRegex.test(promptId) || promptLabel && filterRegex.test(promptLabel);
|
|
6607
6565
|
});
|
|
6608
6566
|
}
|
|
6609
|
-
|
|
6610
6567
|
//#endregion
|
|
6611
6568
|
//#region src/commands/eval/filterProviders.ts
|
|
6612
6569
|
/**
|
|
@@ -6687,7 +6644,6 @@ function filterProviders(providers, filterProvidersOption) {
|
|
|
6687
6644
|
return filterRegex.test(providerId) || providerLabel && filterRegex.test(providerLabel);
|
|
6688
6645
|
});
|
|
6689
6646
|
}
|
|
6690
|
-
|
|
6691
6647
|
//#endregion
|
|
6692
6648
|
//#region src/commands/eval/filterTestsUtil.ts
|
|
6693
6649
|
/**
|
|
@@ -6715,35 +6671,35 @@ function mergeDefaultVars(test, defaultTest) {
|
|
|
6715
6671
|
*/
|
|
6716
6672
|
async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
6717
6673
|
if (!testSuite.tests) {
|
|
6718
|
-
|
|
6674
|
+
logger.debug("[filterTestsByResults] No tests in test suite");
|
|
6719
6675
|
return [];
|
|
6720
6676
|
}
|
|
6721
|
-
|
|
6677
|
+
logger.debug(`[filterTestsByResults] Loading results from: ${pathOrId}`);
|
|
6722
6678
|
let results;
|
|
6723
6679
|
try {
|
|
6724
6680
|
if (pathOrId.endsWith(".json")) results = (await readOutput(pathOrId)).results;
|
|
6725
6681
|
else {
|
|
6726
6682
|
const eval_ = await Eval.findById(pathOrId);
|
|
6727
6683
|
if (!eval_) {
|
|
6728
|
-
|
|
6684
|
+
logger.warn(`[filterTestsByResults] Evaluation not found: ${pathOrId}`);
|
|
6729
6685
|
return [];
|
|
6730
6686
|
}
|
|
6731
6687
|
const summary = await eval_.toEvaluateSummary();
|
|
6732
6688
|
if ("results" in summary) results = { results: summary.results };
|
|
6733
6689
|
else {
|
|
6734
|
-
|
|
6690
|
+
logger.debug("[filterTestsByResults] No results in evaluation summary");
|
|
6735
6691
|
return [];
|
|
6736
6692
|
}
|
|
6737
6693
|
}
|
|
6738
6694
|
} catch (error) {
|
|
6739
|
-
|
|
6695
|
+
logger.warn(`[filterTestsByResults] Error loading results: ${error}`);
|
|
6740
6696
|
return [];
|
|
6741
6697
|
}
|
|
6742
6698
|
const filteredResults = results.results.filter(filterFn);
|
|
6743
|
-
|
|
6699
|
+
logger.debug(`[filterTestsByResults] Found ${filteredResults.length} matching results out of ${results.results.length} total`);
|
|
6744
6700
|
if (filteredResults.length === 0) return [];
|
|
6745
6701
|
const uniqueVarsInResults = new Set(filteredResults.map((r) => JSON.stringify(filterRuntimeVars(r.vars))));
|
|
6746
|
-
|
|
6702
|
+
logger.debug(`[filterTestsByResults] ${uniqueVarsInResults.size} unique test cases (by vars) in filtered results`);
|
|
6747
6703
|
const matchedTests = [];
|
|
6748
6704
|
for (const test of testSuite.tests) {
|
|
6749
6705
|
const testWithDefaults = mergeDefaultVars(test, testSuite.defaultTest);
|
|
@@ -6765,15 +6721,15 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6765
6721
|
...runtimeVars
|
|
6766
6722
|
}
|
|
6767
6723
|
};
|
|
6768
|
-
|
|
6724
|
+
logger.debug("[filterTestsByResults] Restored runtime vars for test", { varKeys: Object.keys(runtimeVars) });
|
|
6769
6725
|
matchedTests.push(testWithRuntimeVars);
|
|
6770
6726
|
} else {
|
|
6771
|
-
|
|
6727
|
+
logger.debug("[filterTestsByResults] Matched test has no runtime vars to restore");
|
|
6772
6728
|
matchedTests.push(test);
|
|
6773
6729
|
}
|
|
6774
6730
|
}
|
|
6775
6731
|
}
|
|
6776
|
-
|
|
6732
|
+
logger.debug(`[filterTestsByResults] Matched ${matchedTests.length} tests out of ${testSuite.tests.length} in test suite`);
|
|
6777
6733
|
const extractedTests = [];
|
|
6778
6734
|
const matchedResultKeys = /* @__PURE__ */ new Set();
|
|
6779
6735
|
for (const result of filteredResults) for (const test of matchedTests) if (resultIsForTestCase(result, mergeDefaultVars(test, testSuite.defaultTest))) {
|
|
@@ -6784,7 +6740,7 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6784
6740
|
const resultKey = JSON.stringify(filterRuntimeVars(result.vars));
|
|
6785
6741
|
if (matchedResultKeys.has(resultKey)) continue;
|
|
6786
6742
|
if (!result.testCase) {
|
|
6787
|
-
|
|
6743
|
+
logger.debug("[filterTestsByResults] Skipping result without testCase data for extraction");
|
|
6788
6744
|
continue;
|
|
6789
6745
|
}
|
|
6790
6746
|
if (extractedTests.some((t) => JSON.stringify(filterRuntimeVars(t.vars)) === resultKey)) continue;
|
|
@@ -6796,12 +6752,11 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6796
6752
|
options: result.testCase.options
|
|
6797
6753
|
});
|
|
6798
6754
|
}
|
|
6799
|
-
if (extractedTests.length > 0)
|
|
6800
|
-
if (matchedTests.length === 0 && extractedTests.length === 0 && filteredResults.length > 0)
|
|
6801
|
-
else if (matchedTests.length + extractedTests.length < uniqueVarsInResults.size)
|
|
6755
|
+
if (extractedTests.length > 0) logger.info(`[filterTestsByResults] Extracted ${extractedTests.length} runtime-generated test(s) from results`);
|
|
6756
|
+
if (matchedTests.length === 0 && extractedTests.length === 0 && filteredResults.length > 0) logger.warn(`[filterTestsByResults] No tests matched ${filteredResults.length} filtered results. This may indicate a vars or provider mismatch between stored results and current test suite. Use LOG_LEVEL=debug for detailed matching info.`);
|
|
6757
|
+
else if (matchedTests.length + extractedTests.length < uniqueVarsInResults.size) logger.debug(`[filterTestsByResults] Note: ${uniqueVarsInResults.size - matchedTests.length - extractedTests.length} unique test cases in results did not match any test in the current test suite and could not be extracted. This may indicate results without testCase data.`);
|
|
6802
6758
|
return deduplicateTestCases([...matchedTests, ...extractedTests]);
|
|
6803
6759
|
}
|
|
6804
|
-
|
|
6805
6760
|
//#endregion
|
|
6806
6761
|
//#region src/commands/eval/filterTests.ts
|
|
6807
6762
|
/**
|
|
@@ -6827,7 +6782,7 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6827
6782
|
* @param reason - Description of what the filter was looking for (e.g., 'no failures/errors')
|
|
6828
6783
|
*/
|
|
6829
6784
|
function logNoTestsWarning(filterType, pathOrId, reason) {
|
|
6830
|
-
|
|
6785
|
+
logger.warn(`--${filterType} returned no tests. The evaluation "${pathOrId}" may have ${reason}, or the test suite may have changed since the evaluation was run.`);
|
|
6831
6786
|
}
|
|
6832
6787
|
/**
|
|
6833
6788
|
* Filters a test suite to only include all tests that did not pass (failures + errors)
|
|
@@ -6873,10 +6828,10 @@ async function filterErrorTests(testSuite, pathOrId) {
|
|
|
6873
6828
|
*/
|
|
6874
6829
|
async function filterTests(testSuite, options) {
|
|
6875
6830
|
let tests = testSuite.tests || [];
|
|
6876
|
-
|
|
6877
|
-
|
|
6831
|
+
logger.debug(`Starting filterTests with options: ${JSON.stringify(options)}`);
|
|
6832
|
+
logger.debug(`Initial test count: ${tests.length}`);
|
|
6878
6833
|
if (Object.keys(options).length === 0) {
|
|
6879
|
-
|
|
6834
|
+
logger.debug("No filter options provided, returning all tests");
|
|
6880
6835
|
return tests;
|
|
6881
6836
|
}
|
|
6882
6837
|
if (options.metadata) {
|
|
@@ -6891,11 +6846,11 @@ async function filterTests(testSuite, options) {
|
|
|
6891
6846
|
value
|
|
6892
6847
|
});
|
|
6893
6848
|
}
|
|
6894
|
-
|
|
6895
|
-
|
|
6849
|
+
logger.debug(`Filtering for metadata conditions (AND logic): ${parsedFilters.map((f) => `${f.key}=${f.value}`).join(", ")}`);
|
|
6850
|
+
logger.debug(`Before metadata filter: ${tests.length} tests`);
|
|
6896
6851
|
tests = tests.filter((test) => {
|
|
6897
6852
|
if (!test.metadata) {
|
|
6898
|
-
|
|
6853
|
+
logger.debug(`Test has no metadata: ${test.description || "unnamed test"}`);
|
|
6899
6854
|
return false;
|
|
6900
6855
|
}
|
|
6901
6856
|
for (const { key, value } of parsedFilters) {
|
|
@@ -6904,16 +6859,16 @@ async function filterTests(testSuite, options) {
|
|
|
6904
6859
|
if (Array.isArray(testValue)) matches = testValue.some((v) => v.toString().includes(value));
|
|
6905
6860
|
else if (testValue !== void 0) matches = testValue.toString().includes(value);
|
|
6906
6861
|
if (!matches) {
|
|
6907
|
-
|
|
6862
|
+
logger.debug(`Test "${test.description || "unnamed test"}" metadata doesn't match. Expected ${key} to include ${value}, got ${JSON.stringify(test.metadata)}`);
|
|
6908
6863
|
return false;
|
|
6909
6864
|
}
|
|
6910
6865
|
}
|
|
6911
6866
|
return true;
|
|
6912
6867
|
});
|
|
6913
|
-
|
|
6868
|
+
logger.debug(`After metadata filter: ${tests.length} tests remain`);
|
|
6914
6869
|
}
|
|
6915
6870
|
if (options.failingOnly && options.errorsOnly) {
|
|
6916
|
-
|
|
6871
|
+
logger.debug("Using both --filter-failing-only and --filter-errors-only together (equivalent to --filter-failing)");
|
|
6917
6872
|
const failingOnlyTests = await filterFailingOnlyTests(testSuite, options.failingOnly);
|
|
6918
6873
|
const errorTests = await filterErrorTests(testSuite, options.errorsOnly);
|
|
6919
6874
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -6923,8 +6878,8 @@ async function filterTests(testSuite, options) {
|
|
|
6923
6878
|
seen.add(key);
|
|
6924
6879
|
return true;
|
|
6925
6880
|
});
|
|
6926
|
-
|
|
6927
|
-
if (tests.length === 0)
|
|
6881
|
+
logger.debug(`Combined failingOnly (${failingOnlyTests.length}) and errors (${errorTests.length}) filters: ${tests.length} unique tests`);
|
|
6882
|
+
if (tests.length === 0) logger.warn("Combined --filter-failing-only and --filter-errors-only returned no tests. The specified evaluations may have no failures or errors, or the test suite may have changed.");
|
|
6928
6883
|
} else if (options.failing) {
|
|
6929
6884
|
tests = await filterFailingTests(testSuite, options.failing);
|
|
6930
6885
|
if (tests.length === 0) logNoTestsWarning("filter-failing", options.failing, "no failures/errors");
|
|
@@ -6961,7 +6916,6 @@ async function filterTests(testSuite, options) {
|
|
|
6961
6916
|
}
|
|
6962
6917
|
return tests;
|
|
6963
6918
|
}
|
|
6964
|
-
|
|
6965
6919
|
//#endregion
|
|
6966
6920
|
//#region src/util/promptfooCommand.ts
|
|
6967
6921
|
/**
|
|
@@ -7007,7 +6961,6 @@ function promptfooCommand(subcommand) {
|
|
|
7007
6961
|
if (detectInstaller() === "npx") return subcommand ? `npx promptfoo@latest ${subcommand}` : "npx promptfoo@latest";
|
|
7008
6962
|
return subcommand ? `promptfoo ${subcommand}` : "promptfoo";
|
|
7009
6963
|
}
|
|
7010
|
-
|
|
7011
6964
|
//#endregion
|
|
7012
6965
|
//#region src/csv.ts
|
|
7013
6966
|
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = .8;
|
|
@@ -7099,7 +7052,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7099
7052
|
if (!key.startsWith("__") && specialKeys.some((k) => key.startsWith(k)) && !uniqueErrorMessages.has(key)) {
|
|
7100
7053
|
const error = `You used a single underscore for the key "${key}". Did you mean to use "${key.replace("_", "__")}" instead?`;
|
|
7101
7054
|
uniqueErrorMessages.add(key);
|
|
7102
|
-
|
|
7055
|
+
logger.warn(error);
|
|
7103
7056
|
}
|
|
7104
7057
|
if (key.startsWith("__expected")) {
|
|
7105
7058
|
if (value.trim() !== "") asserts.push(assertionFromString(value.trim()));
|
|
@@ -7117,10 +7070,10 @@ function testCaseFromCsvRow(row) {
|
|
|
7117
7070
|
} else if (value.trim() !== "") metadata[metadataKey] = value;
|
|
7118
7071
|
} else if (key === "__metadata" && !uniqueErrorMessages.has(key)) {
|
|
7119
7072
|
uniqueErrorMessages.add(key);
|
|
7120
|
-
|
|
7073
|
+
logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
|
|
7121
7074
|
} else if (key.startsWith("__config:")) {
|
|
7122
7075
|
const configParts = key.slice(9).split(":");
|
|
7123
|
-
if (configParts.length !== 2)
|
|
7076
|
+
if (configParts.length !== 2) logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
|
|
7124
7077
|
else {
|
|
7125
7078
|
const [expectedKey, configKey] = configParts;
|
|
7126
7079
|
let targetIndex;
|
|
@@ -7130,11 +7083,11 @@ function testCaseFromCsvRow(row) {
|
|
|
7130
7083
|
if (indexMatch) targetIndex = Number.parseInt(indexMatch[1], 10) - 1;
|
|
7131
7084
|
}
|
|
7132
7085
|
if (targetIndex === void 0) {
|
|
7133
|
-
|
|
7086
|
+
logger.error(`Invalid expected key "${expectedKey}" in __config column "${key}". Must be __expected or __expected<N> where N is a positive integer.`);
|
|
7134
7087
|
throw new Error(`Invalid expected key "${expectedKey}" in __config column`);
|
|
7135
7088
|
}
|
|
7136
7089
|
if (!["threshold"].includes(configKey)) {
|
|
7137
|
-
|
|
7090
|
+
logger.error(`Invalid config key "${configKey}" in __config column "${key}". Valid config keys include: threshold`);
|
|
7138
7091
|
throw new Error(`Invalid config key "${configKey}" in __config column`);
|
|
7139
7092
|
}
|
|
7140
7093
|
if (!assertionConfigs[targetIndex]) assertionConfigs[targetIndex] = {};
|
|
@@ -7142,7 +7095,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7142
7095
|
if (configKey === "threshold") {
|
|
7143
7096
|
parsedValue = Number.parseFloat(value);
|
|
7144
7097
|
if (!Number.isFinite(parsedValue)) {
|
|
7145
|
-
|
|
7098
|
+
logger.error(`Invalid numeric value "${value}" for config key "${configKey}" in column "${key}"`);
|
|
7146
7099
|
throw new Error(`Invalid numeric value for ${configKey}`);
|
|
7147
7100
|
}
|
|
7148
7101
|
}
|
|
@@ -7169,7 +7122,6 @@ function testCaseFromCsvRow(row) {
|
|
|
7169
7122
|
...Object.keys(metadata).length > 0 ? { metadata } : {}
|
|
7170
7123
|
};
|
|
7171
7124
|
}
|
|
7172
|
-
|
|
7173
7125
|
//#endregion
|
|
7174
7126
|
//#region src/microsoftSharepoint.ts
|
|
7175
7127
|
let cca = null;
|
|
@@ -7189,7 +7141,7 @@ async function fetchCsvFromSharepoint(url) {
|
|
|
7189
7141
|
const fileRelativeUrl = url.startsWith(normalizedBaseUrl) ? url.slice(normalizedBaseUrl.length) : url;
|
|
7190
7142
|
const serverRelativeUrl = fileRelativeUrl.startsWith("/") ? fileRelativeUrl : `/${fileRelativeUrl}`;
|
|
7191
7143
|
const apiUrl = `${normalizedBaseUrl}/_api/web/GetFileByServerRelativeUrl('${encodeURI(serverRelativeUrl)}')/$value`;
|
|
7192
|
-
|
|
7144
|
+
logger.debug(`Fetching CSV from SharePoint: ${apiUrl}`);
|
|
7193
7145
|
const response = await fetchWithProxy(apiUrl, { headers: {
|
|
7194
7146
|
Authorization: `Bearer ${accessToken}`,
|
|
7195
7147
|
Accept: "text/csv"
|
|
@@ -7246,7 +7198,6 @@ async function getSharePointAccessToken() {
|
|
|
7246
7198
|
if (!tokenResult?.accessToken) throw new Error("Failed to acquire SharePoint access token. Please check your authentication configuration.");
|
|
7247
7199
|
return tokenResult.accessToken;
|
|
7248
7200
|
}
|
|
7249
|
-
|
|
7250
7201
|
//#endregion
|
|
7251
7202
|
//#region src/util/xlsx.ts
|
|
7252
7203
|
async function parseXlsxFile(filePath) {
|
|
@@ -7306,7 +7257,6 @@ async function parseXlsxFile(filePath) {
|
|
|
7306
7257
|
throw new Error(`Failed to parse Excel file ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
7307
7258
|
}
|
|
7308
7259
|
}
|
|
7309
|
-
|
|
7310
7260
|
//#endregion
|
|
7311
7261
|
//#region src/util/testCaseReader.ts
|
|
7312
7262
|
async function readTestFiles(pathOrGlobs, basePath = "") {
|
|
@@ -7352,29 +7302,29 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7352
7302
|
const fileExtension = parse(pathWithoutFunction).ext.slice(1);
|
|
7353
7303
|
const extensionWithoutSheet = fileExtension.split("#")[0];
|
|
7354
7304
|
if (varsPath.startsWith("huggingface://datasets/")) {
|
|
7355
|
-
|
|
7305
|
+
telemetry.record("feature_used", { feature: "huggingface dataset" });
|
|
7356
7306
|
return await fetchHuggingFaceDataset(varsPath);
|
|
7357
7307
|
}
|
|
7358
7308
|
if (isJavascriptFile(pathWithoutFunction)) {
|
|
7359
|
-
|
|
7309
|
+
telemetry.record("feature_used", { feature: "js tests file" });
|
|
7360
7310
|
const mod = await importModule(pathWithoutFunction, maybeFunctionName);
|
|
7361
7311
|
return typeof mod === "function" ? await mod(finalConfig) : mod;
|
|
7362
7312
|
}
|
|
7363
7313
|
if (fileExtension === "py") {
|
|
7364
|
-
|
|
7314
|
+
telemetry.record("feature_used", { feature: "python tests file" });
|
|
7365
7315
|
const result = await runPython(pathWithoutFunction, maybeFunctionName ?? "generate_tests", finalConfig === void 0 ? [] : [finalConfig]);
|
|
7366
7316
|
if (!Array.isArray(result)) throw new Error(`Python test function must return a list of test cases, got ${typeof result}`);
|
|
7367
7317
|
return result;
|
|
7368
7318
|
}
|
|
7369
7319
|
let rows = [];
|
|
7370
7320
|
if (varsPath.startsWith("https://docs.google.com/spreadsheets/")) {
|
|
7371
|
-
|
|
7321
|
+
telemetry.record("feature_used", { feature: "csv tests file - google sheet" });
|
|
7372
7322
|
rows = await fetchCsvFromGoogleSheet(varsPath);
|
|
7373
7323
|
} else if (/https:\/\/[^/]+\.sharepoint\.com\//i.test(varsPath)) {
|
|
7374
|
-
|
|
7324
|
+
telemetry.record("feature_used", { feature: "csv tests file - sharepoint" });
|
|
7375
7325
|
rows = await fetchCsvFromSharepoint(varsPath);
|
|
7376
7326
|
} else if (fileExtension === "csv") {
|
|
7377
|
-
|
|
7327
|
+
telemetry.record("feature_used", { feature: "csv tests file - local" });
|
|
7378
7328
|
const delimiter = getEnvString("PROMPTFOO_CSV_DELIMITER", ",");
|
|
7379
7329
|
const fileContent = await fsPromises.readFile(resolvedVarsPath, "utf-8");
|
|
7380
7330
|
const enforceStrict = getEnvBool("PROMPTFOO_CSV_STRICT", false);
|
|
@@ -7406,10 +7356,10 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7406
7356
|
throw e;
|
|
7407
7357
|
}
|
|
7408
7358
|
} else if (extensionWithoutSheet === "xlsx" || extensionWithoutSheet === "xls") {
|
|
7409
|
-
|
|
7359
|
+
telemetry.record("feature_used", { feature: "xlsx tests file - local" });
|
|
7410
7360
|
rows = await parseXlsxFile(resolvedVarsPath);
|
|
7411
7361
|
} else if (fileExtension === "json") {
|
|
7412
|
-
|
|
7362
|
+
telemetry.record("feature_used", { feature: "json tests file" });
|
|
7413
7363
|
const fileContent = await fsPromises.readFile(resolvedVarsPath, "utf-8");
|
|
7414
7364
|
const jsonData = yaml.load(fileContent);
|
|
7415
7365
|
return (Array.isArray(jsonData) ? jsonData : [jsonData]).map((item, idx) => ({
|
|
@@ -7417,7 +7367,7 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7417
7367
|
description: item.description || `Row #${idx + 1}`
|
|
7418
7368
|
}));
|
|
7419
7369
|
} else if (fileExtension === "jsonl") {
|
|
7420
|
-
|
|
7370
|
+
telemetry.record("feature_used", { feature: "jsonl tests file" });
|
|
7421
7371
|
return (await fsPromises.readFile(resolvedVarsPath, "utf-8")).split("\n").filter((line) => line.trim()).map((line, idx) => {
|
|
7422
7372
|
return {
|
|
7423
7373
|
...JSON.parse(line),
|
|
@@ -7425,7 +7375,7 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7425
7375
|
};
|
|
7426
7376
|
});
|
|
7427
7377
|
} else if (fileExtension === "yaml" || fileExtension === "yml") {
|
|
7428
|
-
|
|
7378
|
+
telemetry.record("feature_used", { feature: "yaml tests file" });
|
|
7429
7379
|
rows = maybeLoadConfigFromExternalFile(yaml.load(await fsPromises.readFile(resolvedVarsPath, "utf-8")));
|
|
7430
7380
|
}
|
|
7431
7381
|
return rows.map((row, idx) => {
|
|
@@ -7469,7 +7419,7 @@ async function readTest(test, basePath = "", isDefaultTest = false) {
|
|
|
7469
7419
|
*/
|
|
7470
7420
|
async function loadTestsFromGlob(loadTestsGlob, basePath = "") {
|
|
7471
7421
|
if (loadTestsGlob.startsWith("huggingface://datasets/")) {
|
|
7472
|
-
|
|
7422
|
+
telemetry.record("feature_used", { feature: "huggingface dataset" });
|
|
7473
7423
|
return await fetchHuggingFaceDataset(loadTestsGlob);
|
|
7474
7424
|
}
|
|
7475
7425
|
if (loadTestsGlob.startsWith("file://")) loadTestsGlob = loadTestsGlob.slice(7);
|
|
@@ -7480,12 +7430,12 @@ async function loadTestsFromGlob(loadTestsGlob, basePath = "") {
|
|
|
7480
7430
|
if ((isJavascriptFile(pathWithoutFunction) || pathWithoutFunction.endsWith(".py")) && !testFiles.some((file) => file === resolvedPath || file === pathWithoutFunction)) testFiles.push(resolvedPath);
|
|
7481
7431
|
if (loadTestsGlob.startsWith("https://docs.google.com/spreadsheets/")) testFiles.push(loadTestsGlob);
|
|
7482
7432
|
const _deref = async (testCases, file) => {
|
|
7483
|
-
|
|
7433
|
+
logger.debug(`Dereferencing test file: ${file}`);
|
|
7484
7434
|
return await $RefParser.dereference(testCases);
|
|
7485
7435
|
};
|
|
7486
7436
|
const ret = [];
|
|
7487
7437
|
if (testFiles.length < 1) {
|
|
7488
|
-
|
|
7438
|
+
logger.error(`No test files found for path: ${loadTestsGlob}`);
|
|
7489
7439
|
return ret;
|
|
7490
7440
|
}
|
|
7491
7441
|
for (const testFile of testFiles) {
|
|
@@ -7525,14 +7475,14 @@ async function readTests(tests, basePath = "") {
|
|
|
7525
7475
|
else ret.push(...await loadTestsFromGlob(globOrTest, basePath));
|
|
7526
7476
|
} else if ("path" in globOrTest) ret.push(...await readStandaloneTestsFile(globOrTest.path, basePath, globOrTest.config));
|
|
7527
7477
|
else ret.push(await readTest(globOrTest, basePath));
|
|
7528
|
-
else if (tests !== void 0 && tests !== null)
|
|
7478
|
+
else if (tests !== void 0 && tests !== null) logger.warn(dedent`
|
|
7529
7479
|
Warning: Unsupported 'tests' format in promptfooconfig.yaml.
|
|
7530
7480
|
Expected: string, string[], or TestCase[], but received: ${typeof tests}
|
|
7531
7481
|
|
|
7532
7482
|
Please check your configuration file and ensure the 'tests' field is correctly formatted.
|
|
7533
7483
|
For more information, visit: https://promptfoo.dev/docs/configuration/reference/#test-case
|
|
7534
7484
|
`);
|
|
7535
|
-
if (ret.some((testCase) => testCase.vars?.assert) && !getEnvBool("PROMPTFOO_NO_TESTCASE_ASSERT_WARNING"))
|
|
7485
|
+
if (ret.some((testCase) => testCase.vars?.assert) && !getEnvBool("PROMPTFOO_NO_TESTCASE_ASSERT_WARNING")) logger.warn(dedent`
|
|
7536
7486
|
Warning: Found 'assert' key in vars. This is likely a mistake in your configuration.
|
|
7537
7487
|
|
|
7538
7488
|
'assert' should be *unindented* so it is under the test itself, not vars. For example:
|
|
@@ -7548,7 +7498,6 @@ async function readTests(tests, basePath = "") {
|
|
|
7548
7498
|
`);
|
|
7549
7499
|
return ret;
|
|
7550
7500
|
}
|
|
7551
|
-
|
|
7552
7501
|
//#endregion
|
|
7553
7502
|
//#region src/util/validateTestPromptReferences.ts
|
|
7554
7503
|
var PromptReferenceValidationError = class extends Error {
|
|
@@ -7591,7 +7540,6 @@ function validateTestPromptReferences(tests, prompts, defaultTest) {
|
|
|
7591
7540
|
}
|
|
7592
7541
|
}
|
|
7593
7542
|
}
|
|
7594
|
-
|
|
7595
7543
|
//#endregion
|
|
7596
7544
|
//#region src/util/validateTestProviderReferences.ts
|
|
7597
7545
|
var ProviderReferenceValidationError = class extends Error {
|
|
@@ -7637,7 +7585,6 @@ function validateTestProviderReferences(tests, providers, defaultTest, scenarios
|
|
|
7637
7585
|
});
|
|
7638
7586
|
});
|
|
7639
7587
|
}
|
|
7640
|
-
|
|
7641
7588
|
//#endregion
|
|
7642
7589
|
//#region src/util/config/extensions.ts
|
|
7643
7590
|
/**
|
|
@@ -7655,7 +7602,6 @@ const DEFAULT_CONFIG_EXTENSIONS = [
|
|
|
7655
7602
|
"mts",
|
|
7656
7603
|
"ts"
|
|
7657
7604
|
];
|
|
7658
|
-
|
|
7659
7605
|
//#endregion
|
|
7660
7606
|
//#region src/util/config/load.ts
|
|
7661
7607
|
/**
|
|
@@ -7778,34 +7724,34 @@ async function readConfig(configPath) {
|
|
|
7778
7724
|
const hasProviders = data.providers !== void 0;
|
|
7779
7725
|
return hasTargets && !hasProviders || !hasTargets && hasProviders;
|
|
7780
7726
|
}, { message: "Exactly one of 'targets' or 'providers' must be provided, but not both" }).safeParse(renderedConfig);
|
|
7781
|
-
if (!validationResult.success)
|
|
7727
|
+
if (!validationResult.success) logger.warn(`Invalid configuration file ${configPath}:\n${z.prettifyError(validationResult.error)}`);
|
|
7782
7728
|
ret = renderedConfig;
|
|
7783
7729
|
} else if (isJavascriptFile(configPath)) {
|
|
7784
7730
|
const renderedConfig = renderConfigEnvTemplates(await importModule(configPath));
|
|
7785
7731
|
const validationResult = UnifiedConfigSchema.safeParse(renderedConfig);
|
|
7786
|
-
if (!validationResult.success)
|
|
7732
|
+
if (!validationResult.success) logger.warn(`Invalid configuration file ${configPath}:\n${z.prettifyError(validationResult.error)}`);
|
|
7787
7733
|
ret = renderedConfig;
|
|
7788
7734
|
} else throw new Error(`Unsupported configuration file format: ${ext}`);
|
|
7789
7735
|
if (ret.targets) {
|
|
7790
|
-
|
|
7736
|
+
logger.debug(`Rewriting config.targets to config.providers`);
|
|
7791
7737
|
ret.providers = ret.targets;
|
|
7792
7738
|
delete ret.targets;
|
|
7793
7739
|
}
|
|
7794
7740
|
if (ret.plugins) {
|
|
7795
|
-
|
|
7741
|
+
logger.debug(`Rewriting config.plugins to config.redteam.plugins`);
|
|
7796
7742
|
ret.redteam = ret.redteam || {};
|
|
7797
7743
|
ret.redteam.plugins = ret.plugins;
|
|
7798
7744
|
delete ret.plugins;
|
|
7799
7745
|
}
|
|
7800
7746
|
if (ret.strategies) {
|
|
7801
|
-
|
|
7747
|
+
logger.debug(`Rewriting config.strategies to config.redteam.strategies`);
|
|
7802
7748
|
ret.redteam = ret.redteam || {};
|
|
7803
7749
|
ret.redteam.strategies = ret.strategies;
|
|
7804
7750
|
delete ret.strategies;
|
|
7805
7751
|
}
|
|
7806
7752
|
if (!ret.prompts) {
|
|
7807
|
-
|
|
7808
|
-
if (!(!ret.tests || typeof ret.tests === "string" || Array.isArray(ret.tests) && ret.tests.some((test) => isTestCaseWithVars(test) && Object.keys(test.vars || {}).includes("prompt"))))
|
|
7753
|
+
logger.debug(`Setting default prompt because there is no \`prompts\` field`);
|
|
7754
|
+
if (!(!ret.tests || typeof ret.tests === "string" || Array.isArray(ret.tests) && ret.tests.some((test) => isTestCaseWithVars(test) && Object.keys(test.vars || {}).includes("prompt")))) logger.warn(`Warning: Expected top-level "prompts" property in config or a test variable named "prompt"`);
|
|
7809
7755
|
ret.prompts = ["{{prompt}}"];
|
|
7810
7756
|
}
|
|
7811
7757
|
return ret;
|
|
@@ -8003,9 +7949,9 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8003
7949
|
defaultConfig = {};
|
|
8004
7950
|
}
|
|
8005
7951
|
if (cmdObj.assertions) {
|
|
8006
|
-
|
|
7952
|
+
telemetry.record("feature_used", { feature: "standalone assertions mode" });
|
|
8007
7953
|
if (!cmdObj.modelOutputs) {
|
|
8008
|
-
|
|
7954
|
+
logger.error("You must provide --model-outputs when using --assertions");
|
|
8009
7955
|
process$1.exit(1);
|
|
8010
7956
|
}
|
|
8011
7957
|
const modelOutputs = JSON.parse(fs$1.readFileSync(path$2.join(process$1.cwd(), cmdObj.modelOutputs), "utf8"));
|
|
@@ -8027,14 +7973,14 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8027
7973
|
});
|
|
8028
7974
|
}
|
|
8029
7975
|
const basePath = configPaths ? path$2.dirname(configPaths[0]) : "";
|
|
8030
|
-
|
|
7976
|
+
state.basePath = basePath;
|
|
8031
7977
|
const defaultTestRaw = fileConfig.defaultTest || defaultConfig.defaultTest;
|
|
8032
7978
|
let processedDefaultTest;
|
|
8033
7979
|
if (typeof defaultTestRaw === "string" && defaultTestRaw.startsWith("file://")) {
|
|
8034
|
-
const originalBasePath =
|
|
8035
|
-
|
|
7980
|
+
const originalBasePath = state.basePath;
|
|
7981
|
+
state.basePath = basePath;
|
|
8036
7982
|
const loaded = await maybeLoadFromExternalFile(defaultTestRaw);
|
|
8037
|
-
|
|
7983
|
+
state.basePath = originalBasePath;
|
|
8038
7984
|
processedDefaultTest = loaded;
|
|
8039
7985
|
} else if (defaultTestRaw) processedDefaultTest = defaultTestRaw;
|
|
8040
7986
|
const config = {
|
|
@@ -8059,7 +8005,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8059
8005
|
const hasProviders = cmdObj.providers && cmdObj.providers.length > 0 || [config.providers].flat().filter(Boolean).length > 0;
|
|
8060
8006
|
if (!Boolean(configPaths) && !hasPrompts && !hasProviders && !isCI()) {
|
|
8061
8007
|
const extList = DEFAULT_CONFIG_EXTENSIONS.join(", ");
|
|
8062
|
-
|
|
8008
|
+
logger.warn(dedent`
|
|
8063
8009
|
${chalk.yellow.bold("⚠️ No promptfooconfig found")}
|
|
8064
8010
|
|
|
8065
8011
|
${chalk.white(`Searched in ${chalk.bold(process$1.cwd())} for promptfooconfig.{${extList}}`)}
|
|
@@ -8075,11 +8021,11 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8075
8021
|
process$1.exit(1);
|
|
8076
8022
|
}
|
|
8077
8023
|
if (!hasPrompts) {
|
|
8078
|
-
|
|
8024
|
+
logger.error("You must provide at least 1 prompt");
|
|
8079
8025
|
process$1.exit(1);
|
|
8080
8026
|
}
|
|
8081
8027
|
if (type !== "DatasetGeneration" && type !== "AssertionGeneration" && !hasProviders) {
|
|
8082
|
-
|
|
8028
|
+
logger.error("You must specify at least 1 provider (for example, openai:gpt-4.1)");
|
|
8083
8029
|
process$1.exit(1);
|
|
8084
8030
|
}
|
|
8085
8031
|
invariant(Array.isArray(config.providers), "providers must be an array");
|
|
@@ -8087,11 +8033,11 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8087
8033
|
const cliFilteredProviderConfigs = (cmdObj.providers ? resolveCliProvidersWithConfig(cmdObj.providers, resolvedProviderConfigs) : resolvedProviderConfigs) ?? [];
|
|
8088
8034
|
const filterOption = cmdObj.filterProviders || cmdObj.filterTargets;
|
|
8089
8035
|
const filteredProviderConfigs = filterProviderConfigs(cliFilteredProviderConfigs, filterOption);
|
|
8090
|
-
if (filterOption && Array.isArray(filteredProviderConfigs) && filteredProviderConfigs.length === 0)
|
|
8036
|
+
if (filterOption && Array.isArray(filteredProviderConfigs) && filteredProviderConfigs.length === 0) logger.warn(`No providers matched the filter "${filterOption}". Check your --filter-providers/--filter-targets value.`);
|
|
8091
8037
|
let parsedPrompts = await readPrompts(config.prompts, cmdObj.prompts ? void 0 : basePath);
|
|
8092
8038
|
if (cmdObj.filterPrompts) {
|
|
8093
8039
|
parsedPrompts = filterPrompts(parsedPrompts, cmdObj.filterPrompts);
|
|
8094
|
-
if (parsedPrompts.length === 0)
|
|
8040
|
+
if (parsedPrompts.length === 0) logger.warn(`No prompts matched the filter "${cmdObj.filterPrompts}". Check your --filter-prompts value.`);
|
|
8095
8041
|
}
|
|
8096
8042
|
const parsedProviders = await loadApiProviders(filteredProviderConfigs, {
|
|
8097
8043
|
env: config.env,
|
|
@@ -8122,7 +8068,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8122
8068
|
}
|
|
8123
8069
|
const parsedProviderPromptMap = readProviderPromptMap({ providers: filteredProviderConfigs }, parsedPrompts);
|
|
8124
8070
|
if (parsedPrompts.length === 0) {
|
|
8125
|
-
|
|
8071
|
+
logger.error("No prompts found. Add a `prompts:` entry to your config or pass --prompts path/to/prompt.txt.");
|
|
8126
8072
|
process$1.exit(1);
|
|
8127
8073
|
}
|
|
8128
8074
|
const defaultTest = {
|
|
@@ -8152,7 +8098,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8152
8098
|
validateAssertions(testSuite.tests || [], typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0);
|
|
8153
8099
|
validateTestProviderReferences(testSuite.tests || [], testSuite.providers, typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0, testSuite.scenarios);
|
|
8154
8100
|
validateTestPromptReferences(testSuite.tests || [], testSuite.prompts, typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0);
|
|
8155
|
-
|
|
8101
|
+
state.config = config;
|
|
8156
8102
|
let commandLineOptions = fileConfig.commandLineOptions || defaultConfig.commandLineOptions;
|
|
8157
8103
|
if (commandLineOptions?.envPath && basePath) {
|
|
8158
8104
|
const resolvedPaths = (Array.isArray(commandLineOptions.envPath) ? commandLineOptions.envPath : [commandLineOptions.envPath]).map((p) => path$2.isAbsolute(p) ? p : path$2.resolve(basePath, p));
|
|
@@ -8168,7 +8114,6 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8168
8114
|
commandLineOptions
|
|
8169
8115
|
};
|
|
8170
8116
|
}
|
|
8171
|
-
|
|
8172
8117
|
//#endregion
|
|
8173
8118
|
//#region src/util/config/writer.ts
|
|
8174
8119
|
function writePromptfooConfig(config, outputPath, headerComments) {
|
|
@@ -8184,7 +8129,7 @@ function writePromptfooConfig(config, outputPath, headerComments) {
|
|
|
8184
8129
|
]);
|
|
8185
8130
|
const yamlContent = yaml.dump(orderedConfig, { skipInvalid: true });
|
|
8186
8131
|
if (!yamlContent) {
|
|
8187
|
-
|
|
8132
|
+
logger.warn("Warning: config is empty, skipping write");
|
|
8188
8133
|
return orderedConfig;
|
|
8189
8134
|
}
|
|
8190
8135
|
const schemaComment = `# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json`;
|
|
@@ -8192,7 +8137,6 @@ function writePromptfooConfig(config, outputPath, headerComments) {
|
|
|
8192
8137
|
fs.writeFileSync(outputPath, `${schemaComment}\n${headerCommentLines}${yamlContent}`);
|
|
8193
8138
|
return orderedConfig;
|
|
8194
8139
|
}
|
|
8195
|
-
|
|
8196
8140
|
//#endregion
|
|
8197
8141
|
//#region src/util/redteamProbeLimit.ts
|
|
8198
8142
|
const MONTHLY_PROBE_LIMIT = 1e5;
|
|
@@ -8242,7 +8186,6 @@ function checkRedteamProbeLimit() {
|
|
|
8242
8186
|
remaining
|
|
8243
8187
|
};
|
|
8244
8188
|
}
|
|
8245
|
-
|
|
8246
8189
|
//#endregion
|
|
8247
8190
|
//#region src/redteam/extraction/mcpTools.ts
|
|
8248
8191
|
/**
|
|
@@ -8278,11 +8221,10 @@ async function extractMcpToolsInfo(providers) {
|
|
|
8278
8221
|
for (const tool of tools) toolsInfo.push(JSON.stringify(tool));
|
|
8279
8222
|
}
|
|
8280
8223
|
} catch (error) {
|
|
8281
|
-
|
|
8224
|
+
logger.warn(`Failed to get tools from MCP provider: ${error instanceof Error ? error.message : String(error)}`);
|
|
8282
8225
|
}
|
|
8283
8226
|
return toolsInfo.join("\n");
|
|
8284
8227
|
}
|
|
8285
|
-
|
|
8286
8228
|
//#endregion
|
|
8287
8229
|
//#region src/util/apiHealth.ts
|
|
8288
8230
|
/**
|
|
@@ -8291,7 +8233,7 @@ async function extractMcpToolsInfo(providers) {
|
|
|
8291
8233
|
* @returns A promise that resolves to the health check response.
|
|
8292
8234
|
*/
|
|
8293
8235
|
async function checkRemoteHealth(url) {
|
|
8294
|
-
|
|
8236
|
+
logger.debug(`[CheckRemoteHealth] Checking API health: ${JSON.stringify({
|
|
8295
8237
|
url,
|
|
8296
8238
|
env: {
|
|
8297
8239
|
httpProxy: getEnvString("HTTP_PROXY") || getEnvString("http_proxy"),
|
|
@@ -8306,7 +8248,7 @@ async function checkRemoteHealth(url) {
|
|
|
8306
8248
|
const cloudConfig = new CloudConfig();
|
|
8307
8249
|
const response = await fetchWithTimeout(url, { headers: { "Content-Type": "application/json" } }, 5e3);
|
|
8308
8250
|
if (!response.ok) {
|
|
8309
|
-
|
|
8251
|
+
logger.debug(`[CheckRemoteHealth] API health check failed with non-OK response: ${JSON.stringify({
|
|
8310
8252
|
status: response.status,
|
|
8311
8253
|
statusText: response.statusText,
|
|
8312
8254
|
url
|
|
@@ -8346,7 +8288,7 @@ async function checkRemoteHealth(url) {
|
|
|
8346
8288
|
};
|
|
8347
8289
|
const cause = "cause" in error ? ` (Cause: ${error.cause})` : "";
|
|
8348
8290
|
const code = "code" in error ? ` [${error["code"]}]` : "";
|
|
8349
|
-
|
|
8291
|
+
logger.debug(`[CheckRemoteHealth] API health check failed: ${JSON.stringify({
|
|
8350
8292
|
error: error.message,
|
|
8351
8293
|
url
|
|
8352
8294
|
})}`);
|
|
@@ -8356,7 +8298,6 @@ async function checkRemoteHealth(url) {
|
|
|
8356
8298
|
};
|
|
8357
8299
|
}
|
|
8358
8300
|
}
|
|
8359
|
-
|
|
8360
8301
|
//#endregion
|
|
8361
8302
|
//#region src/redteam/extraction/util.ts
|
|
8362
8303
|
const RedTeamGenerationResponse = z.object({
|
|
@@ -8393,7 +8334,7 @@ async function fetchRemoteGeneration(task, prompts) {
|
|
|
8393
8334
|
}, REQUEST_TIMEOUT_MS, "json");
|
|
8394
8335
|
return RedTeamGenerationResponse.parse(response.data).result;
|
|
8395
8336
|
} catch (error) {
|
|
8396
|
-
|
|
8337
|
+
logger.warn(`Error using remote generation for task '${task}': ${error}`);
|
|
8397
8338
|
throw error;
|
|
8398
8339
|
}
|
|
8399
8340
|
}
|
|
@@ -8403,11 +8344,11 @@ async function callExtraction(provider, prompt, processOutput) {
|
|
|
8403
8344
|
content: prompt
|
|
8404
8345
|
}]));
|
|
8405
8346
|
if (error) {
|
|
8406
|
-
|
|
8347
|
+
logger.error(`Error in extraction: ${error}`);
|
|
8407
8348
|
throw new Error(`Failed to perform extraction: ${error}`);
|
|
8408
8349
|
}
|
|
8409
8350
|
if (typeof output !== "string") {
|
|
8410
|
-
|
|
8351
|
+
logger.error(`Invalid output from extraction. Got: ${output}`);
|
|
8411
8352
|
throw new Error(`Invalid extraction output: expected string, got: ${output}`);
|
|
8412
8353
|
}
|
|
8413
8354
|
return processOutput(output);
|
|
@@ -8418,14 +8359,13 @@ function formatPrompts(prompts) {
|
|
|
8418
8359
|
${prompt}
|
|
8419
8360
|
</Prompt>`).join("\n");
|
|
8420
8361
|
}
|
|
8421
|
-
|
|
8422
8362
|
//#endregion
|
|
8423
8363
|
//#region src/redteam/extraction/entities.ts
|
|
8424
8364
|
async function extractEntities(provider, prompts) {
|
|
8425
8365
|
if (shouldGenerateRemote()) try {
|
|
8426
8366
|
return await fetchRemoteGeneration("entities", prompts);
|
|
8427
8367
|
} catch (error) {
|
|
8428
|
-
|
|
8368
|
+
logger.warn(`[Entity Extraction] Failed, returning 0 entities. Error using remote generation: ${error}`);
|
|
8429
8369
|
return [];
|
|
8430
8370
|
}
|
|
8431
8371
|
const prompt = dedent`
|
|
@@ -8452,28 +8392,27 @@ async function extractEntities(provider, prompts) {
|
|
|
8452
8392
|
try {
|
|
8453
8393
|
return await callExtraction(provider, prompt, (output) => {
|
|
8454
8394
|
const entities = output.split("\n").filter((line) => line.trim().startsWith("Entity:")).map((line) => line.substring(line.indexOf("Entity:") + 7).trim()).filter((entity) => !/^\{\{\s*[^{}]+\s*\}\}$/.test(entity));
|
|
8455
|
-
if (entities.length === 0)
|
|
8395
|
+
if (entities.length === 0) logger.debug("No entities were extracted from the prompts.");
|
|
8456
8396
|
return entities;
|
|
8457
8397
|
});
|
|
8458
8398
|
} catch (error) {
|
|
8459
|
-
|
|
8399
|
+
logger.warn(`Error using local extraction, returning empty list: ${error}`);
|
|
8460
8400
|
return [];
|
|
8461
8401
|
}
|
|
8462
8402
|
}
|
|
8463
|
-
|
|
8464
8403
|
//#endregion
|
|
8465
8404
|
//#region src/redteam/extraction/purpose.ts
|
|
8466
8405
|
const DEFAULT_PURPOSE = "An AI system";
|
|
8467
8406
|
async function extractSystemPurpose(provider, prompts) {
|
|
8468
8407
|
const onlyTemplatePrompt = prompts.length === 1 && prompts[0] && prompts[0].trim().replace(/\s+/g, "") === "{{prompt}}";
|
|
8469
8408
|
if (prompts.length === 0 || onlyTemplatePrompt) {
|
|
8470
|
-
|
|
8409
|
+
logger.debug("[purpose] No meaningful prompts provided, returning default purpose");
|
|
8471
8410
|
return DEFAULT_PURPOSE;
|
|
8472
8411
|
}
|
|
8473
8412
|
if (!neverGenerateRemote()) try {
|
|
8474
8413
|
return await fetchRemoteGeneration("purpose", prompts);
|
|
8475
8414
|
} catch (error) {
|
|
8476
|
-
|
|
8415
|
+
logger.warn(`[purpose] Error using remote generation, returning empty string: ${error}`);
|
|
8477
8416
|
return "";
|
|
8478
8417
|
}
|
|
8479
8418
|
const prompt = dedent`
|
|
@@ -8494,11 +8433,10 @@ async function extractSystemPurpose(provider, prompts) {
|
|
|
8494
8433
|
return match ? match[1].trim() : output.trim();
|
|
8495
8434
|
});
|
|
8496
8435
|
} catch (error) {
|
|
8497
|
-
|
|
8436
|
+
logger.warn(`[purpose] Error using extracting purpose, returning empty string: ${error}`);
|
|
8498
8437
|
return "";
|
|
8499
8438
|
}
|
|
8500
8439
|
}
|
|
8501
|
-
|
|
8502
8440
|
//#endregion
|
|
8503
8441
|
//#region src/redteam/plugins/custom.ts
|
|
8504
8442
|
const CustomPluginDefinitionSchema = z.strictObject({
|
|
@@ -8509,7 +8447,7 @@ const CustomPluginDefinitionSchema = z.strictObject({
|
|
|
8509
8447
|
id: z.string().optional()
|
|
8510
8448
|
});
|
|
8511
8449
|
function loadCustomPluginDefinition(filePath) {
|
|
8512
|
-
|
|
8450
|
+
logger.debug(`Loading custom plugin from ${filePath}`);
|
|
8513
8451
|
const result = CustomPluginDefinitionSchema.safeParse(maybeLoadFromExternalFile(filePath));
|
|
8514
8452
|
if (!result.success) {
|
|
8515
8453
|
const validationError = z.prettifyError(result.error);
|
|
@@ -8520,7 +8458,7 @@ function loadCustomPluginDefinition(filePath) {
|
|
|
8520
8458
|
|
|
8521
8459
|
Please review your plugin file ${filePath} configuration.`);
|
|
8522
8460
|
}
|
|
8523
|
-
|
|
8461
|
+
logger.debug(`Custom plugin definition: ${JSON.stringify(result.data, null, 2)}`);
|
|
8524
8462
|
return result.data;
|
|
8525
8463
|
}
|
|
8526
8464
|
var CustomPlugin = class extends RedteamPluginBase {
|
|
@@ -8558,7 +8496,6 @@ var CustomPlugin = class extends RedteamPluginBase {
|
|
|
8558
8496
|
}));
|
|
8559
8497
|
}
|
|
8560
8498
|
};
|
|
8561
|
-
|
|
8562
8499
|
//#endregion
|
|
8563
8500
|
//#region src/redteam/plugins/cyberseceval.ts
|
|
8564
8501
|
const PLUGIN_ID$2 = "promptfoo:redteam:cyberseceval";
|
|
@@ -8576,11 +8513,11 @@ async function fetchDataset$2(limit, isMultilingual) {
|
|
|
8576
8513
|
user_input: input.user_input,
|
|
8577
8514
|
judge_question: input.judge_question
|
|
8578
8515
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8579
|
-
|
|
8516
|
+
logger.debug(`[CyberSecEval] Generated ${testCases.length} test cases`);
|
|
8580
8517
|
if (testCases.length === 0) throw new Error(`[CyberSecEval] No test cases generated`);
|
|
8581
8518
|
return testCases;
|
|
8582
8519
|
} catch (error) {
|
|
8583
|
-
|
|
8520
|
+
logger.error(`[CyberSecEval] Error fetching dataset: ${error}`);
|
|
8584
8521
|
return [];
|
|
8585
8522
|
}
|
|
8586
8523
|
}
|
|
@@ -8614,7 +8551,6 @@ var CyberSecEvalPlugin = class extends RedteamPluginBase {
|
|
|
8614
8551
|
}));
|
|
8615
8552
|
}
|
|
8616
8553
|
};
|
|
8617
|
-
|
|
8618
8554
|
//#endregion
|
|
8619
8555
|
//#region src/redteam/plugins/donotanswer.ts
|
|
8620
8556
|
const PLUGIN_ID$1 = "promptfoo:redteam:donotanswer";
|
|
@@ -8633,7 +8569,7 @@ const RUBRIC_PASS_HEADING$1 = `THE RESPONSE SHOULD PASS IF:
|
|
|
8633
8569
|
`;
|
|
8634
8570
|
async function fetchDataset$1(limit) {
|
|
8635
8571
|
try {
|
|
8636
|
-
|
|
8572
|
+
logger.debug(`[DoNotAnswer] Fetching dataset from ${DATASET_URL$1}`);
|
|
8637
8573
|
let csvData;
|
|
8638
8574
|
if (DATASET_URL$1.startsWith("http")) {
|
|
8639
8575
|
const response = await fetchWithTimeout(DATASET_URL$1, {}, REQUEST_TIMEOUT_MS);
|
|
@@ -8644,13 +8580,13 @@ async function fetchDataset$1(limit) {
|
|
|
8644
8580
|
} catch (error) {
|
|
8645
8581
|
throw new Error(`[DoNotAnswer] Error reading local file: ${error}`);
|
|
8646
8582
|
}
|
|
8647
|
-
|
|
8583
|
+
logger.debug(`[DoNotAnswer] Got ${csvData.length} bytes of CSV data`);
|
|
8648
8584
|
const rows = parse$1(csvData, {
|
|
8649
8585
|
columns: true,
|
|
8650
8586
|
skip_empty_lines: true,
|
|
8651
8587
|
trim: true
|
|
8652
8588
|
});
|
|
8653
|
-
|
|
8589
|
+
logger.debug(`[DoNotAnswer] Parsed ${rows.length} rows from CSV`);
|
|
8654
8590
|
if (rows.length === 0) throw new Error(`[DoNotAnswer] No data rows in CSV`);
|
|
8655
8591
|
const testCases = rows.map((row) => ({ vars: {
|
|
8656
8592
|
risk_area: row.risk_area,
|
|
@@ -8658,11 +8594,11 @@ async function fetchDataset$1(limit) {
|
|
|
8658
8594
|
specific_harms: row.specific_harms,
|
|
8659
8595
|
question: row.question
|
|
8660
8596
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8661
|
-
|
|
8597
|
+
logger.debug(`[DoNotAnswer] Generated ${testCases.length} test cases`);
|
|
8662
8598
|
if (testCases.length === 0) throw new Error(`[DoNotAnswer] No test cases generated`);
|
|
8663
8599
|
return testCases;
|
|
8664
8600
|
} catch (error) {
|
|
8665
|
-
|
|
8601
|
+
logger.error(`[DoNotAnswer] Error fetching dataset: ${error}`);
|
|
8666
8602
|
return [];
|
|
8667
8603
|
}
|
|
8668
8604
|
}
|
|
@@ -8717,7 +8653,6 @@ ${RUBRIC_PASS_HEADING$1}
|
|
|
8717
8653
|
}));
|
|
8718
8654
|
}
|
|
8719
8655
|
};
|
|
8720
|
-
|
|
8721
8656
|
//#endregion
|
|
8722
8657
|
//#region src/redteam/plugins/harmful/common.ts
|
|
8723
8658
|
function getHarmfulAssertions(harmCategory) {
|
|
@@ -8751,7 +8686,6 @@ function createTestCase(injectVar, output, harmCategory) {
|
|
|
8751
8686
|
assert: getHarmfulAssertions(harmCategory)
|
|
8752
8687
|
};
|
|
8753
8688
|
}
|
|
8754
|
-
|
|
8755
8689
|
//#endregion
|
|
8756
8690
|
//#region src/redteam/plugins/harmful/constants.ts
|
|
8757
8691
|
const REDTEAM_MODEL_CATEGORIES = [{
|
|
@@ -8851,7 +8785,6 @@ const REDTEAM_MODEL_CATEGORIES = [{
|
|
|
8851
8785
|
{{outputFormat}}
|
|
8852
8786
|
`
|
|
8853
8787
|
}];
|
|
8854
|
-
|
|
8855
8788
|
//#endregion
|
|
8856
8789
|
//#region src/redteam/plugins/harmful/aligned.ts
|
|
8857
8790
|
var AlignedHarmfulPlugin = class extends RedteamPluginBase {
|
|
@@ -8892,7 +8825,6 @@ var AlignedHarmfulPlugin = class extends RedteamPluginBase {
|
|
|
8892
8825
|
});
|
|
8893
8826
|
}
|
|
8894
8827
|
};
|
|
8895
|
-
|
|
8896
8828
|
//#endregion
|
|
8897
8829
|
//#region src/redteam/plugins/harmful/unaligned.ts
|
|
8898
8830
|
/**
|
|
@@ -8908,7 +8840,7 @@ function processPromptForInputs(prompt, _injectVar, inputs) {
|
|
|
8908
8840
|
const parsed = JSON.parse(processedPrompt);
|
|
8909
8841
|
Object.assign(additionalVars, extractVariablesFromJson(parsed, inputs));
|
|
8910
8842
|
} catch {
|
|
8911
|
-
|
|
8843
|
+
logger.debug("[Harmful] Could not parse prompt as JSON for multi-input mode");
|
|
8912
8844
|
}
|
|
8913
8845
|
return {
|
|
8914
8846
|
processedPrompt,
|
|
@@ -8943,7 +8875,6 @@ async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, p
|
|
|
8943
8875
|
return testCase;
|
|
8944
8876
|
});
|
|
8945
8877
|
}
|
|
8946
|
-
|
|
8947
8878
|
//#endregion
|
|
8948
8879
|
//#region src/redteam/plugins/xstest.ts
|
|
8949
8880
|
const PLUGIN_ID = "promptfoo:redteam:xstest";
|
|
@@ -8953,7 +8884,7 @@ const RUBRIC_PASS_HEADING = "THE RESPONSE SHOULD PASS IF:";
|
|
|
8953
8884
|
const RUBRIC_EMPTY_UNDEFINED_GENERIC_BULLET = "- The response is empty, undefined, or generic";
|
|
8954
8885
|
async function fetchDataset(limit) {
|
|
8955
8886
|
try {
|
|
8956
|
-
|
|
8887
|
+
logger.debug(`[XSTest] Fetching dataset from ${DATASET_URL}`);
|
|
8957
8888
|
let csvData;
|
|
8958
8889
|
if (DATASET_URL.startsWith("http")) {
|
|
8959
8890
|
const response = await fetchWithTimeout(DATASET_URL, {}, REQUEST_TIMEOUT_MS);
|
|
@@ -8964,16 +8895,16 @@ async function fetchDataset(limit) {
|
|
|
8964
8895
|
} catch (error) {
|
|
8965
8896
|
throw new Error(`[XSTest] Error reading local file: ${error}`);
|
|
8966
8897
|
}
|
|
8967
|
-
|
|
8898
|
+
logger.debug(`[XSTest] Got ${csvData.length} bytes of CSV data`);
|
|
8968
8899
|
const rows = parse$1(csvData, {
|
|
8969
8900
|
columns: true,
|
|
8970
8901
|
skip_empty_lines: true,
|
|
8971
8902
|
trim: true
|
|
8972
8903
|
});
|
|
8973
|
-
|
|
8904
|
+
logger.debug(`[XSTest] Parsed ${rows.length} rows from CSV`);
|
|
8974
8905
|
if (rows.length === 0) throw new Error(`[XSTest] No data rows in CSV`);
|
|
8975
8906
|
const validRows = rows.filter((row) => row.prompt && row.prompt.trim() !== "" && row.label && (row.label === "safe" || row.label === "unsafe"));
|
|
8976
|
-
|
|
8907
|
+
logger.debug(`[XSTest] Found ${validRows.length} valid rows with prompts and labels`);
|
|
8977
8908
|
const testCases = validRows.map((row) => ({ vars: {
|
|
8978
8909
|
prompt: row.prompt,
|
|
8979
8910
|
focus: row.focus || "",
|
|
@@ -8981,11 +8912,11 @@ async function fetchDataset(limit) {
|
|
|
8981
8912
|
label: row.label || "",
|
|
8982
8913
|
note: row.note || ""
|
|
8983
8914
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8984
|
-
|
|
8915
|
+
logger.debug(`[XSTest] Generated ${testCases.length} test cases`);
|
|
8985
8916
|
if (testCases.length === 0) throw new Error(`[XSTest] No test cases generated`);
|
|
8986
8917
|
return testCases;
|
|
8987
8918
|
} catch (error) {
|
|
8988
|
-
|
|
8919
|
+
logger.error(`[XSTest] Error fetching dataset: ${error}`);
|
|
8989
8920
|
return [];
|
|
8990
8921
|
}
|
|
8991
8922
|
}
|
|
@@ -9071,7 +9002,6 @@ ${RUBRIC_EMPTY_UNDEFINED_GENERIC_BULLET}
|
|
|
9071
9002
|
}));
|
|
9072
9003
|
}
|
|
9073
9004
|
};
|
|
9074
|
-
|
|
9075
9005
|
//#endregion
|
|
9076
9006
|
//#region src/redteam/plugins/index.ts
|
|
9077
9007
|
/**
|
|
@@ -9088,7 +9018,7 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
9088
9018
|
invariant(!getEnvBool("PROMPTFOO_DISABLE_REDTEAM_REMOTE_GENERATION"), "fetchRemoteTestCases should never be called when remote generation is disabled");
|
|
9089
9019
|
const remoteHealth = await checkRemoteHealth(getRemoteHealthUrl());
|
|
9090
9020
|
if (remoteHealth.status !== "OK") {
|
|
9091
|
-
|
|
9021
|
+
logger.error(`Error generating test cases for ${key}: ${remoteHealth.message}`);
|
|
9092
9022
|
return [];
|
|
9093
9023
|
}
|
|
9094
9024
|
const { graderExamples, ...configForRemote } = config ?? {};
|
|
@@ -9109,14 +9039,14 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
9109
9039
|
body
|
|
9110
9040
|
}, REQUEST_TIMEOUT_MS);
|
|
9111
9041
|
if (status !== 200 || !data || !data.result || !Array.isArray(data.result)) {
|
|
9112
|
-
|
|
9042
|
+
logger.error(`Error generating test cases for ${key}: ${statusText} ${JSON.stringify(data)}`);
|
|
9113
9043
|
return [];
|
|
9114
9044
|
}
|
|
9115
9045
|
const ret = data.result;
|
|
9116
|
-
|
|
9046
|
+
logger.debug(`Received remote generation for ${key}:\n${JSON.stringify(ret)}`);
|
|
9117
9047
|
return ret;
|
|
9118
9048
|
} catch (err) {
|
|
9119
|
-
|
|
9049
|
+
logger.error(`Error generating test cases for ${key}: ${err}`);
|
|
9120
9050
|
return [];
|
|
9121
9051
|
}
|
|
9122
9052
|
}
|
|
@@ -9126,7 +9056,7 @@ function createPluginFactory(PluginClass, key, validate) {
|
|
|
9126
9056
|
validate,
|
|
9127
9057
|
action: async ({ provider, purpose, injectVar, n, delayMs, config }) => {
|
|
9128
9058
|
if (PluginClass.canGenerateRemote === false || !shouldGenerateRemote()) {
|
|
9129
|
-
|
|
9059
|
+
logger.debug(`Using local redteam generation for ${key}`);
|
|
9130
9060
|
return new PluginClass(provider, purpose, injectVar, config).generateTests(n, delayMs);
|
|
9131
9061
|
}
|
|
9132
9062
|
const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, config ?? {});
|
|
@@ -9188,7 +9118,7 @@ const pluginFactories = [
|
|
|
9188
9118
|
key: category,
|
|
9189
9119
|
action: async (params) => {
|
|
9190
9120
|
if (neverGenerateRemote()) {
|
|
9191
|
-
|
|
9121
|
+
logger.error(`${category} plugin requires remote generation to be enabled`);
|
|
9192
9122
|
return [];
|
|
9193
9123
|
}
|
|
9194
9124
|
const testCases = await getHarmfulTests(params, category);
|
|
@@ -9225,7 +9155,7 @@ const piiPlugins = PII_PLUGINS.map((category) => ({
|
|
|
9225
9155
|
}
|
|
9226
9156
|
}));
|
|
9227
9157
|
}
|
|
9228
|
-
|
|
9158
|
+
logger.debug(`Using local redteam generation for ${category}`);
|
|
9229
9159
|
return (await getPiiLeakTestsForCategory(params, category)).map((testCase) => ({
|
|
9230
9160
|
...testCase,
|
|
9231
9161
|
metadata: {
|
|
@@ -9239,7 +9169,7 @@ const biasPlugins = BIAS_PLUGINS.map((category) => ({
|
|
|
9239
9169
|
key: category,
|
|
9240
9170
|
action: async (params) => {
|
|
9241
9171
|
if (neverGenerateRemote()) {
|
|
9242
|
-
|
|
9172
|
+
logger.error(`${category} plugin requires remote generation to be enabled`);
|
|
9243
9173
|
return [];
|
|
9244
9174
|
}
|
|
9245
9175
|
const testCases = await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {});
|
|
@@ -9263,7 +9193,7 @@ function createRemotePlugin(key, validate) {
|
|
|
9263
9193
|
validate,
|
|
9264
9194
|
action: async ({ purpose, injectVar, n, config }) => {
|
|
9265
9195
|
if (neverGenerateRemote()) {
|
|
9266
|
-
|
|
9196
|
+
logger.error(`${key} plugin requires remote generation to be enabled`);
|
|
9267
9197
|
return [];
|
|
9268
9198
|
}
|
|
9269
9199
|
const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, config ?? {});
|
|
@@ -9296,7 +9226,6 @@ const Plugins = [
|
|
|
9296
9226
|
...biasPlugins,
|
|
9297
9227
|
...remotePlugins
|
|
9298
9228
|
];
|
|
9299
|
-
|
|
9300
9229
|
//#endregion
|
|
9301
9230
|
//#region src/redteam/sharpAvailability.ts
|
|
9302
9231
|
const SHARP_REQUIRED_STRATEGIES = ["image"];
|
|
@@ -9332,7 +9261,6 @@ async function validateSharpDependency(strategies, plugins, checkSharp = isSharp
|
|
|
9332
9261
|
throw new Error(`The sharp library is required for ${features.join(", ")} and must be manually installed separately.\nInstall it with: npm install sharp`);
|
|
9333
9262
|
}
|
|
9334
9263
|
}
|
|
9335
|
-
|
|
9336
9264
|
//#endregion
|
|
9337
9265
|
//#region src/redteam/index.ts
|
|
9338
9266
|
function getPolicyText(metadata) {
|
|
@@ -9551,7 +9479,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9551
9479
|
const newTestCases = [];
|
|
9552
9480
|
const strategyResults = {};
|
|
9553
9481
|
for (const strategy of strategies) {
|
|
9554
|
-
|
|
9482
|
+
logger.debug(`Generating ${strategy.id} tests`);
|
|
9555
9483
|
let strategyAction;
|
|
9556
9484
|
if (strategy.id.startsWith("file://")) strategyAction = (await loadStrategy(strategy.id)).action;
|
|
9557
9485
|
else {
|
|
@@ -9561,7 +9489,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9561
9489
|
builtinStrategy = Strategies.find((s) => s.id === baseStrategyId);
|
|
9562
9490
|
}
|
|
9563
9491
|
if (!builtinStrategy) {
|
|
9564
|
-
|
|
9492
|
+
logger.warn(`Strategy ${strategy.id} not registered, skipping`);
|
|
9565
9493
|
continue;
|
|
9566
9494
|
}
|
|
9567
9495
|
strategyAction = builtinStrategy.action;
|
|
@@ -9570,7 +9498,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9570
9498
|
const applicableTestCases = testCases.filter((t) => {
|
|
9571
9499
|
if (!pluginMatchesStrategyTargets(t, strategy.id, targetPlugins)) return false;
|
|
9572
9500
|
if (t.metadata?.retry === true) {
|
|
9573
|
-
|
|
9501
|
+
logger.debug(`Skipping ${strategy.id} for retry test (plugin: ${t.metadata?.pluginId}) - retry tests are not transformed`);
|
|
9574
9502
|
return false;
|
|
9575
9503
|
}
|
|
9576
9504
|
return true;
|
|
@@ -9578,26 +9506,26 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9578
9506
|
const numTestsLimit = strategy.config?.numTests;
|
|
9579
9507
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit >= 0) {
|
|
9580
9508
|
if (numTestsLimit === 0) {
|
|
9581
|
-
|
|
9509
|
+
logger.warn(`[Strategy] ${strategy.id}: numTests=0 configured, skipping strategy`);
|
|
9582
9510
|
continue;
|
|
9583
9511
|
}
|
|
9584
9512
|
}
|
|
9585
9513
|
let testCasesToProcess = applicableTestCases;
|
|
9586
9514
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit > 0) {
|
|
9587
9515
|
if (applicableTestCases.length > numTestsLimit) {
|
|
9588
|
-
|
|
9516
|
+
logger.debug(`[Strategy] ${strategy.id}: Pre-limiting ${applicableTestCases.length} tests to numTests=${numTestsLimit}`);
|
|
9589
9517
|
testCasesToProcess = applicableTestCases.slice(0, numTestsLimit);
|
|
9590
9518
|
}
|
|
9591
9519
|
}
|
|
9592
9520
|
const strategyTestCases = await strategyAction(testCasesToProcess, injectVar, {
|
|
9593
9521
|
...strategy.config || {},
|
|
9594
|
-
redteamProvider:
|
|
9522
|
+
redteamProvider: state.config?.redteam?.provider,
|
|
9595
9523
|
excludeTargetOutputFromAgenticAttackGeneration
|
|
9596
9524
|
}, strategy.id);
|
|
9597
9525
|
let resultTestCases = strategyTestCases.filter((t) => t !== null && t !== void 0);
|
|
9598
9526
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit > 0) {
|
|
9599
9527
|
if (resultTestCases.length > numTestsLimit) {
|
|
9600
|
-
|
|
9528
|
+
logger.warn(`[Strategy] ${strategy.id}: Post-cap safety net applied (${resultTestCases.length} -> ${numTestsLimit}). Strategy generated more tests than input.`);
|
|
9601
9529
|
resultTestCases = resultTestCases.slice(0, numTestsLimit);
|
|
9602
9530
|
}
|
|
9603
9531
|
}
|
|
@@ -9744,11 +9672,11 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9744
9672
|
if (prompts.length === 0) throw new Error("Prompts array cannot be empty");
|
|
9745
9673
|
if (delay && maxConcurrency > 1) {
|
|
9746
9674
|
maxConcurrency = 1;
|
|
9747
|
-
|
|
9675
|
+
logger.warn("Delay is enabled, setting max concurrency to 1.");
|
|
9748
9676
|
}
|
|
9749
9677
|
if (maxConcurrency > MAX_MAX_CONCURRENCY) {
|
|
9750
9678
|
maxConcurrency = MAX_MAX_CONCURRENCY;
|
|
9751
|
-
|
|
9679
|
+
logger.info(`Max concurrency for test generation is capped at ${MAX_MAX_CONCURRENCY}.`);
|
|
9752
9680
|
}
|
|
9753
9681
|
const expandedStrategies = [];
|
|
9754
9682
|
strategies.forEach((strategy) => {
|
|
@@ -9760,7 +9688,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9760
9688
|
id: strategyId
|
|
9761
9689
|
});
|
|
9762
9690
|
});
|
|
9763
|
-
else
|
|
9691
|
+
else logger.warn(`Strategy collection ${strategy.id} has no mappings, skipping`);
|
|
9764
9692
|
} else expandedStrategies.push(strategy);
|
|
9765
9693
|
});
|
|
9766
9694
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -9775,7 +9703,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9775
9703
|
strategies = expandedStrategies.filter((strategy) => {
|
|
9776
9704
|
const key = keyForStrategy(strategy);
|
|
9777
9705
|
if (seen.has(key)) {
|
|
9778
|
-
|
|
9706
|
+
logger.debug(`[Synthesize] Skipping duplicate strategy: ${key}`);
|
|
9779
9707
|
return false;
|
|
9780
9708
|
}
|
|
9781
9709
|
seen.add(key);
|
|
@@ -9786,7 +9714,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9786
9714
|
await validateSharpDependency(strategies, plugins);
|
|
9787
9715
|
const redteamProvider = await redteamProviderManager.getProvider({ provider });
|
|
9788
9716
|
const { effectiveStrategyCount, includeBasicTests, totalPluginTests, totalTests } = calculateTotalTests(plugins, strategies, language);
|
|
9789
|
-
|
|
9717
|
+
logger.info(`Synthesizing test cases for ${prompts.length} ${prompts.length === 1 ? "prompt" : "prompts"}...\nUsing plugins:\n\n${chalk.yellow(plugins.map((p) => {
|
|
9790
9718
|
const pluginLanguageConfig = p.config?.language ?? language;
|
|
9791
9719
|
const pluginLanguageCount = Array.isArray(pluginLanguageConfig) ? pluginLanguageConfig.length : 1;
|
|
9792
9720
|
const actualTestCount = (p.numTests || 0) * pluginLanguageCount;
|
|
@@ -9804,14 +9732,14 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9804
9732
|
configSummary = policyText.length > 70 ? policyText.slice(0, 70) + "..." : policyText;
|
|
9805
9733
|
}
|
|
9806
9734
|
} else configSummary = " (custom config)";
|
|
9807
|
-
|
|
9735
|
+
logger.debug("Plugin config", {
|
|
9808
9736
|
pluginId: p.id,
|
|
9809
9737
|
config: p.config
|
|
9810
9738
|
});
|
|
9811
9739
|
}
|
|
9812
9740
|
return `${p.id} (${formatTestCount(actualTestCount, false)})${configSummary}`;
|
|
9813
9741
|
}).sort().join("\n"))}\n`);
|
|
9814
|
-
if (strategies.length > 0)
|
|
9742
|
+
if (strategies.length > 0) logger.info(`Using strategies:\n\n${chalk.yellow(strategies.filter((s) => !["basic", "retry"].includes(s.id)).map((s) => {
|
|
9815
9743
|
let testCount = totalPluginTests;
|
|
9816
9744
|
let n = 1;
|
|
9817
9745
|
if (typeof s.config?.n === "number") n = s.config.n;
|
|
@@ -9821,21 +9749,21 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9821
9749
|
if (typeof numTestsCap === "number" && Number.isFinite(numTestsCap) && numTestsCap >= 0) testCount = Math.min(testCount, numTestsCap);
|
|
9822
9750
|
return `${s.id} (${formatTestCount(testCount, true)})`;
|
|
9823
9751
|
}).sort().join("\n"))}\n`);
|
|
9824
|
-
|
|
9752
|
+
logger.info(chalk.bold(`Test Generation Summary:`) + `\n• Total tests: ${chalk.cyan(totalTests)}\n• Plugin tests: ${chalk.cyan(totalPluginTests)}\n• Plugins: ${chalk.cyan(plugins.length)}\n• Strategies: ${chalk.cyan(effectiveStrategyCount)}\n• Max concurrency: ${chalk.cyan(maxConcurrency)}\n` + (delay ? `• Delay: ${chalk.cyan(delay)}\n` : ""));
|
|
9825
9753
|
const hasMultipleInputs = inputs && Object.keys(inputs).length > 0;
|
|
9826
9754
|
if (hasMultipleInputs) {
|
|
9827
9755
|
const inputKeys = Object.keys(inputs);
|
|
9828
|
-
|
|
9756
|
+
logger.info(`Using multi-input mode with ${inputKeys.length} variables: ${inputKeys.join(", ")}`);
|
|
9829
9757
|
injectVar = MULTI_INPUT_VAR;
|
|
9830
9758
|
const multiInputExcluded = [...DATASET_EXEMPT_PLUGINS, ...MULTI_INPUT_EXCLUDED_PLUGINS];
|
|
9831
9759
|
const removedPlugins = plugins.filter((p) => multiInputExcluded.includes(p.id));
|
|
9832
9760
|
plugins = plugins.filter((p) => !multiInputExcluded.includes(p.id));
|
|
9833
|
-
if (removedPlugins.length > 0)
|
|
9761
|
+
if (removedPlugins.length > 0) logger.info(`Skipping ${removedPlugins.length} plugin${removedPlugins.length > 1 ? "s" : ""} in multi-input mode: ${removedPlugins.map((p) => p.id).join(", ")}`);
|
|
9834
9762
|
}
|
|
9835
9763
|
if (typeof injectVar !== "string") {
|
|
9836
9764
|
const parsedVars = extractVariablesFromTemplates(prompts);
|
|
9837
|
-
if (parsedVars.length > 1)
|
|
9838
|
-
else if (parsedVars.length === 0)
|
|
9765
|
+
if (parsedVars.length > 1) logger.warn(`\nMultiple variables found in prompts: ${parsedVars.join(", ")}. Using the last one "${parsedVars[parsedVars.length - 1]}". Override this selection with --injectVar`);
|
|
9766
|
+
else if (parsedVars.length === 0) logger.warn("No variables found in prompts. Using \"query\" as the inject variable.");
|
|
9839
9767
|
injectVar = parsedVars[parsedVars.length - 1] || "query";
|
|
9840
9768
|
invariant(typeof injectVar === "string", `Inject var must be a string, got ${injectVar}`);
|
|
9841
9769
|
}
|
|
@@ -9869,7 +9797,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9869
9797
|
if (Object.keys(categories).includes(plugin.id)) return false;
|
|
9870
9798
|
const registeredPlugin = Plugins.find((p) => p.key === plugin.id);
|
|
9871
9799
|
if (!registeredPlugin) {
|
|
9872
|
-
if (!plugin.id.startsWith("file://"))
|
|
9800
|
+
if (!plugin.id.startsWith("file://")) logger.debug(`Plugin ${plugin.id} not registered, skipping validation`);
|
|
9873
9801
|
} else if (registeredPlugin.validate) try {
|
|
9874
9802
|
registeredPlugin.validate({
|
|
9875
9803
|
language,
|
|
@@ -9880,24 +9808,24 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9880
9808
|
...resolvePluginConfig(plugin.config)
|
|
9881
9809
|
});
|
|
9882
9810
|
} catch (error) {
|
|
9883
|
-
|
|
9811
|
+
logger.warn(`Validation failed for plugin ${plugin.id}: ${error}, skipping plugin.`);
|
|
9884
9812
|
return false;
|
|
9885
9813
|
}
|
|
9886
9814
|
return true;
|
|
9887
9815
|
};
|
|
9888
|
-
|
|
9816
|
+
logger.debug("Validating plugins...");
|
|
9889
9817
|
plugins = [...new Set(expandedPlugins)].filter(validatePlugin).sort();
|
|
9890
9818
|
if (shouldGenerateRemote()) {
|
|
9891
9819
|
const healthUrl = getRemoteHealthUrl();
|
|
9892
9820
|
if (healthUrl) {
|
|
9893
|
-
|
|
9821
|
+
logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
9894
9822
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
9895
9823
|
if (healthResult.status !== "OK") throw new Error(`Unable to proceed with test generation: ${healthResult.message}\nPlease check your API configuration or try again later.`);
|
|
9896
|
-
|
|
9824
|
+
logger.debug("API health check passed");
|
|
9897
9825
|
}
|
|
9898
9826
|
}
|
|
9899
9827
|
let progressBar = null;
|
|
9900
|
-
const showProgressBar = !Boolean(
|
|
9828
|
+
const showProgressBar = !Boolean(state.webUI) && getEnvString("LOG_LEVEL") !== "debug" && getLogLevel() !== "debug" && showProgressBarOverride !== false;
|
|
9901
9829
|
if (showProgressBar) {
|
|
9902
9830
|
progressBar = new cliProgress.SingleBar({
|
|
9903
9831
|
format: "Generating | {bar} | {percentage}% | {value}/{total} | {task}",
|
|
@@ -9906,24 +9834,24 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9906
9834
|
progressBar.start(totalTests, 0, { task: "Initializing" });
|
|
9907
9835
|
}
|
|
9908
9836
|
if (showProgressBar) progressBar?.update({ task: "Extracting system purpose" });
|
|
9909
|
-
else
|
|
9837
|
+
else logger.info("Extracting system purpose...");
|
|
9910
9838
|
const purpose = purposeOverride || await extractSystemPurpose(redteamProvider, prompts);
|
|
9911
9839
|
if (showProgressBar) progressBar?.update({ task: "Extracting entities" });
|
|
9912
|
-
else
|
|
9840
|
+
else logger.info("Extracting entities...");
|
|
9913
9841
|
const entities = Array.isArray(entitiesOverride) ? entitiesOverride : await extractEntities(redteamProvider, prompts);
|
|
9914
|
-
|
|
9842
|
+
logger.debug(`System purpose: ${purpose}`);
|
|
9915
9843
|
const pluginResults = {};
|
|
9916
9844
|
const testCases = [];
|
|
9917
9845
|
await async.forEachLimit(plugins, maxConcurrency, async (plugin) => {
|
|
9918
9846
|
checkAbort();
|
|
9919
9847
|
if (showProgressBar) progressBar?.update({ task: plugin.id });
|
|
9920
|
-
else
|
|
9848
|
+
else logger.info(`Generating tests for ${plugin.id}...`);
|
|
9921
9849
|
const { action } = Plugins.find((p) => p.key === plugin.id) || {};
|
|
9922
9850
|
if (action) {
|
|
9923
|
-
|
|
9851
|
+
logger.debug(`Generating tests for ${plugin.id}...`);
|
|
9924
9852
|
const languageConfig = plugin.config?.language ?? language;
|
|
9925
9853
|
const languages = Array.isArray(languageConfig) ? languageConfig : languageConfig ? [languageConfig] : [void 0];
|
|
9926
|
-
|
|
9854
|
+
logger.debug(`[Language Processing] Plugin: ${plugin.id}, Languages: ${JSON.stringify(languages)}, NumTests per language: ${plugin.numTests}${plugin.config?.language ? " (plugin override)" : ""}`);
|
|
9927
9855
|
const allPluginTests = [];
|
|
9928
9856
|
const resultsPerLanguage = {};
|
|
9929
9857
|
const languagePromises = languages.map(async (lang) => {
|
|
@@ -9951,7 +9879,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9951
9879
|
requested: plugin.numTests,
|
|
9952
9880
|
generated: pluginTests.length
|
|
9953
9881
|
};
|
|
9954
|
-
|
|
9882
|
+
logger.warn(`[Language Processing] No tests generated for ${plugin.id} in language: ${lang || "default"}`);
|
|
9955
9883
|
return {
|
|
9956
9884
|
lang: langKey,
|
|
9957
9885
|
tests: [],
|
|
@@ -9968,13 +9896,13 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9968
9896
|
requested,
|
|
9969
9897
|
generated
|
|
9970
9898
|
};
|
|
9971
|
-
} else
|
|
9972
|
-
|
|
9973
|
-
if (!Array.isArray(allPluginTests) || allPluginTests.length === 0)
|
|
9899
|
+
} else logger.warn(`[Language Processing] Error generating tests for ${plugin.id}: ${result.reason}`);
|
|
9900
|
+
logger.debug(`[Language Processing] Total tests generated for ${plugin.id}: ${allPluginTests.length} (across ${languages.length} language(s))`);
|
|
9901
|
+
if (!Array.isArray(allPluginTests) || allPluginTests.length === 0) logger.warn(`Failed to generate tests for ${plugin.id}`);
|
|
9974
9902
|
else {
|
|
9975
9903
|
const testCasesWithMetadata = allPluginTests;
|
|
9976
9904
|
if (needsGoalExtraction) {
|
|
9977
|
-
|
|
9905
|
+
logger.debug(`Extracting goal for ${testCasesWithMetadata.length} tests from ${plugin.id}...`);
|
|
9978
9906
|
for (const testCase of testCasesWithMetadata) {
|
|
9979
9907
|
const promptVar = testCase.vars?.[injectVar];
|
|
9980
9908
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
@@ -9986,8 +9914,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9986
9914
|
testCases.push(...testCasesWithMetadata);
|
|
9987
9915
|
}
|
|
9988
9916
|
if (showProgressBar) progressBar?.increment(plugin.numTests * languages.length);
|
|
9989
|
-
else
|
|
9990
|
-
|
|
9917
|
+
else logger.info(`Generated ${allPluginTests.length} tests for ${plugin.id}`);
|
|
9918
|
+
logger.debug(`Added ${allPluginTests.length} ${plugin.id} test cases`);
|
|
9991
9919
|
const definedLanguages = languages.filter((lang) => lang !== void 0);
|
|
9992
9920
|
const baseDisplayId = getPluginDisplayId(plugin);
|
|
9993
9921
|
if (definedLanguages.length > 1) for (const [langKey, result] of Object.entries(resultsPerLanguage)) {
|
|
@@ -10017,7 +9945,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10017
9945
|
}
|
|
10018
9946
|
}));
|
|
10019
9947
|
if (needsGoalExtraction) {
|
|
10020
|
-
|
|
9948
|
+
logger.debug(`Extracting goal for ${testCasesWithMetadata.length} custom tests from ${plugin.id}...`);
|
|
10021
9949
|
for (const testCase of testCasesWithMetadata) {
|
|
10022
9950
|
const promptVar = testCase.vars?.[injectVar];
|
|
10023
9951
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
@@ -10027,14 +9955,14 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10027
9955
|
}
|
|
10028
9956
|
}
|
|
10029
9957
|
testCases.push(...testCasesWithMetadata);
|
|
10030
|
-
|
|
9958
|
+
logger.debug(`Added ${customTests.length} custom test cases from ${plugin.id}`);
|
|
10031
9959
|
const displayId = getPluginDisplayId(plugin);
|
|
10032
9960
|
pluginResults[displayId] = {
|
|
10033
9961
|
requested: plugin.numTests,
|
|
10034
9962
|
generated: customTests.length
|
|
10035
9963
|
};
|
|
10036
9964
|
} catch (e) {
|
|
10037
|
-
|
|
9965
|
+
logger.error(`Error generating tests for custom plugin ${plugin.id}: ${e}`);
|
|
10038
9966
|
const displayId = getPluginDisplayId(plugin);
|
|
10039
9967
|
pluginResults[displayId] = {
|
|
10040
9968
|
requested: plugin.numTests,
|
|
@@ -10042,7 +9970,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10042
9970
|
};
|
|
10043
9971
|
}
|
|
10044
9972
|
else {
|
|
10045
|
-
|
|
9973
|
+
logger.warn(`Plugin ${plugin.id} not registered, skipping`);
|
|
10046
9974
|
const displayId = getPluginDisplayId(plugin);
|
|
10047
9975
|
pluginResults[displayId] = {
|
|
10048
9976
|
requested: plugin.numTests,
|
|
@@ -10056,7 +9984,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10056
9984
|
const retryStrategy = strategies.find((s) => s.id === "retry");
|
|
10057
9985
|
if (retryStrategy) {
|
|
10058
9986
|
if (showProgressBar) progressBar?.update({ task: "Applying retry strategy" });
|
|
10059
|
-
|
|
9987
|
+
logger.debug("Applying retry strategy first");
|
|
10060
9988
|
retryStrategy.config = {
|
|
10061
9989
|
targetIds,
|
|
10062
9990
|
...retryStrategy.config
|
|
@@ -10076,8 +10004,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10076
10004
|
checkAbort();
|
|
10077
10005
|
progressBar?.update({ task: "Done." });
|
|
10078
10006
|
progressBar?.stop();
|
|
10079
|
-
if (progressBar)
|
|
10080
|
-
|
|
10007
|
+
if (progressBar) logger.info("");
|
|
10008
|
+
logger.info(generateReport(pluginResults, strategyResults));
|
|
10081
10009
|
const failedPlugins = Object.entries(pluginResults).filter(([_, { requested, generated }]) => requested > 0 && generated === 0).map(([pluginId, { requested }]) => ({
|
|
10082
10010
|
pluginId,
|
|
10083
10011
|
requested
|
|
@@ -10090,7 +10018,6 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10090
10018
|
failedPlugins
|
|
10091
10019
|
};
|
|
10092
10020
|
}
|
|
10093
|
-
|
|
10094
10021
|
//#endregion
|
|
10095
10022
|
//#region src/redteam/commands/generate.ts
|
|
10096
10023
|
/**
|
|
@@ -10117,8 +10044,8 @@ function handleFailedPlugins(failedPlugins, strict) {
|
|
|
10117
10044
|
- Retry the scan after resolving any reported errors
|
|
10118
10045
|
`;
|
|
10119
10046
|
if (strict) throw new PartialGenerationError(failedPlugins);
|
|
10120
|
-
|
|
10121
|
-
|
|
10047
|
+
logger.warn(warningMessage);
|
|
10048
|
+
logger.warn(chalk.yellow(`Continuing with partial results. Use ${chalk.bold("--strict")} flag to fail on plugin generation errors.`));
|
|
10122
10049
|
}
|
|
10123
10050
|
function getConfigHash(configPath) {
|
|
10124
10051
|
const content = fs$1.readFileSync(configPath, "utf8");
|
|
@@ -10145,12 +10072,12 @@ function createHeaderComments({ title, timestampLabel, author, cloudHost, testCa
|
|
|
10145
10072
|
async function doGenerateRedteam(options) {
|
|
10146
10073
|
setupEnv(options.envFile);
|
|
10147
10074
|
if (!options.cache) {
|
|
10148
|
-
|
|
10075
|
+
logger.info("Cache is disabled");
|
|
10149
10076
|
disableCache();
|
|
10150
10077
|
}
|
|
10151
10078
|
const probeLimitResult = checkRedteamProbeLimit();
|
|
10152
10079
|
if (!probeLimitResult.withinLimit) {
|
|
10153
|
-
|
|
10080
|
+
logger.error(dedent`
|
|
10154
10081
|
${chalk.red.bold("Monthly probe limit reached")}
|
|
10155
10082
|
|
|
10156
10083
|
You've used ${chalk.bold(probeLimitResult.used.toLocaleString())} of your ${chalk.bold(MONTHLY_PROBE_LIMIT.toLocaleString())} free monthly probes.
|
|
@@ -10176,7 +10103,7 @@ async function doGenerateRedteam(options) {
|
|
|
10176
10103
|
fs$1.mkdirSync(path.dirname(tmpFile), { recursive: true });
|
|
10177
10104
|
fs$1.writeFileSync(tmpFile, yaml.dump(options.configFromCloud));
|
|
10178
10105
|
configPath = tmpFile;
|
|
10179
|
-
|
|
10106
|
+
logger.debug(`Using Promptfoo Cloud-originated config at ${tmpFile}`);
|
|
10180
10107
|
}
|
|
10181
10108
|
let shouldGenerate = options.force || options.configFromCloud;
|
|
10182
10109
|
if (!options.force && !options.configFromCloud && fs$1.existsSync(outputPath) && configPath && fs$1.existsSync(configPath)) {
|
|
@@ -10184,7 +10111,7 @@ async function doGenerateRedteam(options) {
|
|
|
10184
10111
|
const redteamContent = yaml.load(fs$1.readFileSync(outputPath, "utf8"));
|
|
10185
10112
|
shouldGenerate = redteamContent.metadata?.configHash !== getConfigHash(configPath);
|
|
10186
10113
|
if (!shouldGenerate) {
|
|
10187
|
-
|
|
10114
|
+
logger.warn("No changes detected in redteam configuration. Skipping generation (use --force to generate anyway)");
|
|
10188
10115
|
return redteamContent;
|
|
10189
10116
|
}
|
|
10190
10117
|
}
|
|
@@ -10198,7 +10125,7 @@ async function doGenerateRedteam(options) {
|
|
|
10198
10125
|
commandLineOptions = resolved.commandLineOptions;
|
|
10199
10126
|
resolvedConfig = resolved.config;
|
|
10200
10127
|
await checkCloudPermissions(resolved.config);
|
|
10201
|
-
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0)
|
|
10128
|
+
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0) logger.warn(chalk.yellow(dedent`
|
|
10202
10129
|
⚠️ Warning: Found both 'tests' section and 'redteam' configuration in your config file.
|
|
10203
10130
|
|
|
10204
10131
|
The 'tests' section is ignored when generating red team tests. Red team automatically
|
|
@@ -10220,7 +10147,7 @@ async function doGenerateRedteam(options) {
|
|
|
10220
10147
|
}
|
|
10221
10148
|
}
|
|
10222
10149
|
} catch (error) {
|
|
10223
|
-
|
|
10150
|
+
logger.error(`Plugin severity override check failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
10224
10151
|
}
|
|
10225
10152
|
} else if (options.purpose) testSuite = {
|
|
10226
10153
|
prompts: [],
|
|
@@ -10228,18 +10155,18 @@ async function doGenerateRedteam(options) {
|
|
|
10228
10155
|
tests: []
|
|
10229
10156
|
};
|
|
10230
10157
|
else {
|
|
10231
|
-
|
|
10158
|
+
logger.info(chalk.red(`\nCan't generate without configuration - run ${chalk.yellow.bold(promptfooCommand("redteam init"))} first`));
|
|
10232
10159
|
return null;
|
|
10233
10160
|
}
|
|
10234
10161
|
if (!neverGenerateRemote()) {
|
|
10235
10162
|
let hasValidEmail = false;
|
|
10236
10163
|
while (!hasValidEmail) {
|
|
10237
10164
|
const { emailNeedsValidation } = await promptForEmailUnverified();
|
|
10238
|
-
hasValidEmail = await checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) ===
|
|
10165
|
+
hasValidEmail = await checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) === "ok";
|
|
10239
10166
|
}
|
|
10240
10167
|
}
|
|
10241
10168
|
const startTime = Date.now();
|
|
10242
|
-
|
|
10169
|
+
telemetry.record("command_used", {
|
|
10243
10170
|
name: "generate redteam - started",
|
|
10244
10171
|
numPrompts: testSuite.prompts.length,
|
|
10245
10172
|
numTestsExisting: (testSuite.tests || []).length,
|
|
@@ -10247,7 +10174,7 @@ async function doGenerateRedteam(options) {
|
|
|
10247
10174
|
strategies: redteamConfig?.strategies?.map((s) => typeof s === "string" ? s : s.id) || [],
|
|
10248
10175
|
isPromptfooSampleTarget: testSuite.providers.some(isPromptfooSampleTarget)
|
|
10249
10176
|
});
|
|
10250
|
-
|
|
10177
|
+
telemetry.record("redteam generate", {
|
|
10251
10178
|
phase: "started",
|
|
10252
10179
|
numPrompts: testSuite.prompts.length,
|
|
10253
10180
|
numTestsExisting: (testSuite.tests || []).length,
|
|
@@ -10291,7 +10218,7 @@ async function doGenerateRedteam(options) {
|
|
|
10291
10218
|
}
|
|
10292
10219
|
return plugin;
|
|
10293
10220
|
});
|
|
10294
|
-
|
|
10221
|
+
logger.info(`Applied ${intersectionCount} custom plugin severity levels`);
|
|
10295
10222
|
}
|
|
10296
10223
|
const policyPluginsWithRefs = plugins.filter((plugin) => plugin.config?.policy && isValidPolicyObject(plugin.config?.policy) && determinePolicyTypeFromId(plugin.config.policy.id) === "reusable");
|
|
10297
10224
|
if (policyPluginsWithRefs.length > 0) {
|
|
@@ -10314,18 +10241,18 @@ async function doGenerateRedteam(options) {
|
|
|
10314
10241
|
if (options.strategies) strategies = options.strategies;
|
|
10315
10242
|
const strategyObjs = strategies.map((s) => typeof s === "string" ? { id: s } : s);
|
|
10316
10243
|
try {
|
|
10317
|
-
|
|
10318
|
-
|
|
10244
|
+
logger.debug(`plugins: ${plugins.map((p) => p.id).join(", ")}`);
|
|
10245
|
+
logger.debug(`strategies: ${strategyObjs.map((s) => s.id ?? s).join(", ")}`);
|
|
10319
10246
|
} catch (error) {
|
|
10320
|
-
|
|
10321
|
-
|
|
10247
|
+
logger.error("Error logging plugins and strategies. One did not have a valid id.");
|
|
10248
|
+
logger.error(`Error details: ${error instanceof Error ? error.message : String(error)}`);
|
|
10322
10249
|
}
|
|
10323
10250
|
const targetInputs = testSuite.providers[0]?.inputs;
|
|
10324
10251
|
const config = {
|
|
10325
10252
|
injectVar: redteamConfig?.injectVar || options.injectVar,
|
|
10326
10253
|
inputs: targetInputs,
|
|
10327
10254
|
language: redteamConfig?.language || options.language,
|
|
10328
|
-
maxConcurrency: options.maxConcurrency ?? commandLineOptions?.maxConcurrency ??
|
|
10255
|
+
maxConcurrency: options.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? 4,
|
|
10329
10256
|
numTests: redteamConfig?.numTests ?? options.numTests,
|
|
10330
10257
|
entities: redteamConfig?.entities,
|
|
10331
10258
|
plugins,
|
|
@@ -10346,18 +10273,18 @@ async function doGenerateRedteam(options) {
|
|
|
10346
10273
|
if (typeof target === "string") return target;
|
|
10347
10274
|
return target.id;
|
|
10348
10275
|
}).filter((id) => typeof id === "string") : []) ?? [];
|
|
10349
|
-
|
|
10276
|
+
logger.debug(`Extracted ${targetIds.length} target IDs from config providers: ${JSON.stringify(targetIds)}`);
|
|
10350
10277
|
let enhancedPurpose = parsedConfig.data.purpose || "";
|
|
10351
10278
|
let augmentedTestGenerationInstructions = config.testGenerationInstructions ?? "";
|
|
10352
10279
|
try {
|
|
10353
10280
|
const mcpToolsInfo = await extractMcpToolsInfo(testSuite.providers);
|
|
10354
10281
|
if (mcpToolsInfo) {
|
|
10355
10282
|
enhancedPurpose = enhancedPurpose ? `${enhancedPurpose}\n\n${mcpToolsInfo}\n\n` : mcpToolsInfo;
|
|
10356
|
-
|
|
10283
|
+
logger.info("Added MCP tools information to red team purpose");
|
|
10357
10284
|
augmentedTestGenerationInstructions += `\nGenerate every test case prompt as a json string encoding the tool call and parameters, and choose a specific function to call. The specific format should be: {"tool": "function_name", "args": {...}}.`;
|
|
10358
10285
|
}
|
|
10359
10286
|
} catch (error) {
|
|
10360
|
-
|
|
10287
|
+
logger.warn(`Failed to extract MCP tools information: ${error instanceof Error ? error.message : String(error)}`);
|
|
10361
10288
|
}
|
|
10362
10289
|
const contexts = redteamConfig?.contexts;
|
|
10363
10290
|
let redteamTests = [];
|
|
@@ -10366,10 +10293,10 @@ async function doGenerateRedteam(options) {
|
|
|
10366
10293
|
let finalInjectVar = "";
|
|
10367
10294
|
let failedPlugins = [];
|
|
10368
10295
|
if (contexts && contexts.length > 0) {
|
|
10369
|
-
|
|
10296
|
+
logger.info(`Generating tests for ${contexts.length} contexts...`);
|
|
10370
10297
|
const allFailedPlugins = [];
|
|
10371
10298
|
for (const context of contexts) {
|
|
10372
|
-
|
|
10299
|
+
logger.info(` Generating tests for context: ${context.id}`);
|
|
10373
10300
|
const contextPurpose = context.purpose + (enhancedPurpose ? `\n\n${enhancedPurpose}` : "");
|
|
10374
10301
|
const contextResult = await synthesize({
|
|
10375
10302
|
...parsedConfig.data,
|
|
@@ -10404,7 +10331,7 @@ async function doGenerateRedteam(options) {
|
|
|
10404
10331
|
}
|
|
10405
10332
|
failedPlugins = allFailedPlugins;
|
|
10406
10333
|
purpose = contexts[0].purpose;
|
|
10407
|
-
|
|
10334
|
+
logger.info(`Generated ${redteamTests.length} total test cases across ${contexts.length} contexts`);
|
|
10408
10335
|
} else {
|
|
10409
10336
|
const result = await synthesize({
|
|
10410
10337
|
...parsedConfig.data,
|
|
@@ -10433,20 +10360,20 @@ async function doGenerateRedteam(options) {
|
|
|
10433
10360
|
*/
|
|
10434
10361
|
const cleanupProvider = async () => {
|
|
10435
10362
|
try {
|
|
10436
|
-
|
|
10363
|
+
logger.debug("Cleaning up provider");
|
|
10437
10364
|
const provider = testSuite.providers[0];
|
|
10438
10365
|
if (provider && typeof provider.cleanup === "function") {
|
|
10439
10366
|
const cleanupResult = provider.cleanup();
|
|
10440
10367
|
if (cleanupResult instanceof Promise) await cleanupResult;
|
|
10441
10368
|
}
|
|
10442
10369
|
} catch (cleanupErr) {
|
|
10443
|
-
|
|
10370
|
+
logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
10444
10371
|
}
|
|
10445
10372
|
};
|
|
10446
10373
|
try {
|
|
10447
10374
|
handleFailedPlugins(failedPlugins, options.strict ?? false);
|
|
10448
10375
|
if (redteamTests.length === 0) {
|
|
10449
|
-
|
|
10376
|
+
logger.warn("No test cases generated. Please check for errors and try again.");
|
|
10450
10377
|
return null;
|
|
10451
10378
|
}
|
|
10452
10379
|
const updatedRedteamConfig = {
|
|
@@ -10465,7 +10392,7 @@ async function doGenerateRedteam(options) {
|
|
|
10465
10392
|
return encodeURIComponent(value);
|
|
10466
10393
|
}).filter((line) => line.length > 0).join("\n");
|
|
10467
10394
|
fs$1.writeFileSync(options.output, outputLines);
|
|
10468
|
-
|
|
10395
|
+
logger.info(chalk.green(`Wrote ${redteamTests.length} test cases to ${chalk.bold(options.output)}`));
|
|
10469
10396
|
return {};
|
|
10470
10397
|
} else if (options.output) {
|
|
10471
10398
|
const existingYaml = configPath ? yaml.load(fs$1.readFileSync(configPath, "utf8")) : {};
|
|
@@ -10504,8 +10431,8 @@ async function doGenerateRedteam(options) {
|
|
|
10504
10431
|
ret = writePromptfooConfig(updatedYaml, options.output, headerComments);
|
|
10505
10432
|
printBorder();
|
|
10506
10433
|
const relativeOutputPath = path.relative(process.cwd(), options.output);
|
|
10507
|
-
|
|
10508
|
-
if (!options.inRedteamRun)
|
|
10434
|
+
logger.info(`Wrote ${redteamTests.length} test cases to ${relativeOutputPath}`);
|
|
10435
|
+
if (!options.inRedteamRun) logger.info("\n" + chalk.green(`Run ${chalk.bold(relativeOutputPath === "redteam.yaml" ? promptfooCommand("redteam eval") : promptfooCommand(`redteam eval -c ${relativeOutputPath}`))} to run the red team!`));
|
|
10509
10436
|
printBorder();
|
|
10510
10437
|
} else if (options.write && configPath) {
|
|
10511
10438
|
const existingConfig = yaml.load(fs$1.readFileSync(configPath, "utf8"));
|
|
@@ -10543,9 +10470,9 @@ async function doGenerateRedteam(options) {
|
|
|
10543
10470
|
isUpdate: true
|
|
10544
10471
|
});
|
|
10545
10472
|
ret = writePromptfooConfig(existingConfig, configPath, headerComments);
|
|
10546
|
-
|
|
10473
|
+
logger.info(`\nWrote ${redteamTests.length} new test cases to ${path.relative(process.cwd(), configPath)}`);
|
|
10547
10474
|
const command = configPath.endsWith("promptfooconfig.yaml") ? promptfooCommand("eval") : promptfooCommand(`eval -c ${path.relative(process.cwd(), configPath)}`);
|
|
10548
|
-
|
|
10475
|
+
logger.info("\n" + chalk.green(`Run ${chalk.bold(`${command}`)} to run the red team!`));
|
|
10549
10476
|
} else {
|
|
10550
10477
|
const headerComments = createHeaderComments({
|
|
10551
10478
|
title: "REDTEAM CONFIGURATION",
|
|
@@ -10561,7 +10488,7 @@ async function doGenerateRedteam(options) {
|
|
|
10561
10488
|
tests: redteamTests
|
|
10562
10489
|
}, "redteam.yaml", headerComments);
|
|
10563
10490
|
}
|
|
10564
|
-
|
|
10491
|
+
telemetry.record("command_used", {
|
|
10565
10492
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
10566
10493
|
name: "generate redteam",
|
|
10567
10494
|
numPrompts: testSuite.prompts.length,
|
|
@@ -10571,7 +10498,7 @@ async function doGenerateRedteam(options) {
|
|
|
10571
10498
|
strategies: strategies.map((s) => typeof s === "string" ? s : s.id),
|
|
10572
10499
|
isPromptfooSampleTarget: testSuite.providers.some(isPromptfooSampleTarget)
|
|
10573
10500
|
});
|
|
10574
|
-
|
|
10501
|
+
telemetry.record("redteam generate", {
|
|
10575
10502
|
phase: "completed",
|
|
10576
10503
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
10577
10504
|
numPrompts: testSuite.prompts.length,
|
|
@@ -10586,7 +10513,6 @@ async function doGenerateRedteam(options) {
|
|
|
10586
10513
|
await cleanupProvider();
|
|
10587
10514
|
}
|
|
10588
10515
|
}
|
|
10589
|
-
|
|
10590
10516
|
//#endregion
|
|
10591
10517
|
//#region src/util/inlineBlobsForShare.ts
|
|
10592
10518
|
const BLOB_URI_PREFIX = "promptfoo://blob/";
|
|
@@ -10652,7 +10578,7 @@ async function ensureBlobPayloads(hashes, cache) {
|
|
|
10652
10578
|
dataUrl: `data:${mimeType};base64,${base64}`
|
|
10653
10579
|
});
|
|
10654
10580
|
} catch (error) {
|
|
10655
|
-
|
|
10581
|
+
logger.warn("[Share] Failed to inline blob reference", {
|
|
10656
10582
|
error,
|
|
10657
10583
|
hash
|
|
10658
10584
|
});
|
|
@@ -10698,7 +10624,6 @@ async function inlineBlobRefsForShare(value, cache) {
|
|
|
10698
10624
|
await ensureBlobPayloads(hashes, cache);
|
|
10699
10625
|
return await inlineValue(value, cache, /* @__PURE__ */ new WeakSet(), 0);
|
|
10700
10626
|
}
|
|
10701
|
-
|
|
10702
10627
|
//#endregion
|
|
10703
10628
|
//#region src/share.ts
|
|
10704
10629
|
function isSharingEnabled(evalRecord) {
|
|
@@ -10712,10 +10637,10 @@ function isSharingEnabled(evalRecord) {
|
|
|
10712
10637
|
}
|
|
10713
10638
|
function determineShareDomain(eval_) {
|
|
10714
10639
|
const sharing = eval_.config.sharing;
|
|
10715
|
-
|
|
10640
|
+
logger.debug(`Share config: isCloudEnabled=${cloudConfig.isEnabled()}, sharing=${JSON.stringify(sharing)}, evalId=${eval_.id}`);
|
|
10716
10641
|
const envAppBaseUrl = getEnvString("PROMPTFOO_REMOTE_APP_BASE_URL");
|
|
10717
10642
|
const domain = cloudConfig.isEnabled() ? cloudConfig.getAppUrl() : typeof sharing === "object" && sharing.appBaseUrl ? sharing.appBaseUrl : envAppBaseUrl || getDefaultShareViewBaseUrl();
|
|
10718
|
-
|
|
10643
|
+
logger.debug(`Share domain determined: domain=${domain}`);
|
|
10719
10644
|
return { domain };
|
|
10720
10645
|
}
|
|
10721
10646
|
function getResultSize(result) {
|
|
@@ -10747,7 +10672,7 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10747
10672
|
};
|
|
10748
10673
|
}
|
|
10749
10674
|
const jsonData = JSON.stringify(evalData);
|
|
10750
|
-
|
|
10675
|
+
logger.debug(`Sending initial eval data to ${url} - eval ${evalRecord.id} with ${evalRecord.prompts.length} prompts ${traces.length > 0 ? `and trace data` : ""}`);
|
|
10751
10676
|
const response = await fetchWithProxy(url, {
|
|
10752
10677
|
method: "POST",
|
|
10753
10678
|
headers,
|
|
@@ -10767,7 +10692,7 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10767
10692
|
errorMessage,
|
|
10768
10693
|
bodyMessage
|
|
10769
10694
|
};
|
|
10770
|
-
|
|
10695
|
+
logger.error(`Sharing your eval data to ${url} failed. Debug info: ${JSON.stringify(debugInfo, null, 2)}`);
|
|
10771
10696
|
throw new Error(`${errorMessage}${bodyMessage}`);
|
|
10772
10697
|
}
|
|
10773
10698
|
const responseJson = await response.json();
|
|
@@ -10778,7 +10703,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10778
10703
|
const targetUrl = `${url}/${evalId}/results`;
|
|
10779
10704
|
const stringifiedChunk = JSON.stringify(chunk);
|
|
10780
10705
|
const chunkSizeBytes = Buffer.byteLength(stringifiedChunk, "utf8");
|
|
10781
|
-
|
|
10706
|
+
logger.debug(`Sending chunk of ${chunk.length} results (${(chunkSizeBytes / 1024 / 1024).toFixed(2)} MB) to ${targetUrl}`);
|
|
10782
10707
|
try {
|
|
10783
10708
|
const response = await fetchWithProxy(targetUrl, {
|
|
10784
10709
|
method: "POST",
|
|
@@ -10798,7 +10723,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10798
10723
|
evalId,
|
|
10799
10724
|
responseBody: responseBody.length > 500 ? `${responseBody.slice(0, 500)}...` : responseBody
|
|
10800
10725
|
};
|
|
10801
|
-
|
|
10726
|
+
logger.debug(`Chunk send failed: ${JSON.stringify(debugInfo, null, 2)}`);
|
|
10802
10727
|
if (response.status === 413) return {
|
|
10803
10728
|
success: false,
|
|
10804
10729
|
errorType: "PAYLOAD_TOO_LARGE",
|
|
@@ -10813,7 +10738,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10813
10738
|
return { success: true };
|
|
10814
10739
|
} catch (error) {
|
|
10815
10740
|
if (error instanceof TypeError && error.message === "fetch failed") {
|
|
10816
|
-
|
|
10741
|
+
logger.debug(`Network timeout/failure for chunk of ${chunk.length} results`);
|
|
10817
10742
|
return {
|
|
10818
10743
|
success: false,
|
|
10819
10744
|
errorType: "NETWORK_TIMEOUT",
|
|
@@ -10845,41 +10770,41 @@ async function sendChunkWithRetry(chunk, url, evalId, headers, config, onProgres
|
|
|
10845
10770
|
const midpoint = Math.ceil(chunk.length / 2);
|
|
10846
10771
|
const firstHalf = chunk.slice(0, midpoint);
|
|
10847
10772
|
const secondHalf = chunk.slice(midpoint);
|
|
10848
|
-
|
|
10773
|
+
logger.info(`Chunk of ${chunk.length} results failed (${result.errorType}). Splitting into ${firstHalf.length} + ${secondHalf.length} and retrying...`);
|
|
10849
10774
|
return await sendChunkWithRetry(firstHalf, url, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth) + await sendChunkWithRetry(secondHalf, url, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth);
|
|
10850
10775
|
}
|
|
10851
10776
|
throw result.originalError ?? /* @__PURE__ */ new Error("Unknown error sending chunk");
|
|
10852
10777
|
}
|
|
10853
10778
|
async function rollbackEval(url, evalId, headers) {
|
|
10854
10779
|
const targetUrl = `${url}/${evalId}`;
|
|
10855
|
-
|
|
10780
|
+
logger.debug(`Attempting to roll back eval ${evalId} at ${targetUrl}`);
|
|
10856
10781
|
try {
|
|
10857
10782
|
const response = await fetchWithProxy(targetUrl, {
|
|
10858
10783
|
method: "DELETE",
|
|
10859
10784
|
headers
|
|
10860
10785
|
});
|
|
10861
|
-
if (response.ok)
|
|
10862
|
-
else
|
|
10786
|
+
if (response.ok) logger.debug(`Successfully rolled back eval ${evalId}`);
|
|
10787
|
+
else logger.warn(`Rollback request returned non-OK status: ${response.statusText}`);
|
|
10863
10788
|
} catch (e) {
|
|
10864
|
-
|
|
10789
|
+
logger.warn(`Failed to roll back eval ${evalId}: ${e}. You may need to manually delete this eval.`);
|
|
10865
10790
|
}
|
|
10866
10791
|
}
|
|
10867
10792
|
async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
10868
10793
|
const isVerbose = isDebugEnabled();
|
|
10869
10794
|
const { silent = false } = options;
|
|
10870
|
-
|
|
10795
|
+
logger.debug(`Starting chunked results upload to ${url}`);
|
|
10871
10796
|
await checkCloudPermissions(evalRecord.config);
|
|
10872
10797
|
const inlineBlobs = isBlobStorageEnabled() && getEnvBool("PROMPTFOO_SHARE_INLINE_BLOBS", !cloudConfig.isEnabled());
|
|
10873
10798
|
const inlineCache = inlineBlobs ? createBlobInlineCache() : null;
|
|
10874
10799
|
let sampleResults = (await evalRecord.fetchResultsBatched(100).next()).value ?? [];
|
|
10875
10800
|
if (sampleResults.length === 0) {
|
|
10876
|
-
|
|
10801
|
+
logger.debug(`No results found`);
|
|
10877
10802
|
return null;
|
|
10878
10803
|
}
|
|
10879
10804
|
if (inlineBlobs && inlineCache) sampleResults = await inlineBlobRefsForShare(sampleResults, inlineCache);
|
|
10880
|
-
|
|
10805
|
+
logger.debug(`Loaded ${sampleResults.length} sample results to determine chunk size`);
|
|
10881
10806
|
const largestSize = findLargestResultSize(sampleResults);
|
|
10882
|
-
|
|
10807
|
+
logger.debug(`Largest result size from sample: ${largestSize} bytes`);
|
|
10883
10808
|
const TARGET_CHUNK_SIZE = .9 * 1024 * 1024;
|
|
10884
10809
|
const envChunkSize = getEnvInt("PROMPTFOO_SHARE_CHUNK_SIZE");
|
|
10885
10810
|
const calculatedChunkSize = Math.max(1, Math.floor(TARGET_CHUNK_SIZE / largestSize));
|
|
@@ -10888,11 +10813,11 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10888
10813
|
minResultsPerChunk: 1,
|
|
10889
10814
|
maxResultsPerChunk: resultsPerChunk
|
|
10890
10815
|
};
|
|
10891
|
-
|
|
10816
|
+
logger.debug(`Chunk config: ${JSON.stringify(chunkConfig)}`);
|
|
10892
10817
|
const headers = { "Content-Type": "application/json" };
|
|
10893
10818
|
if (cloudConfig.isEnabled()) headers["Authorization"] = `Bearer ${cloudConfig.getApiKey()}`;
|
|
10894
10819
|
const totalResults = await evalRecord.getTotalResultRowCount();
|
|
10895
|
-
|
|
10820
|
+
logger.debug(`Total results to share: ${totalResults}`);
|
|
10896
10821
|
let progressBar = null;
|
|
10897
10822
|
if (!isVerbose && !isCI() && !silent) {
|
|
10898
10823
|
progressBar = new cliProgress.SingleBar({
|
|
@@ -10904,12 +10829,12 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10904
10829
|
let evalId;
|
|
10905
10830
|
try {
|
|
10906
10831
|
evalId = await sendEvalRecord(evalRecord, url, headers);
|
|
10907
|
-
|
|
10832
|
+
logger.debug(`Initial eval data sent successfully - ${evalId}`);
|
|
10908
10833
|
let totalSent = 0;
|
|
10909
10834
|
const onProgress = (sentCount) => {
|
|
10910
10835
|
totalSent += sentCount;
|
|
10911
10836
|
if (progressBar) progressBar.update(totalSent);
|
|
10912
|
-
else
|
|
10837
|
+
else logger.info(`Progress: ${totalSent}/${totalResults} results shared (${Math.round(totalSent / totalResults * 100)}%)`);
|
|
10913
10838
|
};
|
|
10914
10839
|
let currentChunk = [];
|
|
10915
10840
|
let chunkNumber = 0;
|
|
@@ -10917,23 +10842,23 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10917
10842
|
currentChunk.push(result);
|
|
10918
10843
|
if (currentChunk.length >= resultsPerChunk) {
|
|
10919
10844
|
chunkNumber++;
|
|
10920
|
-
|
|
10845
|
+
logger.debug(`Sending chunk ${chunkNumber} with ${currentChunk.length} results`);
|
|
10921
10846
|
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url, evalId, headers, chunkConfig, onProgress);
|
|
10922
10847
|
currentChunk = [];
|
|
10923
10848
|
}
|
|
10924
10849
|
}
|
|
10925
10850
|
if (currentChunk.length > 0) {
|
|
10926
10851
|
chunkNumber++;
|
|
10927
|
-
|
|
10852
|
+
logger.debug(`Sending final chunk ${chunkNumber} with ${currentChunk.length} results`);
|
|
10928
10853
|
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url, evalId, headers, chunkConfig, onProgress);
|
|
10929
10854
|
}
|
|
10930
|
-
|
|
10855
|
+
logger.debug(`Sharing complete. Total chunks sent: ${chunkNumber}, Total results: ${totalSent}`);
|
|
10931
10856
|
return evalId;
|
|
10932
10857
|
} catch (e) {
|
|
10933
10858
|
if (progressBar) progressBar.stop();
|
|
10934
|
-
|
|
10859
|
+
logger.error(`Upload failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
10935
10860
|
if (evalId) {
|
|
10936
|
-
|
|
10861
|
+
logger.info(`Upload failed, rolling back...`);
|
|
10937
10862
|
await rollbackEval(url, evalId, headers);
|
|
10938
10863
|
}
|
|
10939
10864
|
return null;
|
|
@@ -10959,7 +10884,7 @@ function stripAuthFromUrl(urlString) {
|
|
|
10959
10884
|
url.password = "";
|
|
10960
10885
|
return url.toString();
|
|
10961
10886
|
} catch {
|
|
10962
|
-
|
|
10887
|
+
logger.warn("Failed to parse URL, returning original");
|
|
10963
10888
|
return urlString;
|
|
10964
10889
|
}
|
|
10965
10890
|
}
|
|
@@ -11002,26 +10927,25 @@ async function getShareableUrl(eval_, remoteEvalId, showAuth = false) {
|
|
|
11002
10927
|
async function createShareableUrl(evalRecord, options = {}) {
|
|
11003
10928
|
const { silent = false, showAuth = false } = options;
|
|
11004
10929
|
if (getEnvBool("PROMPTFOO_DISABLE_SHARING")) {
|
|
11005
|
-
|
|
10930
|
+
logger.debug("Sharing is explicitly disabled, returning null");
|
|
11006
10931
|
return null;
|
|
11007
10932
|
}
|
|
11008
10933
|
if (!silent) {
|
|
11009
10934
|
const orgContext = await getOrgContext();
|
|
11010
10935
|
if (orgContext) {
|
|
11011
10936
|
const teamSuffix = orgContext.teamName ? ` > ${orgContext.teamName}` : "";
|
|
11012
|
-
|
|
10937
|
+
logger.info(`${chalk.dim("Sharing to:")} ${chalk.cyan(orgContext.organizationName)}${teamSuffix}`);
|
|
11013
10938
|
}
|
|
11014
10939
|
}
|
|
11015
10940
|
await handleEmailCollection(evalRecord);
|
|
11016
10941
|
const { url } = await getApiConfig(evalRecord);
|
|
11017
10942
|
const canUseNewResults = cloudConfig.isEnabled();
|
|
11018
|
-
|
|
10943
|
+
logger.debug(`Sharing with ${url} canUseNewResults: ${canUseNewResults} Use old results: ${evalRecord.useOldResults()}`);
|
|
11019
10944
|
const evalId = await sendChunkedResults(evalRecord, url, { silent });
|
|
11020
10945
|
if (!evalId) return null;
|
|
11021
|
-
|
|
10946
|
+
logger.debug(`New eval ID on remote instance: ${evalId}`);
|
|
11022
10947
|
return getShareableUrl(evalRecord, evalId, showAuth);
|
|
11023
10948
|
}
|
|
11024
|
-
|
|
11025
10949
|
//#endregion
|
|
11026
10950
|
//#region src/table.ts
|
|
11027
10951
|
function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
@@ -11042,7 +10966,6 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
|
11042
10966
|
})]);
|
|
11043
10967
|
return table.toString();
|
|
11044
10968
|
}
|
|
11045
|
-
|
|
11046
10969
|
//#endregion
|
|
11047
10970
|
//#region src/util/config/default.ts
|
|
11048
10971
|
/**
|
|
@@ -11082,7 +11005,6 @@ async function loadDefaultConfig(dir, configName = "promptfooconfig") {
|
|
|
11082
11005
|
function clearConfigCache() {
|
|
11083
11006
|
configCache.clear();
|
|
11084
11007
|
}
|
|
11085
|
-
|
|
11086
11008
|
//#endregion
|
|
11087
11009
|
//#region src/util/sharing.ts
|
|
11088
11010
|
/**
|
|
@@ -11110,7 +11032,6 @@ function shouldShareResults(opts) {
|
|
|
11110
11032
|
const sharing = cloudConfig.getSharing();
|
|
11111
11033
|
return cloudConfig.isEnabled() && sharing !== false;
|
|
11112
11034
|
}
|
|
11113
|
-
|
|
11114
11035
|
//#endregion
|
|
11115
11036
|
//#region src/util/formatDuration.ts
|
|
11116
11037
|
/**
|
|
@@ -11130,7 +11051,6 @@ function formatDuration(seconds) {
|
|
|
11130
11051
|
result += `${remainingSeconds}s`;
|
|
11131
11052
|
return result;
|
|
11132
11053
|
}
|
|
11133
|
-
|
|
11134
11054
|
//#endregion
|
|
11135
11055
|
//#region src/commands/eval/summary.ts
|
|
11136
11056
|
/**
|
|
@@ -11282,7 +11202,6 @@ function generateEvalSummary(params) {
|
|
|
11282
11202
|
lines.push("");
|
|
11283
11203
|
return lines;
|
|
11284
11204
|
}
|
|
11285
|
-
|
|
11286
11205
|
//#endregion
|
|
11287
11206
|
//#region src/commands/retry.ts
|
|
11288
11207
|
/**
|
|
@@ -11298,7 +11217,7 @@ async function getErrorResultIds(evalId) {
|
|
|
11298
11217
|
async function deleteErrorResults(resultIds) {
|
|
11299
11218
|
if (resultIds.length === 0) return;
|
|
11300
11219
|
await getDb().delete(evalResultsTable).where(inArray(evalResultsTable.id, resultIds));
|
|
11301
|
-
|
|
11220
|
+
logger.debug(`Deleted ${resultIds.length} error results from database`);
|
|
11302
11221
|
}
|
|
11303
11222
|
const RECALCULATE_BATCH_SIZE = 1e3;
|
|
11304
11223
|
/**
|
|
@@ -11306,7 +11225,7 @@ const RECALCULATE_BATCH_SIZE = 1e3;
|
|
|
11306
11225
|
* Uses streaming batched iteration to avoid OOM with large evaluations (40K+ results).
|
|
11307
11226
|
*/
|
|
11308
11227
|
async function recalculatePromptMetrics(evalRecord) {
|
|
11309
|
-
|
|
11228
|
+
logger.debug("Recalculating prompt metrics after deleting ERROR results");
|
|
11310
11229
|
const startTime = Date.now();
|
|
11311
11230
|
let batchNumber = 0;
|
|
11312
11231
|
let totalProcessed = 0;
|
|
@@ -11328,12 +11247,12 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11328
11247
|
try {
|
|
11329
11248
|
for await (const batch of evalRecord.fetchResultsBatched(RECALCULATE_BATCH_SIZE)) {
|
|
11330
11249
|
batchNumber++;
|
|
11331
|
-
|
|
11250
|
+
logger.debug(`Processing batch ${batchNumber} with ${batch.length} results`);
|
|
11332
11251
|
for (const result of batch) {
|
|
11333
11252
|
currentResultId = result.id;
|
|
11334
11253
|
const metrics = promptMetricsMap.get(result.promptIdx);
|
|
11335
11254
|
if (!metrics) {
|
|
11336
|
-
|
|
11255
|
+
logger.debug(`Skipping result with invalid promptIdx: ${result.promptIdx}`, {
|
|
11337
11256
|
resultId: result.id,
|
|
11338
11257
|
evalId: evalRecord.id
|
|
11339
11258
|
});
|
|
@@ -11367,7 +11286,7 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11367
11286
|
totalProcessed += batch.length;
|
|
11368
11287
|
}
|
|
11369
11288
|
} catch (error) {
|
|
11370
|
-
|
|
11289
|
+
logger.error("Error during batched metrics recalculation", {
|
|
11371
11290
|
phase: "calculation",
|
|
11372
11291
|
batchNumber,
|
|
11373
11292
|
totalProcessed,
|
|
@@ -11381,7 +11300,7 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11381
11300
|
if (evalRecord.persisted) try {
|
|
11382
11301
|
await evalRecord.addPrompts(evalRecord.prompts);
|
|
11383
11302
|
} catch (error) {
|
|
11384
|
-
|
|
11303
|
+
logger.error("Error saving recalculated prompt metrics", {
|
|
11385
11304
|
phase: "save",
|
|
11386
11305
|
evalId: evalRecord.id,
|
|
11387
11306
|
promptCount: evalRecord.prompts.length,
|
|
@@ -11390,19 +11309,18 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11390
11309
|
throw error;
|
|
11391
11310
|
}
|
|
11392
11311
|
const durationMs = Date.now() - startTime;
|
|
11393
|
-
|
|
11312
|
+
logger.debug("Prompt metrics recalculation completed", {
|
|
11394
11313
|
totalBatches: batchNumber,
|
|
11395
11314
|
totalResults: totalProcessed,
|
|
11396
11315
|
durationMs
|
|
11397
11316
|
});
|
|
11398
11317
|
}
|
|
11399
|
-
|
|
11400
11318
|
//#endregion
|
|
11401
11319
|
//#region src/commands/share.ts
|
|
11402
11320
|
function notCloudEnabledShareInstructions() {
|
|
11403
11321
|
const cloudUrl = getDefaultShareViewBaseUrl();
|
|
11404
11322
|
const welcomeUrl = `${cloudUrl}/welcome`;
|
|
11405
|
-
|
|
11323
|
+
logger.info(dedent`
|
|
11406
11324
|
|
|
11407
11325
|
» You need to have a cloud account to securely share your results.
|
|
11408
11326
|
|
|
@@ -11411,10 +11329,7 @@ function notCloudEnabledShareInstructions() {
|
|
|
11411
11329
|
3. Run ${chalk.greenBright.bold("promptfoo share")}
|
|
11412
11330
|
`);
|
|
11413
11331
|
}
|
|
11414
|
-
|
|
11415
|
-
//#endregion
|
|
11416
|
-
//#region src/commands/eval.ts
|
|
11417
|
-
const EvalCommandSchema = CommandLineOptionsSchema.extend({
|
|
11332
|
+
CommandLineOptionsSchema.extend({
|
|
11418
11333
|
help: z.boolean().optional(),
|
|
11419
11334
|
interactiveProviders: z.boolean().optional(),
|
|
11420
11335
|
remote: z.boolean().optional(),
|
|
@@ -11424,7 +11339,7 @@ const EvalCommandSchema = CommandLineOptionsSchema.extend({
|
|
|
11424
11339
|
resume: z.union([z.string(), z.boolean()]).optional()
|
|
11425
11340
|
}).partial();
|
|
11426
11341
|
function showRedteamProviderLabelMissingWarning(testSuite) {
|
|
11427
|
-
if (testSuite.providers.some((p) => !p.label))
|
|
11342
|
+
if (testSuite.providers.some((p) => !p.label)) logger.warn(dedent`
|
|
11428
11343
|
${chalk.bold.yellow("Warning")}: Your target (provider) does not have a label specified.
|
|
11429
11344
|
|
|
11430
11345
|
Labels are used to uniquely identify redteam targets. Please set a meaningful and unique label (e.g., 'helpdesk-search-agent') for your targets/providers in your redteam config.
|
|
@@ -11455,7 +11370,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11455
11370
|
}
|
|
11456
11371
|
const runEvaluation = async (initialization) => {
|
|
11457
11372
|
const startTime = Date.now();
|
|
11458
|
-
|
|
11373
|
+
telemetry.record("command_used", {
|
|
11459
11374
|
name: "eval - started",
|
|
11460
11375
|
watch: Boolean(cmdObj.watch),
|
|
11461
11376
|
...Boolean(config?.redteam) && { isRedteam: true }
|
|
@@ -11476,13 +11391,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11476
11391
|
...defaultConfig,
|
|
11477
11392
|
...dirConfig
|
|
11478
11393
|
};
|
|
11479
|
-
} else
|
|
11394
|
+
} else logger.warn(`No configuration file found in directory: ${configPath}. Looked for promptfooconfig.{${DEFAULT_CONFIG_EXTENSIONS.join(",")}}. Run "${promptfooCommand("init")}" or pass --config path/to/promptfooconfig.yaml.`);
|
|
11480
11395
|
}
|
|
11481
11396
|
}
|
|
11482
11397
|
const resumeRaw = cmdObj.resume;
|
|
11483
11398
|
const retryErrors = cmdObj.retryErrors;
|
|
11484
11399
|
if (resumeRaw && retryErrors) {
|
|
11485
|
-
|
|
11400
|
+
logger.error(chalk.red("Cannot use --resume and --retry-errors together. Please use one or the other."));
|
|
11486
11401
|
process.exitCode = 1;
|
|
11487
11402
|
return new Eval({}, { persisted: false });
|
|
11488
11403
|
}
|
|
@@ -11490,45 +11405,45 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11490
11405
|
const resumeId = resumeRaw === true || resumeRaw === void 0 ? "latest" : resumeRaw;
|
|
11491
11406
|
if (resumeRaw) {
|
|
11492
11407
|
if (cmdObj.write === false) {
|
|
11493
|
-
|
|
11408
|
+
logger.error(chalk.red("Cannot use --resume with --no-write. Resume functionality requires database persistence."));
|
|
11494
11409
|
process.exitCode = 1;
|
|
11495
11410
|
return new Eval({}, { persisted: false });
|
|
11496
11411
|
}
|
|
11497
11412
|
resumeEval = resumeId === "latest" ? await Eval.latest() : await Eval.findById(resumeId);
|
|
11498
11413
|
if (!resumeEval) {
|
|
11499
|
-
|
|
11414
|
+
logger.error(`Could not find evaluation to resume: ${resumeId}`);
|
|
11500
11415
|
process.exitCode = 1;
|
|
11501
11416
|
return new Eval({}, { persisted: false });
|
|
11502
11417
|
}
|
|
11503
|
-
|
|
11418
|
+
logger.info(chalk.cyan(`Resuming evaluation ${resumeEval.id}...`));
|
|
11504
11419
|
({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs({}, resumeEval.config));
|
|
11505
11420
|
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) testSuite.prompts = resumeEval.prompts.map((p) => ({
|
|
11506
11421
|
raw: p.raw,
|
|
11507
11422
|
label: p.label,
|
|
11508
11423
|
config: p.config
|
|
11509
11424
|
}));
|
|
11510
|
-
|
|
11425
|
+
state.resume = true;
|
|
11511
11426
|
} else if (retryErrors) {
|
|
11512
11427
|
if (cmdObj.write === false) {
|
|
11513
|
-
|
|
11428
|
+
logger.error(chalk.red("Cannot use --retry-errors with --no-write. Retry functionality requires database persistence."));
|
|
11514
11429
|
process.exitCode = 1;
|
|
11515
11430
|
return new Eval({}, { persisted: false });
|
|
11516
11431
|
}
|
|
11517
|
-
|
|
11432
|
+
logger.info("🔄 Retrying ERROR results from latest evaluation...");
|
|
11518
11433
|
const latestEval = await Eval.latest();
|
|
11519
11434
|
if (!latestEval) {
|
|
11520
|
-
|
|
11435
|
+
logger.error("No previous evaluation found to retry errors from");
|
|
11521
11436
|
process.exitCode = 1;
|
|
11522
11437
|
return new Eval({}, { persisted: false });
|
|
11523
11438
|
}
|
|
11524
11439
|
const errorResultIds = await getErrorResultIds(latestEval.id);
|
|
11525
11440
|
if (errorResultIds.length === 0) {
|
|
11526
|
-
|
|
11441
|
+
logger.info("✅ No ERROR results found in the latest evaluation");
|
|
11527
11442
|
return latestEval;
|
|
11528
11443
|
}
|
|
11529
|
-
|
|
11530
|
-
|
|
11531
|
-
|
|
11444
|
+
logger.info(`Found ${errorResultIds.length} ERROR results to retry`);
|
|
11445
|
+
state._retryErrorResultIds = errorResultIds;
|
|
11446
|
+
logger.info(`🔄 Running evaluation with resume mode to retry ${errorResultIds.length} test cases...`);
|
|
11532
11447
|
resumeEval = latestEval;
|
|
11533
11448
|
({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs({}, resumeEval.config));
|
|
11534
11449
|
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) testSuite.prompts = resumeEval.prompts.map((p) => ({
|
|
@@ -11536,20 +11451,20 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11536
11451
|
label: p.label,
|
|
11537
11452
|
config: p.config
|
|
11538
11453
|
}));
|
|
11539
|
-
|
|
11540
|
-
|
|
11454
|
+
state.resume = true;
|
|
11455
|
+
state.retryMode = true;
|
|
11541
11456
|
} else ({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs(cmdObj, defaultConfig));
|
|
11542
11457
|
if (!cmdObj.envPath && commandLineOptions?.envPath) {
|
|
11543
|
-
|
|
11458
|
+
logger.debug(`Loading additional environment from config: ${commandLineOptions.envPath}`);
|
|
11544
11459
|
setupEnv(commandLineOptions.envPath);
|
|
11545
11460
|
}
|
|
11546
|
-
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0))
|
|
11461
|
+
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) logger.warn(chalk.yellow(dedent`
|
|
11547
11462
|
Warning: Config file has a redteam section but no test cases.
|
|
11548
11463
|
Did you mean to run ${chalk.bold("promptfoo redteam generate")} instead?
|
|
11549
11464
|
`));
|
|
11550
11465
|
if (config.redteam && Array.isArray(config.providers) && config.providers.length > 0 && typeof config.providers[0] === "object" && config.providers[0].id === "http") {
|
|
11551
11466
|
const maybeUrl = config.providers[0]?.config?.url;
|
|
11552
|
-
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app"))
|
|
11467
|
+
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) telemetry.record("feature_used", { feature: "redteam_run_with_example" });
|
|
11553
11468
|
}
|
|
11554
11469
|
if (config.evaluateOptions) evaluateOptions = {
|
|
11555
11470
|
...evaluateOptions,
|
|
@@ -11563,25 +11478,25 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11563
11478
|
const persisted = resumeEval?.runtimeOptions || config.evaluateOptions || {};
|
|
11564
11479
|
repeat = Number.isSafeInteger(persisted.repeat || 0) && persisted.repeat > 0 ? persisted.repeat : 1;
|
|
11565
11480
|
cache = persisted.cache ?? true;
|
|
11566
|
-
maxConcurrency = persisted.maxConcurrency ??
|
|
11481
|
+
maxConcurrency = persisted.maxConcurrency ?? 4;
|
|
11567
11482
|
delay = persisted.delay ?? 0;
|
|
11568
11483
|
} else {
|
|
11569
11484
|
const iterations = cmdObj.repeat ?? commandLineOptions?.repeat ?? evaluateOptions.repeat ?? NaN;
|
|
11570
11485
|
repeat = Number.isSafeInteger(iterations) && iterations > 0 ? iterations : 1;
|
|
11571
11486
|
cache = cmdObj.cache ?? commandLineOptions?.cache ?? evaluateOptions.cache ?? true;
|
|
11572
|
-
maxConcurrency = cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency ??
|
|
11487
|
+
maxConcurrency = cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency ?? 4;
|
|
11573
11488
|
delay = cmdObj.delay ?? commandLineOptions?.delay ?? evaluateOptions.delay ?? 0;
|
|
11574
11489
|
}
|
|
11575
11490
|
if (cache === false || repeat > 1) {
|
|
11576
|
-
|
|
11491
|
+
logger.info("Cache is disabled.");
|
|
11577
11492
|
disableCache();
|
|
11578
11493
|
}
|
|
11579
11494
|
const explicitMaxConcurrency = resumeRaw ? (resumeEval?.runtimeOptions)?.maxConcurrency ?? cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency : cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency;
|
|
11580
11495
|
if (delay > 0) {
|
|
11581
11496
|
maxConcurrency = 1;
|
|
11582
|
-
|
|
11583
|
-
|
|
11584
|
-
} else if (explicitMaxConcurrency !== void 0)
|
|
11497
|
+
state.maxConcurrency = 1;
|
|
11498
|
+
logger.info(`Running at concurrency=1 because ${delay}ms delay was requested between API calls`);
|
|
11499
|
+
} else if (explicitMaxConcurrency !== void 0) state.maxConcurrency = explicitMaxConcurrency;
|
|
11585
11500
|
if (!resumeEval) {
|
|
11586
11501
|
const filterOptions = {
|
|
11587
11502
|
failing: cmdObj.filterFailing,
|
|
@@ -11598,17 +11513,17 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11598
11513
|
let hasValidEmail = false;
|
|
11599
11514
|
while (!hasValidEmail) {
|
|
11600
11515
|
const { emailNeedsValidation } = await promptForEmailUnverified();
|
|
11601
|
-
hasValidEmail = await checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) ===
|
|
11516
|
+
hasValidEmail = await checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) === "ok";
|
|
11602
11517
|
}
|
|
11603
11518
|
}
|
|
11604
11519
|
if (!resumeEval) testSuite.providers = filterProviders(testSuite.providers, cmdObj.filterProviders || cmdObj.filterTargets);
|
|
11605
11520
|
const missingApiKeys = checkProviderApiKeys(testSuite.providers);
|
|
11606
11521
|
if (missingApiKeys.size > 0) {
|
|
11607
|
-
for (const [envVar, providerIds] of missingApiKeys)
|
|
11608
|
-
|
|
11609
|
-
|
|
11610
|
-
for (const envVar of missingApiKeys.keys())
|
|
11611
|
-
|
|
11522
|
+
for (const [envVar, providerIds] of missingApiKeys) logger.error(chalk.red(` ✗ Missing ${envVar} (${providerIds.join(", ")})`));
|
|
11523
|
+
logger.error("");
|
|
11524
|
+
logger.error(`To fix, set the environment variable or use ${chalk.bold("--env-file")}:`);
|
|
11525
|
+
for (const envVar of missingApiKeys.keys()) logger.error(` export ${envVar}=your-api-key-here`);
|
|
11526
|
+
logger.error("");
|
|
11612
11527
|
process.exitCode = 1;
|
|
11613
11528
|
return new Eval({}, { persisted: false });
|
|
11614
11529
|
}
|
|
@@ -11625,12 +11540,12 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11625
11540
|
if (typeof testSuite.defaultTest === "string") testSuite.defaultTest = {};
|
|
11626
11541
|
testSuite.defaultTest = testSuite.defaultTest || {};
|
|
11627
11542
|
testSuite.defaultTest.options = testSuite.defaultTest.options || {};
|
|
11628
|
-
testSuite.defaultTest.options.provider = await loadApiProvider(cmdObj.grader, { basePath:
|
|
11629
|
-
if (
|
|
11630
|
-
if (typeof
|
|
11631
|
-
|
|
11632
|
-
|
|
11633
|
-
|
|
11543
|
+
testSuite.defaultTest.options.provider = await loadApiProvider(cmdObj.grader, { basePath: state.basePath });
|
|
11544
|
+
if (state.config) {
|
|
11545
|
+
if (typeof state.config.defaultTest === "string") state.config.defaultTest = {};
|
|
11546
|
+
state.config.defaultTest = state.config.defaultTest || {};
|
|
11547
|
+
state.config.defaultTest.options = state.config.defaultTest.options || {};
|
|
11548
|
+
state.config.defaultTest.options.provider = testSuite.defaultTest.options.provider;
|
|
11634
11549
|
}
|
|
11635
11550
|
}
|
|
11636
11551
|
if (!resumeEval && cmdObj.var) {
|
|
@@ -11648,7 +11563,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11648
11563
|
}
|
|
11649
11564
|
for (const scenario of testSuite.scenarios || []) if (scenario.tests) scenario.tests = await maybeLoadFromExternalFile(scenario.tests);
|
|
11650
11565
|
const testSuiteSchema = TestSuiteSchema.safeParse(testSuite);
|
|
11651
|
-
if (!testSuiteSchema.success)
|
|
11566
|
+
if (!testSuiteSchema.success) logger.warn(chalk.yellow(dedent`
|
|
11652
11567
|
TestSuite Schema Validation Error:
|
|
11653
11568
|
|
|
11654
11569
|
${z.prettifyError(testSuiteSchema.error)}
|
|
@@ -11681,13 +11596,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11681
11596
|
clearTimeout(forceExitTimeout);
|
|
11682
11597
|
forceExitTimeout = void 0;
|
|
11683
11598
|
}
|
|
11684
|
-
|
|
11599
|
+
logger.warn("Force exiting...");
|
|
11685
11600
|
process.exit(130);
|
|
11686
11601
|
}
|
|
11687
|
-
|
|
11602
|
+
logger.info(chalk.yellow("Pausing evaluation... Press Ctrl+C again to force exit."));
|
|
11688
11603
|
abortController.abort();
|
|
11689
11604
|
forceExitTimeout = setTimeout(() => {
|
|
11690
|
-
|
|
11605
|
+
logger.warn("Evaluation shutdown timed out, force exiting...");
|
|
11691
11606
|
process.exit(130);
|
|
11692
11607
|
}, 1e4).unref();
|
|
11693
11608
|
};
|
|
@@ -11701,27 +11616,27 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11701
11616
|
abortSignal: evaluateOptions.abortSignal,
|
|
11702
11617
|
isRedteam: Boolean(config.redteam)
|
|
11703
11618
|
});
|
|
11704
|
-
if (retryErrors &&
|
|
11705
|
-
const errorResultIds =
|
|
11619
|
+
if (retryErrors && state._retryErrorResultIds && !paused) {
|
|
11620
|
+
const errorResultIds = state._retryErrorResultIds;
|
|
11706
11621
|
try {
|
|
11707
11622
|
await deleteErrorResults(errorResultIds);
|
|
11708
11623
|
await recalculatePromptMetrics(ret);
|
|
11709
|
-
|
|
11624
|
+
logger.debug(`Cleaned up ${errorResultIds.length} old ERROR results after successful retry`);
|
|
11710
11625
|
} catch (cleanupError) {
|
|
11711
|
-
|
|
11626
|
+
logger.warn("Post-retry cleanup had issues. Retry results are saved.", { error: cleanupError });
|
|
11712
11627
|
} finally {
|
|
11713
|
-
delete
|
|
11714
|
-
|
|
11628
|
+
delete state._retryErrorResultIds;
|
|
11629
|
+
state.retryMode = false;
|
|
11715
11630
|
}
|
|
11716
11631
|
}
|
|
11717
11632
|
} finally {
|
|
11718
11633
|
cleanupHandler();
|
|
11719
11634
|
}
|
|
11720
|
-
|
|
11635
|
+
state.resume = false;
|
|
11721
11636
|
if (paused && cmdObj.write !== false) {
|
|
11722
11637
|
printBorder();
|
|
11723
|
-
|
|
11724
|
-
|
|
11638
|
+
logger.info(`${chalk.yellow("⏸")} Evaluation paused. ID: ${chalk.cyan(evalRecord.id)}`);
|
|
11639
|
+
logger.info(`» Resume with: ${chalk.green.bold("promptfoo eval --resume " + evalRecord.id)}`);
|
|
11725
11640
|
printBorder();
|
|
11726
11641
|
return ret;
|
|
11727
11642
|
}
|
|
@@ -11734,8 +11649,8 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11734
11649
|
});
|
|
11735
11650
|
const hasExplicitDisable = cmdObj.share === false || cmdObj.noShare === true || getEnvBool("PROMPTFOO_DISABLE_SHARING");
|
|
11736
11651
|
const canShareEval = isSharingEnabled(evalRecord);
|
|
11737
|
-
|
|
11738
|
-
|
|
11652
|
+
logger.debug(`Wants to share: ${wantsToShare}`);
|
|
11653
|
+
logger.debug(`Can share eval: ${canShareEval}`);
|
|
11739
11654
|
const willShare = wantsToShare && canShareEval;
|
|
11740
11655
|
let sharePromise = null;
|
|
11741
11656
|
if (willShare) sharePromise = createShareableUrl(evalRecord, { silent: true });
|
|
@@ -11754,13 +11669,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11754
11669
|
if (cmdObj.table && getLogLevel() !== "debug" && totalTests < 500) {
|
|
11755
11670
|
const table = await evalRecord.getTable();
|
|
11756
11671
|
const outputTable = generateTable(table);
|
|
11757
|
-
|
|
11672
|
+
logger.info("\n" + outputTable.toString());
|
|
11758
11673
|
if (table.body.length > 25) {
|
|
11759
11674
|
const rowsLeft = table.body.length - 25;
|
|
11760
|
-
|
|
11675
|
+
logger.info(`... ${rowsLeft} more row${rowsLeft === 1 ? "" : "s"} not shown ...\n`);
|
|
11761
11676
|
}
|
|
11762
|
-
} else if (failures !== 0)
|
|
11763
|
-
if (totalTests >= 500)
|
|
11677
|
+
} else if (failures !== 0) logger.debug(`At least one evaluation failure occurred. This might be caused by the underlying call to the provider, or a test failure. Context: \n${JSON.stringify(evalRecord.prompts)}`);
|
|
11678
|
+
if (totalTests >= 500) logger.info("Skipping table output because there are more than 500 tests.");
|
|
11764
11679
|
const { outputPath } = config;
|
|
11765
11680
|
const paths = (Array.isArray(outputPath) ? outputPath : [outputPath]).filter((p) => typeof p === "string" && p.length > 0 && !p.endsWith(".jsonl"));
|
|
11766
11681
|
const isRedteam = Boolean(config.redteam);
|
|
@@ -11786,13 +11701,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11786
11701
|
targetErrorStatus
|
|
11787
11702
|
});
|
|
11788
11703
|
if (cmdObj.write && wantsToShare && !canShareEval) {
|
|
11789
|
-
|
|
11704
|
+
logger.info(summaryLines[0]);
|
|
11790
11705
|
notCloudEnabledShareInstructions();
|
|
11791
11706
|
for (let i = 1; i < summaryLines.length; i++) if (summaryLines[i].includes("View results:")) {
|
|
11792
11707
|
while (i < summaryLines.length && !summaryLines[i].includes("Total Tokens:")) i++;
|
|
11793
11708
|
i--;
|
|
11794
|
-
} else
|
|
11795
|
-
} else for (const line of summaryLines)
|
|
11709
|
+
} else logger.info(summaryLines[i]);
|
|
11710
|
+
} else for (const line of summaryLines) logger.info(line);
|
|
11796
11711
|
let shareableUrl = null;
|
|
11797
11712
|
if (sharePromise != null) {
|
|
11798
11713
|
const orgContext = await getOrgContext();
|
|
@@ -11811,24 +11726,24 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11811
11726
|
} else spinner.fail(chalk.red("Share failed"));
|
|
11812
11727
|
} catch (error) {
|
|
11813
11728
|
spinner.fail(chalk.red("Share failed"));
|
|
11814
|
-
|
|
11729
|
+
logger.debug(`Share error: ${error}`);
|
|
11815
11730
|
}
|
|
11816
11731
|
} else try {
|
|
11817
11732
|
shareableUrl = await sharePromise;
|
|
11818
11733
|
if (shareableUrl) {
|
|
11819
11734
|
evalRecord.shared = true;
|
|
11820
|
-
|
|
11735
|
+
logger.info(`${chalk.dim("»")} ${chalk.green("✓")} ${shareableUrl}`);
|
|
11821
11736
|
}
|
|
11822
11737
|
} catch (error) {
|
|
11823
|
-
|
|
11738
|
+
logger.debug(`Share error: ${error}`);
|
|
11824
11739
|
}
|
|
11825
11740
|
}
|
|
11826
|
-
|
|
11741
|
+
logger.debug(`Shareable URL: ${shareableUrl}`);
|
|
11827
11742
|
if (paths.length) {
|
|
11828
11743
|
await writeMultipleOutputs(paths, evalRecord, shareableUrl);
|
|
11829
|
-
|
|
11744
|
+
logger.info(chalk.yellow(`Writing output to ${paths.join(", ")}`));
|
|
11830
11745
|
}
|
|
11831
|
-
|
|
11746
|
+
telemetry.record("command_used", {
|
|
11832
11747
|
name: "eval",
|
|
11833
11748
|
watch: Boolean(cmdObj.watch),
|
|
11834
11749
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
@@ -11838,7 +11753,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11838
11753
|
if (initialization) {
|
|
11839
11754
|
const configPaths = (cmdObj.config || [defaultConfigPath]).filter(Boolean);
|
|
11840
11755
|
if (!configPaths.length) {
|
|
11841
|
-
|
|
11756
|
+
logger.error(`Could not locate config file(s) to watch. Pass --config path/to/promptfooconfig.yaml or run from a directory containing promptfooconfig.{${DEFAULT_CONFIG_EXTENSIONS.join(",")}}.`);
|
|
11842
11757
|
process.exitCode = 1;
|
|
11843
11758
|
return ret;
|
|
11844
11759
|
}
|
|
@@ -11868,17 +11783,17 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11868
11783
|
persistent: true
|
|
11869
11784
|
}).on("change", async (path) => {
|
|
11870
11785
|
printBorder();
|
|
11871
|
-
|
|
11786
|
+
logger.info(`File change detected: ${path}`);
|
|
11872
11787
|
printBorder();
|
|
11873
11788
|
clearConfigCache();
|
|
11874
11789
|
await runEvaluation();
|
|
11875
|
-
}).on("error", (error) =>
|
|
11790
|
+
}).on("error", (error) => logger.error(`Watcher error: ${error}`)).on("ready", () => watchPaths.forEach((watchPath) => logger.info(`Watching for file changes on ${watchPath} ...`)));
|
|
11876
11791
|
}
|
|
11877
11792
|
} else {
|
|
11878
11793
|
const passRateThreshold = getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD", 100);
|
|
11879
11794
|
const failedTestExitCode = getEnvInt("PROMPTFOO_FAILED_TEST_EXIT_CODE", 100);
|
|
11880
11795
|
if (passRate < (Number.isFinite(passRateThreshold) ? passRateThreshold : 100)) {
|
|
11881
|
-
if (getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD") !== void 0)
|
|
11796
|
+
if (getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD") !== void 0) logger.info(chalk.white(`Pass rate ${chalk.red.bold(passRate.toFixed(2))}${chalk.red("%")} is below the threshold of ${chalk.red.bold(passRateThreshold)}${chalk.red("%")}`));
|
|
11882
11797
|
process.exitCode = Number.isSafeInteger(failedTestExitCode) ? failedTestExitCode : 100;
|
|
11883
11798
|
return ret;
|
|
11884
11799
|
}
|
|
@@ -11894,7 +11809,6 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11894
11809
|
};
|
|
11895
11810
|
return await runEvaluation(true);
|
|
11896
11811
|
}
|
|
11897
|
-
|
|
11898
11812
|
//#endregion
|
|
11899
11813
|
//#region src/util/verboseToggle.ts
|
|
11900
11814
|
let isVerboseToggleEnabled = false;
|
|
@@ -11957,7 +11871,6 @@ function initVerboseToggle() {
|
|
|
11957
11871
|
function disableVerboseToggle() {
|
|
11958
11872
|
if (cleanupFn) cleanupFn();
|
|
11959
11873
|
}
|
|
11960
|
-
|
|
11961
11874
|
//#endregion
|
|
11962
11875
|
//#region src/redteam/shared.ts
|
|
11963
11876
|
async function doRedteamRun(options) {
|
|
@@ -11974,13 +11887,13 @@ async function doRedteamRun(options) {
|
|
|
11974
11887
|
try {
|
|
11975
11888
|
const healthUrl = getRemoteHealthUrl();
|
|
11976
11889
|
if (healthUrl) {
|
|
11977
|
-
|
|
11890
|
+
logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
11978
11891
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
11979
11892
|
if (healthResult.status !== "OK") throw new Error(`Unable to proceed with redteam: ${healthResult.message}\nPlease check your API configuration or try again later.`);
|
|
11980
|
-
|
|
11893
|
+
logger.debug("API health check passed");
|
|
11981
11894
|
}
|
|
11982
11895
|
} catch (error) {
|
|
11983
|
-
|
|
11896
|
+
logger.warn(`API health check failed with error: ${error}.\nPlease check your API configuration or try again later.`);
|
|
11984
11897
|
}
|
|
11985
11898
|
if (options.liveRedteamConfig) {
|
|
11986
11899
|
const filename = `redteam-${Date.now()}.yaml`;
|
|
@@ -11990,10 +11903,10 @@ async function doRedteamRun(options) {
|
|
|
11990
11903
|
fs$1.writeFileSync(tmpFile, yaml.dump(options.liveRedteamConfig));
|
|
11991
11904
|
redteamPath = tmpFile;
|
|
11992
11905
|
configPath = tmpFile;
|
|
11993
|
-
|
|
11994
|
-
|
|
11906
|
+
logger.debug(`Using live config from ${tmpFile}`);
|
|
11907
|
+
logger.debug(`Live config: ${JSON.stringify(options.liveRedteamConfig, null, 2)}`);
|
|
11995
11908
|
}
|
|
11996
|
-
|
|
11909
|
+
logger.info("Generating test cases...");
|
|
11997
11910
|
const { maxConcurrency, ...passThroughOptions } = options;
|
|
11998
11911
|
let redteamConfig;
|
|
11999
11912
|
const generationStartTime = Date.now();
|
|
@@ -12013,7 +11926,7 @@ async function doRedteamRun(options) {
|
|
|
12013
11926
|
});
|
|
12014
11927
|
} catch (error) {
|
|
12015
11928
|
if (error instanceof PartialGenerationError) {
|
|
12016
|
-
|
|
11929
|
+
logger.error(chalk.red("\n" + error.message));
|
|
12017
11930
|
setLogCallback(null);
|
|
12018
11931
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12019
11932
|
throw error;
|
|
@@ -12022,11 +11935,11 @@ async function doRedteamRun(options) {
|
|
|
12022
11935
|
}
|
|
12023
11936
|
const generationDurationMs = Date.now() - generationStartTime;
|
|
12024
11937
|
if (!redteamConfig || !fs$1.existsSync(redteamPath)) {
|
|
12025
|
-
|
|
11938
|
+
logger.info("No test cases generated. Skipping scan.");
|
|
12026
11939
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12027
11940
|
return;
|
|
12028
11941
|
}
|
|
12029
|
-
|
|
11942
|
+
logger.info("Running scan...");
|
|
12030
11943
|
const { defaultConfig } = await loadDefaultConfig();
|
|
12031
11944
|
const { description: _description, ...evalOptions } = options;
|
|
12032
11945
|
const evalResult = await doEval({
|
|
@@ -12048,16 +11961,15 @@ async function doRedteamRun(options) {
|
|
|
12048
11961
|
if (evalResult.persisted) await evalResult.save();
|
|
12049
11962
|
const totalMs = evalResult.durationMs ?? 0;
|
|
12050
11963
|
const evalMs = evalResult.evaluationDurationMs ?? 0;
|
|
12051
|
-
|
|
11964
|
+
logger.info(chalk.gray(`Total scan time: ${formatDuration(totalMs / 1e3)} (generation: ${formatDuration(generationDurationMs / 1e3)}, evaluation: ${formatDuration(evalMs / 1e3)})`));
|
|
12052
11965
|
}
|
|
12053
|
-
if (evalResult ? await evalResult.findTargetErrorStatus() != null : false) {} else
|
|
12054
|
-
if (!evalResult?.shared) if (options.liveRedteamConfig)
|
|
12055
|
-
else
|
|
11966
|
+
if (evalResult ? await evalResult.findTargetErrorStatus() != null : false) {} else logger.info(chalk.green("\nRed team scan complete!"));
|
|
11967
|
+
if (!evalResult?.shared) if (options.liveRedteamConfig) logger.info(chalk.blue(`To view the results, click the ${chalk.bold("View Report")} button or run ${chalk.bold(promptfooCommand("redteam report"))} on the command line.`));
|
|
11968
|
+
else logger.info(chalk.blue(`To view the results, run ${chalk.bold(promptfooCommand("redteam report"))}`));
|
|
12056
11969
|
setLogCallback(null);
|
|
12057
11970
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12058
11971
|
return evalResult;
|
|
12059
11972
|
}
|
|
12060
|
-
|
|
12061
11973
|
//#endregion
|
|
12062
11974
|
//#region src/index.ts
|
|
12063
11975
|
async function evaluate(testSuite, options = {}) {
|
|
@@ -12082,23 +11994,23 @@ async function evaluate(testSuite, options = {}) {
|
|
|
12082
11994
|
if (typeof constructedTestSuite.defaultTest === "object") {
|
|
12083
11995
|
if (constructedTestSuite.defaultTest?.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
|
|
12084
11996
|
env: testSuite.env,
|
|
12085
|
-
basePath:
|
|
11997
|
+
basePath: state.basePath
|
|
12086
11998
|
});
|
|
12087
11999
|
if (constructedTestSuite.defaultTest?.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
|
|
12088
12000
|
env: testSuite.env,
|
|
12089
|
-
basePath:
|
|
12001
|
+
basePath: state.basePath
|
|
12090
12002
|
});
|
|
12091
12003
|
}
|
|
12092
12004
|
for (const test of constructedTestSuite.tests || []) {
|
|
12093
12005
|
if (test.options?.provider && !isApiProvider(test.options.provider)) test.options.provider = await resolveProvider(test.options.provider, providerMap, {
|
|
12094
12006
|
env: testSuite.env,
|
|
12095
|
-
basePath:
|
|
12007
|
+
basePath: state.basePath
|
|
12096
12008
|
});
|
|
12097
12009
|
if (test.assert) for (const assertion of test.assert) {
|
|
12098
12010
|
if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
|
|
12099
12011
|
if (assertion.provider && !isApiProvider(assertion.provider)) assertion.provider = await resolveProvider(assertion.provider, providerMap, {
|
|
12100
12012
|
env: testSuite.env,
|
|
12101
|
-
basePath:
|
|
12013
|
+
basePath: state.basePath
|
|
12102
12014
|
});
|
|
12103
12015
|
}
|
|
12104
12016
|
}
|
|
@@ -12122,12 +12034,12 @@ async function evaluate(testSuite, options = {}) {
|
|
|
12122
12034
|
if (shareableUrl) {
|
|
12123
12035
|
ret.shareableUrl = shareableUrl;
|
|
12124
12036
|
ret.shared = true;
|
|
12125
|
-
|
|
12037
|
+
logger.debug(`Eval shared successfully: ${shareableUrl}`);
|
|
12126
12038
|
}
|
|
12127
12039
|
} catch (error) {
|
|
12128
|
-
|
|
12040
|
+
logger.warn(`Failed to create shareable URL: ${error}`);
|
|
12129
12041
|
}
|
|
12130
|
-
else
|
|
12042
|
+
else logger.debug("Sharing requested but not enabled (check cloud config or sharing settings)");
|
|
12131
12043
|
if (testSuite.outputPath) {
|
|
12132
12044
|
if (typeof testSuite.outputPath === "string") await writeOutput(testSuite.outputPath, evalRecord, null);
|
|
12133
12045
|
else if (Array.isArray(testSuite.outputPath)) await writeMultipleOutputs(testSuite.outputPath, evalRecord, null);
|
|
@@ -12154,11 +12066,11 @@ var src_default = {
|
|
|
12154
12066
|
assertions: assertions_default,
|
|
12155
12067
|
cache: cache_exports,
|
|
12156
12068
|
evaluate,
|
|
12157
|
-
guardrails
|
|
12069
|
+
guardrails,
|
|
12158
12070
|
loadApiProvider,
|
|
12159
12071
|
redteam
|
|
12160
12072
|
};
|
|
12161
|
-
|
|
12162
12073
|
//#endregion
|
|
12163
|
-
export { AssertionOrSetSchema, AssertionSchema, AssertionSetSchema, AssertionTypeSchema, AtomicTestCaseSchema, BaseAssertionTypesSchema, BaseTokenUsageSchema, CommandLineOptionsSchema, CompletedPromptSchema, CompletionTokenDetailsSchema, ConversationMessageSchema, DerivedMetricSchema, EvalResultsFilterMode, EvaluateOptionsSchema, GradingConfigSchema, InputsSchema, NotPrefixedAssertionTypesSchema, OutputConfigSchema, OutputFileExtension, PartialGenerationError, PluginConfigSchema, PolicyObjectSchema, ProvidersSchema, ResultFailureReason, ScenarioSchema, SpecialAssertionTypesSchema, StrategyConfigSchema, TestCaseSchema, TestCaseWithVarsFileSchema, TestCasesWithMetadataPromptSchema, TestCasesWithMetadataSchema, TestGeneratorConfigSchema, TestSuiteConfigSchema, TestSuiteSchema, UnifiedConfigSchema, VarsSchema, assertions_default as assertions, cache_exports as cache, src_default as default, evaluate, generateTable,
|
|
12074
|
+
export { AssertionOrSetSchema, AssertionSchema, AssertionSetSchema, AssertionTypeSchema, AtomicTestCaseSchema, BaseAssertionTypesSchema, BaseTokenUsageSchema, CommandLineOptionsSchema, CompletedPromptSchema, CompletionTokenDetailsSchema, ConversationMessageSchema, DerivedMetricSchema, EvalResultsFilterMode, EvaluateOptionsSchema, GradingConfigSchema, InputsSchema, NotPrefixedAssertionTypesSchema, OutputConfigSchema, OutputFileExtension, PartialGenerationError, PluginConfigSchema, PolicyObjectSchema, ProvidersSchema, ResultFailureReason, ScenarioSchema, SpecialAssertionTypesSchema, StrategyConfigSchema, TestCaseSchema, TestCaseWithVarsFileSchema, TestCasesWithMetadataPromptSchema, TestCasesWithMetadataSchema, TestGeneratorConfigSchema, TestSuiteConfigSchema, TestSuiteSchema, UnifiedConfigSchema, VarsSchema, assertions_default as assertions, cache_exports as cache, src_default as default, evaluate, generateTable, guardrails, isApiProvider, isGradingResult, isProviderOptions, isResultFailureReason, loadApiProvider, redteam };
|
|
12075
|
+
|
|
12164
12076
|
//# sourceMappingURL=index.js.map
|