promptfoo 0.120.27 → 0.121.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/src/{ListApp-8WOe2nT6.js → ListApp-Du7YVwj5.js} +2 -4
- package/dist/src/accounts-BgNJDBE6.js +206 -0
- package/dist/src/{accounts-Fl2J3_Fu.cjs → accounts-Bx-x3bmW.cjs} +77 -78
- package/dist/src/{accounts-DVINui-2.js → accounts-CMqkzrVf.js} +39 -34
- package/dist/src/{accounts-CPDRAMND.js → accounts-xrUGFA6n.js} +38 -33
- package/dist/src/{agentic-utils-D922n6mm.js → agentic-utils-BKIN5PKu.js} +9 -10
- package/dist/src/{agents-BO2n8Z0d.cjs → agents-B0f4HICh.cjs} +37 -40
- package/dist/src/{agents-BXLmVsxR.js → agents-C-dDThPK.js} +37 -37
- package/dist/src/{agents-DgJf2-ez.cjs → agents-CErsqg5U.cjs} +16 -17
- package/dist/src/{agents-BcsN_BgB.js → agents-CVIn-Utx.js} +16 -12
- package/dist/src/{agents-hqgSV-3o.js → agents-CXknwsFX.js} +37 -40
- package/dist/src/{agents-pMfppv9Z.js → agents-DeH4Gu94.js} +18 -18
- package/dist/src/{agents-BdUTAwi-.js → agents-Dy2YpZpa.js} +38 -41
- package/dist/src/{agents-DNvSH78i.js → agents-aF4-T121.js} +16 -20
- package/dist/src/{aimlapi-DOib86oE.js → aimlapi-BAGZDo5G.js} +16 -18
- package/dist/src/{aimlapi-DtgPI0nE.js → aimlapi-BNfTBexL.js} +15 -17
- package/dist/src/{aimlapi-DTPACCB1.js → aimlapi-DHRKlBEA.js} +15 -4
- package/dist/src/{aimlapi-BE_Tg9Fl.cjs → aimlapi-tg0Gkcvr.cjs} +15 -16
- package/dist/src/app/assets/index-BFCZg7hQ.js +439 -0
- package/dist/src/app/index.html +1 -1
- package/dist/src/{audio-BRYU0BFo.js → audio-BRODU0UK.js} +7 -9
- package/dist/src/{audio-Cwo68yZS.cjs → audio-BWeaWovU.cjs} +6 -7
- package/dist/src/{audio-BnRUGAm_.js → audio-CHQ4r-RV.js} +6 -5
- package/dist/src/{audio-MSRki4JU.js → audio-tf_NBjlC.js} +6 -8
- package/dist/src/{base-h961VXYk.js → base-B0tcrnq_.js} +11 -13
- package/dist/src/{base-XB2tDJrB.js → base-B4QJRyFS.js} +11 -13
- package/dist/src/{base-pGVmXNl4.cjs → base-DBtwl2FR.cjs} +36 -38
- package/dist/src/base-fEDN28WM.js +193 -0
- package/dist/src/{blobs-BM_e6hCa.js → blobs-BAU-dXan.js} +9 -12
- package/dist/src/{blobs-CR5C4Ihh.js → blobs-Bpg5rH6i.js} +9 -12
- package/dist/src/{blobs-B-KQAFhX.cjs → blobs-DvS-O6be.cjs} +34 -37
- package/dist/src/blobs-qTYm-1PY.js +236 -0
- package/dist/src/{cache-CIpsoBZR.js → cache-8XhNqPKW.js} +64 -67
- package/dist/src/cache-Bbn1Nyrd.cjs +5 -0
- package/dist/src/cache-BwsMSda7.js +6 -0
- package/dist/src/{cache-jsiwsAJv.js → cache-CG0SlR1d.js} +64 -66
- package/dist/src/{cache-BTVYfbka.cjs → cache-COish3-W.cjs} +114 -117
- package/dist/src/cache-D3eqDYGU.js +739 -0
- package/dist/src/{chat-D31K7C4u.cjs → chat-2K608PeQ.cjs} +20 -21
- package/dist/src/chat-BKm79wib.js +764 -0
- package/dist/src/{chat-B84t99NW.js → chat-CM_kyI8B.js} +20 -9
- package/dist/src/{chat-BcPjZXIp.js → chat-CRWNNq73.js} +41 -44
- package/dist/src/{chat-CcUCysjU.js → chat-CznLWr_D.js} +41 -44
- package/dist/src/{chat-DwWifjxi.js → chat-DHMH-N64.js} +20 -22
- package/dist/src/{chat-BE44YOc6.cjs → chat-DaqekjFr.cjs} +61 -64
- package/dist/src/{chat-DZM2GUHO.js → chat-DxysjBvt.js} +21 -23
- package/dist/src/{chatkit-D67HS_0b.js → chatkit-65VXf5SR.js} +58 -58
- package/dist/src/{chatkit-DAB_qfzI.js → chatkit-Be-Q-a9F.js} +58 -60
- package/dist/src/{chatkit-Biqb_wsD.js → chatkit-BxFvW8KY.js} +58 -60
- package/dist/src/{chatkit-PGG4ZYIn.cjs → chatkit-DKyPi1Gs.cjs} +58 -60
- package/dist/src/chunk-DEq-mXcV.js +15 -0
- package/dist/src/chunk-DRamLcfz.js +16 -0
- package/dist/src/{claude-agent-sdk-SVM6AdBu.js → claude-agent-sdk-BLTu0WBO.js} +31 -31
- package/dist/src/{claude-agent-sdk-C9SiaQub.cjs → claude-agent-sdk-CJH22shf.cjs} +31 -28
- package/dist/src/{claude-agent-sdk-C-IOTPfo.js → claude-agent-sdk-D6_k9FKA.js} +31 -29
- package/dist/src/{claude-agent-sdk-CiluSyW1.js → claude-agent-sdk-Dy5lT-Tx.js} +33 -20
- package/dist/src/{cloud-CZ-q9Ier.js → cloud-Bc9526yV.js} +7 -9
- package/dist/src/cloud-DmE0EwsY.js +4 -0
- package/dist/src/{cloudflare-ai-BahKHyhh.js → cloudflare-ai-C9r2sRhw.js} +16 -18
- package/dist/src/{cloudflare-ai-Dxyt50Nl.js → cloudflare-ai-CWWJCRim.js} +16 -4
- package/dist/src/{cloudflare-ai-Dfahv5SY.cjs → cloudflare-ai-ClWSdor4.cjs} +16 -17
- package/dist/src/{cloudflare-ai-v_qZD6_q.js → cloudflare-ai-ICsOuD-z.js} +17 -19
- package/dist/src/{cloudflare-gateway-BPWoZIzJ.cjs → cloudflare-gateway-C2_-KG5o.cjs} +21 -22
- package/dist/src/{cloudflare-gateway-Bi_FpOFy.js → cloudflare-gateway-D6O7AlYb.js} +23 -23
- package/dist/src/{cloudflare-gateway-btS7h1OZ.js → cloudflare-gateway-D6xFc5pa.js} +21 -25
- package/dist/src/{cloudflare-gateway-C0guUNwk.js → cloudflare-gateway-pXGHxJ47.js} +26 -14
- package/dist/src/{codex-sdk-DSxAnbfT.js → codex-sdk-C6UMlxwV.js} +28 -29
- package/dist/src/{codex-sdk-IYVi9fuM.js → codex-sdk-DUwKWezN.js} +28 -27
- package/dist/src/{codex-sdk-DulY0ZRq.js → codex-sdk-GGAw0qbD.js} +28 -29
- package/dist/src/{codex-sdk-DFKMtAyf.cjs → codex-sdk-fAO0c3yA.cjs} +28 -29
- package/dist/src/{cometapi-DkNBMk0G.js → cometapi-BasUi7-_.js} +17 -19
- package/dist/src/{cometapi-DzrR3SR_.js → cometapi-Bbjp5V4x.js} +16 -4
- package/dist/src/{cometapi-C9EEpJzT.js → cometapi-DkXrKi5z.js} +21 -24
- package/dist/src/{cometapi-DIO64tf4.cjs → cometapi-vY6aDZgo.cjs} +21 -22
- package/dist/src/{completion-CG29bfKX.js → completion-6Mx_iXxK.js} +11 -13
- package/dist/src/{completion-Bgf1VJoq.js → completion-C5rtR_9P.js} +11 -13
- package/dist/src/{completion-CCRT4kX1.cjs → completion-CDOouNzq.cjs} +21 -23
- package/dist/src/completion-C_P3ypkJ.js +120 -0
- package/dist/src/{createHash-Dw_iLu31.js → createHash-CTQmL3G2.js} +2 -3
- package/dist/src/{createHash-CYQy4YeL.cjs → createHash-CfZSc0b4.cjs} +13 -14
- package/dist/src/{createHash-CJcfskIZ.js → createHash-Da8fMwqB.js} +2 -3
- package/dist/src/createHash-DmPQkvBh.js +15 -0
- package/dist/src/{docker-D-ayp2FW.js → docker-5KcG-_86.js} +18 -20
- package/dist/src/{docker-DNcLR4Ig.cjs → docker-BwsKwxFs.cjs} +18 -19
- package/dist/src/{docker-egERKxCF.js → docker-CZnqU1XV.js} +18 -7
- package/dist/src/{docker-B81N0t4e.js → docker-DzxyDPIj.js} +19 -21
- package/dist/src/entrypoint.js +2 -3
- package/dist/src/{errors-DnGCbnx8.js → errors-P6ll7XSJ.js} +2 -2
- package/dist/src/{esm-B9dPm_BF.js → esm-C03C-mv3.js} +17 -20
- package/dist/src/{esm-D2pZ87fL.js → esm-CaIwzWR5.js} +18 -21
- package/dist/src/esm-Cd1AjG1D.js +379 -0
- package/dist/src/{esm-Ct-Joyue.cjs → esm-CnNt7sI4.cjs} +47 -49
- package/dist/src/eval-17JizQIv.js +15 -0
- package/dist/src/{eval-C-Nr6wX_.js → eval-DmFyWU7i.js} +47 -54
- package/dist/src/{evalResult-4BzI2tmj.js → evalResult-CDQiuUuf.js} +16 -12
- package/dist/src/{evalResult-DXMWJ3sx.js → evalResult-CTG2AHOS.js} +10 -11
- package/dist/src/evalResult-Cqj8pldJ.js +12 -0
- package/dist/src/{evalResult-CX8wQecI.cjs → evalResult-Dap2CekP.cjs} +20 -21
- package/dist/src/evalResult-DvcJAWJU.cjs +10 -0
- package/dist/src/evalResult-Hftn-S_i.js +10 -0
- package/dist/src/evaluator-B2CFNt-P.js +36 -0
- package/dist/src/{evaluator-8aGyV12L.js → evaluator-DPFRbFIL.js} +201 -229
- package/dist/src/{extractor-CD5yKL-G.js → extractor-CFG6bcWJ.js} +22 -24
- package/dist/src/{extractor-C031XmTA.cjs → extractor-DX36oYEv.cjs} +37 -39
- package/dist/src/{extractor-V5x_m1i0.js → extractor-M67RUtg6.js} +22 -24
- package/dist/src/extractor-YMU_Gvt8.js +374 -0
- package/dist/src/{fetch-D3OHf-lV.js → fetch-4M3YRaqL.js} +40 -45
- package/dist/src/fetch-60Gzydls.js +777 -0
- package/dist/src/{fetch-CXZI9RRr.js → fetch-BMv0O527.js} +23 -35
- package/dist/src/{fetch-BmbD-v1L.cjs → fetch-BxUk8odA.cjs} +244 -277
- package/dist/src/fetch-KV5kNASw.js +5 -0
- package/dist/src/{fileExtensions-ePDqouxn.js → fileExtensions-DnqA1y9x.js} +2 -2
- package/dist/src/{fileExtensions-BpuMmaFL.js → fileExtensions-Ds-foDzt.js} +2 -2
- package/dist/src/fileExtensions-LcDYkU4v.js +85 -0
- package/dist/src/{fileExtensions-DkJYkWUy.cjs → fileExtensions-bYh77CN8.cjs} +27 -28
- package/dist/src/{formatDuration-CdevI3An.js → formatDuration-DgBVMN65.js} +2 -2
- package/dist/src/{genaiTracer-Ce19n68P.js → genaiTracer-70Z8BIuV.js} +2 -3
- package/dist/src/{genaiTracer-CqNnnXrE.js → genaiTracer-C1rxGO8Q.js} +2 -3
- package/dist/src/genaiTracer-D3fD9dNV.js +256 -0
- package/dist/src/{genaiTracer-Dres3qrN.cjs → genaiTracer-DN4dQywX.cjs} +13 -14
- package/dist/src/graders-Bu0H9nXi.js +32 -0
- package/dist/src/{graders-DTeBrzWp.js → graders-CHO8EPM4.js} +349 -397
- package/dist/src/graders-Cfhkvx-e.js +34 -0
- package/dist/src/{graders--1y2u9HO.js → graders-CpdqD9PI.js} +349 -397
- package/dist/src/graders-DClJVpGP.cjs +32 -0
- package/dist/src/{graders-DohM2dir.cjs → graders-DOXycdlG.cjs} +684 -732
- package/dist/src/graders-DcnJsrMO.js +32 -0
- package/dist/src/graders-R9rYUM0d.js +13466 -0
- package/dist/src/{image-C3wHC9_h.js → image-BmEZqVmk.js} +9 -10
- package/dist/src/{image-O1u4bCFg.js → image-CBBVXWuT.js} +9 -10
- package/dist/src/{image-DpKl2F15.cjs → image-CDLQOcqT.cjs} +6 -7
- package/dist/src/{image-DmE-niFE.js → image-DJEvKveK.js} +6 -5
- package/dist/src/{image-CuKHuccK.cjs → image-DTedmQPg.cjs} +29 -30
- package/dist/src/{image-B0U4Hqll.js → image-gvmivTEe.js} +7 -9
- package/dist/src/image-pAX56tPG.js +257 -0
- package/dist/src/{image-DNEIf_aI.js → image-tL5hIOFh.js} +6 -8
- package/dist/src/index.cjs +605 -689
- package/dist/src/index.d.cts +11 -7
- package/dist/src/index.d.ts +11 -3
- package/dist/src/index.js +570 -658
- package/dist/src/{interactiveCheck-Bxj1Swex.js → interactiveCheck-BgLZUIt3.js} +7 -8
- package/dist/src/{invariant-DT20jrBd.js → invariant-BtWWVVhl.js} +2 -2
- package/dist/src/{invariant-1pAf2CD1.js → invariant-Ddh24eXh.js} +2 -2
- package/dist/src/{invariant-CKcJAQ6M.cjs → invariant-kfQ8Bu82.cjs} +7 -8
- package/dist/src/invariant-vgHWClmd.js +25 -0
- package/dist/src/{knowledgeBase-Be_zyW4L.js → knowledgeBase-CLJybhnF.js} +16 -16
- package/dist/src/{knowledgeBase-CEzQobWX.js → knowledgeBase-CoU-UQBg.js} +14 -9
- package/dist/src/{knowledgeBase-BZ41IFwq.js → knowledgeBase-DjWPVqSb.js} +14 -18
- package/dist/src/{knowledgeBase-D-5BMXlr.cjs → knowledgeBase-wkxuRFhA.cjs} +14 -15
- package/dist/src/{litellm-DnbRJ2if.js → litellm-B9Hysuri.js} +16 -18
- package/dist/src/{litellm-CRDqPhNI.js → litellm-CTfa0hqi.js} +15 -17
- package/dist/src/{litellm-hUSNM_M2.cjs → litellm-NYpQ8RQu.cjs} +15 -16
- package/dist/src/{litellm-9vR8zpfU.js → litellm-ePxtr9F1.js} +15 -4
- package/dist/src/{logger-CG1uZPbQ.js → logger-CT3IKMKA.js} +10 -29
- package/dist/src/{logger-B7sBeGa0.cjs → logger-Cp1GPUjj.cjs} +152 -180
- package/dist/src/logger-DLcq4dWf.js +713 -0
- package/dist/src/{logger-LSBxlt7a.js → logger-KkObSCzq.js} +13 -31
- package/dist/src/{luma-ray-Hm3d6VJE.cjs → luma-ray-B0GGNRc1.cjs} +20 -21
- package/dist/src/{luma-ray-drvgdpP9.js → luma-ray-BE2mOt6N.js} +20 -13
- package/dist/src/{luma-ray-4blv9iZ2.js → luma-ray-BW9IRGIc.js} +22 -21
- package/dist/src/{luma-ray-B2__8lYH.js → luma-ray-Cm1KZBhs.js} +20 -23
- package/dist/src/main.js +1170 -1321
- package/dist/src/{messages-XhiwCbi4.cjs → messages-1JrJs91T.cjs} +32 -34
- package/dist/src/{messages-CGPPidQr.js → messages-1x9atZmP.js} +22 -24
- package/dist/src/{messages-Uee41Mj5.js → messages-BLbWdsyt.js} +22 -24
- package/dist/src/messages-D8EA0oDc.js +240 -0
- package/dist/src/{meteor-BYykdXrV.js → meteor-44VjEACX.js} +3 -4
- package/dist/src/{meteor-CsopaHrH.js → meteor-D-SotUw9.js} +3 -4
- package/dist/src/{meteor-e-E-2vVl.cjs → meteor-DLZZ3osF.cjs} +3 -4
- package/dist/src/{meteor-C8lGP6P4.js → meteor-DUiCJRC-.js} +3 -4
- package/dist/src/{modelslab-yKz-ZNB4.js → modelslab-C1OLRmVX.js} +17 -10
- package/dist/src/{modelslab-E9gO-bYd.js → modelslab-CqXBy3U8.js} +18 -20
- package/dist/src/{modelslab-lUVW0cmB.cjs → modelslab-DcOSFwKh.cjs} +17 -18
- package/dist/src/{modelslab-ClBkr8_9.js → modelslab-X5-4LroM.js} +17 -19
- package/dist/src/{nova-reel-Dk8jNpId.js → nova-reel-BgS1ZWuK.js} +20 -13
- package/dist/src/{nova-reel-u2eF2Cxm.js → nova-reel-D2ZkOSyr.js} +22 -21
- package/dist/src/{nova-reel-D8CuO6QH.cjs → nova-reel-D9xfaMBs.cjs} +20 -21
- package/dist/src/{nova-reel-P9bwvtYX.js → nova-reel-DihqLeol.js} +20 -23
- package/dist/src/{nova-sonic-Ds1C-dpm.cjs → nova-sonic-DVu3mMIy.cjs} +30 -31
- package/dist/src/{nova-sonic-CK2rAiKi.js → nova-sonic-DezhVUYT.js} +30 -26
- package/dist/src/{nova-sonic-BaqWlkds.js → nova-sonic-P-CdUMlV.js} +30 -31
- package/dist/src/{nova-sonic-yZapPLv7.js → nova-sonic-Q3BOJeig.js} +31 -32
- package/dist/src/{openai-DUFopMrH.cjs → openai-Cuif0GEt.cjs} +8 -9
- package/dist/src/{openai-PblZ3jUE.js → openai-DElQ-fPX.js} +3 -4
- package/dist/src/{openai-CcN1B8Sb.js → openai-DhbB7eWK.js} +3 -4
- package/dist/src/openai-j-sE2O7r.js +44 -0
- package/dist/src/{openclaw-A-3_loM7.js → openclaw-BiSZPL7J.js} +20 -14
- package/dist/src/{openclaw-COn6QzDi.js → openclaw-Bv1DINsX.js} +20 -27
- package/dist/src/{openclaw-a3lylB-V.js → openclaw-D1D_ej1z.js} +21 -28
- package/dist/src/{openclaw-B6qqDr_u.cjs → openclaw-DAfWQn-o.cjs} +33 -39
- package/dist/src/opencode-sdk-C7m-wRfI.js +560 -0
- package/dist/src/opencode-sdk-CfaLN8PY.cjs +564 -0
- package/dist/src/opencode-sdk-D95s6SnR.js +562 -0
- package/dist/src/opencode-sdk-DxUPkLT7.js +560 -0
- package/dist/src/{otlpReceiver-oyf5wLGC.js → otlpReceiver--AIRW_S4.js} +53 -51
- package/dist/src/{otlpReceiver-BmmTiMjA.js → otlpReceiver-Bn5wGB1v.js} +53 -55
- package/dist/src/{otlpReceiver-lXsYVbpj.cjs → otlpReceiver-Diec4cln.cjs} +53 -55
- package/dist/src/{otlpReceiver-94URx7UW.js → otlpReceiver-g3ByGaXs.js} +53 -55
- package/dist/src/{providerRegistry-Cq_JK_CJ.js → providerRegistry-B0RUOLI_.js} +7 -8
- package/dist/src/{providerRegistry-DSSHjMKf.js → providerRegistry-CD8MEar9.js} +7 -8
- package/dist/src/{providerRegistry-CvHEVJad.cjs → providerRegistry-Civky8Ar.cjs} +12 -13
- package/dist/src/providerRegistry-DM8rZYol.js +45 -0
- package/dist/src/providers-B3HvufyI.js +33246 -0
- package/dist/src/{providers-BnFpbY_s.js → providers-BKRJTjBz.js} +1536 -1669
- package/dist/src/providers-C1rOSHiR.js +32 -0
- package/dist/src/{providers-Iil64vk9.js → providers-CFLy1_ji.js} +1543 -1676
- package/dist/src/{providers-DHbjzW2e.cjs → providers-CFu-TZl-.cjs} +1896 -2029
- package/dist/src/providers-CxmDwEFf.cjs +31 -0
- package/dist/src/providers-Dodakqr0.js +30 -0
- package/dist/src/providers-GIQ2TcsA.js +30 -0
- package/dist/src/{pythonUtils-CcT5LH1M.js → pythonUtils-C3py6GC1.js} +18 -19
- package/dist/src/{pythonUtils-DBbuI3QJ.cjs → pythonUtils-CTU3Y3lw.cjs} +42 -43
- package/dist/src/{pythonUtils-hZ8LeQLv.js → pythonUtils-D5nxkQ0P.js} +18 -19
- package/dist/src/pythonUtils-D6fwaDSg.js +249 -0
- package/dist/src/{quiverai-BuI0tE39.js → quiverai-C2jVwbH1.js} +8 -7
- package/dist/src/{quiverai-DCGSZt4U.js → quiverai-CI6gYJVI.js} +8 -10
- package/dist/src/{quiverai-DiMVJQDz.cjs → quiverai-CLkWkyZc.cjs} +8 -9
- package/dist/src/{quiverai-fQNkExW4.js → quiverai-MHSxbmmZ.js} +9 -11
- package/dist/src/{render-Dj1smHEb.js → render-Drod8m7K.js} +4 -5
- package/dist/src/{responses-DOAFFENS.js → responses-BKqJmhhc.js} +22 -25
- package/dist/src/{responses-CxzoQoBe.js → responses-CGw0DCzh.js} +22 -25
- package/dist/src/responses-jxdehPkC.js +660 -0
- package/dist/src/{responses-ghR3IOfy.cjs → responses-tD4Bd4dc.cjs} +37 -40
- package/dist/src/rubyUtils-BUHu6PhO.js +5 -0
- package/dist/src/{rubyUtils-CwbGmgYN.js → rubyUtils-BUVePouc.js} +27 -20
- package/dist/src/rubyUtils-BcuGX77l.js +222 -0
- package/dist/src/{rubyUtils-DudlFZed.js → rubyUtils-Boc4HZzX.js} +18 -19
- package/dist/src/rubyUtils-CP42kMvq.cjs +4 -0
- package/dist/src/{rubyUtils-C8MhKGHb.cjs → rubyUtils-DhCAlxZr.cjs} +48 -50
- package/dist/src/{sagemaker-gmskuyre.js → sagemaker-BK4Zb993.js} +75 -70
- package/dist/src/{sagemaker-DuM71dVU.js → sagemaker-BfiWTmvn.js} +77 -77
- package/dist/src/{sagemaker-77zbJ2Q2.cjs → sagemaker-CcQHM1jV.cjs} +75 -76
- package/dist/src/{sagemaker-CcxhlOAR.js → sagemaker-D2Q1c-sD.js} +75 -79
- package/dist/src/{scanner-DJYiSXQj.js → scanner-J8CA3LsV.js} +100 -121
- package/dist/src/server/index.js +5505 -67416
- package/dist/src/{server-B5v33lvE.cjs → server-B0PPuDw-.cjs} +57 -67
- package/dist/src/server-B1vi21hA.js +7 -0
- package/dist/src/{server-RV_i_YX5.js → server-BC7XJFgr.js} +19 -24
- package/dist/src/server-Cm9Kai_h.cjs +5 -0
- package/dist/src/{server-BJ4m4f1D.js → server-DbFphssR.js} +26 -29
- package/dist/src/server-OAs3nBRT.js +229 -0
- package/dist/src/{signal-BW33JuId.js → signal-BOTbd53Z.js} +9 -11
- package/dist/src/{slack-DEURelTy.cjs → slack-BmVAVGaK.cjs} +7 -8
- package/dist/src/{slack-BQYeW9L3.js → slack-DCUPTzS2.js} +8 -8
- package/dist/src/{slack-BB6yuZzp.js → slack-DOdy_kyv.js} +7 -8
- package/dist/src/{slack-2pRrhhgJ.js → slack-DXMKtA-f.js} +7 -9
- package/dist/src/store-BNmZ1KAz.cjs +5 -0
- package/dist/src/{store-D7CgQzAR.cjs → store-BSc-TF2w.cjs} +44 -45
- package/dist/src/store-BltJg2cd.js +6 -0
- package/dist/src/{store-s3SftUwF.js → store-D1tv90v3.js} +34 -35
- package/dist/src/{store-DJNsD1iC.js → store-DQLEjuEO.js} +40 -36
- package/dist/src/store-Ub2vaGJ1.js +228 -0
- package/dist/src/{tables-DfTsNN7X.js → tables-5EvT_Bwn.js} +19 -21
- package/dist/src/{tables-BKTmd6u7.cjs → tables-C7K-XKWp.cjs} +89 -91
- package/dist/src/{tables-DMegD0Xf.js → tables-D36WTqKX.js} +21 -23
- package/dist/src/tables-xKANLRBD.js +288 -0
- package/dist/src/telemetry-5BCRNBbe.cjs +5 -0
- package/dist/src/{telemetry-BedSm-bZ.js → telemetry-C15ziL8u.js} +17 -14
- package/dist/src/{telemetry--WAdAfVi.js → telemetry-C2YDkUQH.js} +11 -13
- package/dist/src/{telemetry-DQgVBCAb.cjs → telemetry-CbrnxHp_.cjs} +21 -24
- package/dist/src/telemetry-D4W5hboe.js +7 -0
- package/dist/src/telemetry-DMb2Mpfm.js +171 -0
- package/dist/src/{text-oiSbwSOI.js → text-B_UCRPp2.js} +2 -2
- package/dist/src/{text-oKzCBnK6.cjs → text-CW1cyrwj.cjs} +12 -13
- package/dist/src/{text-B_IrO4GZ.js → text-Db-Wt2u2.js} +2 -2
- package/dist/src/text-TIv0QYnd.js +22 -0
- package/dist/src/{tokenUsageUtils-FZd5O_4A.js → tokenUsageUtils-BDGe-iyI.js} +2 -2
- package/dist/src/{tokenUsageUtils-DmZSD2eU.js → tokenUsageUtils-DflFMjS0.js} +2 -2
- package/dist/src/tokenUsageUtils-NYT-WKS6.js +138 -0
- package/dist/src/{tokenUsageUtils-CXhxVj72.cjs → tokenUsageUtils-bVa1ga6f.cjs} +32 -33
- package/dist/src/{transcription-BO1AHegO.cjs → transcription-CL78qbOU.cjs} +14 -15
- package/dist/src/{transcription-mYS9vd5v.js → transcription-DAtxHhAM.js} +14 -7
- package/dist/src/{transcription-X2-B4vkX.js → transcription-LNZTNUUL.js} +14 -16
- package/dist/src/{transcription-lzBLiTFJ.js → transcription-QHh3AH6Z.js} +15 -17
- package/dist/src/{transform-DeGlxb0D.js → transform-Cgi24fJ7.js} +39 -47
- package/dist/src/{transform-B1Hi5lWS.cjs → transform-CzK1Q0zl.cjs} +24 -26
- package/dist/src/{transform-CYDILYDe.js → transform-DECvGmzp.js} +15 -13
- package/dist/src/{transform-Dfl89yi4.js → transform-DGLazrMm.js} +39 -47
- package/dist/src/transform-DGxXocjk.js +1506 -0
- package/dist/src/{transform-D5PjiWiZ.cjs → transform-DOcQeLld.cjs} +179 -187
- package/dist/src/transform-DTGDnAzW.js +6 -0
- package/dist/src/{transform-BEgStbHK.js → transform-DilY9wbS.js} +10 -12
- package/dist/src/transform-aa6tmVpZ.js +216 -0
- package/dist/src/transform-m3qNw4KP.cjs +5 -0
- package/dist/src/{transformersAvailability-SZnTS3pJ.js → transformersAvailability-CEVM2GNQ.js} +2 -2
- package/dist/src/{transformersAvailability-D-glmEy7.cjs → transformersAvailability-CwayUSlh.cjs} +2 -3
- package/dist/src/{transformersAvailability-CjeFXhuJ.js → transformersAvailability-D6c6ROpT.js} +2 -2
- package/dist/src/{types-CXQduE9o.js → types-CH3Ge2sE.js} +30 -90
- package/dist/src/{types-C5hEkb-x.js → types-CLKiCBW3.js} +25 -89
- package/dist/src/types-CN_TZ2GJ.js +3260 -0
- package/dist/src/{types-DWNf48sT.cjs → types-LJ0r3wbR.cjs} +500 -564
- package/dist/src/util-5cB-L7U3.js +1430 -0
- package/dist/src/util-6-GqIvzS.js +599 -0
- package/dist/src/{util-CoQjmE3u.js → util-B7T3SiBS.js} +4 -5
- package/dist/src/{util-D9eLdGfa.js → util-Betm42rL.js} +5 -6
- package/dist/src/{util-Bm_-UMD_.js → util-C-PPYSMq.js} +5 -6
- package/dist/src/{util-CyUdMzV0.cjs → util-CchiqXh_.cjs} +34 -35
- package/dist/src/{util-Du96oyYS.js → util-DaWTWKBK.js} +4 -5
- package/dist/src/{util-1wWM599Z.cjs → util-Db0a0AFH.cjs} +50 -51
- package/dist/src/{util-DQ984syk.js → util-Dlz_Wvgm.js} +37 -48
- package/dist/src/{util-_h4pVqrz.js → util-YT5HPZaS.js} +37 -48
- package/dist/src/{util-aLhtl3fe.cjs → util-Yz-1aEhW.cjs} +209 -220
- package/dist/src/util-ZZH-3QZz.js +293 -0
- package/dist/src/{utils-BjLy-Q72.cjs → utils-Cz9qXqII.cjs} +29 -32
- package/dist/src/{utils-CFMn2yHW.js → utils-XiOAgly5.js} +4 -7
- package/dist/src/utils-dLokC-eR.js +94 -0
- package/dist/src/{utils-DvWMzuMx.js → utils-f2-Moju7.js} +4 -7
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +30 -30
- package/dist/src/app/assets/index-B2D0bCSI.js +0 -439
- package/dist/src/app/tsconfig.app.tsbuildinfo +0 -1
- package/dist/src/cache-ChPcurj7.js +0 -6
- package/dist/src/cache-VVu_W-yg.js +0 -8
- package/dist/src/cache-YLNCFEM2.cjs +0 -6
- package/dist/src/chunk-DHDDz29n.js +0 -22
- package/dist/src/chunk-FhC4c-0y.js +0 -21
- package/dist/src/cloud-BndfXy4H.js +0 -5
- package/dist/src/eval-BhHvMY82.js +0 -17
- package/dist/src/evalResult-Dq2gFNQY.js +0 -12
- package/dist/src/evalResult-nmcP5VKH.cjs +0 -12
- package/dist/src/evalResult-trqZjVYh.js +0 -14
- package/dist/src/evaluator-CnfPstzT.js +0 -39
- package/dist/src/fetch-IDPDue6F.cjs +0 -4
- package/dist/src/fetch-hKJ-It8q.js +0 -6
- package/dist/src/fetch-ouKnrWK-.js +0 -4
- package/dist/src/graders-CQn7WUsd.cjs +0 -34
- package/dist/src/graders-DC6QAbpW.js +0 -35
- package/dist/src/graders-DUWz3Y7j.js +0 -37
- package/dist/src/opencode-sdk-4bL9n-Gk.js +0 -382
- package/dist/src/opencode-sdk-BfC2zWcR.js +0 -376
- package/dist/src/opencode-sdk-DMJyuwMg.js +0 -380
- package/dist/src/opencode-sdk-Da-9adza.cjs +0 -383
- package/dist/src/providers-CsXB2Ix-.js +0 -35
- package/dist/src/providers-DO8ltjLC.js +0 -33
- package/dist/src/providers-Dtq-xnXd.cjs +0 -33
- package/dist/src/rubyUtils-BUbcND2f.js +0 -6
- package/dist/src/rubyUtils-Cr55X_KE.js +0 -5
- package/dist/src/rubyUtils-DlIiqoYo.cjs +0 -5
- package/dist/src/server-C2eQH4Gu.js +0 -6
- package/dist/src/server-CXWycu7H.cjs +0 -6
- package/dist/src/server-Q6OGlxxT.js +0 -8
- package/dist/src/store-B3EDO9Q3.js +0 -7
- package/dist/src/store-Dl9F8aw5.js +0 -6
- package/dist/src/store-SnrGrlt9.cjs +0 -6
- package/dist/src/telemetry-BGhiPZtl.js +0 -8
- package/dist/src/telemetry-CFfiYan6.cjs +0 -6
- package/dist/src/telemetry-DHzEduxX.js +0 -6
- package/dist/src/transform-C1x1ZlMQ.cjs +0 -6
- package/dist/src/transform-DYHjFmQu.js +0 -8
- package/dist/src/transform-rmwJT5JQ.js +0 -7
- package/dist/src/transformersAvailability-eJooj0gX.js +0 -35
package/dist/src/index.cjs
CHANGED
|
@@ -1,41 +1,44 @@
|
|
|
1
|
-
Object.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
const
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
const
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const
|
|
15
|
-
const
|
|
16
|
-
require(
|
|
17
|
-
const
|
|
18
|
-
const
|
|
19
|
-
|
|
20
|
-
require(
|
|
21
|
-
require(
|
|
22
|
-
require(
|
|
23
|
-
require(
|
|
24
|
-
|
|
25
|
-
require(
|
|
26
|
-
|
|
27
|
-
const
|
|
28
|
-
|
|
29
|
-
const
|
|
30
|
-
const
|
|
31
|
-
const
|
|
32
|
-
const
|
|
33
|
-
const
|
|
34
|
-
require(
|
|
35
|
-
require(
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
|
|
1
|
+
Object.defineProperties(exports, {
|
|
2
|
+
__esModule: { value: true },
|
|
3
|
+
[Symbol.toStringTag]: { value: "Module" }
|
|
4
|
+
});
|
|
5
|
+
const require_logger = require("./logger-Cp1GPUjj.cjs");
|
|
6
|
+
const require_invariant = require("./invariant-kfQ8Bu82.cjs");
|
|
7
|
+
const require_esm = require("./esm-CnNt7sI4.cjs");
|
|
8
|
+
const require_pythonUtils = require("./pythonUtils-CTU3Y3lw.cjs");
|
|
9
|
+
const require_fileExtensions = require("./fileExtensions-bYh77CN8.cjs");
|
|
10
|
+
const require_transform = require("./transform-CzK1Q0zl.cjs");
|
|
11
|
+
const require_graders = require("./graders-DOXycdlG.cjs");
|
|
12
|
+
const require_types = require("./types-LJ0r3wbR.cjs");
|
|
13
|
+
const require_util = require("./util-Yz-1aEhW.cjs");
|
|
14
|
+
const require_fetch = require("./fetch-BxUk8odA.cjs");
|
|
15
|
+
const require_cache = require("./cache-COish3-W.cjs");
|
|
16
|
+
const require_providers = require("./providers-CFu-TZl-.cjs");
|
|
17
|
+
const require_utils = require("./utils-Cz9qXqII.cjs");
|
|
18
|
+
const require_createHash = require("./createHash-CfZSc0b4.cjs");
|
|
19
|
+
require("./genaiTracer-DN4dQywX.cjs");
|
|
20
|
+
const require_chat = require("./chat-DaqekjFr.cjs");
|
|
21
|
+
const require_tokenUsageUtils = require("./tokenUsageUtils-bVa1ga6f.cjs");
|
|
22
|
+
const require_transform$1 = require("./transform-DOcQeLld.cjs");
|
|
23
|
+
require("./messages-1JrJs91T.cjs");
|
|
24
|
+
require("./util-CchiqXh_.cjs");
|
|
25
|
+
require("./responses-tD4Bd4dc.cjs");
|
|
26
|
+
require("./openai-Cuif0GEt.cjs");
|
|
27
|
+
const require_util$2 = require("./util-Db0a0AFH.cjs");
|
|
28
|
+
require("./completion-CDOouNzq.cjs");
|
|
29
|
+
const require_accounts = require("./accounts-Bx-x3bmW.cjs");
|
|
30
|
+
const require_server = require("./server-B0PPuDw-.cjs");
|
|
31
|
+
const require_blobs = require("./blobs-DvS-O6be.cjs");
|
|
32
|
+
const require_tables = require("./tables-C7K-XKWp.cjs");
|
|
33
|
+
const require_extractor = require("./extractor-DX36oYEv.cjs");
|
|
34
|
+
const require_telemetry = require("./telemetry-CbrnxHp_.cjs");
|
|
35
|
+
const require_text = require("./text-CW1cyrwj.cjs");
|
|
36
|
+
const require_store = require("./store-BSc-TF2w.cjs");
|
|
37
|
+
require("./base-DBtwl2FR.cjs");
|
|
38
|
+
require("./image-DTedmQPg.cjs");
|
|
39
|
+
const require_providerRegistry = require("./providerRegistry-Civky8Ar.cjs");
|
|
40
|
+
const require_rubyUtils = require("./rubyUtils-DhCAlxZr.cjs");
|
|
41
|
+
const require_evalResult = require("./evalResult-Dap2CekP.cjs");
|
|
39
42
|
let fs = require("fs");
|
|
40
43
|
fs = require_logger.__toESM(fs);
|
|
41
44
|
let path = require("path");
|
|
@@ -44,8 +47,8 @@ let async = require("async");
|
|
|
44
47
|
async = require_logger.__toESM(async);
|
|
45
48
|
let js_yaml = require("js-yaml");
|
|
46
49
|
js_yaml = require_logger.__toESM(js_yaml);
|
|
47
|
-
|
|
48
|
-
|
|
50
|
+
require("node:path");
|
|
51
|
+
require("node:url");
|
|
49
52
|
let chalk = require("chalk");
|
|
50
53
|
chalk = require_logger.__toESM(chalk);
|
|
51
54
|
let os = require("os");
|
|
@@ -91,7 +94,6 @@ let ora = require("ora");
|
|
|
91
94
|
ora = require_logger.__toESM(ora);
|
|
92
95
|
let url = require("url");
|
|
93
96
|
require("@inquirer/confirm");
|
|
94
|
-
|
|
95
97
|
//#region src/external/matchers/conversationRelevancyTemplate.ts
|
|
96
98
|
var ConversationRelevancyTemplate = class {
|
|
97
99
|
static generateVerdicts(slidingWindow) {
|
|
@@ -163,7 +165,6 @@ ${JSON.stringify(irrelevancies, null, 2)}
|
|
|
163
165
|
JSON:`;
|
|
164
166
|
}
|
|
165
167
|
};
|
|
166
|
-
|
|
167
168
|
//#endregion
|
|
168
169
|
//#region src/external/matchers/deepeval.ts
|
|
169
170
|
const nunjucks$1 = require_util.getNunjucksEngine(void 0, false, true);
|
|
@@ -213,7 +214,6 @@ async function matchesConversationRelevance(messages, threshold, vars, grading,
|
|
|
213
214
|
return require_graders.fail(`Error parsing output: ${err.message}`, resp.tokenUsage);
|
|
214
215
|
}
|
|
215
216
|
}
|
|
216
|
-
|
|
217
217
|
//#endregion
|
|
218
218
|
//#region src/external/assertions/deepeval.ts
|
|
219
219
|
const DEFAULT_WINDOW_SIZE = 5;
|
|
@@ -268,7 +268,6 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
268
268
|
tokensUsed: tokensUsed.total > 0 ? tokensUsed : void 0
|
|
269
269
|
};
|
|
270
270
|
};
|
|
271
|
-
|
|
272
271
|
//#endregion
|
|
273
272
|
//#region src/tracing/evaluatorTracing.ts
|
|
274
273
|
let otlpReceiverStarted = false;
|
|
@@ -301,28 +300,28 @@ function isOtlpReceiverStarted() {
|
|
|
301
300
|
* Start the OTLP receiver if tracing is enabled and it hasn't been started yet
|
|
302
301
|
*/
|
|
303
302
|
async function startOtlpReceiverIfNeeded(testSuite) {
|
|
304
|
-
require_logger.
|
|
305
|
-
require_logger.
|
|
306
|
-
require_logger.
|
|
303
|
+
require_logger.logger.debug(`[EvaluatorTracing] Checking tracing config: ${JSON.stringify(testSuite.tracing)}`);
|
|
304
|
+
require_logger.logger.debug(`[EvaluatorTracing] testSuite keys: ${Object.keys(testSuite)}`);
|
|
305
|
+
require_logger.logger.debug(`[EvaluatorTracing] Full testSuite.tracing: ${JSON.stringify(testSuite.tracing, null, 2)}`);
|
|
307
306
|
if (testSuite.tracing?.enabled && testSuite.tracing?.otlp?.http?.enabled && !otlpReceiverStarted) {
|
|
308
|
-
require_telemetry.
|
|
307
|
+
require_telemetry.telemetry.record("feature_used", { feature: "tracing" });
|
|
309
308
|
try {
|
|
310
|
-
require_logger.
|
|
311
|
-
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
309
|
+
require_logger.logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
310
|
+
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-Diec4cln.cjs"));
|
|
312
311
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
313
312
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
314
|
-
require_logger.
|
|
313
|
+
require_logger.logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
315
314
|
await startOTLPReceiver(port, host);
|
|
316
315
|
otlpReceiverStarted = true;
|
|
317
|
-
require_logger.
|
|
316
|
+
require_logger.logger.info(`[EvaluatorTracing] OTLP receiver successfully started on port ${port} for tracing`);
|
|
318
317
|
} catch (error) {
|
|
319
|
-
require_logger.
|
|
318
|
+
require_logger.logger.error(`[EvaluatorTracing] Failed to start OTLP receiver: ${error}`);
|
|
320
319
|
}
|
|
321
|
-
} else if (otlpReceiverStarted) require_logger.
|
|
320
|
+
} else if (otlpReceiverStarted) require_logger.logger.debug("[EvaluatorTracing] OTLP receiver already started, skipping initialization");
|
|
322
321
|
else {
|
|
323
|
-
require_logger.
|
|
324
|
-
require_logger.
|
|
325
|
-
require_logger.
|
|
322
|
+
require_logger.logger.debug("[EvaluatorTracing] Tracing not enabled or OTLP HTTP receiver not configured");
|
|
323
|
+
require_logger.logger.debug(`[EvaluatorTracing] tracing.enabled: ${testSuite.tracing?.enabled}`);
|
|
324
|
+
require_logger.logger.debug(`[EvaluatorTracing] tracing.otlp.http.enabled: ${testSuite.tracing?.otlp?.http?.enabled}`);
|
|
326
325
|
}
|
|
327
326
|
}
|
|
328
327
|
/**
|
|
@@ -330,13 +329,13 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
330
329
|
*/
|
|
331
330
|
async function stopOtlpReceiverIfNeeded() {
|
|
332
331
|
if (otlpReceiverStarted) try {
|
|
333
|
-
require_logger.
|
|
334
|
-
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
332
|
+
require_logger.logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
333
|
+
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-Diec4cln.cjs"));
|
|
335
334
|
await stopOTLPReceiver();
|
|
336
335
|
otlpReceiverStarted = false;
|
|
337
|
-
require_logger.
|
|
336
|
+
require_logger.logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
338
337
|
} catch (error) {
|
|
339
|
-
require_logger.
|
|
338
|
+
require_logger.logger.error(`[EvaluatorTracing] Failed to stop OTLP receiver: ${error}`);
|
|
340
339
|
}
|
|
341
340
|
}
|
|
342
341
|
/**
|
|
@@ -352,7 +351,7 @@ function isTracingEnabled(test, testSuite) {
|
|
|
352
351
|
const yamlConfigEnabled = testSuite?.tracing?.enabled === true;
|
|
353
352
|
const envEnabled = require_logger.getEnvBool("PROMPTFOO_TRACING_ENABLED", false);
|
|
354
353
|
const result = metadataEnabled || yamlConfigEnabled || envEnabled;
|
|
355
|
-
require_logger.
|
|
354
|
+
require_logger.logger.debug(`[EvaluatorTracing] isTracingEnabled check: metadata=${metadataEnabled}, yamlConfig=${yamlConfigEnabled}, env=${envEnabled}, result=${result}`);
|
|
356
355
|
return result;
|
|
357
356
|
}
|
|
358
357
|
/**
|
|
@@ -361,25 +360,25 @@ function isTracingEnabled(test, testSuite) {
|
|
|
361
360
|
async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite) {
|
|
362
361
|
const tracingEnabled = isTracingEnabled(test, testSuite);
|
|
363
362
|
if (tracingEnabled) {
|
|
364
|
-
require_logger.
|
|
365
|
-
require_logger.
|
|
363
|
+
require_logger.logger.debug("[EvaluatorTracing] Tracing enabled for test case");
|
|
364
|
+
require_logger.logger.debug(`[EvaluatorTracing] Test metadata: ${JSON.stringify(test.metadata)}`);
|
|
366
365
|
}
|
|
367
366
|
if (!tracingEnabled) return null;
|
|
368
|
-
require_logger.
|
|
369
|
-
const { getTraceStore } = await Promise.resolve().then(() => require("./store-
|
|
367
|
+
require_logger.logger.debug("[EvaluatorTracing] Importing trace store");
|
|
368
|
+
const { getTraceStore } = await Promise.resolve().then(() => require("./store-BNmZ1KAz.cjs"));
|
|
370
369
|
const traceStore = getTraceStore();
|
|
371
370
|
const traceId = generateTraceId();
|
|
372
371
|
const spanId = generateSpanId();
|
|
373
372
|
const traceparent = generateTraceparent(traceId, spanId);
|
|
374
|
-
require_logger.
|
|
373
|
+
require_logger.logger.debug(`[EvaluatorTracing] Generated trace context: traceId=${traceId}, spanId=${spanId}`);
|
|
375
374
|
let evaluationId = test.metadata?.evaluationId || evaluateOptions?.eventSource;
|
|
376
375
|
if (!evaluationId) {
|
|
377
|
-
require_logger.
|
|
376
|
+
require_logger.logger.warn("[EvaluatorTracing] No evaluation ID found in test metadata or evaluateOptions, trace will not be linked to evaluation");
|
|
378
377
|
evaluationId = `eval-${Date.now()}`;
|
|
379
378
|
}
|
|
380
379
|
const testCaseId = test.metadata?.testCaseId || test.id || `${testIdx}-${promptIdx}`;
|
|
381
380
|
try {
|
|
382
|
-
require_logger.
|
|
381
|
+
require_logger.logger.debug(`[EvaluatorTracing] Creating trace record for traceId=${traceId}`);
|
|
383
382
|
await traceStore.createTrace({
|
|
384
383
|
traceId,
|
|
385
384
|
evaluationId: evaluationId || "",
|
|
@@ -390,18 +389,17 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
390
389
|
vars: test.vars
|
|
391
390
|
}
|
|
392
391
|
});
|
|
393
|
-
require_logger.
|
|
392
|
+
require_logger.logger.debug("[EvaluatorTracing] Trace record created successfully");
|
|
394
393
|
} catch (error) {
|
|
395
|
-
require_logger.
|
|
394
|
+
require_logger.logger.error(`[EvaluatorTracing] Failed to create trace: ${error}`);
|
|
396
395
|
}
|
|
397
|
-
require_logger.
|
|
396
|
+
require_logger.logger.debug(`[EvaluatorTracing] Trace context ready: ${traceparent} for test case ${testCaseId}`);
|
|
398
397
|
return {
|
|
399
398
|
traceparent,
|
|
400
399
|
evaluationId,
|
|
401
400
|
testCaseId
|
|
402
401
|
};
|
|
403
402
|
}
|
|
404
|
-
|
|
405
403
|
//#endregion
|
|
406
404
|
//#region src/assertions/answerRelevance.ts
|
|
407
405
|
const handleAnswerRelevance = async ({ assertion, output, prompt, test, providerCallContext }) => {
|
|
@@ -412,7 +410,6 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
|
|
|
412
410
|
...await require_graders.matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
|
|
413
411
|
};
|
|
414
412
|
};
|
|
415
|
-
|
|
416
413
|
//#endregion
|
|
417
414
|
//#region src/assertions/assertionsResult.ts
|
|
418
415
|
const GUARDRAIL_BLOCKED_REASON = "Content failed guardrail safety checks";
|
|
@@ -518,7 +515,6 @@ var AssertionsResult = class {
|
|
|
518
515
|
return this.result;
|
|
519
516
|
}
|
|
520
517
|
};
|
|
521
|
-
|
|
522
518
|
//#endregion
|
|
523
519
|
//#region src/assertions/ngrams.ts
|
|
524
520
|
/**
|
|
@@ -534,7 +530,6 @@ function getNGrams(words, n) {
|
|
|
534
530
|
for (let i = 0; i <= words.length - n; i++) ngrams.push(words.slice(i, i + n).join(" "));
|
|
535
531
|
return ngrams;
|
|
536
532
|
}
|
|
537
|
-
|
|
538
533
|
//#endregion
|
|
539
534
|
//#region src/assertions/bleu.ts
|
|
540
535
|
/**
|
|
@@ -630,7 +625,6 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
630
625
|
assertion
|
|
631
626
|
};
|
|
632
627
|
}
|
|
633
|
-
|
|
634
628
|
//#endregion
|
|
635
629
|
//#region src/assertions/classifier.ts
|
|
636
630
|
async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
|
|
@@ -645,9 +639,43 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
645
639
|
...classificationResult
|
|
646
640
|
};
|
|
647
641
|
}
|
|
648
|
-
|
|
649
642
|
//#endregion
|
|
650
643
|
//#region src/assertions/contains.ts
|
|
644
|
+
function parseCommaSeparatedValues(value) {
|
|
645
|
+
const results = [];
|
|
646
|
+
let i = 0;
|
|
647
|
+
while (i < value.length) {
|
|
648
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
649
|
+
if (i >= value.length) break;
|
|
650
|
+
if (value[i] === ",") {
|
|
651
|
+
i++;
|
|
652
|
+
continue;
|
|
653
|
+
}
|
|
654
|
+
if (value[i] === "\"") {
|
|
655
|
+
i++;
|
|
656
|
+
let field = "";
|
|
657
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
658
|
+
field += value[i + 1];
|
|
659
|
+
i += 2;
|
|
660
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
661
|
+
field += "\"";
|
|
662
|
+
i += 2;
|
|
663
|
+
} else if (value[i] === "\"") {
|
|
664
|
+
i++;
|
|
665
|
+
break;
|
|
666
|
+
} else {
|
|
667
|
+
field += value[i];
|
|
668
|
+
i++;
|
|
669
|
+
}
|
|
670
|
+
results.push(field);
|
|
671
|
+
} else {
|
|
672
|
+
const start = i;
|
|
673
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
674
|
+
results.push(value.substring(start, i).trim());
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
return results;
|
|
678
|
+
}
|
|
651
679
|
const handleContains = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
652
680
|
const value = valueFromScript ?? renderedValue;
|
|
653
681
|
require_invariant.invariant(value, "\"contains\" assertion type must have a string or number value");
|
|
@@ -675,7 +703,7 @@ const handleIContains = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
675
703
|
const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
676
704
|
let value = valueFromScript ?? renderedValue;
|
|
677
705
|
require_invariant.invariant(value, "\"contains-any\" assertion type must have a value");
|
|
678
|
-
if (typeof value === "string") value = value
|
|
706
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
679
707
|
require_invariant.invariant(Array.isArray(value), "\"contains-any\" assertion type must have an array value");
|
|
680
708
|
const pass = value.some((v) => outputString.includes(String(v))) !== inverse;
|
|
681
709
|
return {
|
|
@@ -688,7 +716,7 @@ const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
688
716
|
const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
689
717
|
let value = valueFromScript ?? renderedValue;
|
|
690
718
|
require_invariant.invariant(value, "\"icontains-any\" assertion type must have a value");
|
|
691
|
-
if (typeof value === "string") value = value
|
|
719
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
692
720
|
require_invariant.invariant(Array.isArray(value), "\"icontains-any\" assertion type must have an array value");
|
|
693
721
|
const pass = value.some((v) => outputString.toLowerCase().includes(String(v).toLowerCase())) !== inverse;
|
|
694
722
|
return {
|
|
@@ -701,7 +729,7 @@ const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
701
729
|
const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
702
730
|
let value = valueFromScript ?? renderedValue;
|
|
703
731
|
require_invariant.invariant(value, "\"contains-all\" assertion type must have a value");
|
|
704
|
-
if (typeof value === "string") value = value
|
|
732
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
705
733
|
require_invariant.invariant(Array.isArray(value), "\"contains-all\" assertion type must have an array value");
|
|
706
734
|
const missingStrings = value.filter((v) => !outputString.includes(String(v)));
|
|
707
735
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -715,7 +743,7 @@ const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
715
743
|
const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
716
744
|
let value = valueFromScript ?? renderedValue;
|
|
717
745
|
require_invariant.invariant(value, "\"icontains-all\" assertion type must have a value");
|
|
718
|
-
if (typeof value === "string") value = value
|
|
746
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
719
747
|
require_invariant.invariant(Array.isArray(value), "\"icontains-all\" assertion type must have an array value");
|
|
720
748
|
const missingStrings = value.filter((v) => !outputString.toLowerCase().includes(String(v).toLowerCase()));
|
|
721
749
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -726,7 +754,6 @@ const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
726
754
|
assertion
|
|
727
755
|
};
|
|
728
756
|
};
|
|
729
|
-
|
|
730
757
|
//#endregion
|
|
731
758
|
//#region src/assertions/contextFaithfulness.ts
|
|
732
759
|
/**
|
|
@@ -750,7 +777,6 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
750
777
|
metadata: { context }
|
|
751
778
|
};
|
|
752
779
|
}
|
|
753
|
-
|
|
754
780
|
//#endregion
|
|
755
781
|
//#region src/assertions/contextRecall.ts
|
|
756
782
|
/**
|
|
@@ -777,7 +803,6 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
|
|
|
777
803
|
}
|
|
778
804
|
};
|
|
779
805
|
};
|
|
780
|
-
|
|
781
806
|
//#endregion
|
|
782
807
|
//#region src/assertions/contextRelevance.ts
|
|
783
808
|
/**
|
|
@@ -804,7 +829,6 @@ const handleContextRelevance = async ({ assertion, test, output, prompt, provide
|
|
|
804
829
|
}
|
|
805
830
|
};
|
|
806
831
|
};
|
|
807
|
-
|
|
808
832
|
//#endregion
|
|
809
833
|
//#region src/assertions/cost.ts
|
|
810
834
|
const handleCost = ({ cost, assertion }) => {
|
|
@@ -818,7 +842,6 @@ const handleCost = ({ cost, assertion }) => {
|
|
|
818
842
|
assertion
|
|
819
843
|
};
|
|
820
844
|
};
|
|
821
|
-
|
|
822
845
|
//#endregion
|
|
823
846
|
//#region src/assertions/equals.ts
|
|
824
847
|
const handleEquals = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -838,7 +861,6 @@ const handleEquals = async ({ assertion, renderedValue, outputString, inverse })
|
|
|
838
861
|
assertion
|
|
839
862
|
};
|
|
840
863
|
};
|
|
841
|
-
|
|
842
864
|
//#endregion
|
|
843
865
|
//#region src/assertions/factuality.ts
|
|
844
866
|
const handleFactuality = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -849,7 +871,6 @@ const handleFactuality = async ({ assertion, renderedValue, outputString, test,
|
|
|
849
871
|
...await require_graders.matchesFactuality(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
850
872
|
};
|
|
851
873
|
};
|
|
852
|
-
|
|
853
874
|
//#endregion
|
|
854
875
|
//#region src/assertions/finishReason.ts
|
|
855
876
|
function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
@@ -869,7 +890,6 @@ function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
|
869
890
|
assertion
|
|
870
891
|
};
|
|
871
892
|
}
|
|
872
|
-
|
|
873
893
|
//#endregion
|
|
874
894
|
//#region src/assertions/functionToolCall.ts
|
|
875
895
|
const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
@@ -892,7 +912,6 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
892
912
|
};
|
|
893
913
|
}
|
|
894
914
|
};
|
|
895
|
-
|
|
896
915
|
//#endregion
|
|
897
916
|
//#region src/assertions/geval.ts
|
|
898
917
|
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
@@ -918,7 +937,6 @@ const handleGEval = async ({ assertion, renderedValue, prompt, outputString, tes
|
|
|
918
937
|
...await require_graders.matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext)
|
|
919
938
|
};
|
|
920
939
|
};
|
|
921
|
-
|
|
922
940
|
//#endregion
|
|
923
941
|
//#region src/assertions/gleu.ts
|
|
924
942
|
/**
|
|
@@ -1017,7 +1035,6 @@ function handleGleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
1017
1035
|
assertion
|
|
1018
1036
|
};
|
|
1019
1037
|
}
|
|
1020
|
-
|
|
1021
1038
|
//#endregion
|
|
1022
1039
|
//#region src/assertions/guardrails.ts
|
|
1023
1040
|
const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
@@ -1026,7 +1043,7 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1026
1043
|
flaggedInput: false,
|
|
1027
1044
|
flaggedOutput: false
|
|
1028
1045
|
};
|
|
1029
|
-
require_logger.
|
|
1046
|
+
require_logger.logger.debug(`Guardrails from target response: ${JSON.stringify(guardrails, null, 2)}`);
|
|
1030
1047
|
if (guardrails) if (guardrails.flagged) {
|
|
1031
1048
|
let reason = "Content failed safety checks";
|
|
1032
1049
|
if (guardrails?.flaggedInput) reason = "Prompt failed safety checks";
|
|
@@ -1055,7 +1072,6 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1055
1072
|
assertion
|
|
1056
1073
|
};
|
|
1057
1074
|
};
|
|
1058
|
-
|
|
1059
1075
|
//#endregion
|
|
1060
1076
|
//#region src/assertions/html.ts
|
|
1061
1077
|
const HTML_PATTERNS = {
|
|
@@ -1264,7 +1280,6 @@ const handleIsHtml = ({ assertion, outputString, inverse }) => {
|
|
|
1264
1280
|
assertion
|
|
1265
1281
|
};
|
|
1266
1282
|
};
|
|
1267
|
-
|
|
1268
1283
|
//#endregion
|
|
1269
1284
|
//#region src/assertions/javascript.ts
|
|
1270
1285
|
/**
|
|
@@ -1405,7 +1420,6 @@ ${renderedValue}`,
|
|
|
1405
1420
|
assertion
|
|
1406
1421
|
};
|
|
1407
1422
|
};
|
|
1408
|
-
|
|
1409
1423
|
//#endregion
|
|
1410
1424
|
//#region src/assertions/json.ts
|
|
1411
1425
|
function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, assertion }) {
|
|
@@ -1471,7 +1485,6 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1471
1485
|
assertion
|
|
1472
1486
|
};
|
|
1473
1487
|
}
|
|
1474
|
-
|
|
1475
1488
|
//#endregion
|
|
1476
1489
|
//#region src/assertions/latency.ts
|
|
1477
1490
|
const handleLatency = ({ assertion, latencyMs }) => {
|
|
@@ -1485,7 +1498,6 @@ const handleLatency = ({ assertion, latencyMs }) => {
|
|
|
1485
1498
|
assertion
|
|
1486
1499
|
};
|
|
1487
1500
|
};
|
|
1488
|
-
|
|
1489
1501
|
//#endregion
|
|
1490
1502
|
//#region src/assertions/levenshtein.ts
|
|
1491
1503
|
function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
@@ -1500,7 +1512,6 @@ function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
|
1500
1512
|
assertion
|
|
1501
1513
|
};
|
|
1502
1514
|
}
|
|
1503
|
-
|
|
1504
1515
|
//#endregion
|
|
1505
1516
|
//#region src/assertions/llmRubric.ts
|
|
1506
1517
|
const handleLlmRubric = ({ assertion, renderedValue, outputString, test, providerCallContext }) => {
|
|
@@ -1509,7 +1520,6 @@ const handleLlmRubric = ({ assertion, renderedValue, outputString, test, provide
|
|
|
1509
1520
|
assertion.value = assertion.value || test.options?.rubricPrompt;
|
|
1510
1521
|
return require_graders.matchesLlmRubric(renderedValue || "", outputString, test.options, test.vars, assertion, void 0, providerCallContext);
|
|
1511
1522
|
};
|
|
1512
|
-
|
|
1513
1523
|
//#endregion
|
|
1514
1524
|
//#region src/assertions/modelGradedClosedQa.ts
|
|
1515
1525
|
const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -1520,7 +1530,6 @@ const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputStrin
|
|
|
1520
1530
|
...await require_graders.matchesClosedQa(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
1521
1531
|
};
|
|
1522
1532
|
};
|
|
1523
|
-
|
|
1524
1533
|
//#endregion
|
|
1525
1534
|
//#region src/util/providerResponse.ts
|
|
1526
1535
|
/**
|
|
@@ -1563,7 +1572,6 @@ function getActualPrompt(response, options = {}) {
|
|
|
1563
1572
|
function getActualPromptWithFallback(response, originalPrompt, options = {}) {
|
|
1564
1573
|
return getActualPrompt(response, options) || originalPrompt;
|
|
1565
1574
|
}
|
|
1566
|
-
|
|
1567
1575
|
//#endregion
|
|
1568
1576
|
//#region src/assertions/moderation.ts
|
|
1569
1577
|
const handleModeration = async ({ assertion, test, outputString, providerResponse, prompt }) => {
|
|
@@ -1586,7 +1594,6 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
|
|
|
1586
1594
|
assertion
|
|
1587
1595
|
};
|
|
1588
1596
|
};
|
|
1589
|
-
|
|
1590
1597
|
//#endregion
|
|
1591
1598
|
//#region src/assertions/openai.ts
|
|
1592
1599
|
const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test }) => {
|
|
@@ -1647,7 +1654,6 @@ const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test
|
|
|
1647
1654
|
};
|
|
1648
1655
|
}
|
|
1649
1656
|
};
|
|
1650
|
-
|
|
1651
1657
|
//#endregion
|
|
1652
1658
|
//#region src/assertions/perplexity.ts
|
|
1653
1659
|
function handlePerplexity({ logProbs, assertion }) {
|
|
@@ -1674,7 +1680,6 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1674
1680
|
assertion
|
|
1675
1681
|
};
|
|
1676
1682
|
}
|
|
1677
|
-
|
|
1678
1683
|
//#endregion
|
|
1679
1684
|
//#region src/assertions/pi.ts
|
|
1680
1685
|
const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }) => {
|
|
@@ -1682,7 +1687,6 @@ const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }
|
|
|
1682
1687
|
require_invariant.invariant(typeof prompt === "string", "\"pi\" assertion must have a prompt that is a string");
|
|
1683
1688
|
return require_graders.matchesPiScore(renderedValue, prompt, outputString, assertion);
|
|
1684
1689
|
};
|
|
1685
|
-
|
|
1686
1690
|
//#endregion
|
|
1687
1691
|
//#region src/python/wrapper.ts
|
|
1688
1692
|
/**
|
|
@@ -1698,17 +1702,16 @@ async function runPythonCode(code, method, args) {
|
|
|
1698
1702
|
fs.default.writeFileSync(tempFilePath, code);
|
|
1699
1703
|
return await require_pythonUtils.runPython(tempFilePath, method, args);
|
|
1700
1704
|
} catch (error) {
|
|
1701
|
-
require_logger.
|
|
1705
|
+
require_logger.logger.error(`Error executing Python code: ${error}`);
|
|
1702
1706
|
throw error;
|
|
1703
1707
|
} finally {
|
|
1704
1708
|
try {
|
|
1705
1709
|
fs.default.unlinkSync(tempFilePath);
|
|
1706
1710
|
} catch (error) {
|
|
1707
|
-
require_logger.
|
|
1711
|
+
require_logger.logger.error(`Error removing temporary file: ${error}`);
|
|
1708
1712
|
}
|
|
1709
1713
|
}
|
|
1710
1714
|
}
|
|
1711
|
-
|
|
1712
1715
|
//#endregion
|
|
1713
1716
|
//#region src/util/caseMapping.ts
|
|
1714
1717
|
/**
|
|
@@ -1732,7 +1735,6 @@ function mapSnakeCaseToCamelCase(obj) {
|
|
|
1732
1735
|
});
|
|
1733
1736
|
return result;
|
|
1734
1737
|
}
|
|
1735
|
-
|
|
1736
1738
|
//#endregion
|
|
1737
1739
|
//#region src/assertions/python.ts
|
|
1738
1740
|
const handlePython = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -1802,7 +1804,6 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1802
1804
|
assertion
|
|
1803
1805
|
};
|
|
1804
1806
|
};
|
|
1805
|
-
|
|
1806
1807
|
//#endregion
|
|
1807
1808
|
//#region src/assertions/redteam.ts
|
|
1808
1809
|
/**
|
|
@@ -1883,7 +1884,7 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1883
1884
|
const { hasAnyErrors, allTurnsHaveErrors } = analyzeGraderErrors(redteamHistory);
|
|
1884
1885
|
if (test.metadata?.strategyId && hasAnyErrors && !allTurnsHaveErrors) {
|
|
1885
1886
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1886
|
-
require_logger.
|
|
1887
|
+
require_logger.logger.warn("[Redteam] Grading failed for iterative test with some prior grader errors", {
|
|
1887
1888
|
error: errorMessage,
|
|
1888
1889
|
strategyId: test.metadata.strategyId,
|
|
1889
1890
|
pluginId: test.metadata.pluginId
|
|
@@ -1903,7 +1904,6 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1903
1904
|
throw error;
|
|
1904
1905
|
}
|
|
1905
1906
|
};
|
|
1906
|
-
|
|
1907
1907
|
//#endregion
|
|
1908
1908
|
//#region src/assertions/refusal.ts
|
|
1909
1909
|
function handleIsRefusal(params) {
|
|
@@ -1931,7 +1931,6 @@ function handleIsRefusal(params) {
|
|
|
1931
1931
|
assertion
|
|
1932
1932
|
};
|
|
1933
1933
|
}
|
|
1934
|
-
|
|
1935
1934
|
//#endregion
|
|
1936
1935
|
//#region src/assertions/regex.ts
|
|
1937
1936
|
const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -1956,7 +1955,6 @@ const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
|
1956
1955
|
assertion
|
|
1957
1956
|
};
|
|
1958
1957
|
};
|
|
1959
|
-
|
|
1960
1958
|
//#endregion
|
|
1961
1959
|
//#region src/assertions/rouge.ts
|
|
1962
1960
|
function handleRougeScore({ baseType, assertion, renderedValue, outputString, inverse }) {
|
|
@@ -1972,7 +1970,6 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
1972
1970
|
assertion
|
|
1973
1971
|
};
|
|
1974
1972
|
}
|
|
1975
|
-
|
|
1976
1973
|
//#endregion
|
|
1977
1974
|
//#region src/ruby/wrapper.ts
|
|
1978
1975
|
/**
|
|
@@ -1988,17 +1985,16 @@ async function runRubyCode(code, method, args) {
|
|
|
1988
1985
|
fs.default.writeFileSync(tempFilePath, code);
|
|
1989
1986
|
return await require_rubyUtils.runRuby(tempFilePath, method, args);
|
|
1990
1987
|
} catch (error) {
|
|
1991
|
-
require_logger.
|
|
1988
|
+
require_logger.logger.error(`Error executing Ruby code: ${error}`);
|
|
1992
1989
|
throw error;
|
|
1993
1990
|
} finally {
|
|
1994
1991
|
try {
|
|
1995
1992
|
fs.default.unlinkSync(tempFilePath);
|
|
1996
1993
|
} catch (error) {
|
|
1997
|
-
require_logger.
|
|
1994
|
+
require_logger.logger.error(`Error removing temporary file: ${error}`);
|
|
1998
1995
|
}
|
|
1999
1996
|
}
|
|
2000
1997
|
}
|
|
2001
|
-
|
|
2002
1998
|
//#endregion
|
|
2003
1999
|
//#region src/assertions/ruby.ts
|
|
2004
2000
|
const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -2069,7 +2065,6 @@ end
|
|
|
2069
2065
|
assertion
|
|
2070
2066
|
};
|
|
2071
2067
|
};
|
|
2072
|
-
|
|
2073
2068
|
//#endregion
|
|
2074
2069
|
//#region src/assertions/searchRubric.ts
|
|
2075
2070
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
@@ -2081,7 +2076,6 @@ async function handleSearchRubric({ assertion, baseType: _baseType, inverse, pro
|
|
|
2081
2076
|
}
|
|
2082
2077
|
return result;
|
|
2083
2078
|
}
|
|
2084
|
-
|
|
2085
2079
|
//#endregion
|
|
2086
2080
|
//#region src/assertions/similar.ts
|
|
2087
2081
|
const handleSimilar = async ({ assertion, renderedValue, outputString, inverse, test }) => {
|
|
@@ -2124,7 +2118,6 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2124
2118
|
...await require_graders.matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
|
|
2125
2119
|
};
|
|
2126
2120
|
};
|
|
2127
|
-
|
|
2128
2121
|
//#endregion
|
|
2129
2122
|
//#region src/assertions/sql.ts
|
|
2130
2123
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2216,7 +2209,6 @@ const handleContainsSql = async (assertionParams) => {
|
|
|
2216
2209
|
}
|
|
2217
2210
|
return handleIsSql(assertionParams);
|
|
2218
2211
|
};
|
|
2219
|
-
|
|
2220
2212
|
//#endregion
|
|
2221
2213
|
//#region src/assertions/startsWith.ts
|
|
2222
2214
|
const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2230,7 +2222,6 @@ const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) =
|
|
|
2230
2222
|
assertion
|
|
2231
2223
|
};
|
|
2232
2224
|
};
|
|
2233
|
-
|
|
2234
2225
|
//#endregion
|
|
2235
2226
|
//#region src/assertions/toolCallF1.ts
|
|
2236
2227
|
/**
|
|
@@ -2359,7 +2350,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2359
2350
|
assertion
|
|
2360
2351
|
};
|
|
2361
2352
|
};
|
|
2362
|
-
|
|
2363
2353
|
//#endregion
|
|
2364
2354
|
//#region src/assertions/traceUtils.ts
|
|
2365
2355
|
/**
|
|
@@ -2377,7 +2367,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
2377
2367
|
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2378
2368
|
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2379
2369
|
}
|
|
2380
|
-
|
|
2381
2370
|
//#endregion
|
|
2382
2371
|
//#region src/assertions/traceErrorSpans.ts
|
|
2383
2372
|
function isErrorSpan(span) {
|
|
@@ -2455,7 +2444,6 @@ const handleTraceErrorSpans = ({ assertion, assertionValueContext }) => {
|
|
|
2455
2444
|
assertion
|
|
2456
2445
|
};
|
|
2457
2446
|
};
|
|
2458
|
-
|
|
2459
2447
|
//#endregion
|
|
2460
2448
|
//#region src/assertions/traceSpanCount.ts
|
|
2461
2449
|
const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
@@ -2490,7 +2478,6 @@ const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
|
2490
2478
|
assertion
|
|
2491
2479
|
};
|
|
2492
2480
|
};
|
|
2493
|
-
|
|
2494
2481
|
//#endregion
|
|
2495
2482
|
//#region src/assertions/traceSpanDuration.ts
|
|
2496
2483
|
function calculatePercentile(durations, percentile) {
|
|
@@ -2548,7 +2535,6 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2548
2535
|
assertion
|
|
2549
2536
|
};
|
|
2550
2537
|
};
|
|
2551
|
-
|
|
2552
2538
|
//#endregion
|
|
2553
2539
|
//#region src/assertions/webhook.ts
|
|
2554
2540
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
@@ -2585,7 +2571,6 @@ async function handleWebhook({ assertion, renderedValue, test, prompt, output, i
|
|
|
2585
2571
|
};
|
|
2586
2572
|
}
|
|
2587
2573
|
}
|
|
2588
|
-
|
|
2589
2574
|
//#endregion
|
|
2590
2575
|
//#region src/assertions/wordCount.ts
|
|
2591
2576
|
/**
|
|
@@ -2648,7 +2633,6 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2648
2633
|
assertion
|
|
2649
2634
|
};
|
|
2650
2635
|
};
|
|
2651
|
-
|
|
2652
2636
|
//#endregion
|
|
2653
2637
|
//#region src/assertions/xml.ts
|
|
2654
2638
|
function validateXml(xmlString, requiredElements) {
|
|
@@ -2723,7 +2707,6 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2723
2707
|
assertion
|
|
2724
2708
|
};
|
|
2725
2709
|
};
|
|
2726
|
-
|
|
2727
2710
|
//#endregion
|
|
2728
2711
|
//#region src/assertions/index.ts
|
|
2729
2712
|
const ASSERTIONS_MAX_CONCURRENCY = require_logger.getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
@@ -2777,7 +2760,7 @@ const ASSERTION_HANDLERS = {
|
|
|
2777
2760
|
"llm-rubric": handleLlmRubric,
|
|
2778
2761
|
meteor: async (params) => {
|
|
2779
2762
|
try {
|
|
2780
|
-
const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-
|
|
2763
|
+
const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-DLZZ3osF.cjs"));
|
|
2781
2764
|
return handleMeteorAssertion(params);
|
|
2782
2765
|
} catch (error) {
|
|
2783
2766
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -2823,10 +2806,10 @@ function renderMetricName(metric, vars) {
|
|
|
2823
2806
|
if (!metric) return metric;
|
|
2824
2807
|
try {
|
|
2825
2808
|
const rendered = nunjucks.renderString(metric, vars);
|
|
2826
|
-
if (rendered === "" && metric !== "") require_logger.
|
|
2809
|
+
if (rendered === "" && metric !== "") require_logger.logger.debug(`Metric template "${metric}" rendered to empty string`);
|
|
2827
2810
|
return rendered;
|
|
2828
2811
|
} catch (error) {
|
|
2829
|
-
require_logger.
|
|
2812
|
+
require_logger.logger.warn(`Failed to render metric template "${metric}": ${error instanceof Error ? error.message : error}`);
|
|
2830
2813
|
return metric;
|
|
2831
2814
|
}
|
|
2832
2815
|
}
|
|
@@ -2877,12 +2860,12 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2877
2860
|
spans: traceData.spans || []
|
|
2878
2861
|
};
|
|
2879
2862
|
} catch (error) {
|
|
2880
|
-
require_logger.
|
|
2863
|
+
require_logger.logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
2881
2864
|
}
|
|
2882
2865
|
let renderedValue = assertion.value;
|
|
2883
2866
|
let valueFromScript;
|
|
2884
2867
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
2885
|
-
const basePath = require_logger.
|
|
2868
|
+
const basePath = require_logger.state.basePath || "";
|
|
2886
2869
|
const fileRef = renderedValue.slice(7);
|
|
2887
2870
|
let filePath = fileRef;
|
|
2888
2871
|
let functionName;
|
|
@@ -2894,10 +2877,10 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2894
2877
|
filePath = path.default.resolve(basePath, filePath);
|
|
2895
2878
|
if (require_fileExtensions.isJavascriptFile(filePath)) {
|
|
2896
2879
|
valueFromScript = await require_graders.loadFromJavaScriptFile(filePath, functionName, [output, context]);
|
|
2897
|
-
require_logger.
|
|
2880
|
+
require_logger.logger.debug(`Javascript script ${filePath} output: ${valueFromScript}`);
|
|
2898
2881
|
} else if (filePath.endsWith(".py")) try {
|
|
2899
2882
|
valueFromScript = await require_pythonUtils.runPython(filePath, functionName || "get_assert", [output, context]);
|
|
2900
|
-
require_logger.
|
|
2883
|
+
require_logger.logger.debug(`Python script ${filePath} output: ${valueFromScript}`);
|
|
2901
2884
|
} catch (error) {
|
|
2902
2885
|
return {
|
|
2903
2886
|
pass: false,
|
|
@@ -2907,9 +2890,9 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2907
2890
|
};
|
|
2908
2891
|
}
|
|
2909
2892
|
else if (filePath.endsWith(".rb")) try {
|
|
2910
|
-
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-
|
|
2893
|
+
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-CP42kMvq.cjs"));
|
|
2911
2894
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2912
|
-
require_logger.
|
|
2895
|
+
require_logger.logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2913
2896
|
} catch (error) {
|
|
2914
2897
|
return {
|
|
2915
2898
|
pass: false,
|
|
@@ -2920,7 +2903,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2920
2903
|
}
|
|
2921
2904
|
else renderedValue = require_graders.processFileReference(renderedValue);
|
|
2922
2905
|
} else if (require_providers.isPackagePath(renderedValue)) {
|
|
2923
|
-
const basePath = require_logger.
|
|
2906
|
+
const basePath = require_logger.state.basePath || "";
|
|
2924
2907
|
const requiredModule = await require_providers.loadFromPackage(renderedValue, basePath);
|
|
2925
2908
|
if (typeof requiredModule !== "function") throw new Error(`Assertion malformed: ${renderedValue} must be a function. Received: ${typeof requiredModule}`);
|
|
2926
2909
|
valueFromScript = await Promise.resolve(requiredModule(output, context));
|
|
@@ -3081,7 +3064,6 @@ var assertions_default = {
|
|
|
3081
3064
|
matchesModeration: require_graders.matchesModeration,
|
|
3082
3065
|
matchesConversationRelevance
|
|
3083
3066
|
};
|
|
3084
|
-
|
|
3085
3067
|
//#endregion
|
|
3086
3068
|
//#region src/database/signal.ts
|
|
3087
3069
|
/**
|
|
@@ -3096,10 +3078,9 @@ function updateSignalFile(evalId) {
|
|
|
3096
3078
|
const content = evalId ? `${evalId}:${now.toISOString()}` : now.toISOString();
|
|
3097
3079
|
fs.default.writeFileSync(filePath, content);
|
|
3098
3080
|
} catch (err) {
|
|
3099
|
-
require_logger.
|
|
3081
|
+
require_logger.logger.warn(`Failed to write database signal file: ${err}`);
|
|
3100
3082
|
}
|
|
3101
3083
|
}
|
|
3102
|
-
|
|
3103
3084
|
//#endregion
|
|
3104
3085
|
//#region src/progress/ciProgressReporter.ts
|
|
3105
3086
|
var CIProgressReporter = class {
|
|
@@ -3121,7 +3102,7 @@ var CIProgressReporter = class {
|
|
|
3121
3102
|
}
|
|
3122
3103
|
start() {
|
|
3123
3104
|
if (this.intervalId) clearInterval(this.intervalId);
|
|
3124
|
-
require_logger.
|
|
3105
|
+
require_logger.logger.info(`[Evaluation] Starting ${this.totalTests} test cases...`);
|
|
3125
3106
|
this.intervalId = setInterval(() => {
|
|
3126
3107
|
this.logPeriodicUpdate();
|
|
3127
3108
|
}, this.updateIntervalMs);
|
|
@@ -3152,14 +3133,14 @@ var CIProgressReporter = class {
|
|
|
3152
3133
|
this.intervalId = null;
|
|
3153
3134
|
}
|
|
3154
3135
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3155
|
-
require_logger.
|
|
3136
|
+
require_logger.logger.info(`[Evaluation] ✓ Complete! ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3156
3137
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation completed: ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3157
3138
|
}
|
|
3158
3139
|
error(message) {
|
|
3159
3140
|
const now = Date.now();
|
|
3160
3141
|
if (now - this.lastErrorTime < this.ERROR_THROTTLE_MS) return;
|
|
3161
3142
|
this.lastErrorTime = now;
|
|
3162
|
-
require_logger.
|
|
3143
|
+
require_logger.logger.error(`[Evaluation Error] ${message}`);
|
|
3163
3144
|
if (process.env.GITHUB_ACTIONS) {
|
|
3164
3145
|
const escapedMessage = message.replace(/\r?\n/g, " ").replace(/::/g, " ");
|
|
3165
3146
|
console.log(`::error::${escapedMessage}`);
|
|
@@ -3178,12 +3159,12 @@ var CIProgressReporter = class {
|
|
|
3178
3159
|
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
|
|
3179
3160
|
}
|
|
3180
3161
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3181
|
-
require_logger.
|
|
3182
|
-
require_logger.
|
|
3162
|
+
require_logger.logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
3163
|
+
require_logger.logger.info(`[CI Progress] Rate: ~${Math.round(rate)} tests/minute, ETA: ${etaDisplay}`);
|
|
3183
3164
|
}
|
|
3184
3165
|
logMilestone(percentage) {
|
|
3185
3166
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3186
|
-
require_logger.
|
|
3167
|
+
require_logger.logger.info(`[Evaluation] ✓ ${percentage}% complete (${this.completedTests}/${this.totalTests}) - ${elapsed} elapsed`);
|
|
3187
3168
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation ${percentage}% complete`);
|
|
3188
3169
|
}
|
|
3189
3170
|
formatElapsedTime(ms) {
|
|
@@ -3194,7 +3175,6 @@ var CIProgressReporter = class {
|
|
|
3194
3175
|
return `${minutes}m ${remainingSeconds}s`;
|
|
3195
3176
|
}
|
|
3196
3177
|
};
|
|
3197
|
-
|
|
3198
3178
|
//#endregion
|
|
3199
3179
|
//#region src/providers/azure/warnings.ts
|
|
3200
3180
|
/**
|
|
@@ -3208,13 +3188,12 @@ function maybeEmitAzureOpenAiWarning(testSuite, tests) {
|
|
|
3208
3188
|
const modelGradedAsserts = tests.flatMap((t) => (t.assert || []).filter((a) => a.type !== "assert-set" && MODEL_GRADED_ASSERTION_TYPES.has(a.type) && !a.provider && !t.options?.provider));
|
|
3209
3189
|
if (modelGradedAsserts.length > 0) {
|
|
3210
3190
|
const assertTypes = Array.from(new Set(modelGradedAsserts.map((a) => a.type))).join(", ");
|
|
3211
|
-
require_logger.
|
|
3191
|
+
require_logger.logger.warn(chalk.default.yellow(`You are using model-graded assertions of types ${chalk.default.bold(assertTypes)} while testing an Azure provider. You may need to override these to use your Azure deployment. To learn more, see ${chalk.default.bold(`https://promptfoo.dev/docs/providers/azure/#model-graded-tests`)}`));
|
|
3212
3192
|
return true;
|
|
3213
3193
|
}
|
|
3214
3194
|
}
|
|
3215
3195
|
return false;
|
|
3216
3196
|
}
|
|
3217
|
-
|
|
3218
3197
|
//#endregion
|
|
3219
3198
|
//#region src/suggestions.ts
|
|
3220
3199
|
async function generatePrompts(prompt, _num) {
|
|
@@ -3245,7 +3224,6 @@ async function generatePrompts(prompt, _num) {
|
|
|
3245
3224
|
};
|
|
3246
3225
|
}
|
|
3247
3226
|
}
|
|
3248
|
-
|
|
3249
3227
|
//#endregion
|
|
3250
3228
|
//#region src/tracing/otelConfig.ts
|
|
3251
3229
|
/**
|
|
@@ -3271,7 +3249,6 @@ function getDefaultOtelConfig() {
|
|
|
3271
3249
|
enabled: true
|
|
3272
3250
|
};
|
|
3273
3251
|
}
|
|
3274
|
-
|
|
3275
3252
|
//#endregion
|
|
3276
3253
|
//#region src/tracing/localSpanExporter.ts
|
|
3277
3254
|
/**
|
|
@@ -3291,7 +3268,7 @@ var LocalSpanExporter = class {
|
|
|
3291
3268
|
});
|
|
3292
3269
|
else resultCallback({ code: _opentelemetry_core.ExportResultCode.SUCCESS });
|
|
3293
3270
|
}).catch((error) => {
|
|
3294
|
-
require_logger.
|
|
3271
|
+
require_logger.logger.error("[LocalSpanExporter] Failed to export spans", { error });
|
|
3295
3272
|
resultCallback({
|
|
3296
3273
|
code: _opentelemetry_core.ExportResultCode.FAILED,
|
|
3297
3274
|
error: error instanceof Error ? error : new Error(String(error))
|
|
@@ -3305,7 +3282,7 @@ var LocalSpanExporter = class {
|
|
|
3305
3282
|
async exportAsync(spans) {
|
|
3306
3283
|
if (spans.length === 0) return;
|
|
3307
3284
|
const traceStore = require_store.getTraceStore();
|
|
3308
|
-
require_logger.
|
|
3285
|
+
require_logger.logger.debug(`[LocalSpanExporter] Exporting ${spans.length} spans`);
|
|
3309
3286
|
const spansByTrace = /* @__PURE__ */ new Map();
|
|
3310
3287
|
for (const span of spans) {
|
|
3311
3288
|
const traceId = span.spanContext().traceId;
|
|
@@ -3316,12 +3293,12 @@ var LocalSpanExporter = class {
|
|
|
3316
3293
|
let firstError;
|
|
3317
3294
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
3318
3295
|
const result = await traceStore.addSpans(traceId, spanDataList, { skipTraceCheck: false });
|
|
3319
|
-
if (result.stored) require_logger.
|
|
3320
|
-
else require_logger.
|
|
3296
|
+
if (result.stored) require_logger.logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
3297
|
+
else require_logger.logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
3321
3298
|
} catch (error) {
|
|
3322
|
-
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY")) require_logger.
|
|
3299
|
+
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY")) require_logger.logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}`);
|
|
3323
3300
|
else {
|
|
3324
|
-
require_logger.
|
|
3301
|
+
require_logger.logger.error(`[LocalSpanExporter] Failed to add spans to trace ${traceId}`, { error });
|
|
3325
3302
|
if (!firstError) firstError = error instanceof Error ? error : new Error(String(error));
|
|
3326
3303
|
}
|
|
3327
3304
|
}
|
|
@@ -3358,7 +3335,7 @@ var LocalSpanExporter = class {
|
|
|
3358
3335
|
* Shutdown the exporter. No-op for local storage.
|
|
3359
3336
|
*/
|
|
3360
3337
|
shutdown() {
|
|
3361
|
-
require_logger.
|
|
3338
|
+
require_logger.logger.debug("[LocalSpanExporter] Shutting down");
|
|
3362
3339
|
return Promise.resolve();
|
|
3363
3340
|
}
|
|
3364
3341
|
/**
|
|
@@ -3368,7 +3345,6 @@ var LocalSpanExporter = class {
|
|
|
3368
3345
|
return Promise.resolve();
|
|
3369
3346
|
}
|
|
3370
3347
|
};
|
|
3371
|
-
|
|
3372
3348
|
//#endregion
|
|
3373
3349
|
//#region src/tracing/otelSdk.ts
|
|
3374
3350
|
let provider = null;
|
|
@@ -3396,21 +3372,21 @@ function getHandlers() {
|
|
|
3396
3372
|
*/
|
|
3397
3373
|
function initializeOtel(config) {
|
|
3398
3374
|
if (initialized) {
|
|
3399
|
-
require_logger.
|
|
3375
|
+
require_logger.logger.debug("[OtelSdk] Already initialized, skipping");
|
|
3400
3376
|
return;
|
|
3401
3377
|
}
|
|
3402
3378
|
if (!config.enabled) {
|
|
3403
|
-
require_logger.
|
|
3379
|
+
require_logger.logger.debug("[OtelSdk] OTEL tracing is disabled");
|
|
3404
3380
|
return;
|
|
3405
3381
|
}
|
|
3406
|
-
require_logger.
|
|
3382
|
+
require_logger.logger.debug("[OtelSdk] Initializing OpenTelemetry SDK", {
|
|
3407
3383
|
serviceName: config.serviceName,
|
|
3408
3384
|
endpoint: config.endpoint,
|
|
3409
3385
|
localExport: config.localExport
|
|
3410
3386
|
});
|
|
3411
3387
|
if (config.debug) _opentelemetry_api.diag.setLogger(new _opentelemetry_api.DiagConsoleLogger(), _opentelemetry_api.DiagLogLevel.DEBUG);
|
|
3412
3388
|
_opentelemetry_api.propagation.setGlobalPropagator(new _opentelemetry_core.W3CTraceContextPropagator());
|
|
3413
|
-
require_logger.
|
|
3389
|
+
require_logger.logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
|
|
3414
3390
|
const resource = (0, _opentelemetry_resources.resourceFromAttributes)({
|
|
3415
3391
|
[_opentelemetry_semantic_conventions.ATTR_SERVICE_NAME]: config.serviceName,
|
|
3416
3392
|
[_opentelemetry_semantic_conventions.ATTR_SERVICE_VERSION]: require_fetch.VERSION
|
|
@@ -3419,12 +3395,12 @@ function initializeOtel(config) {
|
|
|
3419
3395
|
if (config.localExport) {
|
|
3420
3396
|
const localExporter = new LocalSpanExporter();
|
|
3421
3397
|
spanProcessors.push(new _opentelemetry_sdk_trace_node.BatchSpanProcessor(localExporter));
|
|
3422
|
-
require_logger.
|
|
3398
|
+
require_logger.logger.debug("[OtelSdk] Added local span exporter");
|
|
3423
3399
|
}
|
|
3424
3400
|
if (config.endpoint) {
|
|
3425
3401
|
const otlpExporter = new _opentelemetry_exporter_trace_otlp_http.OTLPTraceExporter({ url: config.endpoint });
|
|
3426
3402
|
spanProcessors.push(new _opentelemetry_sdk_trace_node.BatchSpanProcessor(otlpExporter));
|
|
3427
|
-
require_logger.
|
|
3403
|
+
require_logger.logger.debug(`[OtelSdk] Added OTLP exporter to ${config.endpoint}`);
|
|
3428
3404
|
}
|
|
3429
3405
|
provider = new _opentelemetry_sdk_trace_node.NodeTracerProvider({
|
|
3430
3406
|
resource,
|
|
@@ -3432,7 +3408,7 @@ function initializeOtel(config) {
|
|
|
3432
3408
|
});
|
|
3433
3409
|
provider.register();
|
|
3434
3410
|
initialized = true;
|
|
3435
|
-
require_logger.
|
|
3411
|
+
require_logger.logger.info("[OtelSdk] OpenTelemetry SDK initialized successfully");
|
|
3436
3412
|
setupShutdownHandlers();
|
|
3437
3413
|
}
|
|
3438
3414
|
/**
|
|
@@ -3441,12 +3417,12 @@ function initializeOtel(config) {
|
|
|
3441
3417
|
*/
|
|
3442
3418
|
async function shutdownOtel() {
|
|
3443
3419
|
if (!initialized || !provider) return;
|
|
3444
|
-
require_logger.
|
|
3420
|
+
require_logger.logger.debug("[OtelSdk] Shutting down OpenTelemetry SDK");
|
|
3445
3421
|
try {
|
|
3446
3422
|
await provider.shutdown();
|
|
3447
|
-
require_logger.
|
|
3423
|
+
require_logger.logger.info("[OtelSdk] OpenTelemetry SDK shut down successfully");
|
|
3448
3424
|
} catch (error) {
|
|
3449
|
-
require_logger.
|
|
3425
|
+
require_logger.logger.error("[OtelSdk] Error shutting down OpenTelemetry SDK", { error });
|
|
3450
3426
|
} finally {
|
|
3451
3427
|
provider = null;
|
|
3452
3428
|
initialized = false;
|
|
@@ -3459,12 +3435,12 @@ async function shutdownOtel() {
|
|
|
3459
3435
|
*/
|
|
3460
3436
|
async function flushOtel() {
|
|
3461
3437
|
if (!initialized || !provider) return;
|
|
3462
|
-
require_logger.
|
|
3438
|
+
require_logger.logger.debug("[OtelSdk] Flushing pending spans");
|
|
3463
3439
|
try {
|
|
3464
3440
|
await provider.forceFlush();
|
|
3465
|
-
require_logger.
|
|
3441
|
+
require_logger.logger.debug("[OtelSdk] Spans flushed successfully");
|
|
3466
3442
|
} catch (error) {
|
|
3467
|
-
require_logger.
|
|
3443
|
+
require_logger.logger.error("[OtelSdk] Error flushing spans", { error });
|
|
3468
3444
|
}
|
|
3469
3445
|
}
|
|
3470
3446
|
/**
|
|
@@ -3476,7 +3452,7 @@ function setupShutdownHandlers() {
|
|
|
3476
3452
|
const handlers = getHandlers();
|
|
3477
3453
|
if (handlers.registered) return;
|
|
3478
3454
|
const shutdown = async (signal) => {
|
|
3479
|
-
require_logger.
|
|
3455
|
+
require_logger.logger.debug(`[OtelSdk] Received ${signal}, shutting down`);
|
|
3480
3456
|
await shutdownOtel();
|
|
3481
3457
|
};
|
|
3482
3458
|
handlers.sigTermHandler = () => {
|
|
@@ -3513,7 +3489,6 @@ function cleanupShutdownHandlers() {
|
|
|
3513
3489
|
}
|
|
3514
3490
|
handlers.registered = false;
|
|
3515
3491
|
}
|
|
3516
|
-
|
|
3517
3492
|
//#endregion
|
|
3518
3493
|
//#region src/util/exportToFile/writeToFile.ts
|
|
3519
3494
|
var JsonlFileWriter = class {
|
|
@@ -3537,7 +3512,6 @@ var JsonlFileWriter = class {
|
|
|
3537
3512
|
});
|
|
3538
3513
|
}
|
|
3539
3514
|
};
|
|
3540
|
-
|
|
3541
3515
|
//#endregion
|
|
3542
3516
|
//#region src/util/promptMatching.ts
|
|
3543
3517
|
/**
|
|
@@ -3575,7 +3549,6 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
3575
3549
|
if (allowedPrompts.length === 0) return false;
|
|
3576
3550
|
return allowedPrompts.some((ref) => doesPromptRefMatch(ref, prompt));
|
|
3577
3551
|
}
|
|
3578
|
-
|
|
3579
3552
|
//#endregion
|
|
3580
3553
|
//#region src/evaluator.ts
|
|
3581
3554
|
/**
|
|
@@ -3767,7 +3740,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3767
3740
|
if (test.providerOutput) response.output = test.providerOutput;
|
|
3768
3741
|
else {
|
|
3769
3742
|
const activeProvider = require_types.isApiProvider(test.provider) ? test.provider : provider;
|
|
3770
|
-
require_logger.
|
|
3743
|
+
require_logger.logger.debug(`Provider type: ${activeProvider.id()}`);
|
|
3771
3744
|
traceContext = await generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite);
|
|
3772
3745
|
const callApiContext = {
|
|
3773
3746
|
vars,
|
|
@@ -3778,7 +3751,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3778
3751
|
filters,
|
|
3779
3752
|
originalProvider: provider,
|
|
3780
3753
|
test,
|
|
3781
|
-
logger: require_logger.
|
|
3754
|
+
logger: require_logger.logger,
|
|
3782
3755
|
getCache: require_cache.getCache,
|
|
3783
3756
|
repeatIndex
|
|
3784
3757
|
};
|
|
@@ -3795,8 +3768,8 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3795
3768
|
const sanitizedMetadata = require_logger.safeJsonStringify(response.metadata);
|
|
3796
3769
|
response.metadata = sanitizedMetadata ? JSON.parse(sanitizedMetadata) : {};
|
|
3797
3770
|
}
|
|
3798
|
-
require_logger.
|
|
3799
|
-
require_logger.
|
|
3771
|
+
require_logger.logger.debug(`Provider response properties: ${Object.keys(response).join(", ")}`);
|
|
3772
|
+
require_logger.logger.debug(`Provider response cached property explicitly: ${response.cached}`);
|
|
3800
3773
|
}
|
|
3801
3774
|
latencyMs = Date.now() - startTime;
|
|
3802
3775
|
let conversationLastInput = void 0;
|
|
@@ -3813,12 +3786,12 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3813
3786
|
metadata: response.metadata
|
|
3814
3787
|
});
|
|
3815
3788
|
}
|
|
3816
|
-
require_logger.
|
|
3817
|
-
require_logger.
|
|
3789
|
+
require_logger.logger.debug("Evaluator response", { responsePreview: (require_logger.safeJsonStringify(response) ?? "").slice(0, 100) });
|
|
3790
|
+
require_logger.logger.debug(`Evaluator checking cached flag: response.cached = ${Boolean(response.cached)}, provider.delay = ${provider.delay}`);
|
|
3818
3791
|
if (!response.cached && provider.delay > 0) {
|
|
3819
|
-
require_logger.
|
|
3792
|
+
require_logger.logger.debug(`Sleeping for ${provider.delay}ms`);
|
|
3820
3793
|
await require_fetch.sleep(provider.delay);
|
|
3821
|
-
} else if (response.cached) require_logger.
|
|
3794
|
+
} else if (response.cached) require_logger.logger.debug(`Skipping delay because response is cached`);
|
|
3822
3795
|
const ret = {
|
|
3823
3796
|
...setup,
|
|
3824
3797
|
response,
|
|
@@ -3921,7 +3894,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3921
3894
|
promptIdx,
|
|
3922
3895
|
testIdx
|
|
3923
3896
|
});
|
|
3924
|
-
if (!(err instanceof Error && err.name === "AbortError")) require_logger.
|
|
3897
|
+
if (!(err instanceof Error && err.name === "AbortError")) require_logger.logger.error("Provider call failed during eval", logContext);
|
|
3925
3898
|
return [{
|
|
3926
3899
|
...setup,
|
|
3927
3900
|
error: errorWithStack,
|
|
@@ -4004,7 +3977,7 @@ function generateVarCombinations(vars) {
|
|
|
4004
3977
|
let values = [];
|
|
4005
3978
|
if (typeof vars[key] === "string" && vars[key].startsWith("file://")) {
|
|
4006
3979
|
const filePath = vars[key].slice(7);
|
|
4007
|
-
const basePath = require_logger.
|
|
3980
|
+
const basePath = require_logger.state.basePath || "";
|
|
4008
3981
|
values = ((0, glob.globSync)(filePath, {
|
|
4009
3982
|
cwd: basePath || process.cwd(),
|
|
4010
3983
|
windowsPathsNoEscape: true
|
|
@@ -4044,28 +4017,28 @@ var Evaluator = class {
|
|
|
4044
4017
|
this.conversations = {};
|
|
4045
4018
|
this.registers = {};
|
|
4046
4019
|
this.fileWriters = (Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : []).map((p) => new JsonlFileWriter(p));
|
|
4047
|
-
this.rateLimitRegistry = require_providers.createRateLimitRegistry({ maxConcurrency: options.maxConcurrency ||
|
|
4020
|
+
this.rateLimitRegistry = require_providers.createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
4048
4021
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
4049
|
-
require_logger.
|
|
4022
|
+
require_logger.logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
4050
4023
|
retryAfterMs: data.retryAfterMs,
|
|
4051
4024
|
resetAt: data.resetAt,
|
|
4052
4025
|
concurrencyChange: data.concurrencyChange
|
|
4053
4026
|
});
|
|
4054
4027
|
});
|
|
4055
4028
|
this.rateLimitRegistry.on("ratelimit:learned", (data) => {
|
|
4056
|
-
require_logger.
|
|
4029
|
+
require_logger.logger.debug(`[Scheduler] Learned rate limits for ${data.rateLimitKey}`, {
|
|
4057
4030
|
requestLimit: data.requestLimit,
|
|
4058
4031
|
tokenLimit: data.tokenLimit
|
|
4059
4032
|
});
|
|
4060
4033
|
});
|
|
4061
4034
|
this.rateLimitRegistry.on("concurrency:decreased", (data) => {
|
|
4062
|
-
require_logger.
|
|
4035
|
+
require_logger.logger.debug(`[Scheduler] Concurrency decreased for ${data.rateLimitKey}`, {
|
|
4063
4036
|
previous: data.previous,
|
|
4064
4037
|
current: data.current
|
|
4065
4038
|
});
|
|
4066
4039
|
});
|
|
4067
4040
|
this.rateLimitRegistry.on("concurrency:increased", (data) => {
|
|
4068
|
-
require_logger.
|
|
4041
|
+
require_logger.logger.debug(`[Scheduler] Concurrency increased for ${data.rateLimitKey}`, {
|
|
4069
4042
|
previous: data.previous,
|
|
4070
4043
|
current: data.current
|
|
4071
4044
|
});
|
|
@@ -4122,7 +4095,7 @@ var Evaluator = class {
|
|
|
4122
4095
|
const checkAbort = () => {
|
|
4123
4096
|
if (combinedAbortSignal.aborted) throw new Error("Operation cancelled");
|
|
4124
4097
|
};
|
|
4125
|
-
if (!options.silent) require_logger.
|
|
4098
|
+
if (!options.silent) require_logger.logger.info(`Starting evaluation ${this.evalRecord.id}`);
|
|
4126
4099
|
checkAbort();
|
|
4127
4100
|
const prompts = [];
|
|
4128
4101
|
const assertionTypes = /* @__PURE__ */ new Set();
|
|
@@ -4134,32 +4107,32 @@ var Evaluator = class {
|
|
|
4134
4107
|
}
|
|
4135
4108
|
testSuite = (await require_providers.runExtensionHook(testSuite.extensions, "beforeAll", { suite: testSuite })).suite;
|
|
4136
4109
|
if (options.generateSuggestions) {
|
|
4137
|
-
require_logger.
|
|
4110
|
+
require_logger.logger.info(`Generating prompt variations...`);
|
|
4138
4111
|
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
4139
4112
|
if (error || !newPrompts) throw new Error(`Failed to generate prompts: ${error}`);
|
|
4140
|
-
require_logger.
|
|
4113
|
+
require_logger.logger.info(chalk.default.blue("Generated prompts:"));
|
|
4141
4114
|
let numAdded = 0;
|
|
4142
4115
|
for (const prompt of newPrompts) {
|
|
4143
|
-
require_logger.
|
|
4144
|
-
require_logger.
|
|
4145
|
-
require_logger.
|
|
4116
|
+
require_logger.logger.info("--------------------------------------------------------");
|
|
4117
|
+
require_logger.logger.info(`${prompt}`);
|
|
4118
|
+
require_logger.logger.info("--------------------------------------------------------");
|
|
4146
4119
|
if (await require_server.promptYesNo("Do you want to test this prompt?", false)) {
|
|
4147
4120
|
testSuite.prompts.push({
|
|
4148
4121
|
raw: prompt,
|
|
4149
4122
|
label: prompt
|
|
4150
4123
|
});
|
|
4151
4124
|
numAdded++;
|
|
4152
|
-
} else require_logger.
|
|
4125
|
+
} else require_logger.logger.info("Skipping this prompt.");
|
|
4153
4126
|
}
|
|
4154
4127
|
if (numAdded < 1) {
|
|
4155
|
-
require_logger.
|
|
4128
|
+
require_logger.logger.info(chalk.default.red("No prompts selected. Aborting."));
|
|
4156
4129
|
process.exitCode = 1;
|
|
4157
4130
|
return this.evalRecord;
|
|
4158
4131
|
}
|
|
4159
4132
|
}
|
|
4160
4133
|
const existingPromptsMap = /* @__PURE__ */ new Map();
|
|
4161
|
-
if (require_logger.
|
|
4162
|
-
require_logger.
|
|
4134
|
+
if (require_logger.state.resume && this.evalRecord.persisted && this.evalRecord.prompts.length > 0) {
|
|
4135
|
+
require_logger.logger.debug("Resuming evaluation: preserving metrics from previous run");
|
|
4163
4136
|
for (const existingPrompt of this.evalRecord.prompts) {
|
|
4164
4137
|
const key = `${existingPrompt.provider}:${existingPrompt.id}`;
|
|
4165
4138
|
existingPromptsMap.set(key, existingPrompt);
|
|
@@ -4197,7 +4170,7 @@ var Evaluator = class {
|
|
|
4197
4170
|
await this.evalRecord.addPrompts(prompts);
|
|
4198
4171
|
let tests = testSuite.tests && testSuite.tests.length > 0 ? testSuite.tests : testSuite.scenarios ? [] : [{}];
|
|
4199
4172
|
if (testSuite.scenarios && testSuite.scenarios.length > 0) {
|
|
4200
|
-
require_telemetry.
|
|
4173
|
+
require_telemetry.telemetry.record("feature_used", { feature: "scenarios" });
|
|
4201
4174
|
let scenarioIndex = 0;
|
|
4202
4175
|
for (const scenario of testSuite.scenarios) for (const data of scenario.config) {
|
|
4203
4176
|
const scenarioTests = (scenario.tests || [{}]).map((test) => {
|
|
@@ -4261,7 +4234,7 @@ var Evaluator = class {
|
|
|
4261
4234
|
}
|
|
4262
4235
|
const runEvalOptions = [];
|
|
4263
4236
|
let testIdx = 0;
|
|
4264
|
-
let concurrency = options.maxConcurrency ||
|
|
4237
|
+
let concurrency = options.maxConcurrency || 4;
|
|
4265
4238
|
for (let index = 0; index < tests.length; index++) {
|
|
4266
4239
|
const testCase = tests[index];
|
|
4267
4240
|
require_invariant.invariant(typeof testSuite.defaultTest !== "object" || Array.isArray(testSuite.defaultTest?.assert || []), `defaultTest.assert is not an array in test case #${index + 1}`);
|
|
@@ -4281,7 +4254,7 @@ var Evaluator = class {
|
|
|
4281
4254
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4282
4255
|
if (require_types.isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4283
4256
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4284
|
-
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-
|
|
4257
|
+
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-CxmDwEFf.cjs"));
|
|
4285
4258
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4286
4259
|
} else testCase.provider = defaultProvider;
|
|
4287
4260
|
}
|
|
@@ -4308,7 +4281,7 @@ var Evaluator = class {
|
|
|
4308
4281
|
const promptId = require_utils.generateIdFromPrompt(prompt);
|
|
4309
4282
|
const promptIdx = promptIndexMap.get(`${providerKey}:${promptId}`);
|
|
4310
4283
|
if (promptIdx === void 0) {
|
|
4311
|
-
require_logger.
|
|
4284
|
+
require_logger.logger.warn(`Could not find prompt index for ${providerKey}:${promptId}, skipping`);
|
|
4312
4285
|
continue;
|
|
4313
4286
|
}
|
|
4314
4287
|
runEvalOptions.push({
|
|
@@ -4331,7 +4304,7 @@ var Evaluator = class {
|
|
|
4331
4304
|
options: testOptions
|
|
4332
4305
|
};
|
|
4333
4306
|
const tracingEnabled = require_logger.getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || testCase.metadata?.tracingEnabled === true || testSuite.tracing?.enabled === true;
|
|
4334
|
-
require_logger.
|
|
4307
|
+
require_logger.logger.debug(`[Evaluator] Tracing check: env=${require_logger.getEnvBool("PROMPTFOO_TRACING_ENABLED", false)}, testCase.metadata?.tracingEnabled=${testCase.metadata?.tracingEnabled}, testSuite.tracing?.enabled=${testSuite.tracing?.enabled}, tracingEnabled=${tracingEnabled}`);
|
|
4335
4308
|
if (tracingEnabled) return {
|
|
4336
4309
|
...baseTest,
|
|
4337
4310
|
metadata: {
|
|
@@ -4364,27 +4337,27 @@ var Evaluator = class {
|
|
|
4364
4337
|
if (evalOption.test.assert?.some((a) => a.type === "select-best")) rowsWithSelectBestAssertion.add(evalOption.testIdx);
|
|
4365
4338
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4366
4339
|
}
|
|
4367
|
-
if (require_logger.
|
|
4368
|
-
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-
|
|
4369
|
-
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: require_logger.
|
|
4340
|
+
if (require_logger.state.resume && this.evalRecord.persisted) try {
|
|
4341
|
+
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-DvcJAWJU.cjs"));
|
|
4342
|
+
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: require_logger.state.retryMode });
|
|
4370
4343
|
const originalCount = runEvalOptions.length;
|
|
4371
4344
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
4372
4345
|
const step = runEvalOptions[i];
|
|
4373
4346
|
if (completedPairs.has(`${step.testIdx}:${step.promptIdx}`)) runEvalOptions.splice(i, 1);
|
|
4374
4347
|
}
|
|
4375
4348
|
const skipped = originalCount - runEvalOptions.length;
|
|
4376
|
-
if (skipped > 0) require_logger.
|
|
4349
|
+
if (skipped > 0) require_logger.logger.info(`Resuming: skipping ${skipped} previously completed cases`);
|
|
4377
4350
|
} catch (err) {
|
|
4378
|
-
require_logger.
|
|
4351
|
+
require_logger.logger.warn(`Resume: failed to load completed results. Running full evaluation. ${String(err)}`);
|
|
4379
4352
|
}
|
|
4380
4353
|
if (concurrency > 1) {
|
|
4381
4354
|
const usesConversation = prompts.some((p) => p.raw.includes("_conversation"));
|
|
4382
4355
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
4383
4356
|
if (usesConversation) {
|
|
4384
|
-
require_logger.
|
|
4357
|
+
require_logger.logger.info(`Setting concurrency to 1 because the ${chalk.default.cyan("_conversation")} variable is used.`);
|
|
4385
4358
|
concurrency = 1;
|
|
4386
4359
|
} else if (usesStoreOutputAs) {
|
|
4387
|
-
require_logger.
|
|
4360
|
+
require_logger.logger.info(`Setting concurrency to 1 because storeOutputAs is used.`);
|
|
4388
4361
|
concurrency = 1;
|
|
4389
4362
|
}
|
|
4390
4363
|
}
|
|
@@ -4415,14 +4388,14 @@ var Evaluator = class {
|
|
|
4415
4388
|
await this.evalRecord.addResult(row);
|
|
4416
4389
|
} catch (error) {
|
|
4417
4390
|
const resultSummary = require_logger.summarizeEvaluateResultForLogging(row);
|
|
4418
|
-
require_logger.
|
|
4391
|
+
require_logger.logger.error(`Error saving result: ${error} ${require_logger.safeJsonStringify(resultSummary)}`);
|
|
4419
4392
|
}
|
|
4420
4393
|
for (const writer of this.fileWriters) await writer.write(row);
|
|
4421
4394
|
const httpStatus = row.response?.metadata?.http?.status;
|
|
4422
4395
|
if (typeof httpStatus === "number" && require_cache.isNonTransientHttpStatus(httpStatus)) {
|
|
4423
4396
|
targetUnavailable = true;
|
|
4424
4397
|
targetErrorStatus = httpStatus;
|
|
4425
|
-
require_logger.
|
|
4398
|
+
require_logger.logger.error(`Target returned HTTP ${httpStatus}. Aborting scan - this error will not resolve on retry.`);
|
|
4426
4399
|
targetErrorAbortController.abort();
|
|
4427
4400
|
break;
|
|
4428
4401
|
}
|
|
@@ -4442,7 +4415,7 @@ var Evaluator = class {
|
|
|
4442
4415
|
if (testSuite.derivedMetrics) {
|
|
4443
4416
|
const math = await import("mathjs");
|
|
4444
4417
|
const promptEvalCount = metrics.testPassCount + metrics.testFailCount + metrics.testErrorCount + 1;
|
|
4445
|
-
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count")) require_logger.
|
|
4418
|
+
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count")) require_logger.logger.warn("Metric name '__count' is reserved for derived metrics and will be overridden.");
|
|
4446
4419
|
const evalContext = {
|
|
4447
4420
|
...metrics.namedScores,
|
|
4448
4421
|
__count: promptEvalCount
|
|
@@ -4457,7 +4430,7 @@ var Evaluator = class {
|
|
|
4457
4430
|
}
|
|
4458
4431
|
evalContext[metric.name] = metrics.namedScores[metric.name];
|
|
4459
4432
|
} catch (error) {
|
|
4460
|
-
require_logger.
|
|
4433
|
+
require_logger.logger.debug(`Could not evaluate derived metric '${metric.name}': ${error.message}`);
|
|
4461
4434
|
}
|
|
4462
4435
|
}
|
|
4463
4436
|
}
|
|
@@ -4496,7 +4469,7 @@ var Evaluator = class {
|
|
|
4496
4469
|
if (typeof evalStep.provider.cleanup === "function") try {
|
|
4497
4470
|
evalStep.provider.cleanup();
|
|
4498
4471
|
} catch (cleanupErr) {
|
|
4499
|
-
require_logger.
|
|
4472
|
+
require_logger.logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
4500
4473
|
}
|
|
4501
4474
|
reject(/* @__PURE__ */ new Error(`Evaluation timed out after ${timeoutMs}ms`));
|
|
4502
4475
|
}, timeoutMs);
|
|
@@ -4560,8 +4533,8 @@ var Evaluator = class {
|
|
|
4560
4533
|
}
|
|
4561
4534
|
};
|
|
4562
4535
|
const originalProgressCallback = this.options.progressCallback;
|
|
4563
|
-
const isWebUI = Boolean(require_logger.
|
|
4564
|
-
require_logger.
|
|
4536
|
+
const isWebUI = Boolean(require_logger.state.webUI);
|
|
4537
|
+
require_logger.logger.debug(`Progress bar settings: showProgressBar=${this.options.showProgressBar}, isWebUI=${isWebUI}`);
|
|
4565
4538
|
if (require_logger.isCI() && !isWebUI) {
|
|
4566
4539
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4567
4540
|
ciProgressReporter.start();
|
|
@@ -4571,20 +4544,20 @@ var Evaluator = class {
|
|
|
4571
4544
|
if (isWebUI) {
|
|
4572
4545
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4573
4546
|
const vars = formatVarsForDisplay(evalStep.test.vars, 50);
|
|
4574
|
-
require_logger.
|
|
4547
|
+
require_logger.logger.info(`[${numComplete}/${total}] Running ${provider} with vars: ${vars}`);
|
|
4575
4548
|
} else if (progressBarManager) {
|
|
4576
4549
|
const phase = evalStep.test.options?.runSerially ? "serial" : "concurrent";
|
|
4577
4550
|
progressBarManager.updateProgress(index, evalStep, phase, metrics);
|
|
4578
4551
|
} else if (ciProgressReporter) ciProgressReporter.update(numComplete);
|
|
4579
|
-
else require_logger.
|
|
4552
|
+
else require_logger.logger.debug(`Eval #${index + 1} complete (${numComplete} of ${runEvalOptions.length})`);
|
|
4580
4553
|
};
|
|
4581
4554
|
const serialRunEvalOptions = [];
|
|
4582
4555
|
const concurrentRunEvalOptions = [];
|
|
4583
4556
|
for (const evalOption of runEvalOptions) if (evalOption.test.options?.runSerially) serialRunEvalOptions.push(evalOption);
|
|
4584
4557
|
else concurrentRunEvalOptions.push(evalOption);
|
|
4585
4558
|
if (!this.options.silent) {
|
|
4586
|
-
if (serialRunEvalOptions.length > 0) require_logger.
|
|
4587
|
-
if (concurrentRunEvalOptions.length > 0) require_logger.
|
|
4559
|
+
if (serialRunEvalOptions.length > 0) require_logger.logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4560
|
+
if (concurrentRunEvalOptions.length > 0) require_logger.logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4588
4561
|
}
|
|
4589
4562
|
if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
4590
4563
|
try {
|
|
@@ -4593,7 +4566,7 @@ var Evaluator = class {
|
|
|
4593
4566
|
if (isWebUI) {
|
|
4594
4567
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4595
4568
|
const vars = formatVarsForDisplay(evalStep.test.vars || {}, 50);
|
|
4596
|
-
require_logger.
|
|
4569
|
+
require_logger.logger.info(`[${numComplete}/${runEvalOptions.length}] Running ${provider} with vars: ${vars}`);
|
|
4597
4570
|
}
|
|
4598
4571
|
const idx = runEvalOptions.indexOf(evalStep);
|
|
4599
4572
|
await processEvalStepWithTimeout(evalStep, idx);
|
|
@@ -4608,9 +4581,9 @@ var Evaluator = class {
|
|
|
4608
4581
|
});
|
|
4609
4582
|
} catch (err) {
|
|
4610
4583
|
if (combinedAbortSignal.aborted) {
|
|
4611
|
-
if (evalTimedOut) require_logger.
|
|
4584
|
+
if (evalTimedOut) require_logger.logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
4612
4585
|
else if (!targetUnavailable) {
|
|
4613
|
-
require_logger.
|
|
4586
|
+
require_logger.logger.info("Evaluation interrupted, saving progress...");
|
|
4614
4587
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4615
4588
|
if (progressBarManager) progressBarManager.stop();
|
|
4616
4589
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -4640,10 +4613,10 @@ var Evaluator = class {
|
|
|
4640
4613
|
let compareCount = 0;
|
|
4641
4614
|
for (const testIdx of rowsWithSelectBestAssertion) {
|
|
4642
4615
|
compareCount++;
|
|
4643
|
-
if (isWebUI) require_logger.
|
|
4616
|
+
if (isWebUI) require_logger.logger.info(`Running model-graded comparison ${compareCount} of ${compareRowsCount}...`);
|
|
4644
4617
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4645
4618
|
if (resultsToCompare.length === 0) {
|
|
4646
|
-
require_logger.
|
|
4619
|
+
require_logger.logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4647
4620
|
continue;
|
|
4648
4621
|
}
|
|
4649
4622
|
const compareAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "select-best");
|
|
@@ -4705,16 +4678,16 @@ var Evaluator = class {
|
|
|
4705
4678
|
}
|
|
4706
4679
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4707
4680
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4708
|
-
else if (!isWebUI) require_logger.
|
|
4681
|
+
else if (!isWebUI) require_logger.logger.debug(`Model-graded comparison #${compareCount} of ${compareRowsCount} complete`);
|
|
4709
4682
|
}
|
|
4710
4683
|
}
|
|
4711
4684
|
const maxScoreRowsCount = rowsWithMaxScoreAssertion.size;
|
|
4712
4685
|
if (maxScoreRowsCount > 0) {
|
|
4713
|
-
require_logger.
|
|
4686
|
+
require_logger.logger.info(`Processing ${maxScoreRowsCount} max-score assertions...`);
|
|
4714
4687
|
for (const testIdx of rowsWithMaxScoreAssertion) {
|
|
4715
4688
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4716
4689
|
if (resultsToCompare.length === 0) {
|
|
4717
|
-
require_logger.
|
|
4690
|
+
require_logger.logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4718
4691
|
continue;
|
|
4719
4692
|
}
|
|
4720
4693
|
const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
|
|
@@ -4722,7 +4695,7 @@ var Evaluator = class {
|
|
|
4722
4695
|
const maxScoreGradingResults = await require_graders.selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
|
|
4723
4696
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4724
4697
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4725
|
-
else if (!isWebUI) require_logger.
|
|
4698
|
+
else if (!isWebUI) require_logger.logger.debug(`Max-score assertion for test #${testIdx} complete`);
|
|
4726
4699
|
for (let index = 0; index < resultsToCompare.length; index++) {
|
|
4727
4700
|
const result = resultsToCompare[index];
|
|
4728
4701
|
const maxScoreGradingResult = {
|
|
@@ -4766,7 +4739,7 @@ var Evaluator = class {
|
|
|
4766
4739
|
progressBarManager.stop();
|
|
4767
4740
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
4768
4741
|
} catch (cleanupErr) {
|
|
4769
|
-
require_logger.
|
|
4742
|
+
require_logger.logger.warn(`Error during progress reporter cleanup: ${cleanupErr}`);
|
|
4770
4743
|
}
|
|
4771
4744
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4772
4745
|
if (evalTimedOut) {
|
|
@@ -4839,7 +4812,7 @@ var Evaluator = class {
|
|
|
4839
4812
|
return idParts.length > 1 ? idParts[0] : "unknown";
|
|
4840
4813
|
})));
|
|
4841
4814
|
const timeoutOccurred = evalTimedOut || this.evalRecord.results.some((r) => r.failureReason === require_types.ResultFailureReason.ERROR && r.error?.includes("timed out"));
|
|
4842
|
-
require_telemetry.
|
|
4815
|
+
require_telemetry.telemetry.record("eval_ran", {
|
|
4843
4816
|
numPrompts: prompts.length,
|
|
4844
4817
|
numTests: this.stats.successes + this.stats.failures + this.stats.errors,
|
|
4845
4818
|
numRequests: this.stats.tokenUsage.numRequests || 0,
|
|
@@ -4887,26 +4860,26 @@ var Evaluator = class {
|
|
|
4887
4860
|
await startOtlpReceiverIfNeeded(this.testSuite);
|
|
4888
4861
|
const tracingEnabled = require_logger.getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || this.testSuite.tracing?.enabled === true || typeof this.testSuite.defaultTest === "object" && this.testSuite.defaultTest?.metadata?.tracingEnabled === true || this.testSuite.tests?.some((t) => t.metadata?.tracingEnabled === true);
|
|
4889
4862
|
if (tracingEnabled) {
|
|
4890
|
-
require_logger.
|
|
4863
|
+
require_logger.logger.debug("[Evaluator] Initializing OTEL SDK for tracing");
|
|
4891
4864
|
initializeOtel(getDefaultOtelConfig());
|
|
4892
4865
|
}
|
|
4893
4866
|
try {
|
|
4894
4867
|
return await this._runEvaluation();
|
|
4895
4868
|
} finally {
|
|
4896
4869
|
if (tracingEnabled) {
|
|
4897
|
-
require_logger.
|
|
4870
|
+
require_logger.logger.debug("[Evaluator] Flushing OTEL spans...");
|
|
4898
4871
|
await flushOtel();
|
|
4899
4872
|
await shutdownOtel();
|
|
4900
4873
|
}
|
|
4901
4874
|
if (isOtlpReceiverStarted()) {
|
|
4902
|
-
require_logger.
|
|
4875
|
+
require_logger.logger.debug("[Evaluator] Waiting for span exports to complete...");
|
|
4903
4876
|
await require_fetch.sleep(3e3);
|
|
4904
4877
|
}
|
|
4905
4878
|
await stopOtlpReceiverIfNeeded();
|
|
4906
4879
|
await require_providerRegistry.providerRegistry.shutdownAll();
|
|
4907
4880
|
if (this.rateLimitRegistry) {
|
|
4908
4881
|
const metrics = this.rateLimitRegistry.getMetrics();
|
|
4909
|
-
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0) require_logger.
|
|
4882
|
+
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0) require_logger.logger.debug(`[Scheduler] Final metrics for ${key}`, {
|
|
4910
4883
|
totalRequests: m.totalRequests,
|
|
4911
4884
|
completedRequests: m.completedRequests,
|
|
4912
4885
|
failedRequests: m.failedRequests,
|
|
@@ -4919,14 +4892,13 @@ var Evaluator = class {
|
|
|
4919
4892
|
}
|
|
4920
4893
|
this.rateLimitRegistry?.dispose();
|
|
4921
4894
|
require_providers.redteamProviderManager.setRateLimitRegistry(void 0);
|
|
4922
|
-
require_logger.
|
|
4895
|
+
require_logger.state.maxConcurrency = void 0;
|
|
4923
4896
|
}
|
|
4924
4897
|
}
|
|
4925
4898
|
};
|
|
4926
4899
|
function evaluate$1(testSuite, evalRecord, options) {
|
|
4927
4900
|
return new Evaluator(testSuite, evalRecord, options).evaluate();
|
|
4928
4901
|
}
|
|
4929
|
-
|
|
4930
4902
|
//#endregion
|
|
4931
4903
|
//#region src/guardrails.ts
|
|
4932
4904
|
const API_BASE_URL = `${require_fetch.getShareApiBaseUrl()}/v1`;
|
|
@@ -4940,7 +4912,7 @@ async function makeRequest(endpoint, input) {
|
|
|
4940
4912
|
if (!response.data) throw new Error("No data returned from API");
|
|
4941
4913
|
return response.data;
|
|
4942
4914
|
} catch (error) {
|
|
4943
|
-
require_logger.
|
|
4915
|
+
require_logger.logger.error(`Guardrails API error: ${error}`);
|
|
4944
4916
|
throw error;
|
|
4945
4917
|
}
|
|
4946
4918
|
}
|
|
@@ -4957,7 +4929,7 @@ async function makeAdaptiveRequest(request) {
|
|
|
4957
4929
|
if (!response.data) throw new Error("No data returned from API");
|
|
4958
4930
|
return response.data;
|
|
4959
4931
|
} catch (error) {
|
|
4960
|
-
require_logger.
|
|
4932
|
+
require_logger.logger.error(`Guardrails API error: ${error}`);
|
|
4961
4933
|
throw error;
|
|
4962
4934
|
}
|
|
4963
4935
|
}
|
|
@@ -4975,8 +4947,6 @@ const guardrails = {
|
|
|
4975
4947
|
return makeAdaptiveRequest(request);
|
|
4976
4948
|
}
|
|
4977
4949
|
};
|
|
4978
|
-
var guardrails_default = guardrails;
|
|
4979
|
-
|
|
4980
4950
|
//#endregion
|
|
4981
4951
|
//#region src/migrate.ts
|
|
4982
4952
|
/**
|
|
@@ -5011,18 +4981,17 @@ async function runDbMigrations() {
|
|
|
5011
4981
|
const projectRoot = dir.split("dist/server/src")[0];
|
|
5012
4982
|
migrationsFolder = path.join(projectRoot, "dist", "promptfoo", "drizzle");
|
|
5013
4983
|
} else migrationsFolder = path.join(dir, "..", "drizzle");
|
|
5014
|
-
require_logger.
|
|
4984
|
+
require_logger.logger.debug(`Running database migrations from: ${migrationsFolder}`);
|
|
5015
4985
|
(0, drizzle_orm_better_sqlite3_migrator.migrate)(db, { migrationsFolder });
|
|
5016
|
-
require_logger.
|
|
4986
|
+
require_logger.logger.debug("Database migrations completed");
|
|
5017
4987
|
resolve();
|
|
5018
4988
|
} catch (error) {
|
|
5019
|
-
require_logger.
|
|
4989
|
+
require_logger.logger.error(`Database migration failed: ${error}`);
|
|
5020
4990
|
reject(error);
|
|
5021
4991
|
}
|
|
5022
4992
|
});
|
|
5023
4993
|
});
|
|
5024
4994
|
}
|
|
5025
|
-
|
|
5026
4995
|
//#endregion
|
|
5027
4996
|
//#region src/redteam/sharedFrontend.ts
|
|
5028
4997
|
function getRiskCategorySeverityMap(plugins) {
|
|
@@ -5039,7 +5008,6 @@ function getRiskCategorySeverityMap(plugins) {
|
|
|
5039
5008
|
...overrides
|
|
5040
5009
|
};
|
|
5041
5010
|
}
|
|
5042
|
-
|
|
5043
5011
|
//#endregion
|
|
5044
5012
|
//#region src/util/calculateFilteredMetrics.ts
|
|
5045
5013
|
/**
|
|
@@ -5093,12 +5061,12 @@ async function calculateFilteredMetrics(opts) {
|
|
|
5093
5061
|
try {
|
|
5094
5062
|
const countResult = await getResultCount(whereSql);
|
|
5095
5063
|
if (countResult > MAX_RESULTS_FOR_METRICS) {
|
|
5096
|
-
require_logger.
|
|
5064
|
+
require_logger.logger.warn(`Filtered result count ${countResult} exceeds limit ${MAX_RESULTS_FOR_METRICS}`, { evalId: opts.evalId });
|
|
5097
5065
|
throw new Error(`Result count ${countResult} exceeds maximum ${MAX_RESULTS_FOR_METRICS}`);
|
|
5098
5066
|
}
|
|
5099
5067
|
return await calculateWithOptimizedQuery(opts);
|
|
5100
5068
|
} catch (error) {
|
|
5101
|
-
require_logger.
|
|
5069
|
+
require_logger.logger.error("Failed to calculate filtered metrics with optimized query", { error });
|
|
5102
5070
|
return createEmptyMetricsArray(numPrompts);
|
|
5103
5071
|
}
|
|
5104
5072
|
}
|
|
@@ -5151,7 +5119,7 @@ async function calculateWithOptimizedQuery(opts) {
|
|
|
5151
5119
|
for (const row of basicResults) {
|
|
5152
5120
|
const idx = row.prompt_idx;
|
|
5153
5121
|
if (idx < 0 || idx >= numPrompts) {
|
|
5154
|
-
require_logger.
|
|
5122
|
+
require_logger.logger.warn(`Invalid prompt_idx ${idx}, expected 0-${numPrompts - 1}`);
|
|
5155
5123
|
continue;
|
|
5156
5124
|
}
|
|
5157
5125
|
metrics[idx] = {
|
|
@@ -5176,7 +5144,7 @@ async function calculateWithOptimizedQuery(opts) {
|
|
|
5176
5144
|
}
|
|
5177
5145
|
await aggregateNamedScores(metrics, whereSql);
|
|
5178
5146
|
await aggregateAssertions(metrics, whereSql);
|
|
5179
|
-
require_logger.
|
|
5147
|
+
require_logger.logger.debug("Filtered metrics calculated", {
|
|
5180
5148
|
numPrompts,
|
|
5181
5149
|
metricsCount: basicResults.length
|
|
5182
5150
|
});
|
|
@@ -5297,7 +5265,6 @@ function createEmptyMetricsArray(numPrompts) {
|
|
|
5297
5265
|
cost: 0
|
|
5298
5266
|
}));
|
|
5299
5267
|
}
|
|
5300
|
-
|
|
5301
5268
|
//#endregion
|
|
5302
5269
|
//#region src/util/convertEvalResultsToTable.ts
|
|
5303
5270
|
/**
|
|
@@ -5430,7 +5397,6 @@ function convertResultsToTable(eval_) {
|
|
|
5430
5397
|
body: rows
|
|
5431
5398
|
};
|
|
5432
5399
|
}
|
|
5433
|
-
|
|
5434
5400
|
//#endregion
|
|
5435
5401
|
//#region src/util/exportToFile/index.ts
|
|
5436
5402
|
function convertEvalResultToTableCell(result) {
|
|
@@ -5508,7 +5474,6 @@ function convertTestResultsToTableRow(results, varsForHeader) {
|
|
|
5508
5474
|
for (const result of results) row.outputs[result.promptIdx] = convertEvalResultToTableCell(result);
|
|
5509
5475
|
return row;
|
|
5510
5476
|
}
|
|
5511
|
-
|
|
5512
5477
|
//#endregion
|
|
5513
5478
|
//#region src/models/evalPerformance.ts
|
|
5514
5479
|
const distinctCountCache = /* @__PURE__ */ new Map();
|
|
@@ -5525,7 +5490,7 @@ async function getCachedResultsCount(evalId) {
|
|
|
5525
5490
|
const cacheKey = `distinct:${evalId}`;
|
|
5526
5491
|
const cached = distinctCountCache.get(cacheKey);
|
|
5527
5492
|
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
5528
|
-
require_logger.
|
|
5493
|
+
require_logger.logger.debug(`Using cached distinct count for eval ${evalId}: ${cached.count}`);
|
|
5529
5494
|
return cached.count;
|
|
5530
5495
|
}
|
|
5531
5496
|
const db = require_tables.getDb();
|
|
@@ -5533,7 +5498,7 @@ async function getCachedResultsCount(evalId) {
|
|
|
5533
5498
|
const result = db.select({ count: drizzle_orm.sql`COUNT(DISTINCT test_idx)` }).from(require_tables.evalResultsTable).where(drizzle_orm.sql`eval_id = ${evalId}`).all();
|
|
5534
5499
|
const count = Number(result[0]?.count ?? 0);
|
|
5535
5500
|
const duration = Date.now() - start;
|
|
5536
|
-
require_logger.
|
|
5501
|
+
require_logger.logger.debug(`Distinct count query for eval ${evalId}: ${count} in ${duration}ms`);
|
|
5537
5502
|
distinctCountCache.set(cacheKey, {
|
|
5538
5503
|
count,
|
|
5539
5504
|
timestamp: Date.now()
|
|
@@ -5551,7 +5516,7 @@ async function getTotalResultRowCount(evalId) {
|
|
|
5551
5516
|
const cacheKey = `total:${evalId}`;
|
|
5552
5517
|
const cached = totalRowCountCache.get(cacheKey);
|
|
5553
5518
|
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
|
|
5554
|
-
require_logger.
|
|
5519
|
+
require_logger.logger.debug(`Using cached total row count for eval ${evalId}: ${cached.count}`);
|
|
5555
5520
|
return cached.count;
|
|
5556
5521
|
}
|
|
5557
5522
|
const db = require_tables.getDb();
|
|
@@ -5559,7 +5524,7 @@ async function getTotalResultRowCount(evalId) {
|
|
|
5559
5524
|
const result = db.select({ count: drizzle_orm.sql`COUNT(*)` }).from(require_tables.evalResultsTable).where(drizzle_orm.sql`eval_id = ${evalId}`).all();
|
|
5560
5525
|
const count = Number(result[0]?.count ?? 0);
|
|
5561
5526
|
const duration = Date.now() - start;
|
|
5562
|
-
require_logger.
|
|
5527
|
+
require_logger.logger.debug(`Total row count query for eval ${evalId}: ${count} in ${duration}ms`);
|
|
5563
5528
|
totalRowCountCache.set(cacheKey, {
|
|
5564
5529
|
count,
|
|
5565
5530
|
timestamp: Date.now()
|
|
@@ -5592,7 +5557,7 @@ async function queryTestIndicesOptimized(evalId, opts) {
|
|
|
5592
5557
|
`;
|
|
5593
5558
|
const countResult = db.all(countQuery);
|
|
5594
5559
|
const filteredCount = Number(countResult[0]?.count ?? 0);
|
|
5595
|
-
require_logger.
|
|
5560
|
+
require_logger.logger.debug(`Optimized count query took ${Date.now() - countStart}ms`);
|
|
5596
5561
|
const idxStart = Date.now();
|
|
5597
5562
|
const idxQuery = drizzle_orm.sql`
|
|
5598
5563
|
SELECT DISTINCT test_idx
|
|
@@ -5603,13 +5568,12 @@ async function queryTestIndicesOptimized(evalId, opts) {
|
|
|
5603
5568
|
OFFSET ${offset}
|
|
5604
5569
|
`;
|
|
5605
5570
|
const testIndices = db.all(idxQuery).map((row) => row.test_idx);
|
|
5606
|
-
require_logger.
|
|
5571
|
+
require_logger.logger.debug(`Optimized index query took ${Date.now() - idxStart}ms`);
|
|
5607
5572
|
return {
|
|
5608
5573
|
testIndices,
|
|
5609
5574
|
filteredCount
|
|
5610
5575
|
};
|
|
5611
5576
|
}
|
|
5612
|
-
|
|
5613
5577
|
//#endregion
|
|
5614
5578
|
//#region src/models/eval.ts
|
|
5615
5579
|
/**
|
|
@@ -5704,7 +5668,7 @@ var EvalQueries = class {
|
|
|
5704
5668
|
try {
|
|
5705
5669
|
db.update(require_tables.evalsTable).set({ vars }).where((0, drizzle_orm.eq)(require_tables.evalsTable.id, evalId)).run();
|
|
5706
5670
|
} catch (e) {
|
|
5707
|
-
require_logger.
|
|
5671
|
+
require_logger.logger.error(`Error setting vars: ${vars} for eval ${evalId}: ${e}`);
|
|
5708
5672
|
}
|
|
5709
5673
|
}
|
|
5710
5674
|
static async getMetadataKeysFromEval(evalId, comparisonEvalIds = []) {
|
|
@@ -5725,7 +5689,7 @@ var EvalQueries = class {
|
|
|
5725
5689
|
`;
|
|
5726
5690
|
return (await db.all(query)).map((r) => r.key);
|
|
5727
5691
|
} catch (error) {
|
|
5728
|
-
require_logger.
|
|
5692
|
+
require_logger.logger.error(`Error fetching metadata keys for eval ${evalId} and comparisons [${comparisonEvalIds.join(", ")}]: ${error}`);
|
|
5729
5693
|
return [];
|
|
5730
5694
|
}
|
|
5731
5695
|
}
|
|
@@ -5756,7 +5720,7 @@ var EvalQueries = class {
|
|
|
5756
5720
|
const values = db.all(query).map(({ value }) => String(value).trim()).filter((value) => value.length > 0);
|
|
5757
5721
|
return Array.from(new Set(values));
|
|
5758
5722
|
} catch (error) {
|
|
5759
|
-
require_logger.
|
|
5723
|
+
require_logger.logger.error(`Error fetching metadata values for eval ${evalId} and key ${trimmedKey}: ${error instanceof Error ? error.message : String(error)}`);
|
|
5760
5724
|
return [];
|
|
5761
5725
|
}
|
|
5762
5726
|
}
|
|
@@ -5828,7 +5792,7 @@ var Eval = class Eval {
|
|
|
5828
5792
|
}
|
|
5829
5793
|
return evalInstance;
|
|
5830
5794
|
}
|
|
5831
|
-
static async getMany(limit =
|
|
5795
|
+
static async getMany(limit = 100) {
|
|
5832
5796
|
return (await require_tables.getDb().select().from(require_tables.evalsTable).limit(limit).orderBy((0, drizzle_orm.desc)(require_tables.evalsTable.createdAt)).all()).map((e) => new Eval(e.config, {
|
|
5833
5797
|
id: e.id,
|
|
5834
5798
|
createdAt: new Date(e.createdAt),
|
|
@@ -5843,7 +5807,7 @@ var Eval = class Eval {
|
|
|
5843
5807
|
* @param offset - Number of evals to skip
|
|
5844
5808
|
* @param limit - Maximum number of evals to return
|
|
5845
5809
|
*/
|
|
5846
|
-
static async getPaginated(offset = 0, limit =
|
|
5810
|
+
static async getPaginated(offset = 0, limit = 100) {
|
|
5847
5811
|
return (await require_tables.getDb().select().from(require_tables.evalsTable).orderBy((0, drizzle_orm.desc)(require_tables.evalsTable.createdAt)).limit(limit).offset(offset).all()).map((e) => new Eval(e.config, {
|
|
5848
5812
|
id: e.id,
|
|
5849
5813
|
createdAt: new Date(e.createdAt),
|
|
@@ -5889,7 +5853,7 @@ var Eval = class Eval {
|
|
|
5889
5853
|
evalId,
|
|
5890
5854
|
promptId
|
|
5891
5855
|
}).onConflictDoNothing().run();
|
|
5892
|
-
require_logger.
|
|
5856
|
+
require_logger.logger.debug(`Inserting prompt ${promptId}`);
|
|
5893
5857
|
}
|
|
5894
5858
|
if (opts?.results && opts.results.length > 0) {
|
|
5895
5859
|
const res = db.insert(require_tables.evalResultsTable).values(opts.results?.map((r) => ({
|
|
@@ -5897,7 +5861,7 @@ var Eval = class Eval {
|
|
|
5897
5861
|
evalId,
|
|
5898
5862
|
id: crypto.randomUUID()
|
|
5899
5863
|
}))).run();
|
|
5900
|
-
require_logger.
|
|
5864
|
+
require_logger.logger.debug(`Inserted ${res.changes} eval results`);
|
|
5901
5865
|
}
|
|
5902
5866
|
db.insert(require_tables.datasetsTable).values({
|
|
5903
5867
|
id: datasetId,
|
|
@@ -5907,7 +5871,7 @@ var Eval = class Eval {
|
|
|
5907
5871
|
evalId,
|
|
5908
5872
|
datasetId
|
|
5909
5873
|
}).onConflictDoNothing().run();
|
|
5910
|
-
require_logger.
|
|
5874
|
+
require_logger.logger.debug(`Inserting dataset ${datasetId}`);
|
|
5911
5875
|
if (config.tags) for (const [tagKey, tagValue] of Object.entries(config.tags)) {
|
|
5912
5876
|
const tagId = require_createHash.sha256(`${tagKey}:${tagValue}`);
|
|
5913
5877
|
db.insert(require_tables.tagsTable).values({
|
|
@@ -5919,7 +5883,7 @@ var Eval = class Eval {
|
|
|
5919
5883
|
evalId,
|
|
5920
5884
|
tagId
|
|
5921
5885
|
}).onConflictDoNothing().run();
|
|
5922
|
-
require_logger.
|
|
5886
|
+
require_logger.logger.debug(`Inserting tag ${tagId}`);
|
|
5923
5887
|
}
|
|
5924
5888
|
});
|
|
5925
5889
|
return new Eval(config, {
|
|
@@ -6100,7 +6064,7 @@ var Eval = class Eval {
|
|
|
6100
6064
|
if (type === "metric") {
|
|
6101
6065
|
const metricKey = field || value;
|
|
6102
6066
|
if (!metricKey) {
|
|
6103
|
-
require_logger.
|
|
6067
|
+
require_logger.logger.warn("Invalid metric filter: missing field and value", { filter });
|
|
6104
6068
|
return;
|
|
6105
6069
|
}
|
|
6106
6070
|
const jsonPath = buildSafeJsonPath(metricKey);
|
|
@@ -6114,7 +6078,7 @@ var Eval = class Eval {
|
|
|
6114
6078
|
else if (operator === "lt") condition = drizzle_orm.sql`CAST(json_extract(named_scores, ${jsonPath}) AS REAL) < ${numericValue}`;
|
|
6115
6079
|
else if (operator === "lte") condition = drizzle_orm.sql`CAST(json_extract(named_scores, ${jsonPath}) AS REAL) <= ${numericValue}`;
|
|
6116
6080
|
} else {
|
|
6117
|
-
require_logger.
|
|
6081
|
+
require_logger.logger.warn("Invalid numeric value in metric filter", {
|
|
6118
6082
|
metricKey,
|
|
6119
6083
|
value,
|
|
6120
6084
|
numericValue,
|
|
@@ -6192,7 +6156,7 @@ var Eval = class Eval {
|
|
|
6192
6156
|
const countStart = Date.now();
|
|
6193
6157
|
const countResult = await db.get(filteredCountQuery);
|
|
6194
6158
|
const countEnd = Date.now();
|
|
6195
|
-
require_logger.
|
|
6159
|
+
require_logger.logger.debug(`Count query took ${countEnd - countStart}ms`);
|
|
6196
6160
|
const filteredCount = countResult?.count || 0;
|
|
6197
6161
|
const idxQuery = drizzle_orm.sql`
|
|
6198
6162
|
SELECT DISTINCT test_idx
|
|
@@ -6205,7 +6169,7 @@ var Eval = class Eval {
|
|
|
6205
6169
|
const idxStart = Date.now();
|
|
6206
6170
|
const rows = await db.all(idxQuery);
|
|
6207
6171
|
const idxEnd = Date.now();
|
|
6208
|
-
require_logger.
|
|
6172
|
+
require_logger.logger.debug(`Index query took ${idxEnd - idxStart}ms`);
|
|
6209
6173
|
return {
|
|
6210
6174
|
testIndices: rows.map((row) => row.test_idx),
|
|
6211
6175
|
filteredCount
|
|
@@ -6241,7 +6205,7 @@ var Eval = class Eval {
|
|
|
6241
6205
|
const hasComplexFilters = opts.filters && opts.filters.length > 0;
|
|
6242
6206
|
let queryResult;
|
|
6243
6207
|
if (hasComplexFilters) {
|
|
6244
|
-
require_logger.
|
|
6208
|
+
require_logger.logger.debug("Using original query for complex filters");
|
|
6245
6209
|
queryResult = await this.queryTestIndices({
|
|
6246
6210
|
offset: opts.offset,
|
|
6247
6211
|
limit: opts.limit,
|
|
@@ -6250,7 +6214,7 @@ var Eval = class Eval {
|
|
|
6250
6214
|
filters: opts.filters
|
|
6251
6215
|
});
|
|
6252
6216
|
} else {
|
|
6253
|
-
require_logger.
|
|
6217
|
+
require_logger.logger.debug("Using optimized query for table page");
|
|
6254
6218
|
queryResult = await queryTestIndicesOptimized(this.id, {
|
|
6255
6219
|
offset: opts.offset,
|
|
6256
6220
|
limit: opts.limit,
|
|
@@ -6265,12 +6229,12 @@ var Eval = class Eval {
|
|
|
6265
6229
|
const varsStart = Date.now();
|
|
6266
6230
|
const vars = Array.from(this.vars);
|
|
6267
6231
|
const varsEnd = Date.now();
|
|
6268
|
-
require_logger.
|
|
6232
|
+
require_logger.logger.debug(`Vars query took ${varsEnd - varsStart}ms`);
|
|
6269
6233
|
const body = [];
|
|
6270
6234
|
const bodyStart = Date.now();
|
|
6271
6235
|
if (testIndices.length === 0) {
|
|
6272
6236
|
const bodyEnd = Date.now();
|
|
6273
|
-
require_logger.
|
|
6237
|
+
require_logger.logger.debug(`Body query took ${bodyEnd - bodyStart}ms`);
|
|
6274
6238
|
return {
|
|
6275
6239
|
head: {
|
|
6276
6240
|
prompts: this.prompts,
|
|
@@ -6302,7 +6266,7 @@ var Eval = class Eval {
|
|
|
6302
6266
|
if (results.length > 0) body.push(convertTestResultsToTableRow(results, vars));
|
|
6303
6267
|
}
|
|
6304
6268
|
const bodyEnd = Date.now();
|
|
6305
|
-
require_logger.
|
|
6269
|
+
require_logger.logger.debug(`Body query took ${bodyEnd - bodyStart}ms`);
|
|
6306
6270
|
return {
|
|
6307
6271
|
head: {
|
|
6308
6272
|
prompts: this.prompts,
|
|
@@ -6415,7 +6379,7 @@ var Eval = class Eval {
|
|
|
6415
6379
|
})
|
|
6416
6380
|
}));
|
|
6417
6381
|
} catch (error) {
|
|
6418
|
-
require_logger.
|
|
6382
|
+
require_logger.logger.debug(`Failed to fetch traces for eval ${this.id}: ${error}`);
|
|
6419
6383
|
return [];
|
|
6420
6384
|
}
|
|
6421
6385
|
}
|
|
@@ -6452,7 +6416,7 @@ var Eval = class Eval {
|
|
|
6452
6416
|
const newEvalId = createEvalId(/* @__PURE__ */ new Date());
|
|
6453
6417
|
const copyDescription = description || `${this.description || "Evaluation"} (Copy)`;
|
|
6454
6418
|
const testCount = distinctTestCount ?? await this.getResultsCount();
|
|
6455
|
-
require_logger.
|
|
6419
|
+
require_logger.logger.info("Starting eval copy", {
|
|
6456
6420
|
sourceEvalId: this.id,
|
|
6457
6421
|
targetEvalId: newEvalId,
|
|
6458
6422
|
distinctTestCount: testCount
|
|
@@ -6515,7 +6479,7 @@ var Eval = class Eval {
|
|
|
6515
6479
|
db.insert(require_tables.evalResultsTable).values(copiedResults).run();
|
|
6516
6480
|
copiedCount += batch.length;
|
|
6517
6481
|
offset += BATCH_SIZE;
|
|
6518
|
-
require_logger.
|
|
6482
|
+
require_logger.logger.debug("Copied batch of eval results", {
|
|
6519
6483
|
sourceEvalId: this.id,
|
|
6520
6484
|
targetEvalId: newEvalId,
|
|
6521
6485
|
batchSize: batch.length,
|
|
@@ -6524,7 +6488,7 @@ var Eval = class Eval {
|
|
|
6524
6488
|
});
|
|
6525
6489
|
}
|
|
6526
6490
|
});
|
|
6527
|
-
require_logger.
|
|
6491
|
+
require_logger.logger.info("Eval copy completed successfully", {
|
|
6528
6492
|
sourceEvalId: this.id,
|
|
6529
6493
|
targetEvalId: newEvalId,
|
|
6530
6494
|
rowsCopied: copiedCount,
|
|
@@ -6539,7 +6503,6 @@ var Eval = class Eval {
|
|
|
6539
6503
|
this._shared = shared;
|
|
6540
6504
|
}
|
|
6541
6505
|
};
|
|
6542
|
-
|
|
6543
6506
|
//#endregion
|
|
6544
6507
|
//#region src/assertions/validateAssertions.ts
|
|
6545
6508
|
var AssertValidationError = class extends Error {
|
|
@@ -6591,7 +6554,6 @@ function validateAssertions(tests, defaultTest) {
|
|
|
6591
6554
|
}
|
|
6592
6555
|
}
|
|
6593
6556
|
}
|
|
6594
|
-
|
|
6595
6557
|
//#endregion
|
|
6596
6558
|
//#region src/commands/eval/filterPrompts.ts
|
|
6597
6559
|
/**
|
|
@@ -6617,7 +6579,6 @@ function filterPrompts(prompts, filterPromptsOption) {
|
|
|
6617
6579
|
return promptId && filterRegex.test(promptId) || promptLabel && filterRegex.test(promptLabel);
|
|
6618
6580
|
});
|
|
6619
6581
|
}
|
|
6620
|
-
|
|
6621
6582
|
//#endregion
|
|
6622
6583
|
//#region src/commands/eval/filterProviders.ts
|
|
6623
6584
|
/**
|
|
@@ -6698,7 +6659,6 @@ function filterProviders(providers, filterProvidersOption) {
|
|
|
6698
6659
|
return filterRegex.test(providerId) || providerLabel && filterRegex.test(providerLabel);
|
|
6699
6660
|
});
|
|
6700
6661
|
}
|
|
6701
|
-
|
|
6702
6662
|
//#endregion
|
|
6703
6663
|
//#region src/commands/eval/filterTestsUtil.ts
|
|
6704
6664
|
/**
|
|
@@ -6726,35 +6686,35 @@ function mergeDefaultVars(test, defaultTest) {
|
|
|
6726
6686
|
*/
|
|
6727
6687
|
async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
6728
6688
|
if (!testSuite.tests) {
|
|
6729
|
-
require_logger.
|
|
6689
|
+
require_logger.logger.debug("[filterTestsByResults] No tests in test suite");
|
|
6730
6690
|
return [];
|
|
6731
6691
|
}
|
|
6732
|
-
require_logger.
|
|
6692
|
+
require_logger.logger.debug(`[filterTestsByResults] Loading results from: ${pathOrId}`);
|
|
6733
6693
|
let results;
|
|
6734
6694
|
try {
|
|
6735
6695
|
if (pathOrId.endsWith(".json")) results = (await require_util.readOutput(pathOrId)).results;
|
|
6736
6696
|
else {
|
|
6737
6697
|
const eval_ = await Eval.findById(pathOrId);
|
|
6738
6698
|
if (!eval_) {
|
|
6739
|
-
require_logger.
|
|
6699
|
+
require_logger.logger.warn(`[filterTestsByResults] Evaluation not found: ${pathOrId}`);
|
|
6740
6700
|
return [];
|
|
6741
6701
|
}
|
|
6742
6702
|
const summary = await eval_.toEvaluateSummary();
|
|
6743
6703
|
if ("results" in summary) results = { results: summary.results };
|
|
6744
6704
|
else {
|
|
6745
|
-
require_logger.
|
|
6705
|
+
require_logger.logger.debug("[filterTestsByResults] No results in evaluation summary");
|
|
6746
6706
|
return [];
|
|
6747
6707
|
}
|
|
6748
6708
|
}
|
|
6749
6709
|
} catch (error) {
|
|
6750
|
-
require_logger.
|
|
6710
|
+
require_logger.logger.warn(`[filterTestsByResults] Error loading results: ${error}`);
|
|
6751
6711
|
return [];
|
|
6752
6712
|
}
|
|
6753
6713
|
const filteredResults = results.results.filter(filterFn);
|
|
6754
|
-
require_logger.
|
|
6714
|
+
require_logger.logger.debug(`[filterTestsByResults] Found ${filteredResults.length} matching results out of ${results.results.length} total`);
|
|
6755
6715
|
if (filteredResults.length === 0) return [];
|
|
6756
6716
|
const uniqueVarsInResults = new Set(filteredResults.map((r) => JSON.stringify(require_util.filterRuntimeVars(r.vars))));
|
|
6757
|
-
require_logger.
|
|
6717
|
+
require_logger.logger.debug(`[filterTestsByResults] ${uniqueVarsInResults.size} unique test cases (by vars) in filtered results`);
|
|
6758
6718
|
const matchedTests = [];
|
|
6759
6719
|
for (const test of testSuite.tests) {
|
|
6760
6720
|
const testWithDefaults = mergeDefaultVars(test, testSuite.defaultTest);
|
|
@@ -6776,15 +6736,15 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6776
6736
|
...runtimeVars
|
|
6777
6737
|
}
|
|
6778
6738
|
};
|
|
6779
|
-
require_logger.
|
|
6739
|
+
require_logger.logger.debug("[filterTestsByResults] Restored runtime vars for test", { varKeys: Object.keys(runtimeVars) });
|
|
6780
6740
|
matchedTests.push(testWithRuntimeVars);
|
|
6781
6741
|
} else {
|
|
6782
|
-
require_logger.
|
|
6742
|
+
require_logger.logger.debug("[filterTestsByResults] Matched test has no runtime vars to restore");
|
|
6783
6743
|
matchedTests.push(test);
|
|
6784
6744
|
}
|
|
6785
6745
|
}
|
|
6786
6746
|
}
|
|
6787
|
-
require_logger.
|
|
6747
|
+
require_logger.logger.debug(`[filterTestsByResults] Matched ${matchedTests.length} tests out of ${testSuite.tests.length} in test suite`);
|
|
6788
6748
|
const extractedTests = [];
|
|
6789
6749
|
const matchedResultKeys = /* @__PURE__ */ new Set();
|
|
6790
6750
|
for (const result of filteredResults) for (const test of matchedTests) if (require_util.resultIsForTestCase(result, mergeDefaultVars(test, testSuite.defaultTest))) {
|
|
@@ -6795,7 +6755,7 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6795
6755
|
const resultKey = JSON.stringify(require_util.filterRuntimeVars(result.vars));
|
|
6796
6756
|
if (matchedResultKeys.has(resultKey)) continue;
|
|
6797
6757
|
if (!result.testCase) {
|
|
6798
|
-
require_logger.
|
|
6758
|
+
require_logger.logger.debug("[filterTestsByResults] Skipping result without testCase data for extraction");
|
|
6799
6759
|
continue;
|
|
6800
6760
|
}
|
|
6801
6761
|
if (extractedTests.some((t) => JSON.stringify(require_util.filterRuntimeVars(t.vars)) === resultKey)) continue;
|
|
@@ -6807,12 +6767,11 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6807
6767
|
options: result.testCase.options
|
|
6808
6768
|
});
|
|
6809
6769
|
}
|
|
6810
|
-
if (extractedTests.length > 0) require_logger.
|
|
6811
|
-
if (matchedTests.length === 0 && extractedTests.length === 0 && filteredResults.length > 0) require_logger.
|
|
6812
|
-
else if (matchedTests.length + extractedTests.length < uniqueVarsInResults.size) require_logger.
|
|
6770
|
+
if (extractedTests.length > 0) require_logger.logger.info(`[filterTestsByResults] Extracted ${extractedTests.length} runtime-generated test(s) from results`);
|
|
6771
|
+
if (matchedTests.length === 0 && extractedTests.length === 0 && filteredResults.length > 0) require_logger.logger.warn(`[filterTestsByResults] No tests matched ${filteredResults.length} filtered results. This may indicate a vars or provider mismatch between stored results and current test suite. Use LOG_LEVEL=debug for detailed matching info.`);
|
|
6772
|
+
else if (matchedTests.length + extractedTests.length < uniqueVarsInResults.size) require_logger.logger.debug(`[filterTestsByResults] Note: ${uniqueVarsInResults.size - matchedTests.length - extractedTests.length} unique test cases in results did not match any test in the current test suite and could not be extracted. This may indicate results without testCase data.`);
|
|
6813
6773
|
return require_util.deduplicateTestCases([...matchedTests, ...extractedTests]);
|
|
6814
6774
|
}
|
|
6815
|
-
|
|
6816
6775
|
//#endregion
|
|
6817
6776
|
//#region src/commands/eval/filterTests.ts
|
|
6818
6777
|
/**
|
|
@@ -6838,7 +6797,7 @@ async function filterTestsByResults(testSuite, pathOrId, filterFn) {
|
|
|
6838
6797
|
* @param reason - Description of what the filter was looking for (e.g., 'no failures/errors')
|
|
6839
6798
|
*/
|
|
6840
6799
|
function logNoTestsWarning(filterType, pathOrId, reason) {
|
|
6841
|
-
require_logger.
|
|
6800
|
+
require_logger.logger.warn(`--${filterType} returned no tests. The evaluation "${pathOrId}" may have ${reason}, or the test suite may have changed since the evaluation was run.`);
|
|
6842
6801
|
}
|
|
6843
6802
|
/**
|
|
6844
6803
|
* Filters a test suite to only include all tests that did not pass (failures + errors)
|
|
@@ -6884,10 +6843,10 @@ async function filterErrorTests(testSuite, pathOrId) {
|
|
|
6884
6843
|
*/
|
|
6885
6844
|
async function filterTests(testSuite, options) {
|
|
6886
6845
|
let tests = testSuite.tests || [];
|
|
6887
|
-
require_logger.
|
|
6888
|
-
require_logger.
|
|
6846
|
+
require_logger.logger.debug(`Starting filterTests with options: ${JSON.stringify(options)}`);
|
|
6847
|
+
require_logger.logger.debug(`Initial test count: ${tests.length}`);
|
|
6889
6848
|
if (Object.keys(options).length === 0) {
|
|
6890
|
-
require_logger.
|
|
6849
|
+
require_logger.logger.debug("No filter options provided, returning all tests");
|
|
6891
6850
|
return tests;
|
|
6892
6851
|
}
|
|
6893
6852
|
if (options.metadata) {
|
|
@@ -6902,11 +6861,11 @@ async function filterTests(testSuite, options) {
|
|
|
6902
6861
|
value
|
|
6903
6862
|
});
|
|
6904
6863
|
}
|
|
6905
|
-
require_logger.
|
|
6906
|
-
require_logger.
|
|
6864
|
+
require_logger.logger.debug(`Filtering for metadata conditions (AND logic): ${parsedFilters.map((f) => `${f.key}=${f.value}`).join(", ")}`);
|
|
6865
|
+
require_logger.logger.debug(`Before metadata filter: ${tests.length} tests`);
|
|
6907
6866
|
tests = tests.filter((test) => {
|
|
6908
6867
|
if (!test.metadata) {
|
|
6909
|
-
require_logger.
|
|
6868
|
+
require_logger.logger.debug(`Test has no metadata: ${test.description || "unnamed test"}`);
|
|
6910
6869
|
return false;
|
|
6911
6870
|
}
|
|
6912
6871
|
for (const { key, value } of parsedFilters) {
|
|
@@ -6915,16 +6874,16 @@ async function filterTests(testSuite, options) {
|
|
|
6915
6874
|
if (Array.isArray(testValue)) matches = testValue.some((v) => v.toString().includes(value));
|
|
6916
6875
|
else if (testValue !== void 0) matches = testValue.toString().includes(value);
|
|
6917
6876
|
if (!matches) {
|
|
6918
|
-
require_logger.
|
|
6877
|
+
require_logger.logger.debug(`Test "${test.description || "unnamed test"}" metadata doesn't match. Expected ${key} to include ${value}, got ${JSON.stringify(test.metadata)}`);
|
|
6919
6878
|
return false;
|
|
6920
6879
|
}
|
|
6921
6880
|
}
|
|
6922
6881
|
return true;
|
|
6923
6882
|
});
|
|
6924
|
-
require_logger.
|
|
6883
|
+
require_logger.logger.debug(`After metadata filter: ${tests.length} tests remain`);
|
|
6925
6884
|
}
|
|
6926
6885
|
if (options.failingOnly && options.errorsOnly) {
|
|
6927
|
-
require_logger.
|
|
6886
|
+
require_logger.logger.debug("Using both --filter-failing-only and --filter-errors-only together (equivalent to --filter-failing)");
|
|
6928
6887
|
const failingOnlyTests = await filterFailingOnlyTests(testSuite, options.failingOnly);
|
|
6929
6888
|
const errorTests = await filterErrorTests(testSuite, options.errorsOnly);
|
|
6930
6889
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -6934,8 +6893,8 @@ async function filterTests(testSuite, options) {
|
|
|
6934
6893
|
seen.add(key);
|
|
6935
6894
|
return true;
|
|
6936
6895
|
});
|
|
6937
|
-
require_logger.
|
|
6938
|
-
if (tests.length === 0) require_logger.
|
|
6896
|
+
require_logger.logger.debug(`Combined failingOnly (${failingOnlyTests.length}) and errors (${errorTests.length}) filters: ${tests.length} unique tests`);
|
|
6897
|
+
if (tests.length === 0) require_logger.logger.warn("Combined --filter-failing-only and --filter-errors-only returned no tests. The specified evaluations may have no failures or errors, or the test suite may have changed.");
|
|
6939
6898
|
} else if (options.failing) {
|
|
6940
6899
|
tests = await filterFailingTests(testSuite, options.failing);
|
|
6941
6900
|
if (tests.length === 0) logNoTestsWarning("filter-failing", options.failing, "no failures/errors");
|
|
@@ -6972,7 +6931,6 @@ async function filterTests(testSuite, options) {
|
|
|
6972
6931
|
}
|
|
6973
6932
|
return tests;
|
|
6974
6933
|
}
|
|
6975
|
-
|
|
6976
6934
|
//#endregion
|
|
6977
6935
|
//#region src/util/promptfooCommand.ts
|
|
6978
6936
|
/**
|
|
@@ -7018,7 +6976,6 @@ function promptfooCommand(subcommand) {
|
|
|
7018
6976
|
if (detectInstaller() === "npx") return subcommand ? `npx promptfoo@latest ${subcommand}` : "npx promptfoo@latest";
|
|
7019
6977
|
return subcommand ? `promptfoo ${subcommand}` : "promptfoo";
|
|
7020
6978
|
}
|
|
7021
|
-
|
|
7022
6979
|
//#endregion
|
|
7023
6980
|
//#region src/csv.ts
|
|
7024
6981
|
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = .8;
|
|
@@ -7110,7 +7067,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7110
7067
|
if (!key.startsWith("__") && specialKeys.some((k) => key.startsWith(k)) && !uniqueErrorMessages.has(key)) {
|
|
7111
7068
|
const error = `You used a single underscore for the key "${key}". Did you mean to use "${key.replace("_", "__")}" instead?`;
|
|
7112
7069
|
uniqueErrorMessages.add(key);
|
|
7113
|
-
require_logger.
|
|
7070
|
+
require_logger.logger.warn(error);
|
|
7114
7071
|
}
|
|
7115
7072
|
if (key.startsWith("__expected")) {
|
|
7116
7073
|
if (value.trim() !== "") asserts.push(assertionFromString(value.trim()));
|
|
@@ -7128,10 +7085,10 @@ function testCaseFromCsvRow(row) {
|
|
|
7128
7085
|
} else if (value.trim() !== "") metadata[metadataKey] = value;
|
|
7129
7086
|
} else if (key === "__metadata" && !uniqueErrorMessages.has(key)) {
|
|
7130
7087
|
uniqueErrorMessages.add(key);
|
|
7131
|
-
require_logger.
|
|
7088
|
+
require_logger.logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
|
|
7132
7089
|
} else if (key.startsWith("__config:")) {
|
|
7133
7090
|
const configParts = key.slice(9).split(":");
|
|
7134
|
-
if (configParts.length !== 2) require_logger.
|
|
7091
|
+
if (configParts.length !== 2) require_logger.logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
|
|
7135
7092
|
else {
|
|
7136
7093
|
const [expectedKey, configKey] = configParts;
|
|
7137
7094
|
let targetIndex;
|
|
@@ -7141,11 +7098,11 @@ function testCaseFromCsvRow(row) {
|
|
|
7141
7098
|
if (indexMatch) targetIndex = Number.parseInt(indexMatch[1], 10) - 1;
|
|
7142
7099
|
}
|
|
7143
7100
|
if (targetIndex === void 0) {
|
|
7144
|
-
require_logger.
|
|
7101
|
+
require_logger.logger.error(`Invalid expected key "${expectedKey}" in __config column "${key}". Must be __expected or __expected<N> where N is a positive integer.`);
|
|
7145
7102
|
throw new Error(`Invalid expected key "${expectedKey}" in __config column`);
|
|
7146
7103
|
}
|
|
7147
7104
|
if (!["threshold"].includes(configKey)) {
|
|
7148
|
-
require_logger.
|
|
7105
|
+
require_logger.logger.error(`Invalid config key "${configKey}" in __config column "${key}". Valid config keys include: threshold`);
|
|
7149
7106
|
throw new Error(`Invalid config key "${configKey}" in __config column`);
|
|
7150
7107
|
}
|
|
7151
7108
|
if (!assertionConfigs[targetIndex]) assertionConfigs[targetIndex] = {};
|
|
@@ -7153,7 +7110,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7153
7110
|
if (configKey === "threshold") {
|
|
7154
7111
|
parsedValue = Number.parseFloat(value);
|
|
7155
7112
|
if (!Number.isFinite(parsedValue)) {
|
|
7156
|
-
require_logger.
|
|
7113
|
+
require_logger.logger.error(`Invalid numeric value "${value}" for config key "${configKey}" in column "${key}"`);
|
|
7157
7114
|
throw new Error(`Invalid numeric value for ${configKey}`);
|
|
7158
7115
|
}
|
|
7159
7116
|
}
|
|
@@ -7180,7 +7137,6 @@ function testCaseFromCsvRow(row) {
|
|
|
7180
7137
|
...Object.keys(metadata).length > 0 ? { metadata } : {}
|
|
7181
7138
|
};
|
|
7182
7139
|
}
|
|
7183
|
-
|
|
7184
7140
|
//#endregion
|
|
7185
7141
|
//#region src/microsoftSharepoint.ts
|
|
7186
7142
|
let cca = null;
|
|
@@ -7200,7 +7156,7 @@ async function fetchCsvFromSharepoint(url) {
|
|
|
7200
7156
|
const fileRelativeUrl = url.startsWith(normalizedBaseUrl) ? url.slice(normalizedBaseUrl.length) : url;
|
|
7201
7157
|
const serverRelativeUrl = fileRelativeUrl.startsWith("/") ? fileRelativeUrl : `/${fileRelativeUrl}`;
|
|
7202
7158
|
const apiUrl = `${normalizedBaseUrl}/_api/web/GetFileByServerRelativeUrl('${encodeURI(serverRelativeUrl)}')/$value`;
|
|
7203
|
-
require_logger.
|
|
7159
|
+
require_logger.logger.debug(`Fetching CSV from SharePoint: ${apiUrl}`);
|
|
7204
7160
|
const response = await require_fetch.fetchWithProxy(apiUrl, { headers: {
|
|
7205
7161
|
Authorization: `Bearer ${accessToken}`,
|
|
7206
7162
|
Accept: "text/csv"
|
|
@@ -7257,7 +7213,6 @@ async function getSharePointAccessToken() {
|
|
|
7257
7213
|
if (!tokenResult?.accessToken) throw new Error("Failed to acquire SharePoint access token. Please check your authentication configuration.");
|
|
7258
7214
|
return tokenResult.accessToken;
|
|
7259
7215
|
}
|
|
7260
|
-
|
|
7261
7216
|
//#endregion
|
|
7262
7217
|
//#region src/util/xlsx.ts
|
|
7263
7218
|
async function parseXlsxFile(filePath) {
|
|
@@ -7317,7 +7272,6 @@ async function parseXlsxFile(filePath) {
|
|
|
7317
7272
|
throw new Error(`Failed to parse Excel file ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
7318
7273
|
}
|
|
7319
7274
|
}
|
|
7320
|
-
|
|
7321
7275
|
//#endregion
|
|
7322
7276
|
//#region src/util/testCaseReader.ts
|
|
7323
7277
|
async function readTestFiles(pathOrGlobs, basePath = "") {
|
|
@@ -7363,29 +7317,29 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7363
7317
|
const fileExtension = (0, path.parse)(pathWithoutFunction).ext.slice(1);
|
|
7364
7318
|
const extensionWithoutSheet = fileExtension.split("#")[0];
|
|
7365
7319
|
if (varsPath.startsWith("huggingface://datasets/")) {
|
|
7366
|
-
require_telemetry.
|
|
7320
|
+
require_telemetry.telemetry.record("feature_used", { feature: "huggingface dataset" });
|
|
7367
7321
|
return await require_graders.fetchHuggingFaceDataset(varsPath);
|
|
7368
7322
|
}
|
|
7369
7323
|
if (require_fileExtensions.isJavascriptFile(pathWithoutFunction)) {
|
|
7370
|
-
require_telemetry.
|
|
7324
|
+
require_telemetry.telemetry.record("feature_used", { feature: "js tests file" });
|
|
7371
7325
|
const mod = await require_esm.importModule(pathWithoutFunction, maybeFunctionName);
|
|
7372
7326
|
return typeof mod === "function" ? await mod(finalConfig) : mod;
|
|
7373
7327
|
}
|
|
7374
7328
|
if (fileExtension === "py") {
|
|
7375
|
-
require_telemetry.
|
|
7329
|
+
require_telemetry.telemetry.record("feature_used", { feature: "python tests file" });
|
|
7376
7330
|
const result = await require_pythonUtils.runPython(pathWithoutFunction, maybeFunctionName ?? "generate_tests", finalConfig === void 0 ? [] : [finalConfig]);
|
|
7377
7331
|
if (!Array.isArray(result)) throw new Error(`Python test function must return a list of test cases, got ${typeof result}`);
|
|
7378
7332
|
return result;
|
|
7379
7333
|
}
|
|
7380
7334
|
let rows = [];
|
|
7381
7335
|
if (varsPath.startsWith("https://docs.google.com/spreadsheets/")) {
|
|
7382
|
-
require_telemetry.
|
|
7336
|
+
require_telemetry.telemetry.record("feature_used", { feature: "csv tests file - google sheet" });
|
|
7383
7337
|
rows = await require_util.fetchCsvFromGoogleSheet(varsPath);
|
|
7384
7338
|
} else if (/https:\/\/[^/]+\.sharepoint\.com\//i.test(varsPath)) {
|
|
7385
|
-
require_telemetry.
|
|
7339
|
+
require_telemetry.telemetry.record("feature_used", { feature: "csv tests file - sharepoint" });
|
|
7386
7340
|
rows = await fetchCsvFromSharepoint(varsPath);
|
|
7387
7341
|
} else if (fileExtension === "csv") {
|
|
7388
|
-
require_telemetry.
|
|
7342
|
+
require_telemetry.telemetry.record("feature_used", { feature: "csv tests file - local" });
|
|
7389
7343
|
const delimiter = require_logger.getEnvString("PROMPTFOO_CSV_DELIMITER", ",");
|
|
7390
7344
|
const fileContent = await fs_promises.readFile(resolvedVarsPath, "utf-8");
|
|
7391
7345
|
const enforceStrict = require_logger.getEnvBool("PROMPTFOO_CSV_STRICT", false);
|
|
@@ -7417,10 +7371,10 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7417
7371
|
throw e;
|
|
7418
7372
|
}
|
|
7419
7373
|
} else if (extensionWithoutSheet === "xlsx" || extensionWithoutSheet === "xls") {
|
|
7420
|
-
require_telemetry.
|
|
7374
|
+
require_telemetry.telemetry.record("feature_used", { feature: "xlsx tests file - local" });
|
|
7421
7375
|
rows = await parseXlsxFile(resolvedVarsPath);
|
|
7422
7376
|
} else if (fileExtension === "json") {
|
|
7423
|
-
require_telemetry.
|
|
7377
|
+
require_telemetry.telemetry.record("feature_used", { feature: "json tests file" });
|
|
7424
7378
|
const fileContent = await fs_promises.readFile(resolvedVarsPath, "utf-8");
|
|
7425
7379
|
const jsonData = js_yaml.default.load(fileContent);
|
|
7426
7380
|
return (Array.isArray(jsonData) ? jsonData : [jsonData]).map((item, idx) => ({
|
|
@@ -7428,7 +7382,7 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7428
7382
|
description: item.description || `Row #${idx + 1}`
|
|
7429
7383
|
}));
|
|
7430
7384
|
} else if (fileExtension === "jsonl") {
|
|
7431
|
-
require_telemetry.
|
|
7385
|
+
require_telemetry.telemetry.record("feature_used", { feature: "jsonl tests file" });
|
|
7432
7386
|
return (await fs_promises.readFile(resolvedVarsPath, "utf-8")).split("\n").filter((line) => line.trim()).map((line, idx) => {
|
|
7433
7387
|
return {
|
|
7434
7388
|
...JSON.parse(line),
|
|
@@ -7436,7 +7390,7 @@ async function readStandaloneTestsFile(varsPath, basePath = "", config) {
|
|
|
7436
7390
|
};
|
|
7437
7391
|
});
|
|
7438
7392
|
} else if (fileExtension === "yaml" || fileExtension === "yml") {
|
|
7439
|
-
require_telemetry.
|
|
7393
|
+
require_telemetry.telemetry.record("feature_used", { feature: "yaml tests file" });
|
|
7440
7394
|
rows = require_util.maybeLoadConfigFromExternalFile(js_yaml.default.load(await fs_promises.readFile(resolvedVarsPath, "utf-8")));
|
|
7441
7395
|
}
|
|
7442
7396
|
return rows.map((row, idx) => {
|
|
@@ -7480,7 +7434,7 @@ async function readTest(test, basePath = "", isDefaultTest = false) {
|
|
|
7480
7434
|
*/
|
|
7481
7435
|
async function loadTestsFromGlob(loadTestsGlob, basePath = "") {
|
|
7482
7436
|
if (loadTestsGlob.startsWith("huggingface://datasets/")) {
|
|
7483
|
-
require_telemetry.
|
|
7437
|
+
require_telemetry.telemetry.record("feature_used", { feature: "huggingface dataset" });
|
|
7484
7438
|
return await require_graders.fetchHuggingFaceDataset(loadTestsGlob);
|
|
7485
7439
|
}
|
|
7486
7440
|
if (loadTestsGlob.startsWith("file://")) loadTestsGlob = loadTestsGlob.slice(7);
|
|
@@ -7491,12 +7445,12 @@ async function loadTestsFromGlob(loadTestsGlob, basePath = "") {
|
|
|
7491
7445
|
if ((require_fileExtensions.isJavascriptFile(pathWithoutFunction) || pathWithoutFunction.endsWith(".py")) && !testFiles.some((file) => file === resolvedPath || file === pathWithoutFunction)) testFiles.push(resolvedPath);
|
|
7492
7446
|
if (loadTestsGlob.startsWith("https://docs.google.com/spreadsheets/")) testFiles.push(loadTestsGlob);
|
|
7493
7447
|
const _deref = async (testCases, file) => {
|
|
7494
|
-
require_logger.
|
|
7448
|
+
require_logger.logger.debug(`Dereferencing test file: ${file}`);
|
|
7495
7449
|
return await _apidevtools_json_schema_ref_parser.default.dereference(testCases);
|
|
7496
7450
|
};
|
|
7497
7451
|
const ret = [];
|
|
7498
7452
|
if (testFiles.length < 1) {
|
|
7499
|
-
require_logger.
|
|
7453
|
+
require_logger.logger.error(`No test files found for path: ${loadTestsGlob}`);
|
|
7500
7454
|
return ret;
|
|
7501
7455
|
}
|
|
7502
7456
|
for (const testFile of testFiles) {
|
|
@@ -7536,14 +7490,14 @@ async function readTests(tests, basePath = "") {
|
|
|
7536
7490
|
else ret.push(...await loadTestsFromGlob(globOrTest, basePath));
|
|
7537
7491
|
} else if ("path" in globOrTest) ret.push(...await readStandaloneTestsFile(globOrTest.path, basePath, globOrTest.config));
|
|
7538
7492
|
else ret.push(await readTest(globOrTest, basePath));
|
|
7539
|
-
else if (tests !== void 0 && tests !== null) require_logger.
|
|
7493
|
+
else if (tests !== void 0 && tests !== null) require_logger.logger.warn(dedent.default`
|
|
7540
7494
|
Warning: Unsupported 'tests' format in promptfooconfig.yaml.
|
|
7541
7495
|
Expected: string, string[], or TestCase[], but received: ${typeof tests}
|
|
7542
7496
|
|
|
7543
7497
|
Please check your configuration file and ensure the 'tests' field is correctly formatted.
|
|
7544
7498
|
For more information, visit: https://promptfoo.dev/docs/configuration/reference/#test-case
|
|
7545
7499
|
`);
|
|
7546
|
-
if (ret.some((testCase) => testCase.vars?.assert) && !require_logger.getEnvBool("PROMPTFOO_NO_TESTCASE_ASSERT_WARNING")) require_logger.
|
|
7500
|
+
if (ret.some((testCase) => testCase.vars?.assert) && !require_logger.getEnvBool("PROMPTFOO_NO_TESTCASE_ASSERT_WARNING")) require_logger.logger.warn(dedent.default`
|
|
7547
7501
|
Warning: Found 'assert' key in vars. This is likely a mistake in your configuration.
|
|
7548
7502
|
|
|
7549
7503
|
'assert' should be *unindented* so it is under the test itself, not vars. For example:
|
|
@@ -7559,7 +7513,6 @@ async function readTests(tests, basePath = "") {
|
|
|
7559
7513
|
`);
|
|
7560
7514
|
return ret;
|
|
7561
7515
|
}
|
|
7562
|
-
|
|
7563
7516
|
//#endregion
|
|
7564
7517
|
//#region src/util/validateTestPromptReferences.ts
|
|
7565
7518
|
var PromptReferenceValidationError = class extends Error {
|
|
@@ -7602,7 +7555,6 @@ function validateTestPromptReferences(tests, prompts, defaultTest) {
|
|
|
7602
7555
|
}
|
|
7603
7556
|
}
|
|
7604
7557
|
}
|
|
7605
|
-
|
|
7606
7558
|
//#endregion
|
|
7607
7559
|
//#region src/util/validateTestProviderReferences.ts
|
|
7608
7560
|
var ProviderReferenceValidationError = class extends Error {
|
|
@@ -7648,7 +7600,6 @@ function validateTestProviderReferences(tests, providers, defaultTest, scenarios
|
|
|
7648
7600
|
});
|
|
7649
7601
|
});
|
|
7650
7602
|
}
|
|
7651
|
-
|
|
7652
7603
|
//#endregion
|
|
7653
7604
|
//#region src/util/config/extensions.ts
|
|
7654
7605
|
/**
|
|
@@ -7666,7 +7617,6 @@ const DEFAULT_CONFIG_EXTENSIONS = [
|
|
|
7666
7617
|
"mts",
|
|
7667
7618
|
"ts"
|
|
7668
7619
|
];
|
|
7669
|
-
|
|
7670
7620
|
//#endregion
|
|
7671
7621
|
//#region src/util/config/load.ts
|
|
7672
7622
|
/**
|
|
@@ -7789,34 +7739,34 @@ async function readConfig(configPath) {
|
|
|
7789
7739
|
const hasProviders = data.providers !== void 0;
|
|
7790
7740
|
return hasTargets && !hasProviders || !hasTargets && hasProviders;
|
|
7791
7741
|
}, { message: "Exactly one of 'targets' or 'providers' must be provided, but not both" }).safeParse(renderedConfig);
|
|
7792
|
-
if (!validationResult.success) require_logger.
|
|
7742
|
+
if (!validationResult.success) require_logger.logger.warn(`Invalid configuration file ${configPath}:\n${zod.z.prettifyError(validationResult.error)}`);
|
|
7793
7743
|
ret = renderedConfig;
|
|
7794
7744
|
} else if (require_fileExtensions.isJavascriptFile(configPath)) {
|
|
7795
7745
|
const renderedConfig = renderConfigEnvTemplates(await require_esm.importModule(configPath));
|
|
7796
7746
|
const validationResult = require_types.UnifiedConfigSchema.safeParse(renderedConfig);
|
|
7797
|
-
if (!validationResult.success) require_logger.
|
|
7747
|
+
if (!validationResult.success) require_logger.logger.warn(`Invalid configuration file ${configPath}:\n${zod.z.prettifyError(validationResult.error)}`);
|
|
7798
7748
|
ret = renderedConfig;
|
|
7799
7749
|
} else throw new Error(`Unsupported configuration file format: ${ext}`);
|
|
7800
7750
|
if (ret.targets) {
|
|
7801
|
-
require_logger.
|
|
7751
|
+
require_logger.logger.debug(`Rewriting config.targets to config.providers`);
|
|
7802
7752
|
ret.providers = ret.targets;
|
|
7803
7753
|
delete ret.targets;
|
|
7804
7754
|
}
|
|
7805
7755
|
if (ret.plugins) {
|
|
7806
|
-
require_logger.
|
|
7756
|
+
require_logger.logger.debug(`Rewriting config.plugins to config.redteam.plugins`);
|
|
7807
7757
|
ret.redteam = ret.redteam || {};
|
|
7808
7758
|
ret.redteam.plugins = ret.plugins;
|
|
7809
7759
|
delete ret.plugins;
|
|
7810
7760
|
}
|
|
7811
7761
|
if (ret.strategies) {
|
|
7812
|
-
require_logger.
|
|
7762
|
+
require_logger.logger.debug(`Rewriting config.strategies to config.redteam.strategies`);
|
|
7813
7763
|
ret.redteam = ret.redteam || {};
|
|
7814
7764
|
ret.redteam.strategies = ret.strategies;
|
|
7815
7765
|
delete ret.strategies;
|
|
7816
7766
|
}
|
|
7817
7767
|
if (!ret.prompts) {
|
|
7818
|
-
require_logger.
|
|
7819
|
-
if (!(!ret.tests || typeof ret.tests === "string" || Array.isArray(ret.tests) && ret.tests.some((test) => isTestCaseWithVars(test) && Object.keys(test.vars || {}).includes("prompt")))) require_logger.
|
|
7768
|
+
require_logger.logger.debug(`Setting default prompt because there is no \`prompts\` field`);
|
|
7769
|
+
if (!(!ret.tests || typeof ret.tests === "string" || Array.isArray(ret.tests) && ret.tests.some((test) => isTestCaseWithVars(test) && Object.keys(test.vars || {}).includes("prompt")))) require_logger.logger.warn(`Warning: Expected top-level "prompts" property in config or a test variable named "prompt"`);
|
|
7820
7770
|
ret.prompts = ["{{prompt}}"];
|
|
7821
7771
|
}
|
|
7822
7772
|
return ret;
|
|
@@ -8014,9 +7964,9 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8014
7964
|
defaultConfig = {};
|
|
8015
7965
|
}
|
|
8016
7966
|
if (cmdObj.assertions) {
|
|
8017
|
-
require_telemetry.
|
|
7967
|
+
require_telemetry.telemetry.record("feature_used", { feature: "standalone assertions mode" });
|
|
8018
7968
|
if (!cmdObj.modelOutputs) {
|
|
8019
|
-
require_logger.
|
|
7969
|
+
require_logger.logger.error("You must provide --model-outputs when using --assertions");
|
|
8020
7970
|
process$1.default.exit(1);
|
|
8021
7971
|
}
|
|
8022
7972
|
const modelOutputs = JSON.parse(fs.readFileSync(path.join(process$1.default.cwd(), cmdObj.modelOutputs), "utf8"));
|
|
@@ -8038,14 +7988,14 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8038
7988
|
});
|
|
8039
7989
|
}
|
|
8040
7990
|
const basePath = configPaths ? path.dirname(configPaths[0]) : "";
|
|
8041
|
-
require_logger.
|
|
7991
|
+
require_logger.state.basePath = basePath;
|
|
8042
7992
|
const defaultTestRaw = fileConfig.defaultTest || defaultConfig.defaultTest;
|
|
8043
7993
|
let processedDefaultTest;
|
|
8044
7994
|
if (typeof defaultTestRaw === "string" && defaultTestRaw.startsWith("file://")) {
|
|
8045
|
-
const originalBasePath = require_logger.
|
|
8046
|
-
require_logger.
|
|
7995
|
+
const originalBasePath = require_logger.state.basePath;
|
|
7996
|
+
require_logger.state.basePath = basePath;
|
|
8047
7997
|
const loaded = await require_util.maybeLoadFromExternalFile(defaultTestRaw);
|
|
8048
|
-
require_logger.
|
|
7998
|
+
require_logger.state.basePath = originalBasePath;
|
|
8049
7999
|
processedDefaultTest = loaded;
|
|
8050
8000
|
} else if (defaultTestRaw) processedDefaultTest = defaultTestRaw;
|
|
8051
8001
|
const config = {
|
|
@@ -8070,7 +8020,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8070
8020
|
const hasProviders = cmdObj.providers && cmdObj.providers.length > 0 || [config.providers].flat().filter(Boolean).length > 0;
|
|
8071
8021
|
if (!Boolean(configPaths) && !hasPrompts && !hasProviders && !require_logger.isCI()) {
|
|
8072
8022
|
const extList = DEFAULT_CONFIG_EXTENSIONS.join(", ");
|
|
8073
|
-
require_logger.
|
|
8023
|
+
require_logger.logger.warn(dedent.default`
|
|
8074
8024
|
${chalk.default.yellow.bold("⚠️ No promptfooconfig found")}
|
|
8075
8025
|
|
|
8076
8026
|
${chalk.default.white(`Searched in ${chalk.default.bold(process$1.default.cwd())} for promptfooconfig.{${extList}}`)}
|
|
@@ -8086,11 +8036,11 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8086
8036
|
process$1.default.exit(1);
|
|
8087
8037
|
}
|
|
8088
8038
|
if (!hasPrompts) {
|
|
8089
|
-
require_logger.
|
|
8039
|
+
require_logger.logger.error("You must provide at least 1 prompt");
|
|
8090
8040
|
process$1.default.exit(1);
|
|
8091
8041
|
}
|
|
8092
8042
|
if (type !== "DatasetGeneration" && type !== "AssertionGeneration" && !hasProviders) {
|
|
8093
|
-
require_logger.
|
|
8043
|
+
require_logger.logger.error("You must specify at least 1 provider (for example, openai:gpt-4.1)");
|
|
8094
8044
|
process$1.default.exit(1);
|
|
8095
8045
|
}
|
|
8096
8046
|
require_invariant.invariant(Array.isArray(config.providers), "providers must be an array");
|
|
@@ -8098,11 +8048,11 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8098
8048
|
const cliFilteredProviderConfigs = (cmdObj.providers ? resolveCliProvidersWithConfig(cmdObj.providers, resolvedProviderConfigs) : resolvedProviderConfigs) ?? [];
|
|
8099
8049
|
const filterOption = cmdObj.filterProviders || cmdObj.filterTargets;
|
|
8100
8050
|
const filteredProviderConfigs = filterProviderConfigs(cliFilteredProviderConfigs, filterOption);
|
|
8101
|
-
if (filterOption && Array.isArray(filteredProviderConfigs) && filteredProviderConfigs.length === 0) require_logger.
|
|
8051
|
+
if (filterOption && Array.isArray(filteredProviderConfigs) && filteredProviderConfigs.length === 0) require_logger.logger.warn(`No providers matched the filter "${filterOption}". Check your --filter-providers/--filter-targets value.`);
|
|
8102
8052
|
let parsedPrompts = await require_graders.readPrompts(config.prompts, cmdObj.prompts ? void 0 : basePath);
|
|
8103
8053
|
if (cmdObj.filterPrompts) {
|
|
8104
8054
|
parsedPrompts = filterPrompts(parsedPrompts, cmdObj.filterPrompts);
|
|
8105
|
-
if (parsedPrompts.length === 0) require_logger.
|
|
8055
|
+
if (parsedPrompts.length === 0) require_logger.logger.warn(`No prompts matched the filter "${cmdObj.filterPrompts}". Check your --filter-prompts value.`);
|
|
8106
8056
|
}
|
|
8107
8057
|
const parsedProviders = await require_providers.loadApiProviders(filteredProviderConfigs, {
|
|
8108
8058
|
env: config.env,
|
|
@@ -8133,7 +8083,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8133
8083
|
}
|
|
8134
8084
|
const parsedProviderPromptMap = require_graders.readProviderPromptMap({ providers: filteredProviderConfigs }, parsedPrompts);
|
|
8135
8085
|
if (parsedPrompts.length === 0) {
|
|
8136
|
-
require_logger.
|
|
8086
|
+
require_logger.logger.error("No prompts found. Add a `prompts:` entry to your config or pass --prompts path/to/prompt.txt.");
|
|
8137
8087
|
process$1.default.exit(1);
|
|
8138
8088
|
}
|
|
8139
8089
|
const defaultTest = {
|
|
@@ -8163,7 +8113,7 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8163
8113
|
validateAssertions(testSuite.tests || [], typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0);
|
|
8164
8114
|
validateTestProviderReferences(testSuite.tests || [], testSuite.providers, typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0, testSuite.scenarios);
|
|
8165
8115
|
validateTestPromptReferences(testSuite.tests || [], testSuite.prompts, typeof testSuite.defaultTest === "object" ? testSuite.defaultTest : void 0);
|
|
8166
|
-
require_logger.
|
|
8116
|
+
require_logger.state.config = config;
|
|
8167
8117
|
let commandLineOptions = fileConfig.commandLineOptions || defaultConfig.commandLineOptions;
|
|
8168
8118
|
if (commandLineOptions?.envPath && basePath) {
|
|
8169
8119
|
const resolvedPaths = (Array.isArray(commandLineOptions.envPath) ? commandLineOptions.envPath : [commandLineOptions.envPath]).map((p) => path.isAbsolute(p) ? p : path.resolve(basePath, p));
|
|
@@ -8179,7 +8129,6 @@ async function resolveConfigs(cmdObj, _defaultConfig, type) {
|
|
|
8179
8129
|
commandLineOptions
|
|
8180
8130
|
};
|
|
8181
8131
|
}
|
|
8182
|
-
|
|
8183
8132
|
//#endregion
|
|
8184
8133
|
//#region src/util/config/writer.ts
|
|
8185
8134
|
function writePromptfooConfig(config, outputPath, headerComments) {
|
|
@@ -8195,7 +8144,7 @@ function writePromptfooConfig(config, outputPath, headerComments) {
|
|
|
8195
8144
|
]);
|
|
8196
8145
|
const yamlContent = js_yaml.default.dump(orderedConfig, { skipInvalid: true });
|
|
8197
8146
|
if (!yamlContent) {
|
|
8198
|
-
require_logger.
|
|
8147
|
+
require_logger.logger.warn("Warning: config is empty, skipping write");
|
|
8199
8148
|
return orderedConfig;
|
|
8200
8149
|
}
|
|
8201
8150
|
const schemaComment = `# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json`;
|
|
@@ -8203,7 +8152,6 @@ function writePromptfooConfig(config, outputPath, headerComments) {
|
|
|
8203
8152
|
fs.default.writeFileSync(outputPath, `${schemaComment}\n${headerCommentLines}${yamlContent}`);
|
|
8204
8153
|
return orderedConfig;
|
|
8205
8154
|
}
|
|
8206
|
-
|
|
8207
8155
|
//#endregion
|
|
8208
8156
|
//#region src/util/redteamProbeLimit.ts
|
|
8209
8157
|
const MONTHLY_PROBE_LIMIT = 1e5;
|
|
@@ -8253,7 +8201,6 @@ function checkRedteamProbeLimit() {
|
|
|
8253
8201
|
remaining
|
|
8254
8202
|
};
|
|
8255
8203
|
}
|
|
8256
|
-
|
|
8257
8204
|
//#endregion
|
|
8258
8205
|
//#region src/redteam/extraction/mcpTools.ts
|
|
8259
8206
|
/**
|
|
@@ -8289,11 +8236,10 @@ async function extractMcpToolsInfo(providers) {
|
|
|
8289
8236
|
for (const tool of tools) toolsInfo.push(JSON.stringify(tool));
|
|
8290
8237
|
}
|
|
8291
8238
|
} catch (error) {
|
|
8292
|
-
require_logger.
|
|
8239
|
+
require_logger.logger.warn(`Failed to get tools from MCP provider: ${error instanceof Error ? error.message : String(error)}`);
|
|
8293
8240
|
}
|
|
8294
8241
|
return toolsInfo.join("\n");
|
|
8295
8242
|
}
|
|
8296
|
-
|
|
8297
8243
|
//#endregion
|
|
8298
8244
|
//#region src/util/apiHealth.ts
|
|
8299
8245
|
/**
|
|
@@ -8302,7 +8248,7 @@ async function extractMcpToolsInfo(providers) {
|
|
|
8302
8248
|
* @returns A promise that resolves to the health check response.
|
|
8303
8249
|
*/
|
|
8304
8250
|
async function checkRemoteHealth(url) {
|
|
8305
|
-
require_logger.
|
|
8251
|
+
require_logger.logger.debug(`[CheckRemoteHealth] Checking API health: ${JSON.stringify({
|
|
8306
8252
|
url,
|
|
8307
8253
|
env: {
|
|
8308
8254
|
httpProxy: require_logger.getEnvString("HTTP_PROXY") || require_logger.getEnvString("http_proxy"),
|
|
@@ -8317,7 +8263,7 @@ async function checkRemoteHealth(url) {
|
|
|
8317
8263
|
const cloudConfig = new require_fetch.CloudConfig();
|
|
8318
8264
|
const response = await require_fetch.fetchWithTimeout(url, { headers: { "Content-Type": "application/json" } }, 5e3);
|
|
8319
8265
|
if (!response.ok) {
|
|
8320
|
-
require_logger.
|
|
8266
|
+
require_logger.logger.debug(`[CheckRemoteHealth] API health check failed with non-OK response: ${JSON.stringify({
|
|
8321
8267
|
status: response.status,
|
|
8322
8268
|
statusText: response.statusText,
|
|
8323
8269
|
url
|
|
@@ -8357,7 +8303,7 @@ async function checkRemoteHealth(url) {
|
|
|
8357
8303
|
};
|
|
8358
8304
|
const cause = "cause" in error ? ` (Cause: ${error.cause})` : "";
|
|
8359
8305
|
const code = "code" in error ? ` [${error["code"]}]` : "";
|
|
8360
|
-
require_logger.
|
|
8306
|
+
require_logger.logger.debug(`[CheckRemoteHealth] API health check failed: ${JSON.stringify({
|
|
8361
8307
|
error: error.message,
|
|
8362
8308
|
url
|
|
8363
8309
|
})}`);
|
|
@@ -8367,7 +8313,6 @@ async function checkRemoteHealth(url) {
|
|
|
8367
8313
|
};
|
|
8368
8314
|
}
|
|
8369
8315
|
}
|
|
8370
|
-
|
|
8371
8316
|
//#endregion
|
|
8372
8317
|
//#region src/redteam/extraction/util.ts
|
|
8373
8318
|
const RedTeamGenerationResponse = zod.z.object({
|
|
@@ -8404,7 +8349,7 @@ async function fetchRemoteGeneration(task, prompts) {
|
|
|
8404
8349
|
}, require_fetch.REQUEST_TIMEOUT_MS, "json");
|
|
8405
8350
|
return RedTeamGenerationResponse.parse(response.data).result;
|
|
8406
8351
|
} catch (error) {
|
|
8407
|
-
require_logger.
|
|
8352
|
+
require_logger.logger.warn(`Error using remote generation for task '${task}': ${error}`);
|
|
8408
8353
|
throw error;
|
|
8409
8354
|
}
|
|
8410
8355
|
}
|
|
@@ -8414,11 +8359,11 @@ async function callExtraction(provider, prompt, processOutput) {
|
|
|
8414
8359
|
content: prompt
|
|
8415
8360
|
}]));
|
|
8416
8361
|
if (error) {
|
|
8417
|
-
require_logger.
|
|
8362
|
+
require_logger.logger.error(`Error in extraction: ${error}`);
|
|
8418
8363
|
throw new Error(`Failed to perform extraction: ${error}`);
|
|
8419
8364
|
}
|
|
8420
8365
|
if (typeof output !== "string") {
|
|
8421
|
-
require_logger.
|
|
8366
|
+
require_logger.logger.error(`Invalid output from extraction. Got: ${output}`);
|
|
8422
8367
|
throw new Error(`Invalid extraction output: expected string, got: ${output}`);
|
|
8423
8368
|
}
|
|
8424
8369
|
return processOutput(output);
|
|
@@ -8429,14 +8374,13 @@ function formatPrompts(prompts) {
|
|
|
8429
8374
|
${prompt}
|
|
8430
8375
|
</Prompt>`).join("\n");
|
|
8431
8376
|
}
|
|
8432
|
-
|
|
8433
8377
|
//#endregion
|
|
8434
8378
|
//#region src/redteam/extraction/entities.ts
|
|
8435
8379
|
async function extractEntities(provider, prompts) {
|
|
8436
8380
|
if (require_server.shouldGenerateRemote()) try {
|
|
8437
8381
|
return await fetchRemoteGeneration("entities", prompts);
|
|
8438
8382
|
} catch (error) {
|
|
8439
|
-
require_logger.
|
|
8383
|
+
require_logger.logger.warn(`[Entity Extraction] Failed, returning 0 entities. Error using remote generation: ${error}`);
|
|
8440
8384
|
return [];
|
|
8441
8385
|
}
|
|
8442
8386
|
const prompt = dedent.default`
|
|
@@ -8463,28 +8407,27 @@ async function extractEntities(provider, prompts) {
|
|
|
8463
8407
|
try {
|
|
8464
8408
|
return await callExtraction(provider, prompt, (output) => {
|
|
8465
8409
|
const entities = output.split("\n").filter((line) => line.trim().startsWith("Entity:")).map((line) => line.substring(line.indexOf("Entity:") + 7).trim()).filter((entity) => !/^\{\{\s*[^{}]+\s*\}\}$/.test(entity));
|
|
8466
|
-
if (entities.length === 0) require_logger.
|
|
8410
|
+
if (entities.length === 0) require_logger.logger.debug("No entities were extracted from the prompts.");
|
|
8467
8411
|
return entities;
|
|
8468
8412
|
});
|
|
8469
8413
|
} catch (error) {
|
|
8470
|
-
require_logger.
|
|
8414
|
+
require_logger.logger.warn(`Error using local extraction, returning empty list: ${error}`);
|
|
8471
8415
|
return [];
|
|
8472
8416
|
}
|
|
8473
8417
|
}
|
|
8474
|
-
|
|
8475
8418
|
//#endregion
|
|
8476
8419
|
//#region src/redteam/extraction/purpose.ts
|
|
8477
8420
|
const DEFAULT_PURPOSE = "An AI system";
|
|
8478
8421
|
async function extractSystemPurpose(provider, prompts) {
|
|
8479
8422
|
const onlyTemplatePrompt = prompts.length === 1 && prompts[0] && prompts[0].trim().replace(/\s+/g, "") === "{{prompt}}";
|
|
8480
8423
|
if (prompts.length === 0 || onlyTemplatePrompt) {
|
|
8481
|
-
require_logger.
|
|
8424
|
+
require_logger.logger.debug("[purpose] No meaningful prompts provided, returning default purpose");
|
|
8482
8425
|
return DEFAULT_PURPOSE;
|
|
8483
8426
|
}
|
|
8484
8427
|
if (!require_server.neverGenerateRemote()) try {
|
|
8485
8428
|
return await fetchRemoteGeneration("purpose", prompts);
|
|
8486
8429
|
} catch (error) {
|
|
8487
|
-
require_logger.
|
|
8430
|
+
require_logger.logger.warn(`[purpose] Error using remote generation, returning empty string: ${error}`);
|
|
8488
8431
|
return "";
|
|
8489
8432
|
}
|
|
8490
8433
|
const prompt = dedent.default`
|
|
@@ -8505,11 +8448,10 @@ async function extractSystemPurpose(provider, prompts) {
|
|
|
8505
8448
|
return match ? match[1].trim() : output.trim();
|
|
8506
8449
|
});
|
|
8507
8450
|
} catch (error) {
|
|
8508
|
-
require_logger.
|
|
8451
|
+
require_logger.logger.warn(`[purpose] Error using extracting purpose, returning empty string: ${error}`);
|
|
8509
8452
|
return "";
|
|
8510
8453
|
}
|
|
8511
8454
|
}
|
|
8512
|
-
|
|
8513
8455
|
//#endregion
|
|
8514
8456
|
//#region src/redteam/plugins/custom.ts
|
|
8515
8457
|
const CustomPluginDefinitionSchema = zod.z.strictObject({
|
|
@@ -8520,7 +8462,7 @@ const CustomPluginDefinitionSchema = zod.z.strictObject({
|
|
|
8520
8462
|
id: zod.z.string().optional()
|
|
8521
8463
|
});
|
|
8522
8464
|
function loadCustomPluginDefinition(filePath) {
|
|
8523
|
-
require_logger.
|
|
8465
|
+
require_logger.logger.debug(`Loading custom plugin from ${filePath}`);
|
|
8524
8466
|
const result = CustomPluginDefinitionSchema.safeParse(require_util.maybeLoadFromExternalFile(filePath));
|
|
8525
8467
|
if (!result.success) {
|
|
8526
8468
|
const validationError = zod.z.prettifyError(result.error);
|
|
@@ -8531,7 +8473,7 @@ function loadCustomPluginDefinition(filePath) {
|
|
|
8531
8473
|
|
|
8532
8474
|
Please review your plugin file ${filePath} configuration.`);
|
|
8533
8475
|
}
|
|
8534
|
-
require_logger.
|
|
8476
|
+
require_logger.logger.debug(`Custom plugin definition: ${JSON.stringify(result.data, null, 2)}`);
|
|
8535
8477
|
return result.data;
|
|
8536
8478
|
}
|
|
8537
8479
|
var CustomPlugin = class extends require_graders.RedteamPluginBase {
|
|
@@ -8569,7 +8511,6 @@ var CustomPlugin = class extends require_graders.RedteamPluginBase {
|
|
|
8569
8511
|
}));
|
|
8570
8512
|
}
|
|
8571
8513
|
};
|
|
8572
|
-
|
|
8573
8514
|
//#endregion
|
|
8574
8515
|
//#region src/redteam/plugins/cyberseceval.ts
|
|
8575
8516
|
const PLUGIN_ID$2 = "promptfoo:redteam:cyberseceval";
|
|
@@ -8587,11 +8528,11 @@ async function fetchDataset$2(limit, isMultilingual) {
|
|
|
8587
8528
|
user_input: input.user_input,
|
|
8588
8529
|
judge_question: input.judge_question
|
|
8589
8530
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8590
|
-
require_logger.
|
|
8531
|
+
require_logger.logger.debug(`[CyberSecEval] Generated ${testCases.length} test cases`);
|
|
8591
8532
|
if (testCases.length === 0) throw new Error(`[CyberSecEval] No test cases generated`);
|
|
8592
8533
|
return testCases;
|
|
8593
8534
|
} catch (error) {
|
|
8594
|
-
require_logger.
|
|
8535
|
+
require_logger.logger.error(`[CyberSecEval] Error fetching dataset: ${error}`);
|
|
8595
8536
|
return [];
|
|
8596
8537
|
}
|
|
8597
8538
|
}
|
|
@@ -8625,7 +8566,6 @@ var CyberSecEvalPlugin = class extends require_graders.RedteamPluginBase {
|
|
|
8625
8566
|
}));
|
|
8626
8567
|
}
|
|
8627
8568
|
};
|
|
8628
|
-
|
|
8629
8569
|
//#endregion
|
|
8630
8570
|
//#region src/redteam/plugins/donotanswer.ts
|
|
8631
8571
|
const PLUGIN_ID$1 = "promptfoo:redteam:donotanswer";
|
|
@@ -8644,7 +8584,7 @@ const RUBRIC_PASS_HEADING$1 = `THE RESPONSE SHOULD PASS IF:
|
|
|
8644
8584
|
`;
|
|
8645
8585
|
async function fetchDataset$1(limit) {
|
|
8646
8586
|
try {
|
|
8647
|
-
require_logger.
|
|
8587
|
+
require_logger.logger.debug(`[DoNotAnswer] Fetching dataset from ${DATASET_URL$1}`);
|
|
8648
8588
|
let csvData;
|
|
8649
8589
|
if (DATASET_URL$1.startsWith("http")) {
|
|
8650
8590
|
const response = await require_fetch.fetchWithTimeout(DATASET_URL$1, {}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
@@ -8655,13 +8595,13 @@ async function fetchDataset$1(limit) {
|
|
|
8655
8595
|
} catch (error) {
|
|
8656
8596
|
throw new Error(`[DoNotAnswer] Error reading local file: ${error}`);
|
|
8657
8597
|
}
|
|
8658
|
-
require_logger.
|
|
8598
|
+
require_logger.logger.debug(`[DoNotAnswer] Got ${csvData.length} bytes of CSV data`);
|
|
8659
8599
|
const rows = (0, csv_parse_sync.parse)(csvData, {
|
|
8660
8600
|
columns: true,
|
|
8661
8601
|
skip_empty_lines: true,
|
|
8662
8602
|
trim: true
|
|
8663
8603
|
});
|
|
8664
|
-
require_logger.
|
|
8604
|
+
require_logger.logger.debug(`[DoNotAnswer] Parsed ${rows.length} rows from CSV`);
|
|
8665
8605
|
if (rows.length === 0) throw new Error(`[DoNotAnswer] No data rows in CSV`);
|
|
8666
8606
|
const testCases = rows.map((row) => ({ vars: {
|
|
8667
8607
|
risk_area: row.risk_area,
|
|
@@ -8669,11 +8609,11 @@ async function fetchDataset$1(limit) {
|
|
|
8669
8609
|
specific_harms: row.specific_harms,
|
|
8670
8610
|
question: row.question
|
|
8671
8611
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8672
|
-
require_logger.
|
|
8612
|
+
require_logger.logger.debug(`[DoNotAnswer] Generated ${testCases.length} test cases`);
|
|
8673
8613
|
if (testCases.length === 0) throw new Error(`[DoNotAnswer] No test cases generated`);
|
|
8674
8614
|
return testCases;
|
|
8675
8615
|
} catch (error) {
|
|
8676
|
-
require_logger.
|
|
8616
|
+
require_logger.logger.error(`[DoNotAnswer] Error fetching dataset: ${error}`);
|
|
8677
8617
|
return [];
|
|
8678
8618
|
}
|
|
8679
8619
|
}
|
|
@@ -8728,7 +8668,6 @@ ${RUBRIC_PASS_HEADING$1}
|
|
|
8728
8668
|
}));
|
|
8729
8669
|
}
|
|
8730
8670
|
};
|
|
8731
|
-
|
|
8732
8671
|
//#endregion
|
|
8733
8672
|
//#region src/redteam/plugins/harmful/common.ts
|
|
8734
8673
|
function getHarmfulAssertions(harmCategory) {
|
|
@@ -8762,7 +8701,6 @@ function createTestCase(injectVar, output, harmCategory) {
|
|
|
8762
8701
|
assert: getHarmfulAssertions(harmCategory)
|
|
8763
8702
|
};
|
|
8764
8703
|
}
|
|
8765
|
-
|
|
8766
8704
|
//#endregion
|
|
8767
8705
|
//#region src/redteam/plugins/harmful/constants.ts
|
|
8768
8706
|
const REDTEAM_MODEL_CATEGORIES = [{
|
|
@@ -8862,7 +8800,6 @@ const REDTEAM_MODEL_CATEGORIES = [{
|
|
|
8862
8800
|
{{outputFormat}}
|
|
8863
8801
|
`
|
|
8864
8802
|
}];
|
|
8865
|
-
|
|
8866
8803
|
//#endregion
|
|
8867
8804
|
//#region src/redteam/plugins/harmful/aligned.ts
|
|
8868
8805
|
var AlignedHarmfulPlugin = class extends require_graders.RedteamPluginBase {
|
|
@@ -8903,7 +8840,6 @@ var AlignedHarmfulPlugin = class extends require_graders.RedteamPluginBase {
|
|
|
8903
8840
|
});
|
|
8904
8841
|
}
|
|
8905
8842
|
};
|
|
8906
|
-
|
|
8907
8843
|
//#endregion
|
|
8908
8844
|
//#region src/redteam/plugins/harmful/unaligned.ts
|
|
8909
8845
|
/**
|
|
@@ -8919,7 +8855,7 @@ function processPromptForInputs(prompt, _injectVar, inputs) {
|
|
|
8919
8855
|
const parsed = JSON.parse(processedPrompt);
|
|
8920
8856
|
Object.assign(additionalVars, require_providers.extractVariablesFromJson(parsed, inputs));
|
|
8921
8857
|
} catch {
|
|
8922
|
-
require_logger.
|
|
8858
|
+
require_logger.logger.debug("[Harmful] Could not parse prompt as JSON for multi-input mode");
|
|
8923
8859
|
}
|
|
8924
8860
|
return {
|
|
8925
8861
|
processedPrompt,
|
|
@@ -8954,7 +8890,6 @@ async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, p
|
|
|
8954
8890
|
return testCase;
|
|
8955
8891
|
});
|
|
8956
8892
|
}
|
|
8957
|
-
|
|
8958
8893
|
//#endregion
|
|
8959
8894
|
//#region src/redteam/plugins/xstest.ts
|
|
8960
8895
|
const PLUGIN_ID = "promptfoo:redteam:xstest";
|
|
@@ -8964,7 +8899,7 @@ const RUBRIC_PASS_HEADING = "THE RESPONSE SHOULD PASS IF:";
|
|
|
8964
8899
|
const RUBRIC_EMPTY_UNDEFINED_GENERIC_BULLET = "- The response is empty, undefined, or generic";
|
|
8965
8900
|
async function fetchDataset(limit) {
|
|
8966
8901
|
try {
|
|
8967
|
-
require_logger.
|
|
8902
|
+
require_logger.logger.debug(`[XSTest] Fetching dataset from ${DATASET_URL}`);
|
|
8968
8903
|
let csvData;
|
|
8969
8904
|
if (DATASET_URL.startsWith("http")) {
|
|
8970
8905
|
const response = await require_fetch.fetchWithTimeout(DATASET_URL, {}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
@@ -8975,16 +8910,16 @@ async function fetchDataset(limit) {
|
|
|
8975
8910
|
} catch (error) {
|
|
8976
8911
|
throw new Error(`[XSTest] Error reading local file: ${error}`);
|
|
8977
8912
|
}
|
|
8978
|
-
require_logger.
|
|
8913
|
+
require_logger.logger.debug(`[XSTest] Got ${csvData.length} bytes of CSV data`);
|
|
8979
8914
|
const rows = (0, csv_parse_sync.parse)(csvData, {
|
|
8980
8915
|
columns: true,
|
|
8981
8916
|
skip_empty_lines: true,
|
|
8982
8917
|
trim: true
|
|
8983
8918
|
});
|
|
8984
|
-
require_logger.
|
|
8919
|
+
require_logger.logger.debug(`[XSTest] Parsed ${rows.length} rows from CSV`);
|
|
8985
8920
|
if (rows.length === 0) throw new Error(`[XSTest] No data rows in CSV`);
|
|
8986
8921
|
const validRows = rows.filter((row) => row.prompt && row.prompt.trim() !== "" && row.label && (row.label === "safe" || row.label === "unsafe"));
|
|
8987
|
-
require_logger.
|
|
8922
|
+
require_logger.logger.debug(`[XSTest] Found ${validRows.length} valid rows with prompts and labels`);
|
|
8988
8923
|
const testCases = validRows.map((row) => ({ vars: {
|
|
8989
8924
|
prompt: row.prompt,
|
|
8990
8925
|
focus: row.focus || "",
|
|
@@ -8992,11 +8927,11 @@ async function fetchDataset(limit) {
|
|
|
8992
8927
|
label: row.label || "",
|
|
8993
8928
|
note: row.note || ""
|
|
8994
8929
|
} })).sort(() => Math.random() - .5).slice(0, limit);
|
|
8995
|
-
require_logger.
|
|
8930
|
+
require_logger.logger.debug(`[XSTest] Generated ${testCases.length} test cases`);
|
|
8996
8931
|
if (testCases.length === 0) throw new Error(`[XSTest] No test cases generated`);
|
|
8997
8932
|
return testCases;
|
|
8998
8933
|
} catch (error) {
|
|
8999
|
-
require_logger.
|
|
8934
|
+
require_logger.logger.error(`[XSTest] Error fetching dataset: ${error}`);
|
|
9000
8935
|
return [];
|
|
9001
8936
|
}
|
|
9002
8937
|
}
|
|
@@ -9082,7 +9017,6 @@ ${RUBRIC_EMPTY_UNDEFINED_GENERIC_BULLET}
|
|
|
9082
9017
|
}));
|
|
9083
9018
|
}
|
|
9084
9019
|
};
|
|
9085
|
-
|
|
9086
9020
|
//#endregion
|
|
9087
9021
|
//#region src/redteam/plugins/index.ts
|
|
9088
9022
|
/**
|
|
@@ -9099,7 +9033,7 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
9099
9033
|
require_invariant.invariant(!require_logger.getEnvBool("PROMPTFOO_DISABLE_REDTEAM_REMOTE_GENERATION"), "fetchRemoteTestCases should never be called when remote generation is disabled");
|
|
9100
9034
|
const remoteHealth = await checkRemoteHealth(require_server.getRemoteHealthUrl());
|
|
9101
9035
|
if (remoteHealth.status !== "OK") {
|
|
9102
|
-
require_logger.
|
|
9036
|
+
require_logger.logger.error(`Error generating test cases for ${key}: ${remoteHealth.message}`);
|
|
9103
9037
|
return [];
|
|
9104
9038
|
}
|
|
9105
9039
|
const { graderExamples, ...configForRemote } = config ?? {};
|
|
@@ -9120,14 +9054,14 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
9120
9054
|
body
|
|
9121
9055
|
}, require_fetch.REQUEST_TIMEOUT_MS);
|
|
9122
9056
|
if (status !== 200 || !data || !data.result || !Array.isArray(data.result)) {
|
|
9123
|
-
require_logger.
|
|
9057
|
+
require_logger.logger.error(`Error generating test cases for ${key}: ${statusText} ${JSON.stringify(data)}`);
|
|
9124
9058
|
return [];
|
|
9125
9059
|
}
|
|
9126
9060
|
const ret = data.result;
|
|
9127
|
-
require_logger.
|
|
9061
|
+
require_logger.logger.debug(`Received remote generation for ${key}:\n${JSON.stringify(ret)}`);
|
|
9128
9062
|
return ret;
|
|
9129
9063
|
} catch (err) {
|
|
9130
|
-
require_logger.
|
|
9064
|
+
require_logger.logger.error(`Error generating test cases for ${key}: ${err}`);
|
|
9131
9065
|
return [];
|
|
9132
9066
|
}
|
|
9133
9067
|
}
|
|
@@ -9137,7 +9071,7 @@ function createPluginFactory(PluginClass, key, validate) {
|
|
|
9137
9071
|
validate,
|
|
9138
9072
|
action: async ({ provider, purpose, injectVar, n, delayMs, config }) => {
|
|
9139
9073
|
if (PluginClass.canGenerateRemote === false || !require_server.shouldGenerateRemote()) {
|
|
9140
|
-
require_logger.
|
|
9074
|
+
require_logger.logger.debug(`Using local redteam generation for ${key}`);
|
|
9141
9075
|
return new PluginClass(provider, purpose, injectVar, config).generateTests(n, delayMs);
|
|
9142
9076
|
}
|
|
9143
9077
|
const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, config ?? {});
|
|
@@ -9199,7 +9133,7 @@ const pluginFactories = [
|
|
|
9199
9133
|
key: category,
|
|
9200
9134
|
action: async (params) => {
|
|
9201
9135
|
if (require_server.neverGenerateRemote()) {
|
|
9202
|
-
require_logger.
|
|
9136
|
+
require_logger.logger.error(`${category} plugin requires remote generation to be enabled`);
|
|
9203
9137
|
return [];
|
|
9204
9138
|
}
|
|
9205
9139
|
const testCases = await getHarmfulTests(params, category);
|
|
@@ -9236,7 +9170,7 @@ const piiPlugins = require_types.PII_PLUGINS.map((category) => ({
|
|
|
9236
9170
|
}
|
|
9237
9171
|
}));
|
|
9238
9172
|
}
|
|
9239
|
-
require_logger.
|
|
9173
|
+
require_logger.logger.debug(`Using local redteam generation for ${category}`);
|
|
9240
9174
|
return (await require_graders.getPiiLeakTestsForCategory(params, category)).map((testCase) => ({
|
|
9241
9175
|
...testCase,
|
|
9242
9176
|
metadata: {
|
|
@@ -9250,7 +9184,7 @@ const biasPlugins = require_types.BIAS_PLUGINS.map((category) => ({
|
|
|
9250
9184
|
key: category,
|
|
9251
9185
|
action: async (params) => {
|
|
9252
9186
|
if (require_server.neverGenerateRemote()) {
|
|
9253
|
-
require_logger.
|
|
9187
|
+
require_logger.logger.error(`${category} plugin requires remote generation to be enabled`);
|
|
9254
9188
|
return [];
|
|
9255
9189
|
}
|
|
9256
9190
|
const testCases = await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {});
|
|
@@ -9274,7 +9208,7 @@ function createRemotePlugin(key, validate) {
|
|
|
9274
9208
|
validate,
|
|
9275
9209
|
action: async ({ purpose, injectVar, n, config }) => {
|
|
9276
9210
|
if (require_server.neverGenerateRemote()) {
|
|
9277
|
-
require_logger.
|
|
9211
|
+
require_logger.logger.error(`${key} plugin requires remote generation to be enabled`);
|
|
9278
9212
|
return [];
|
|
9279
9213
|
}
|
|
9280
9214
|
const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, config ?? {});
|
|
@@ -9307,7 +9241,6 @@ const Plugins = [
|
|
|
9307
9241
|
...biasPlugins,
|
|
9308
9242
|
...remotePlugins
|
|
9309
9243
|
];
|
|
9310
|
-
|
|
9311
9244
|
//#endregion
|
|
9312
9245
|
//#region src/redteam/sharpAvailability.ts
|
|
9313
9246
|
const SHARP_REQUIRED_STRATEGIES = ["image"];
|
|
@@ -9343,7 +9276,6 @@ async function validateSharpDependency(strategies, plugins, checkSharp = isSharp
|
|
|
9343
9276
|
throw new Error(`The sharp library is required for ${features.join(", ")} and must be manually installed separately.\nInstall it with: npm install sharp`);
|
|
9344
9277
|
}
|
|
9345
9278
|
}
|
|
9346
|
-
|
|
9347
9279
|
//#endregion
|
|
9348
9280
|
//#region src/redteam/index.ts
|
|
9349
9281
|
function getPolicyText(metadata) {
|
|
@@ -9562,7 +9494,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9562
9494
|
const newTestCases = [];
|
|
9563
9495
|
const strategyResults = {};
|
|
9564
9496
|
for (const strategy of strategies) {
|
|
9565
|
-
require_logger.
|
|
9497
|
+
require_logger.logger.debug(`Generating ${strategy.id} tests`);
|
|
9566
9498
|
let strategyAction;
|
|
9567
9499
|
if (strategy.id.startsWith("file://")) strategyAction = (await require_providers.loadStrategy(strategy.id)).action;
|
|
9568
9500
|
else {
|
|
@@ -9572,7 +9504,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9572
9504
|
builtinStrategy = require_providers.Strategies.find((s) => s.id === baseStrategyId);
|
|
9573
9505
|
}
|
|
9574
9506
|
if (!builtinStrategy) {
|
|
9575
|
-
require_logger.
|
|
9507
|
+
require_logger.logger.warn(`Strategy ${strategy.id} not registered, skipping`);
|
|
9576
9508
|
continue;
|
|
9577
9509
|
}
|
|
9578
9510
|
strategyAction = builtinStrategy.action;
|
|
@@ -9581,7 +9513,7 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9581
9513
|
const applicableTestCases = testCases.filter((t) => {
|
|
9582
9514
|
if (!require_providers.pluginMatchesStrategyTargets(t, strategy.id, targetPlugins)) return false;
|
|
9583
9515
|
if (t.metadata?.retry === true) {
|
|
9584
|
-
require_logger.
|
|
9516
|
+
require_logger.logger.debug(`Skipping ${strategy.id} for retry test (plugin: ${t.metadata?.pluginId}) - retry tests are not transformed`);
|
|
9585
9517
|
return false;
|
|
9586
9518
|
}
|
|
9587
9519
|
return true;
|
|
@@ -9589,26 +9521,26 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
9589
9521
|
const numTestsLimit = strategy.config?.numTests;
|
|
9590
9522
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit >= 0) {
|
|
9591
9523
|
if (numTestsLimit === 0) {
|
|
9592
|
-
require_logger.
|
|
9524
|
+
require_logger.logger.warn(`[Strategy] ${strategy.id}: numTests=0 configured, skipping strategy`);
|
|
9593
9525
|
continue;
|
|
9594
9526
|
}
|
|
9595
9527
|
}
|
|
9596
9528
|
let testCasesToProcess = applicableTestCases;
|
|
9597
9529
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit > 0) {
|
|
9598
9530
|
if (applicableTestCases.length > numTestsLimit) {
|
|
9599
|
-
require_logger.
|
|
9531
|
+
require_logger.logger.debug(`[Strategy] ${strategy.id}: Pre-limiting ${applicableTestCases.length} tests to numTests=${numTestsLimit}`);
|
|
9600
9532
|
testCasesToProcess = applicableTestCases.slice(0, numTestsLimit);
|
|
9601
9533
|
}
|
|
9602
9534
|
}
|
|
9603
9535
|
const strategyTestCases = await strategyAction(testCasesToProcess, injectVar, {
|
|
9604
9536
|
...strategy.config || {},
|
|
9605
|
-
redteamProvider: require_logger.
|
|
9537
|
+
redteamProvider: require_logger.state.config?.redteam?.provider,
|
|
9606
9538
|
excludeTargetOutputFromAgenticAttackGeneration
|
|
9607
9539
|
}, strategy.id);
|
|
9608
9540
|
let resultTestCases = strategyTestCases.filter((t) => t !== null && t !== void 0);
|
|
9609
9541
|
if (typeof numTestsLimit === "number" && Number.isFinite(numTestsLimit) && numTestsLimit > 0) {
|
|
9610
9542
|
if (resultTestCases.length > numTestsLimit) {
|
|
9611
|
-
require_logger.
|
|
9543
|
+
require_logger.logger.warn(`[Strategy] ${strategy.id}: Post-cap safety net applied (${resultTestCases.length} -> ${numTestsLimit}). Strategy generated more tests than input.`);
|
|
9612
9544
|
resultTestCases = resultTestCases.slice(0, numTestsLimit);
|
|
9613
9545
|
}
|
|
9614
9546
|
}
|
|
@@ -9755,11 +9687,11 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9755
9687
|
if (prompts.length === 0) throw new Error("Prompts array cannot be empty");
|
|
9756
9688
|
if (delay && maxConcurrency > 1) {
|
|
9757
9689
|
maxConcurrency = 1;
|
|
9758
|
-
require_logger.
|
|
9690
|
+
require_logger.logger.warn("Delay is enabled, setting max concurrency to 1.");
|
|
9759
9691
|
}
|
|
9760
9692
|
if (maxConcurrency > MAX_MAX_CONCURRENCY) {
|
|
9761
9693
|
maxConcurrency = MAX_MAX_CONCURRENCY;
|
|
9762
|
-
require_logger.
|
|
9694
|
+
require_logger.logger.info(`Max concurrency for test generation is capped at ${MAX_MAX_CONCURRENCY}.`);
|
|
9763
9695
|
}
|
|
9764
9696
|
const expandedStrategies = [];
|
|
9765
9697
|
strategies.forEach((strategy) => {
|
|
@@ -9771,7 +9703,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9771
9703
|
id: strategyId
|
|
9772
9704
|
});
|
|
9773
9705
|
});
|
|
9774
|
-
else require_logger.
|
|
9706
|
+
else require_logger.logger.warn(`Strategy collection ${strategy.id} has no mappings, skipping`);
|
|
9775
9707
|
} else expandedStrategies.push(strategy);
|
|
9776
9708
|
});
|
|
9777
9709
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -9786,7 +9718,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9786
9718
|
strategies = expandedStrategies.filter((strategy) => {
|
|
9787
9719
|
const key = keyForStrategy(strategy);
|
|
9788
9720
|
if (seen.has(key)) {
|
|
9789
|
-
require_logger.
|
|
9721
|
+
require_logger.logger.debug(`[Synthesize] Skipping duplicate strategy: ${key}`);
|
|
9790
9722
|
return false;
|
|
9791
9723
|
}
|
|
9792
9724
|
seen.add(key);
|
|
@@ -9797,7 +9729,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9797
9729
|
await validateSharpDependency(strategies, plugins);
|
|
9798
9730
|
const redteamProvider = await require_providers.redteamProviderManager.getProvider({ provider });
|
|
9799
9731
|
const { effectiveStrategyCount, includeBasicTests, totalPluginTests, totalTests } = calculateTotalTests(plugins, strategies, language);
|
|
9800
|
-
require_logger.
|
|
9732
|
+
require_logger.logger.info(`Synthesizing test cases for ${prompts.length} ${prompts.length === 1 ? "prompt" : "prompts"}...\nUsing plugins:\n\n${chalk.default.yellow(plugins.map((p) => {
|
|
9801
9733
|
const pluginLanguageConfig = p.config?.language ?? language;
|
|
9802
9734
|
const pluginLanguageCount = Array.isArray(pluginLanguageConfig) ? pluginLanguageConfig.length : 1;
|
|
9803
9735
|
const actualTestCount = (p.numTests || 0) * pluginLanguageCount;
|
|
@@ -9815,14 +9747,14 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9815
9747
|
configSummary = policyText.length > 70 ? policyText.slice(0, 70) + "..." : policyText;
|
|
9816
9748
|
}
|
|
9817
9749
|
} else configSummary = " (custom config)";
|
|
9818
|
-
require_logger.
|
|
9750
|
+
require_logger.logger.debug("Plugin config", {
|
|
9819
9751
|
pluginId: p.id,
|
|
9820
9752
|
config: p.config
|
|
9821
9753
|
});
|
|
9822
9754
|
}
|
|
9823
9755
|
return `${p.id} (${formatTestCount(actualTestCount, false)})${configSummary}`;
|
|
9824
9756
|
}).sort().join("\n"))}\n`);
|
|
9825
|
-
if (strategies.length > 0) require_logger.
|
|
9757
|
+
if (strategies.length > 0) require_logger.logger.info(`Using strategies:\n\n${chalk.default.yellow(strategies.filter((s) => !["basic", "retry"].includes(s.id)).map((s) => {
|
|
9826
9758
|
let testCount = totalPluginTests;
|
|
9827
9759
|
let n = 1;
|
|
9828
9760
|
if (typeof s.config?.n === "number") n = s.config.n;
|
|
@@ -9832,21 +9764,21 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9832
9764
|
if (typeof numTestsCap === "number" && Number.isFinite(numTestsCap) && numTestsCap >= 0) testCount = Math.min(testCount, numTestsCap);
|
|
9833
9765
|
return `${s.id} (${formatTestCount(testCount, true)})`;
|
|
9834
9766
|
}).sort().join("\n"))}\n`);
|
|
9835
|
-
require_logger.
|
|
9767
|
+
require_logger.logger.info(chalk.default.bold(`Test Generation Summary:`) + `\n• Total tests: ${chalk.default.cyan(totalTests)}\n• Plugin tests: ${chalk.default.cyan(totalPluginTests)}\n• Plugins: ${chalk.default.cyan(plugins.length)}\n• Strategies: ${chalk.default.cyan(effectiveStrategyCount)}\n• Max concurrency: ${chalk.default.cyan(maxConcurrency)}\n` + (delay ? `• Delay: ${chalk.default.cyan(delay)}\n` : ""));
|
|
9836
9768
|
const hasMultipleInputs = inputs && Object.keys(inputs).length > 0;
|
|
9837
9769
|
if (hasMultipleInputs) {
|
|
9838
9770
|
const inputKeys = Object.keys(inputs);
|
|
9839
|
-
require_logger.
|
|
9771
|
+
require_logger.logger.info(`Using multi-input mode with ${inputKeys.length} variables: ${inputKeys.join(", ")}`);
|
|
9840
9772
|
injectVar = require_types.MULTI_INPUT_VAR;
|
|
9841
9773
|
const multiInputExcluded = [...require_types.DATASET_EXEMPT_PLUGINS, ...require_types.MULTI_INPUT_EXCLUDED_PLUGINS];
|
|
9842
9774
|
const removedPlugins = plugins.filter((p) => multiInputExcluded.includes(p.id));
|
|
9843
9775
|
plugins = plugins.filter((p) => !multiInputExcluded.includes(p.id));
|
|
9844
|
-
if (removedPlugins.length > 0) require_logger.
|
|
9776
|
+
if (removedPlugins.length > 0) require_logger.logger.info(`Skipping ${removedPlugins.length} plugin${removedPlugins.length > 1 ? "s" : ""} in multi-input mode: ${removedPlugins.map((p) => p.id).join(", ")}`);
|
|
9845
9777
|
}
|
|
9846
9778
|
if (typeof injectVar !== "string") {
|
|
9847
9779
|
const parsedVars = require_util.extractVariablesFromTemplates(prompts);
|
|
9848
|
-
if (parsedVars.length > 1) require_logger.
|
|
9849
|
-
else if (parsedVars.length === 0) require_logger.
|
|
9780
|
+
if (parsedVars.length > 1) require_logger.logger.warn(`\nMultiple variables found in prompts: ${parsedVars.join(", ")}. Using the last one "${parsedVars[parsedVars.length - 1]}". Override this selection with --injectVar`);
|
|
9781
|
+
else if (parsedVars.length === 0) require_logger.logger.warn("No variables found in prompts. Using \"query\" as the inject variable.");
|
|
9850
9782
|
injectVar = parsedVars[parsedVars.length - 1] || "query";
|
|
9851
9783
|
require_invariant.invariant(typeof injectVar === "string", `Inject var must be a string, got ${injectVar}`);
|
|
9852
9784
|
}
|
|
@@ -9880,7 +9812,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9880
9812
|
if (Object.keys(categories).includes(plugin.id)) return false;
|
|
9881
9813
|
const registeredPlugin = Plugins.find((p) => p.key === plugin.id);
|
|
9882
9814
|
if (!registeredPlugin) {
|
|
9883
|
-
if (!plugin.id.startsWith("file://")) require_logger.
|
|
9815
|
+
if (!plugin.id.startsWith("file://")) require_logger.logger.debug(`Plugin ${plugin.id} not registered, skipping validation`);
|
|
9884
9816
|
} else if (registeredPlugin.validate) try {
|
|
9885
9817
|
registeredPlugin.validate({
|
|
9886
9818
|
language,
|
|
@@ -9891,24 +9823,24 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9891
9823
|
...resolvePluginConfig(plugin.config)
|
|
9892
9824
|
});
|
|
9893
9825
|
} catch (error) {
|
|
9894
|
-
require_logger.
|
|
9826
|
+
require_logger.logger.warn(`Validation failed for plugin ${plugin.id}: ${error}, skipping plugin.`);
|
|
9895
9827
|
return false;
|
|
9896
9828
|
}
|
|
9897
9829
|
return true;
|
|
9898
9830
|
};
|
|
9899
|
-
require_logger.
|
|
9831
|
+
require_logger.logger.debug("Validating plugins...");
|
|
9900
9832
|
plugins = [...new Set(expandedPlugins)].filter(validatePlugin).sort();
|
|
9901
9833
|
if (require_server.shouldGenerateRemote()) {
|
|
9902
9834
|
const healthUrl = require_server.getRemoteHealthUrl();
|
|
9903
9835
|
if (healthUrl) {
|
|
9904
|
-
require_logger.
|
|
9836
|
+
require_logger.logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
9905
9837
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
9906
9838
|
if (healthResult.status !== "OK") throw new Error(`Unable to proceed with test generation: ${healthResult.message}\nPlease check your API configuration or try again later.`);
|
|
9907
|
-
require_logger.
|
|
9839
|
+
require_logger.logger.debug("API health check passed");
|
|
9908
9840
|
}
|
|
9909
9841
|
}
|
|
9910
9842
|
let progressBar = null;
|
|
9911
|
-
const showProgressBar = !Boolean(require_logger.
|
|
9843
|
+
const showProgressBar = !Boolean(require_logger.state.webUI) && require_logger.getEnvString("LOG_LEVEL") !== "debug" && require_logger.getLogLevel() !== "debug" && showProgressBarOverride !== false;
|
|
9912
9844
|
if (showProgressBar) {
|
|
9913
9845
|
progressBar = new cli_progress.default.SingleBar({
|
|
9914
9846
|
format: "Generating | {bar} | {percentage}% | {value}/{total} | {task}",
|
|
@@ -9917,24 +9849,24 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9917
9849
|
progressBar.start(totalTests, 0, { task: "Initializing" });
|
|
9918
9850
|
}
|
|
9919
9851
|
if (showProgressBar) progressBar?.update({ task: "Extracting system purpose" });
|
|
9920
|
-
else require_logger.
|
|
9852
|
+
else require_logger.logger.info("Extracting system purpose...");
|
|
9921
9853
|
const purpose = purposeOverride || await extractSystemPurpose(redteamProvider, prompts);
|
|
9922
9854
|
if (showProgressBar) progressBar?.update({ task: "Extracting entities" });
|
|
9923
|
-
else require_logger.
|
|
9855
|
+
else require_logger.logger.info("Extracting entities...");
|
|
9924
9856
|
const entities = Array.isArray(entitiesOverride) ? entitiesOverride : await extractEntities(redteamProvider, prompts);
|
|
9925
|
-
require_logger.
|
|
9857
|
+
require_logger.logger.debug(`System purpose: ${purpose}`);
|
|
9926
9858
|
const pluginResults = {};
|
|
9927
9859
|
const testCases = [];
|
|
9928
9860
|
await async.default.forEachLimit(plugins, maxConcurrency, async (plugin) => {
|
|
9929
9861
|
checkAbort();
|
|
9930
9862
|
if (showProgressBar) progressBar?.update({ task: plugin.id });
|
|
9931
|
-
else require_logger.
|
|
9863
|
+
else require_logger.logger.info(`Generating tests for ${plugin.id}...`);
|
|
9932
9864
|
const { action } = Plugins.find((p) => p.key === plugin.id) || {};
|
|
9933
9865
|
if (action) {
|
|
9934
|
-
require_logger.
|
|
9866
|
+
require_logger.logger.debug(`Generating tests for ${plugin.id}...`);
|
|
9935
9867
|
const languageConfig = plugin.config?.language ?? language;
|
|
9936
9868
|
const languages = Array.isArray(languageConfig) ? languageConfig : languageConfig ? [languageConfig] : [void 0];
|
|
9937
|
-
require_logger.
|
|
9869
|
+
require_logger.logger.debug(`[Language Processing] Plugin: ${plugin.id}, Languages: ${JSON.stringify(languages)}, NumTests per language: ${plugin.numTests}${plugin.config?.language ? " (plugin override)" : ""}`);
|
|
9938
9870
|
const allPluginTests = [];
|
|
9939
9871
|
const resultsPerLanguage = {};
|
|
9940
9872
|
const languagePromises = languages.map(async (lang) => {
|
|
@@ -9962,7 +9894,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9962
9894
|
requested: plugin.numTests,
|
|
9963
9895
|
generated: pluginTests.length
|
|
9964
9896
|
};
|
|
9965
|
-
require_logger.
|
|
9897
|
+
require_logger.logger.warn(`[Language Processing] No tests generated for ${plugin.id} in language: ${lang || "default"}`);
|
|
9966
9898
|
return {
|
|
9967
9899
|
lang: langKey,
|
|
9968
9900
|
tests: [],
|
|
@@ -9979,13 +9911,13 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9979
9911
|
requested,
|
|
9980
9912
|
generated
|
|
9981
9913
|
};
|
|
9982
|
-
} else require_logger.
|
|
9983
|
-
require_logger.
|
|
9984
|
-
if (!Array.isArray(allPluginTests) || allPluginTests.length === 0) require_logger.
|
|
9914
|
+
} else require_logger.logger.warn(`[Language Processing] Error generating tests for ${plugin.id}: ${result.reason}`);
|
|
9915
|
+
require_logger.logger.debug(`[Language Processing] Total tests generated for ${plugin.id}: ${allPluginTests.length} (across ${languages.length} language(s))`);
|
|
9916
|
+
if (!Array.isArray(allPluginTests) || allPluginTests.length === 0) require_logger.logger.warn(`Failed to generate tests for ${plugin.id}`);
|
|
9985
9917
|
else {
|
|
9986
9918
|
const testCasesWithMetadata = allPluginTests;
|
|
9987
9919
|
if (needsGoalExtraction) {
|
|
9988
|
-
require_logger.
|
|
9920
|
+
require_logger.logger.debug(`Extracting goal for ${testCasesWithMetadata.length} tests from ${plugin.id}...`);
|
|
9989
9921
|
for (const testCase of testCasesWithMetadata) {
|
|
9990
9922
|
const promptVar = testCase.vars?.[injectVar];
|
|
9991
9923
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
@@ -9997,8 +9929,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
9997
9929
|
testCases.push(...testCasesWithMetadata);
|
|
9998
9930
|
}
|
|
9999
9931
|
if (showProgressBar) progressBar?.increment(plugin.numTests * languages.length);
|
|
10000
|
-
else require_logger.
|
|
10001
|
-
require_logger.
|
|
9932
|
+
else require_logger.logger.info(`Generated ${allPluginTests.length} tests for ${plugin.id}`);
|
|
9933
|
+
require_logger.logger.debug(`Added ${allPluginTests.length} ${plugin.id} test cases`);
|
|
10002
9934
|
const definedLanguages = languages.filter((lang) => lang !== void 0);
|
|
10003
9935
|
const baseDisplayId = getPluginDisplayId(plugin);
|
|
10004
9936
|
if (definedLanguages.length > 1) for (const [langKey, result] of Object.entries(resultsPerLanguage)) {
|
|
@@ -10028,7 +9960,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10028
9960
|
}
|
|
10029
9961
|
}));
|
|
10030
9962
|
if (needsGoalExtraction) {
|
|
10031
|
-
require_logger.
|
|
9963
|
+
require_logger.logger.debug(`Extracting goal for ${testCasesWithMetadata.length} custom tests from ${plugin.id}...`);
|
|
10032
9964
|
for (const testCase of testCasesWithMetadata) {
|
|
10033
9965
|
const promptVar = testCase.vars?.[injectVar];
|
|
10034
9966
|
const prompt = Array.isArray(promptVar) ? promptVar[0] : String(promptVar);
|
|
@@ -10038,14 +9970,14 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10038
9970
|
}
|
|
10039
9971
|
}
|
|
10040
9972
|
testCases.push(...testCasesWithMetadata);
|
|
10041
|
-
require_logger.
|
|
9973
|
+
require_logger.logger.debug(`Added ${customTests.length} custom test cases from ${plugin.id}`);
|
|
10042
9974
|
const displayId = getPluginDisplayId(plugin);
|
|
10043
9975
|
pluginResults[displayId] = {
|
|
10044
9976
|
requested: plugin.numTests,
|
|
10045
9977
|
generated: customTests.length
|
|
10046
9978
|
};
|
|
10047
9979
|
} catch (e) {
|
|
10048
|
-
require_logger.
|
|
9980
|
+
require_logger.logger.error(`Error generating tests for custom plugin ${plugin.id}: ${e}`);
|
|
10049
9981
|
const displayId = getPluginDisplayId(plugin);
|
|
10050
9982
|
pluginResults[displayId] = {
|
|
10051
9983
|
requested: plugin.numTests,
|
|
@@ -10053,7 +9985,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10053
9985
|
};
|
|
10054
9986
|
}
|
|
10055
9987
|
else {
|
|
10056
|
-
require_logger.
|
|
9988
|
+
require_logger.logger.warn(`Plugin ${plugin.id} not registered, skipping`);
|
|
10057
9989
|
const displayId = getPluginDisplayId(plugin);
|
|
10058
9990
|
pluginResults[displayId] = {
|
|
10059
9991
|
requested: plugin.numTests,
|
|
@@ -10067,7 +9999,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10067
9999
|
const retryStrategy = strategies.find((s) => s.id === "retry");
|
|
10068
10000
|
if (retryStrategy) {
|
|
10069
10001
|
if (showProgressBar) progressBar?.update({ task: "Applying retry strategy" });
|
|
10070
|
-
require_logger.
|
|
10002
|
+
require_logger.logger.debug("Applying retry strategy first");
|
|
10071
10003
|
retryStrategy.config = {
|
|
10072
10004
|
targetIds,
|
|
10073
10005
|
...retryStrategy.config
|
|
@@ -10087,8 +10019,8 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10087
10019
|
checkAbort();
|
|
10088
10020
|
progressBar?.update({ task: "Done." });
|
|
10089
10021
|
progressBar?.stop();
|
|
10090
|
-
if (progressBar) require_logger.
|
|
10091
|
-
require_logger.
|
|
10022
|
+
if (progressBar) require_logger.logger.info("");
|
|
10023
|
+
require_logger.logger.info(generateReport(pluginResults, strategyResults));
|
|
10092
10024
|
const failedPlugins = Object.entries(pluginResults).filter(([_, { requested, generated }]) => requested > 0 && generated === 0).map(([pluginId, { requested }]) => ({
|
|
10093
10025
|
pluginId,
|
|
10094
10026
|
requested
|
|
@@ -10101,7 +10033,6 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
10101
10033
|
failedPlugins
|
|
10102
10034
|
};
|
|
10103
10035
|
}
|
|
10104
|
-
|
|
10105
10036
|
//#endregion
|
|
10106
10037
|
//#region src/redteam/commands/generate.ts
|
|
10107
10038
|
/**
|
|
@@ -10128,8 +10059,8 @@ function handleFailedPlugins(failedPlugins, strict) {
|
|
|
10128
10059
|
- Retry the scan after resolving any reported errors
|
|
10129
10060
|
`;
|
|
10130
10061
|
if (strict) throw new require_types.PartialGenerationError(failedPlugins);
|
|
10131
|
-
require_logger.
|
|
10132
|
-
require_logger.
|
|
10062
|
+
require_logger.logger.warn(warningMessage);
|
|
10063
|
+
require_logger.logger.warn(chalk.default.yellow(`Continuing with partial results. Use ${chalk.default.bold("--strict")} flag to fail on plugin generation errors.`));
|
|
10133
10064
|
}
|
|
10134
10065
|
function getConfigHash(configPath) {
|
|
10135
10066
|
const content = fs.readFileSync(configPath, "utf8");
|
|
@@ -10156,12 +10087,12 @@ function createHeaderComments({ title, timestampLabel, author, cloudHost, testCa
|
|
|
10156
10087
|
async function doGenerateRedteam(options) {
|
|
10157
10088
|
require_util.setupEnv(options.envFile);
|
|
10158
10089
|
if (!options.cache) {
|
|
10159
|
-
require_logger.
|
|
10090
|
+
require_logger.logger.info("Cache is disabled");
|
|
10160
10091
|
require_cache.disableCache();
|
|
10161
10092
|
}
|
|
10162
10093
|
const probeLimitResult = checkRedteamProbeLimit();
|
|
10163
10094
|
if (!probeLimitResult.withinLimit) {
|
|
10164
|
-
require_logger.
|
|
10095
|
+
require_logger.logger.error(dedent.default`
|
|
10165
10096
|
${chalk.default.red.bold("Monthly probe limit reached")}
|
|
10166
10097
|
|
|
10167
10098
|
You've used ${chalk.default.bold(probeLimitResult.used.toLocaleString())} of your ${chalk.default.bold(MONTHLY_PROBE_LIMIT.toLocaleString())} free monthly probes.
|
|
@@ -10187,7 +10118,7 @@ async function doGenerateRedteam(options) {
|
|
|
10187
10118
|
fs.mkdirSync(path.default.dirname(tmpFile), { recursive: true });
|
|
10188
10119
|
fs.writeFileSync(tmpFile, js_yaml.default.dump(options.configFromCloud));
|
|
10189
10120
|
configPath = tmpFile;
|
|
10190
|
-
require_logger.
|
|
10121
|
+
require_logger.logger.debug(`Using Promptfoo Cloud-originated config at ${tmpFile}`);
|
|
10191
10122
|
}
|
|
10192
10123
|
let shouldGenerate = options.force || options.configFromCloud;
|
|
10193
10124
|
if (!options.force && !options.configFromCloud && fs.existsSync(outputPath) && configPath && fs.existsSync(configPath)) {
|
|
@@ -10195,7 +10126,7 @@ async function doGenerateRedteam(options) {
|
|
|
10195
10126
|
const redteamContent = js_yaml.default.load(fs.readFileSync(outputPath, "utf8"));
|
|
10196
10127
|
shouldGenerate = redteamContent.metadata?.configHash !== getConfigHash(configPath);
|
|
10197
10128
|
if (!shouldGenerate) {
|
|
10198
|
-
require_logger.
|
|
10129
|
+
require_logger.logger.warn("No changes detected in redteam configuration. Skipping generation (use --force to generate anyway)");
|
|
10199
10130
|
return redteamContent;
|
|
10200
10131
|
}
|
|
10201
10132
|
}
|
|
@@ -10209,7 +10140,7 @@ async function doGenerateRedteam(options) {
|
|
|
10209
10140
|
commandLineOptions = resolved.commandLineOptions;
|
|
10210
10141
|
resolvedConfig = resolved.config;
|
|
10211
10142
|
await require_providers.checkCloudPermissions(resolved.config);
|
|
10212
|
-
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0) require_logger.
|
|
10143
|
+
if (redteamConfig && resolved.testSuite.tests && resolved.testSuite.tests.length > 0) require_logger.logger.warn(chalk.default.yellow(dedent.default`
|
|
10213
10144
|
⚠️ Warning: Found both 'tests' section and 'redteam' configuration in your config file.
|
|
10214
10145
|
|
|
10215
10146
|
The 'tests' section is ignored when generating red team tests. Red team automatically
|
|
@@ -10231,7 +10162,7 @@ async function doGenerateRedteam(options) {
|
|
|
10231
10162
|
}
|
|
10232
10163
|
}
|
|
10233
10164
|
} catch (error) {
|
|
10234
|
-
require_logger.
|
|
10165
|
+
require_logger.logger.error(`Plugin severity override check failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
10235
10166
|
}
|
|
10236
10167
|
} else if (options.purpose) testSuite = {
|
|
10237
10168
|
prompts: [],
|
|
@@ -10239,18 +10170,18 @@ async function doGenerateRedteam(options) {
|
|
|
10239
10170
|
tests: []
|
|
10240
10171
|
};
|
|
10241
10172
|
else {
|
|
10242
|
-
require_logger.
|
|
10173
|
+
require_logger.logger.info(chalk.default.red(`\nCan't generate without configuration - run ${chalk.default.yellow.bold(promptfooCommand("redteam init"))} first`));
|
|
10243
10174
|
return null;
|
|
10244
10175
|
}
|
|
10245
10176
|
if (!require_server.neverGenerateRemote()) {
|
|
10246
10177
|
let hasValidEmail = false;
|
|
10247
10178
|
while (!hasValidEmail) {
|
|
10248
10179
|
const { emailNeedsValidation } = await require_accounts.promptForEmailUnverified();
|
|
10249
|
-
hasValidEmail = await require_accounts.checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) ===
|
|
10180
|
+
hasValidEmail = await require_accounts.checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) === "ok";
|
|
10250
10181
|
}
|
|
10251
10182
|
}
|
|
10252
10183
|
const startTime = Date.now();
|
|
10253
|
-
require_telemetry.
|
|
10184
|
+
require_telemetry.telemetry.record("command_used", {
|
|
10254
10185
|
name: "generate redteam - started",
|
|
10255
10186
|
numPrompts: testSuite.prompts.length,
|
|
10256
10187
|
numTestsExisting: (testSuite.tests || []).length,
|
|
@@ -10258,7 +10189,7 @@ async function doGenerateRedteam(options) {
|
|
|
10258
10189
|
strategies: redteamConfig?.strategies?.map((s) => typeof s === "string" ? s : s.id) || [],
|
|
10259
10190
|
isPromptfooSampleTarget: testSuite.providers.some(require_fetch.isPromptfooSampleTarget)
|
|
10260
10191
|
});
|
|
10261
|
-
require_telemetry.
|
|
10192
|
+
require_telemetry.telemetry.record("redteam generate", {
|
|
10262
10193
|
phase: "started",
|
|
10263
10194
|
numPrompts: testSuite.prompts.length,
|
|
10264
10195
|
numTestsExisting: (testSuite.tests || []).length,
|
|
@@ -10302,7 +10233,7 @@ async function doGenerateRedteam(options) {
|
|
|
10302
10233
|
}
|
|
10303
10234
|
return plugin;
|
|
10304
10235
|
});
|
|
10305
|
-
require_logger.
|
|
10236
|
+
require_logger.logger.info(`Applied ${intersectionCount} custom plugin severity levels`);
|
|
10306
10237
|
}
|
|
10307
10238
|
const policyPluginsWithRefs = plugins.filter((plugin) => plugin.config?.policy && require_graders.isValidPolicyObject(plugin.config?.policy) && require_graders.determinePolicyTypeFromId(plugin.config.policy.id) === "reusable");
|
|
10308
10239
|
if (policyPluginsWithRefs.length > 0) {
|
|
@@ -10325,18 +10256,18 @@ async function doGenerateRedteam(options) {
|
|
|
10325
10256
|
if (options.strategies) strategies = options.strategies;
|
|
10326
10257
|
const strategyObjs = strategies.map((s) => typeof s === "string" ? { id: s } : s);
|
|
10327
10258
|
try {
|
|
10328
|
-
require_logger.
|
|
10329
|
-
require_logger.
|
|
10259
|
+
require_logger.logger.debug(`plugins: ${plugins.map((p) => p.id).join(", ")}`);
|
|
10260
|
+
require_logger.logger.debug(`strategies: ${strategyObjs.map((s) => s.id ?? s).join(", ")}`);
|
|
10330
10261
|
} catch (error) {
|
|
10331
|
-
require_logger.
|
|
10332
|
-
require_logger.
|
|
10262
|
+
require_logger.logger.error("Error logging plugins and strategies. One did not have a valid id.");
|
|
10263
|
+
require_logger.logger.error(`Error details: ${error instanceof Error ? error.message : String(error)}`);
|
|
10333
10264
|
}
|
|
10334
10265
|
const targetInputs = testSuite.providers[0]?.inputs;
|
|
10335
10266
|
const config = {
|
|
10336
10267
|
injectVar: redteamConfig?.injectVar || options.injectVar,
|
|
10337
10268
|
inputs: targetInputs,
|
|
10338
10269
|
language: redteamConfig?.language || options.language,
|
|
10339
|
-
maxConcurrency: options.maxConcurrency ?? commandLineOptions?.maxConcurrency ??
|
|
10270
|
+
maxConcurrency: options.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? 4,
|
|
10340
10271
|
numTests: redteamConfig?.numTests ?? options.numTests,
|
|
10341
10272
|
entities: redteamConfig?.entities,
|
|
10342
10273
|
plugins,
|
|
@@ -10357,18 +10288,18 @@ async function doGenerateRedteam(options) {
|
|
|
10357
10288
|
if (typeof target === "string") return target;
|
|
10358
10289
|
return target.id;
|
|
10359
10290
|
}).filter((id) => typeof id === "string") : []) ?? [];
|
|
10360
|
-
require_logger.
|
|
10291
|
+
require_logger.logger.debug(`Extracted ${targetIds.length} target IDs from config providers: ${JSON.stringify(targetIds)}`);
|
|
10361
10292
|
let enhancedPurpose = parsedConfig.data.purpose || "";
|
|
10362
10293
|
let augmentedTestGenerationInstructions = config.testGenerationInstructions ?? "";
|
|
10363
10294
|
try {
|
|
10364
10295
|
const mcpToolsInfo = await extractMcpToolsInfo(testSuite.providers);
|
|
10365
10296
|
if (mcpToolsInfo) {
|
|
10366
10297
|
enhancedPurpose = enhancedPurpose ? `${enhancedPurpose}\n\n${mcpToolsInfo}\n\n` : mcpToolsInfo;
|
|
10367
|
-
require_logger.
|
|
10298
|
+
require_logger.logger.info("Added MCP tools information to red team purpose");
|
|
10368
10299
|
augmentedTestGenerationInstructions += `\nGenerate every test case prompt as a json string encoding the tool call and parameters, and choose a specific function to call. The specific format should be: {"tool": "function_name", "args": {...}}.`;
|
|
10369
10300
|
}
|
|
10370
10301
|
} catch (error) {
|
|
10371
|
-
require_logger.
|
|
10302
|
+
require_logger.logger.warn(`Failed to extract MCP tools information: ${error instanceof Error ? error.message : String(error)}`);
|
|
10372
10303
|
}
|
|
10373
10304
|
const contexts = redteamConfig?.contexts;
|
|
10374
10305
|
let redteamTests = [];
|
|
@@ -10377,10 +10308,10 @@ async function doGenerateRedteam(options) {
|
|
|
10377
10308
|
let finalInjectVar = "";
|
|
10378
10309
|
let failedPlugins = [];
|
|
10379
10310
|
if (contexts && contexts.length > 0) {
|
|
10380
|
-
require_logger.
|
|
10311
|
+
require_logger.logger.info(`Generating tests for ${contexts.length} contexts...`);
|
|
10381
10312
|
const allFailedPlugins = [];
|
|
10382
10313
|
for (const context of contexts) {
|
|
10383
|
-
require_logger.
|
|
10314
|
+
require_logger.logger.info(` Generating tests for context: ${context.id}`);
|
|
10384
10315
|
const contextPurpose = context.purpose + (enhancedPurpose ? `\n\n${enhancedPurpose}` : "");
|
|
10385
10316
|
const contextResult = await synthesize({
|
|
10386
10317
|
...parsedConfig.data,
|
|
@@ -10415,7 +10346,7 @@ async function doGenerateRedteam(options) {
|
|
|
10415
10346
|
}
|
|
10416
10347
|
failedPlugins = allFailedPlugins;
|
|
10417
10348
|
purpose = contexts[0].purpose;
|
|
10418
|
-
require_logger.
|
|
10349
|
+
require_logger.logger.info(`Generated ${redteamTests.length} total test cases across ${contexts.length} contexts`);
|
|
10419
10350
|
} else {
|
|
10420
10351
|
const result = await synthesize({
|
|
10421
10352
|
...parsedConfig.data,
|
|
@@ -10444,20 +10375,20 @@ async function doGenerateRedteam(options) {
|
|
|
10444
10375
|
*/
|
|
10445
10376
|
const cleanupProvider = async () => {
|
|
10446
10377
|
try {
|
|
10447
|
-
require_logger.
|
|
10378
|
+
require_logger.logger.debug("Cleaning up provider");
|
|
10448
10379
|
const provider = testSuite.providers[0];
|
|
10449
10380
|
if (provider && typeof provider.cleanup === "function") {
|
|
10450
10381
|
const cleanupResult = provider.cleanup();
|
|
10451
10382
|
if (cleanupResult instanceof Promise) await cleanupResult;
|
|
10452
10383
|
}
|
|
10453
10384
|
} catch (cleanupErr) {
|
|
10454
|
-
require_logger.
|
|
10385
|
+
require_logger.logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
10455
10386
|
}
|
|
10456
10387
|
};
|
|
10457
10388
|
try {
|
|
10458
10389
|
handleFailedPlugins(failedPlugins, options.strict ?? false);
|
|
10459
10390
|
if (redteamTests.length === 0) {
|
|
10460
|
-
require_logger.
|
|
10391
|
+
require_logger.logger.warn("No test cases generated. Please check for errors and try again.");
|
|
10461
10392
|
return null;
|
|
10462
10393
|
}
|
|
10463
10394
|
const updatedRedteamConfig = {
|
|
@@ -10476,7 +10407,7 @@ async function doGenerateRedteam(options) {
|
|
|
10476
10407
|
return encodeURIComponent(value);
|
|
10477
10408
|
}).filter((line) => line.length > 0).join("\n");
|
|
10478
10409
|
fs.writeFileSync(options.output, outputLines);
|
|
10479
|
-
require_logger.
|
|
10410
|
+
require_logger.logger.info(chalk.default.green(`Wrote ${redteamTests.length} test cases to ${chalk.default.bold(options.output)}`));
|
|
10480
10411
|
return {};
|
|
10481
10412
|
} else if (options.output) {
|
|
10482
10413
|
const existingYaml = configPath ? js_yaml.default.load(fs.readFileSync(configPath, "utf8")) : {};
|
|
@@ -10515,8 +10446,8 @@ async function doGenerateRedteam(options) {
|
|
|
10515
10446
|
ret = writePromptfooConfig(updatedYaml, options.output, headerComments);
|
|
10516
10447
|
require_util.printBorder();
|
|
10517
10448
|
const relativeOutputPath = path.default.relative(process.cwd(), options.output);
|
|
10518
|
-
require_logger.
|
|
10519
|
-
if (!options.inRedteamRun) require_logger.
|
|
10449
|
+
require_logger.logger.info(`Wrote ${redteamTests.length} test cases to ${relativeOutputPath}`);
|
|
10450
|
+
if (!options.inRedteamRun) require_logger.logger.info("\n" + chalk.default.green(`Run ${chalk.default.bold(relativeOutputPath === "redteam.yaml" ? promptfooCommand("redteam eval") : promptfooCommand(`redteam eval -c ${relativeOutputPath}`))} to run the red team!`));
|
|
10520
10451
|
require_util.printBorder();
|
|
10521
10452
|
} else if (options.write && configPath) {
|
|
10522
10453
|
const existingConfig = js_yaml.default.load(fs.readFileSync(configPath, "utf8"));
|
|
@@ -10554,9 +10485,9 @@ async function doGenerateRedteam(options) {
|
|
|
10554
10485
|
isUpdate: true
|
|
10555
10486
|
});
|
|
10556
10487
|
ret = writePromptfooConfig(existingConfig, configPath, headerComments);
|
|
10557
|
-
require_logger.
|
|
10488
|
+
require_logger.logger.info(`\nWrote ${redteamTests.length} new test cases to ${path.default.relative(process.cwd(), configPath)}`);
|
|
10558
10489
|
const command = configPath.endsWith("promptfooconfig.yaml") ? promptfooCommand("eval") : promptfooCommand(`eval -c ${path.default.relative(process.cwd(), configPath)}`);
|
|
10559
|
-
require_logger.
|
|
10490
|
+
require_logger.logger.info("\n" + chalk.default.green(`Run ${chalk.default.bold(`${command}`)} to run the red team!`));
|
|
10560
10491
|
} else {
|
|
10561
10492
|
const headerComments = createHeaderComments({
|
|
10562
10493
|
title: "REDTEAM CONFIGURATION",
|
|
@@ -10572,7 +10503,7 @@ async function doGenerateRedteam(options) {
|
|
|
10572
10503
|
tests: redteamTests
|
|
10573
10504
|
}, "redteam.yaml", headerComments);
|
|
10574
10505
|
}
|
|
10575
|
-
require_telemetry.
|
|
10506
|
+
require_telemetry.telemetry.record("command_used", {
|
|
10576
10507
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
10577
10508
|
name: "generate redteam",
|
|
10578
10509
|
numPrompts: testSuite.prompts.length,
|
|
@@ -10582,7 +10513,7 @@ async function doGenerateRedteam(options) {
|
|
|
10582
10513
|
strategies: strategies.map((s) => typeof s === "string" ? s : s.id),
|
|
10583
10514
|
isPromptfooSampleTarget: testSuite.providers.some(require_fetch.isPromptfooSampleTarget)
|
|
10584
10515
|
});
|
|
10585
|
-
require_telemetry.
|
|
10516
|
+
require_telemetry.telemetry.record("redteam generate", {
|
|
10586
10517
|
phase: "completed",
|
|
10587
10518
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
10588
10519
|
numPrompts: testSuite.prompts.length,
|
|
@@ -10597,7 +10528,6 @@ async function doGenerateRedteam(options) {
|
|
|
10597
10528
|
await cleanupProvider();
|
|
10598
10529
|
}
|
|
10599
10530
|
}
|
|
10600
|
-
|
|
10601
10531
|
//#endregion
|
|
10602
10532
|
//#region src/util/inlineBlobsForShare.ts
|
|
10603
10533
|
const BLOB_URI_PREFIX = "promptfoo://blob/";
|
|
@@ -10663,7 +10593,7 @@ async function ensureBlobPayloads(hashes, cache) {
|
|
|
10663
10593
|
dataUrl: `data:${mimeType};base64,${base64}`
|
|
10664
10594
|
});
|
|
10665
10595
|
} catch (error) {
|
|
10666
|
-
require_logger.
|
|
10596
|
+
require_logger.logger.warn("[Share] Failed to inline blob reference", {
|
|
10667
10597
|
error,
|
|
10668
10598
|
hash
|
|
10669
10599
|
});
|
|
@@ -10709,7 +10639,6 @@ async function inlineBlobRefsForShare(value, cache) {
|
|
|
10709
10639
|
await ensureBlobPayloads(hashes, cache);
|
|
10710
10640
|
return await inlineValue(value, cache, /* @__PURE__ */ new WeakSet(), 0);
|
|
10711
10641
|
}
|
|
10712
|
-
|
|
10713
10642
|
//#endregion
|
|
10714
10643
|
//#region src/share.ts
|
|
10715
10644
|
function isSharingEnabled(evalRecord) {
|
|
@@ -10723,10 +10652,10 @@ function isSharingEnabled(evalRecord) {
|
|
|
10723
10652
|
}
|
|
10724
10653
|
function determineShareDomain(eval_) {
|
|
10725
10654
|
const sharing = eval_.config.sharing;
|
|
10726
|
-
require_logger.
|
|
10655
|
+
require_logger.logger.debug(`Share config: isCloudEnabled=${require_fetch.cloudConfig.isEnabled()}, sharing=${JSON.stringify(sharing)}, evalId=${eval_.id}`);
|
|
10727
10656
|
const envAppBaseUrl = require_logger.getEnvString("PROMPTFOO_REMOTE_APP_BASE_URL");
|
|
10728
10657
|
const domain = require_fetch.cloudConfig.isEnabled() ? require_fetch.cloudConfig.getAppUrl() : typeof sharing === "object" && sharing.appBaseUrl ? sharing.appBaseUrl : envAppBaseUrl || require_fetch.getDefaultShareViewBaseUrl();
|
|
10729
|
-
require_logger.
|
|
10658
|
+
require_logger.logger.debug(`Share domain determined: domain=${domain}`);
|
|
10730
10659
|
return { domain };
|
|
10731
10660
|
}
|
|
10732
10661
|
function getResultSize(result) {
|
|
@@ -10736,7 +10665,7 @@ function findLargestResultSize(results, sampleSize = 1e3) {
|
|
|
10736
10665
|
const sampleSizes = results.slice(0, Math.min(sampleSize, results.length)).map(getResultSize);
|
|
10737
10666
|
return Math.max(...sampleSizes);
|
|
10738
10667
|
}
|
|
10739
|
-
async function sendEvalRecord(evalRecord, url, headers) {
|
|
10668
|
+
async function sendEvalRecord(evalRecord, url$1, headers) {
|
|
10740
10669
|
const traces = await evalRecord.getTraces();
|
|
10741
10670
|
let evalData = {
|
|
10742
10671
|
...evalRecord,
|
|
@@ -10758,8 +10687,8 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10758
10687
|
};
|
|
10759
10688
|
}
|
|
10760
10689
|
const jsonData = JSON.stringify(evalData);
|
|
10761
|
-
require_logger.
|
|
10762
|
-
const response = await require_fetch.fetchWithProxy(url, {
|
|
10690
|
+
require_logger.logger.debug(`Sending initial eval data to ${url$1} - eval ${evalRecord.id} with ${evalRecord.prompts.length} prompts ${traces.length > 0 ? `and trace data` : ""}`);
|
|
10691
|
+
const response = await require_fetch.fetchWithProxy(url$1, {
|
|
10763
10692
|
method: "POST",
|
|
10764
10693
|
headers,
|
|
10765
10694
|
body: jsonData,
|
|
@@ -10767,10 +10696,10 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10767
10696
|
});
|
|
10768
10697
|
if (!response.ok) {
|
|
10769
10698
|
const responseBody = await response.text();
|
|
10770
|
-
const errorMessage = `Failed to send initial eval data to ${url}: ${response.statusText}`;
|
|
10699
|
+
const errorMessage = `Failed to send initial eval data to ${url$1}: ${response.statusText}`;
|
|
10771
10700
|
const bodyMessage = responseBody ? `\nResponse body: ${responseBody}` : "";
|
|
10772
10701
|
const debugInfo = {
|
|
10773
|
-
url,
|
|
10702
|
+
url: url$1,
|
|
10774
10703
|
statusCode: response.status,
|
|
10775
10704
|
statusText: response.statusText,
|
|
10776
10705
|
headers: Object.keys(headers),
|
|
@@ -10778,18 +10707,18 @@ async function sendEvalRecord(evalRecord, url, headers) {
|
|
|
10778
10707
|
errorMessage,
|
|
10779
10708
|
bodyMessage
|
|
10780
10709
|
};
|
|
10781
|
-
require_logger.
|
|
10710
|
+
require_logger.logger.error(`Sharing your eval data to ${url$1} failed. Debug info: ${JSON.stringify(debugInfo, null, 2)}`);
|
|
10782
10711
|
throw new Error(`${errorMessage}${bodyMessage}`);
|
|
10783
10712
|
}
|
|
10784
10713
|
const responseJson = await response.json();
|
|
10785
|
-
if (!responseJson.id) throw new Error(`Failed to send initial eval data to ${url}: ${response.statusText} ${responseJson}`);
|
|
10714
|
+
if (!responseJson.id) throw new Error(`Failed to send initial eval data to ${url$1}: ${response.statusText} ${responseJson}`);
|
|
10786
10715
|
return responseJson.id;
|
|
10787
10716
|
}
|
|
10788
|
-
async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
10789
|
-
const targetUrl = `${url}/${evalId}/results`;
|
|
10717
|
+
async function sendChunkOfResults(chunk, url$2, evalId, headers) {
|
|
10718
|
+
const targetUrl = `${url$2}/${evalId}/results`;
|
|
10790
10719
|
const stringifiedChunk = JSON.stringify(chunk);
|
|
10791
10720
|
const chunkSizeBytes = Buffer.byteLength(stringifiedChunk, "utf8");
|
|
10792
|
-
require_logger.
|
|
10721
|
+
require_logger.logger.debug(`Sending chunk of ${chunk.length} results (${(chunkSizeBytes / 1024 / 1024).toFixed(2)} MB) to ${targetUrl}`);
|
|
10793
10722
|
try {
|
|
10794
10723
|
const response = await require_fetch.fetchWithProxy(targetUrl, {
|
|
10795
10724
|
method: "POST",
|
|
@@ -10809,7 +10738,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10809
10738
|
evalId,
|
|
10810
10739
|
responseBody: responseBody.length > 500 ? `${responseBody.slice(0, 500)}...` : responseBody
|
|
10811
10740
|
};
|
|
10812
|
-
require_logger.
|
|
10741
|
+
require_logger.logger.debug(`Chunk send failed: ${JSON.stringify(debugInfo, null, 2)}`);
|
|
10813
10742
|
if (response.status === 413) return {
|
|
10814
10743
|
success: false,
|
|
10815
10744
|
errorType: "PAYLOAD_TOO_LARGE",
|
|
@@ -10824,7 +10753,7 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10824
10753
|
return { success: true };
|
|
10825
10754
|
} catch (error) {
|
|
10826
10755
|
if (error instanceof TypeError && error.message === "fetch failed") {
|
|
10827
|
-
require_logger.
|
|
10756
|
+
require_logger.logger.debug(`Network timeout/failure for chunk of ${chunk.length} results`);
|
|
10828
10757
|
return {
|
|
10829
10758
|
success: false,
|
|
10830
10759
|
errorType: "NETWORK_TIMEOUT",
|
|
@@ -10842,11 +10771,11 @@ async function sendChunkOfResults(chunk, url, evalId, headers) {
|
|
|
10842
10771
|
* Attempts to send a chunk of results, splitting it in half on retryable failures.
|
|
10843
10772
|
* Uses recursive splitting to handle chunks that are too large.
|
|
10844
10773
|
*/
|
|
10845
|
-
async function sendChunkWithRetry(chunk, url, evalId, headers, config, onProgress, depth = 0, maxDepth) {
|
|
10774
|
+
async function sendChunkWithRetry(chunk, url$3, evalId, headers, config, onProgress, depth = 0, maxDepth) {
|
|
10846
10775
|
const effectiveMaxDepth = maxDepth ?? Math.ceil(Math.log2(chunk.length / config.minResultsPerChunk)) + 1;
|
|
10847
10776
|
if (depth > effectiveMaxDepth) throw new Error(`Maximum retry depth exceeded. Cannot send chunk of ${chunk.length} results.`);
|
|
10848
10777
|
if (chunk.length === 0) return 0;
|
|
10849
|
-
const result = await sendChunkOfResults(chunk, url, evalId, headers);
|
|
10778
|
+
const result = await sendChunkOfResults(chunk, url$3, evalId, headers);
|
|
10850
10779
|
if (result.success) {
|
|
10851
10780
|
onProgress(chunk.length);
|
|
10852
10781
|
return chunk.length;
|
|
@@ -10856,41 +10785,41 @@ async function sendChunkWithRetry(chunk, url, evalId, headers, config, onProgres
|
|
|
10856
10785
|
const midpoint = Math.ceil(chunk.length / 2);
|
|
10857
10786
|
const firstHalf = chunk.slice(0, midpoint);
|
|
10858
10787
|
const secondHalf = chunk.slice(midpoint);
|
|
10859
|
-
require_logger.
|
|
10860
|
-
return await sendChunkWithRetry(firstHalf, url, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth) + await sendChunkWithRetry(secondHalf, url, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth);
|
|
10788
|
+
require_logger.logger.info(`Chunk of ${chunk.length} results failed (${result.errorType}). Splitting into ${firstHalf.length} + ${secondHalf.length} and retrying...`);
|
|
10789
|
+
return await sendChunkWithRetry(firstHalf, url$3, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth) + await sendChunkWithRetry(secondHalf, url$3, evalId, headers, config, onProgress, depth + 1, effectiveMaxDepth);
|
|
10861
10790
|
}
|
|
10862
10791
|
throw result.originalError ?? /* @__PURE__ */ new Error("Unknown error sending chunk");
|
|
10863
10792
|
}
|
|
10864
|
-
async function rollbackEval(url, evalId, headers) {
|
|
10865
|
-
const targetUrl = `${url}/${evalId}`;
|
|
10866
|
-
require_logger.
|
|
10793
|
+
async function rollbackEval(url$4, evalId, headers) {
|
|
10794
|
+
const targetUrl = `${url$4}/${evalId}`;
|
|
10795
|
+
require_logger.logger.debug(`Attempting to roll back eval ${evalId} at ${targetUrl}`);
|
|
10867
10796
|
try {
|
|
10868
10797
|
const response = await require_fetch.fetchWithProxy(targetUrl, {
|
|
10869
10798
|
method: "DELETE",
|
|
10870
10799
|
headers
|
|
10871
10800
|
});
|
|
10872
|
-
if (response.ok) require_logger.
|
|
10873
|
-
else require_logger.
|
|
10801
|
+
if (response.ok) require_logger.logger.debug(`Successfully rolled back eval ${evalId}`);
|
|
10802
|
+
else require_logger.logger.warn(`Rollback request returned non-OK status: ${response.statusText}`);
|
|
10874
10803
|
} catch (e) {
|
|
10875
|
-
require_logger.
|
|
10804
|
+
require_logger.logger.warn(`Failed to roll back eval ${evalId}: ${e}. You may need to manually delete this eval.`);
|
|
10876
10805
|
}
|
|
10877
10806
|
}
|
|
10878
|
-
async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
10807
|
+
async function sendChunkedResults(evalRecord, url$5, options = {}) {
|
|
10879
10808
|
const isVerbose = require_logger.isDebugEnabled();
|
|
10880
10809
|
const { silent = false } = options;
|
|
10881
|
-
require_logger.
|
|
10810
|
+
require_logger.logger.debug(`Starting chunked results upload to ${url$5}`);
|
|
10882
10811
|
await require_providers.checkCloudPermissions(evalRecord.config);
|
|
10883
10812
|
const inlineBlobs = require_extractor.isBlobStorageEnabled() && require_logger.getEnvBool("PROMPTFOO_SHARE_INLINE_BLOBS", !require_fetch.cloudConfig.isEnabled());
|
|
10884
10813
|
const inlineCache = inlineBlobs ? createBlobInlineCache() : null;
|
|
10885
10814
|
let sampleResults = (await evalRecord.fetchResultsBatched(100).next()).value ?? [];
|
|
10886
10815
|
if (sampleResults.length === 0) {
|
|
10887
|
-
require_logger.
|
|
10816
|
+
require_logger.logger.debug(`No results found`);
|
|
10888
10817
|
return null;
|
|
10889
10818
|
}
|
|
10890
10819
|
if (inlineBlobs && inlineCache) sampleResults = await inlineBlobRefsForShare(sampleResults, inlineCache);
|
|
10891
|
-
require_logger.
|
|
10820
|
+
require_logger.logger.debug(`Loaded ${sampleResults.length} sample results to determine chunk size`);
|
|
10892
10821
|
const largestSize = findLargestResultSize(sampleResults);
|
|
10893
|
-
require_logger.
|
|
10822
|
+
require_logger.logger.debug(`Largest result size from sample: ${largestSize} bytes`);
|
|
10894
10823
|
const TARGET_CHUNK_SIZE = .9 * 1024 * 1024;
|
|
10895
10824
|
const envChunkSize = require_logger.getEnvInt("PROMPTFOO_SHARE_CHUNK_SIZE");
|
|
10896
10825
|
const calculatedChunkSize = Math.max(1, Math.floor(TARGET_CHUNK_SIZE / largestSize));
|
|
@@ -10899,11 +10828,11 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10899
10828
|
minResultsPerChunk: 1,
|
|
10900
10829
|
maxResultsPerChunk: resultsPerChunk
|
|
10901
10830
|
};
|
|
10902
|
-
require_logger.
|
|
10831
|
+
require_logger.logger.debug(`Chunk config: ${JSON.stringify(chunkConfig)}`);
|
|
10903
10832
|
const headers = { "Content-Type": "application/json" };
|
|
10904
10833
|
if (require_fetch.cloudConfig.isEnabled()) headers["Authorization"] = `Bearer ${require_fetch.cloudConfig.getApiKey()}`;
|
|
10905
10834
|
const totalResults = await evalRecord.getTotalResultRowCount();
|
|
10906
|
-
require_logger.
|
|
10835
|
+
require_logger.logger.debug(`Total results to share: ${totalResults}`);
|
|
10907
10836
|
let progressBar = null;
|
|
10908
10837
|
if (!isVerbose && !require_logger.isCI() && !silent) {
|
|
10909
10838
|
progressBar = new cli_progress.default.SingleBar({
|
|
@@ -10914,13 +10843,13 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10914
10843
|
}
|
|
10915
10844
|
let evalId;
|
|
10916
10845
|
try {
|
|
10917
|
-
evalId = await sendEvalRecord(evalRecord, url, headers);
|
|
10918
|
-
require_logger.
|
|
10846
|
+
evalId = await sendEvalRecord(evalRecord, url$5, headers);
|
|
10847
|
+
require_logger.logger.debug(`Initial eval data sent successfully - ${evalId}`);
|
|
10919
10848
|
let totalSent = 0;
|
|
10920
10849
|
const onProgress = (sentCount) => {
|
|
10921
10850
|
totalSent += sentCount;
|
|
10922
10851
|
if (progressBar) progressBar.update(totalSent);
|
|
10923
|
-
else require_logger.
|
|
10852
|
+
else require_logger.logger.info(`Progress: ${totalSent}/${totalResults} results shared (${Math.round(totalSent / totalResults * 100)}%)`);
|
|
10924
10853
|
};
|
|
10925
10854
|
let currentChunk = [];
|
|
10926
10855
|
let chunkNumber = 0;
|
|
@@ -10928,24 +10857,24 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10928
10857
|
currentChunk.push(result);
|
|
10929
10858
|
if (currentChunk.length >= resultsPerChunk) {
|
|
10930
10859
|
chunkNumber++;
|
|
10931
|
-
require_logger.
|
|
10932
|
-
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url, evalId, headers, chunkConfig, onProgress);
|
|
10860
|
+
require_logger.logger.debug(`Sending chunk ${chunkNumber} with ${currentChunk.length} results`);
|
|
10861
|
+
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url$5, evalId, headers, chunkConfig, onProgress);
|
|
10933
10862
|
currentChunk = [];
|
|
10934
10863
|
}
|
|
10935
10864
|
}
|
|
10936
10865
|
if (currentChunk.length > 0) {
|
|
10937
10866
|
chunkNumber++;
|
|
10938
|
-
require_logger.
|
|
10939
|
-
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url, evalId, headers, chunkConfig, onProgress);
|
|
10867
|
+
require_logger.logger.debug(`Sending final chunk ${chunkNumber} with ${currentChunk.length} results`);
|
|
10868
|
+
await sendChunkWithRetry(inlineBlobs && inlineCache ? await inlineBlobRefsForShare(currentChunk, inlineCache) : currentChunk, url$5, evalId, headers, chunkConfig, onProgress);
|
|
10940
10869
|
}
|
|
10941
|
-
require_logger.
|
|
10870
|
+
require_logger.logger.debug(`Sharing complete. Total chunks sent: ${chunkNumber}, Total results: ${totalSent}`);
|
|
10942
10871
|
return evalId;
|
|
10943
10872
|
} catch (e) {
|
|
10944
10873
|
if (progressBar) progressBar.stop();
|
|
10945
|
-
require_logger.
|
|
10874
|
+
require_logger.logger.error(`Upload failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
10946
10875
|
if (evalId) {
|
|
10947
|
-
require_logger.
|
|
10948
|
-
await rollbackEval(url, evalId, headers);
|
|
10876
|
+
require_logger.logger.info(`Upload failed, rolling back...`);
|
|
10877
|
+
await rollbackEval(url$5, evalId, headers);
|
|
10949
10878
|
}
|
|
10950
10879
|
return null;
|
|
10951
10880
|
} finally {
|
|
@@ -10965,12 +10894,12 @@ async function sendChunkedResults(evalRecord, url, options = {}) {
|
|
|
10965
10894
|
*/
|
|
10966
10895
|
function stripAuthFromUrl(urlString) {
|
|
10967
10896
|
try {
|
|
10968
|
-
const url = new url.URL(urlString);
|
|
10969
|
-
url.username = "";
|
|
10970
|
-
url.password = "";
|
|
10971
|
-
return url.toString();
|
|
10897
|
+
const url$6 = new url.URL(urlString);
|
|
10898
|
+
url$6.username = "";
|
|
10899
|
+
url$6.password = "";
|
|
10900
|
+
return url$6.toString();
|
|
10972
10901
|
} catch {
|
|
10973
|
-
require_logger.
|
|
10902
|
+
require_logger.logger.warn("Failed to parse URL, returning original");
|
|
10974
10903
|
return urlString;
|
|
10975
10904
|
}
|
|
10976
10905
|
}
|
|
@@ -11013,26 +10942,25 @@ async function getShareableUrl(eval_, remoteEvalId, showAuth = false) {
|
|
|
11013
10942
|
async function createShareableUrl(evalRecord, options = {}) {
|
|
11014
10943
|
const { silent = false, showAuth = false } = options;
|
|
11015
10944
|
if (require_logger.getEnvBool("PROMPTFOO_DISABLE_SHARING")) {
|
|
11016
|
-
require_logger.
|
|
10945
|
+
require_logger.logger.debug("Sharing is explicitly disabled, returning null");
|
|
11017
10946
|
return null;
|
|
11018
10947
|
}
|
|
11019
10948
|
if (!silent) {
|
|
11020
10949
|
const orgContext = await require_providers.getOrgContext();
|
|
11021
10950
|
if (orgContext) {
|
|
11022
10951
|
const teamSuffix = orgContext.teamName ? ` > ${orgContext.teamName}` : "";
|
|
11023
|
-
require_logger.
|
|
10952
|
+
require_logger.logger.info(`${chalk.default.dim("Sharing to:")} ${chalk.default.cyan(orgContext.organizationName)}${teamSuffix}`);
|
|
11024
10953
|
}
|
|
11025
10954
|
}
|
|
11026
10955
|
await handleEmailCollection(evalRecord);
|
|
11027
|
-
const { url } = await getApiConfig(evalRecord);
|
|
10956
|
+
const { url: url$7 } = await getApiConfig(evalRecord);
|
|
11028
10957
|
const canUseNewResults = require_fetch.cloudConfig.isEnabled();
|
|
11029
|
-
require_logger.
|
|
11030
|
-
const evalId = await sendChunkedResults(evalRecord, url, { silent });
|
|
10958
|
+
require_logger.logger.debug(`Sharing with ${url$7} canUseNewResults: ${canUseNewResults} Use old results: ${evalRecord.useOldResults()}`);
|
|
10959
|
+
const evalId = await sendChunkedResults(evalRecord, url$7, { silent });
|
|
11031
10960
|
if (!evalId) return null;
|
|
11032
|
-
require_logger.
|
|
10961
|
+
require_logger.logger.debug(`New eval ID on remote instance: ${evalId}`);
|
|
11033
10962
|
return getShareableUrl(evalRecord, evalId, showAuth);
|
|
11034
10963
|
}
|
|
11035
|
-
|
|
11036
10964
|
//#endregion
|
|
11037
10965
|
//#region src/table.ts
|
|
11038
10966
|
function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
@@ -11053,7 +10981,6 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
|
11053
10981
|
})]);
|
|
11054
10982
|
return table.toString();
|
|
11055
10983
|
}
|
|
11056
|
-
|
|
11057
10984
|
//#endregion
|
|
11058
10985
|
//#region src/util/config/default.ts
|
|
11059
10986
|
/**
|
|
@@ -11093,7 +11020,6 @@ async function loadDefaultConfig(dir, configName = "promptfooconfig") {
|
|
|
11093
11020
|
function clearConfigCache() {
|
|
11094
11021
|
configCache.clear();
|
|
11095
11022
|
}
|
|
11096
|
-
|
|
11097
11023
|
//#endregion
|
|
11098
11024
|
//#region src/util/sharing.ts
|
|
11099
11025
|
/**
|
|
@@ -11121,7 +11047,6 @@ function shouldShareResults(opts) {
|
|
|
11121
11047
|
const sharing = require_fetch.cloudConfig.getSharing();
|
|
11122
11048
|
return require_fetch.cloudConfig.isEnabled() && sharing !== false;
|
|
11123
11049
|
}
|
|
11124
|
-
|
|
11125
11050
|
//#endregion
|
|
11126
11051
|
//#region src/util/formatDuration.ts
|
|
11127
11052
|
/**
|
|
@@ -11141,7 +11066,6 @@ function formatDuration(seconds) {
|
|
|
11141
11066
|
result += `${remainingSeconds}s`;
|
|
11142
11067
|
return result;
|
|
11143
11068
|
}
|
|
11144
|
-
|
|
11145
11069
|
//#endregion
|
|
11146
11070
|
//#region src/commands/eval/summary.ts
|
|
11147
11071
|
/**
|
|
@@ -11293,7 +11217,6 @@ function generateEvalSummary(params) {
|
|
|
11293
11217
|
lines.push("");
|
|
11294
11218
|
return lines;
|
|
11295
11219
|
}
|
|
11296
|
-
|
|
11297
11220
|
//#endregion
|
|
11298
11221
|
//#region src/commands/retry.ts
|
|
11299
11222
|
/**
|
|
@@ -11309,7 +11232,7 @@ async function getErrorResultIds(evalId) {
|
|
|
11309
11232
|
async function deleteErrorResults(resultIds) {
|
|
11310
11233
|
if (resultIds.length === 0) return;
|
|
11311
11234
|
await require_tables.getDb().delete(require_tables.evalResultsTable).where((0, drizzle_orm.inArray)(require_tables.evalResultsTable.id, resultIds));
|
|
11312
|
-
require_logger.
|
|
11235
|
+
require_logger.logger.debug(`Deleted ${resultIds.length} error results from database`);
|
|
11313
11236
|
}
|
|
11314
11237
|
const RECALCULATE_BATCH_SIZE = 1e3;
|
|
11315
11238
|
/**
|
|
@@ -11317,7 +11240,7 @@ const RECALCULATE_BATCH_SIZE = 1e3;
|
|
|
11317
11240
|
* Uses streaming batched iteration to avoid OOM with large evaluations (40K+ results).
|
|
11318
11241
|
*/
|
|
11319
11242
|
async function recalculatePromptMetrics(evalRecord) {
|
|
11320
|
-
require_logger.
|
|
11243
|
+
require_logger.logger.debug("Recalculating prompt metrics after deleting ERROR results");
|
|
11321
11244
|
const startTime = Date.now();
|
|
11322
11245
|
let batchNumber = 0;
|
|
11323
11246
|
let totalProcessed = 0;
|
|
@@ -11339,12 +11262,12 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11339
11262
|
try {
|
|
11340
11263
|
for await (const batch of evalRecord.fetchResultsBatched(RECALCULATE_BATCH_SIZE)) {
|
|
11341
11264
|
batchNumber++;
|
|
11342
|
-
require_logger.
|
|
11265
|
+
require_logger.logger.debug(`Processing batch ${batchNumber} with ${batch.length} results`);
|
|
11343
11266
|
for (const result of batch) {
|
|
11344
11267
|
currentResultId = result.id;
|
|
11345
11268
|
const metrics = promptMetricsMap.get(result.promptIdx);
|
|
11346
11269
|
if (!metrics) {
|
|
11347
|
-
require_logger.
|
|
11270
|
+
require_logger.logger.debug(`Skipping result with invalid promptIdx: ${result.promptIdx}`, {
|
|
11348
11271
|
resultId: result.id,
|
|
11349
11272
|
evalId: evalRecord.id
|
|
11350
11273
|
});
|
|
@@ -11378,7 +11301,7 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11378
11301
|
totalProcessed += batch.length;
|
|
11379
11302
|
}
|
|
11380
11303
|
} catch (error) {
|
|
11381
|
-
require_logger.
|
|
11304
|
+
require_logger.logger.error("Error during batched metrics recalculation", {
|
|
11382
11305
|
phase: "calculation",
|
|
11383
11306
|
batchNumber,
|
|
11384
11307
|
totalProcessed,
|
|
@@ -11392,7 +11315,7 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11392
11315
|
if (evalRecord.persisted) try {
|
|
11393
11316
|
await evalRecord.addPrompts(evalRecord.prompts);
|
|
11394
11317
|
} catch (error) {
|
|
11395
|
-
require_logger.
|
|
11318
|
+
require_logger.logger.error("Error saving recalculated prompt metrics", {
|
|
11396
11319
|
phase: "save",
|
|
11397
11320
|
evalId: evalRecord.id,
|
|
11398
11321
|
promptCount: evalRecord.prompts.length,
|
|
@@ -11401,19 +11324,18 @@ async function recalculatePromptMetrics(evalRecord) {
|
|
|
11401
11324
|
throw error;
|
|
11402
11325
|
}
|
|
11403
11326
|
const durationMs = Date.now() - startTime;
|
|
11404
|
-
require_logger.
|
|
11327
|
+
require_logger.logger.debug("Prompt metrics recalculation completed", {
|
|
11405
11328
|
totalBatches: batchNumber,
|
|
11406
11329
|
totalResults: totalProcessed,
|
|
11407
11330
|
durationMs
|
|
11408
11331
|
});
|
|
11409
11332
|
}
|
|
11410
|
-
|
|
11411
11333
|
//#endregion
|
|
11412
11334
|
//#region src/commands/share.ts
|
|
11413
11335
|
function notCloudEnabledShareInstructions() {
|
|
11414
11336
|
const cloudUrl = require_fetch.getDefaultShareViewBaseUrl();
|
|
11415
11337
|
const welcomeUrl = `${cloudUrl}/welcome`;
|
|
11416
|
-
require_logger.
|
|
11338
|
+
require_logger.logger.info(dedent.default`
|
|
11417
11339
|
|
|
11418
11340
|
» You need to have a cloud account to securely share your results.
|
|
11419
11341
|
|
|
@@ -11422,10 +11344,7 @@ function notCloudEnabledShareInstructions() {
|
|
|
11422
11344
|
3. Run ${chalk.default.greenBright.bold("promptfoo share")}
|
|
11423
11345
|
`);
|
|
11424
11346
|
}
|
|
11425
|
-
|
|
11426
|
-
//#endregion
|
|
11427
|
-
//#region src/commands/eval.ts
|
|
11428
|
-
const EvalCommandSchema = require_types.CommandLineOptionsSchema.extend({
|
|
11347
|
+
require_types.CommandLineOptionsSchema.extend({
|
|
11429
11348
|
help: zod.z.boolean().optional(),
|
|
11430
11349
|
interactiveProviders: zod.z.boolean().optional(),
|
|
11431
11350
|
remote: zod.z.boolean().optional(),
|
|
@@ -11435,7 +11354,7 @@ const EvalCommandSchema = require_types.CommandLineOptionsSchema.extend({
|
|
|
11435
11354
|
resume: zod.z.union([zod.z.string(), zod.z.boolean()]).optional()
|
|
11436
11355
|
}).partial();
|
|
11437
11356
|
function showRedteamProviderLabelMissingWarning(testSuite) {
|
|
11438
|
-
if (testSuite.providers.some((p) => !p.label)) require_logger.
|
|
11357
|
+
if (testSuite.providers.some((p) => !p.label)) require_logger.logger.warn(dedent.default`
|
|
11439
11358
|
${chalk.default.bold.yellow("Warning")}: Your target (provider) does not have a label specified.
|
|
11440
11359
|
|
|
11441
11360
|
Labels are used to uniquely identify redteam targets. Please set a meaningful and unique label (e.g., 'helpdesk-search-agent') for your targets/providers in your redteam config.
|
|
@@ -11466,7 +11385,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11466
11385
|
}
|
|
11467
11386
|
const runEvaluation = async (initialization) => {
|
|
11468
11387
|
const startTime = Date.now();
|
|
11469
|
-
require_telemetry.
|
|
11388
|
+
require_telemetry.telemetry.record("command_used", {
|
|
11470
11389
|
name: "eval - started",
|
|
11471
11390
|
watch: Boolean(cmdObj.watch),
|
|
11472
11391
|
...Boolean(config?.redteam) && { isRedteam: true }
|
|
@@ -11481,19 +11400,19 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11481
11400
|
for (const configPath of configPaths) if (fs.default.existsSync(configPath) && fs.default.statSync(configPath).isDirectory()) {
|
|
11482
11401
|
const { defaultConfig: dirConfig, defaultConfigPath: newConfigPath } = await loadDefaultConfig(configPath);
|
|
11483
11402
|
if (newConfigPath) {
|
|
11484
|
-
cmdObj.config = cmdObj.config.filter((path) => path !== configPath);
|
|
11403
|
+
cmdObj.config = cmdObj.config.filter((path$6) => path$6 !== configPath);
|
|
11485
11404
|
cmdObj.config.push(newConfigPath);
|
|
11486
11405
|
defaultConfig = {
|
|
11487
11406
|
...defaultConfig,
|
|
11488
11407
|
...dirConfig
|
|
11489
11408
|
};
|
|
11490
|
-
} else require_logger.
|
|
11409
|
+
} else require_logger.logger.warn(`No configuration file found in directory: ${configPath}. Looked for promptfooconfig.{${DEFAULT_CONFIG_EXTENSIONS.join(",")}}. Run "${promptfooCommand("init")}" or pass --config path/to/promptfooconfig.yaml.`);
|
|
11491
11410
|
}
|
|
11492
11411
|
}
|
|
11493
11412
|
const resumeRaw = cmdObj.resume;
|
|
11494
11413
|
const retryErrors = cmdObj.retryErrors;
|
|
11495
11414
|
if (resumeRaw && retryErrors) {
|
|
11496
|
-
require_logger.
|
|
11415
|
+
require_logger.logger.error(chalk.default.red("Cannot use --resume and --retry-errors together. Please use one or the other."));
|
|
11497
11416
|
process.exitCode = 1;
|
|
11498
11417
|
return new Eval({}, { persisted: false });
|
|
11499
11418
|
}
|
|
@@ -11501,45 +11420,45 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11501
11420
|
const resumeId = resumeRaw === true || resumeRaw === void 0 ? "latest" : resumeRaw;
|
|
11502
11421
|
if (resumeRaw) {
|
|
11503
11422
|
if (cmdObj.write === false) {
|
|
11504
|
-
require_logger.
|
|
11423
|
+
require_logger.logger.error(chalk.default.red("Cannot use --resume with --no-write. Resume functionality requires database persistence."));
|
|
11505
11424
|
process.exitCode = 1;
|
|
11506
11425
|
return new Eval({}, { persisted: false });
|
|
11507
11426
|
}
|
|
11508
11427
|
resumeEval = resumeId === "latest" ? await Eval.latest() : await Eval.findById(resumeId);
|
|
11509
11428
|
if (!resumeEval) {
|
|
11510
|
-
require_logger.
|
|
11429
|
+
require_logger.logger.error(`Could not find evaluation to resume: ${resumeId}`);
|
|
11511
11430
|
process.exitCode = 1;
|
|
11512
11431
|
return new Eval({}, { persisted: false });
|
|
11513
11432
|
}
|
|
11514
|
-
require_logger.
|
|
11433
|
+
require_logger.logger.info(chalk.default.cyan(`Resuming evaluation ${resumeEval.id}...`));
|
|
11515
11434
|
({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs({}, resumeEval.config));
|
|
11516
11435
|
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) testSuite.prompts = resumeEval.prompts.map((p) => ({
|
|
11517
11436
|
raw: p.raw,
|
|
11518
11437
|
label: p.label,
|
|
11519
11438
|
config: p.config
|
|
11520
11439
|
}));
|
|
11521
|
-
require_logger.
|
|
11440
|
+
require_logger.state.resume = true;
|
|
11522
11441
|
} else if (retryErrors) {
|
|
11523
11442
|
if (cmdObj.write === false) {
|
|
11524
|
-
require_logger.
|
|
11443
|
+
require_logger.logger.error(chalk.default.red("Cannot use --retry-errors with --no-write. Retry functionality requires database persistence."));
|
|
11525
11444
|
process.exitCode = 1;
|
|
11526
11445
|
return new Eval({}, { persisted: false });
|
|
11527
11446
|
}
|
|
11528
|
-
require_logger.
|
|
11447
|
+
require_logger.logger.info("🔄 Retrying ERROR results from latest evaluation...");
|
|
11529
11448
|
const latestEval = await Eval.latest();
|
|
11530
11449
|
if (!latestEval) {
|
|
11531
|
-
require_logger.
|
|
11450
|
+
require_logger.logger.error("No previous evaluation found to retry errors from");
|
|
11532
11451
|
process.exitCode = 1;
|
|
11533
11452
|
return new Eval({}, { persisted: false });
|
|
11534
11453
|
}
|
|
11535
11454
|
const errorResultIds = await getErrorResultIds(latestEval.id);
|
|
11536
11455
|
if (errorResultIds.length === 0) {
|
|
11537
|
-
require_logger.
|
|
11456
|
+
require_logger.logger.info("✅ No ERROR results found in the latest evaluation");
|
|
11538
11457
|
return latestEval;
|
|
11539
11458
|
}
|
|
11540
|
-
require_logger.
|
|
11541
|
-
require_logger.
|
|
11542
|
-
require_logger.
|
|
11459
|
+
require_logger.logger.info(`Found ${errorResultIds.length} ERROR results to retry`);
|
|
11460
|
+
require_logger.state._retryErrorResultIds = errorResultIds;
|
|
11461
|
+
require_logger.logger.info(`🔄 Running evaluation with resume mode to retry ${errorResultIds.length} test cases...`);
|
|
11543
11462
|
resumeEval = latestEval;
|
|
11544
11463
|
({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs({}, resumeEval.config));
|
|
11545
11464
|
if (Array.isArray(resumeEval.prompts) && resumeEval.prompts.length > 0) testSuite.prompts = resumeEval.prompts.map((p) => ({
|
|
@@ -11547,20 +11466,20 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11547
11466
|
label: p.label,
|
|
11548
11467
|
config: p.config
|
|
11549
11468
|
}));
|
|
11550
|
-
require_logger.
|
|
11551
|
-
require_logger.
|
|
11469
|
+
require_logger.state.resume = true;
|
|
11470
|
+
require_logger.state.retryMode = true;
|
|
11552
11471
|
} else ({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs(cmdObj, defaultConfig));
|
|
11553
11472
|
if (!cmdObj.envPath && commandLineOptions?.envPath) {
|
|
11554
|
-
require_logger.
|
|
11473
|
+
require_logger.logger.debug(`Loading additional environment from config: ${commandLineOptions.envPath}`);
|
|
11555
11474
|
require_util.setupEnv(commandLineOptions.envPath);
|
|
11556
11475
|
}
|
|
11557
|
-
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) require_logger.
|
|
11476
|
+
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) require_logger.logger.warn(chalk.default.yellow(dedent.default`
|
|
11558
11477
|
Warning: Config file has a redteam section but no test cases.
|
|
11559
11478
|
Did you mean to run ${chalk.default.bold("promptfoo redteam generate")} instead?
|
|
11560
11479
|
`));
|
|
11561
11480
|
if (config.redteam && Array.isArray(config.providers) && config.providers.length > 0 && typeof config.providers[0] === "object" && config.providers[0].id === "http") {
|
|
11562
11481
|
const maybeUrl = config.providers[0]?.config?.url;
|
|
11563
|
-
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) require_telemetry.
|
|
11482
|
+
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) require_telemetry.telemetry.record("feature_used", { feature: "redteam_run_with_example" });
|
|
11564
11483
|
}
|
|
11565
11484
|
if (config.evaluateOptions) evaluateOptions = {
|
|
11566
11485
|
...evaluateOptions,
|
|
@@ -11574,25 +11493,25 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11574
11493
|
const persisted = resumeEval?.runtimeOptions || config.evaluateOptions || {};
|
|
11575
11494
|
repeat = Number.isSafeInteger(persisted.repeat || 0) && persisted.repeat > 0 ? persisted.repeat : 1;
|
|
11576
11495
|
cache = persisted.cache ?? true;
|
|
11577
|
-
maxConcurrency = persisted.maxConcurrency ??
|
|
11496
|
+
maxConcurrency = persisted.maxConcurrency ?? 4;
|
|
11578
11497
|
delay = persisted.delay ?? 0;
|
|
11579
11498
|
} else {
|
|
11580
11499
|
const iterations = cmdObj.repeat ?? commandLineOptions?.repeat ?? evaluateOptions.repeat ?? NaN;
|
|
11581
11500
|
repeat = Number.isSafeInteger(iterations) && iterations > 0 ? iterations : 1;
|
|
11582
11501
|
cache = cmdObj.cache ?? commandLineOptions?.cache ?? evaluateOptions.cache ?? true;
|
|
11583
|
-
maxConcurrency = cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency ??
|
|
11502
|
+
maxConcurrency = cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency ?? 4;
|
|
11584
11503
|
delay = cmdObj.delay ?? commandLineOptions?.delay ?? evaluateOptions.delay ?? 0;
|
|
11585
11504
|
}
|
|
11586
11505
|
if (cache === false || repeat > 1) {
|
|
11587
|
-
require_logger.
|
|
11506
|
+
require_logger.logger.info("Cache is disabled.");
|
|
11588
11507
|
require_cache.disableCache();
|
|
11589
11508
|
}
|
|
11590
11509
|
const explicitMaxConcurrency = resumeRaw ? (resumeEval?.runtimeOptions)?.maxConcurrency ?? cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency : cmdObj.maxConcurrency ?? commandLineOptions?.maxConcurrency ?? evaluateOptions.maxConcurrency;
|
|
11591
11510
|
if (delay > 0) {
|
|
11592
11511
|
maxConcurrency = 1;
|
|
11593
|
-
require_logger.
|
|
11594
|
-
require_logger.
|
|
11595
|
-
} else if (explicitMaxConcurrency !== void 0) require_logger.
|
|
11512
|
+
require_logger.state.maxConcurrency = 1;
|
|
11513
|
+
require_logger.logger.info(`Running at concurrency=1 because ${delay}ms delay was requested between API calls`);
|
|
11514
|
+
} else if (explicitMaxConcurrency !== void 0) require_logger.state.maxConcurrency = explicitMaxConcurrency;
|
|
11596
11515
|
if (!resumeEval) {
|
|
11597
11516
|
const filterOptions = {
|
|
11598
11517
|
failing: cmdObj.filterFailing,
|
|
@@ -11609,17 +11528,17 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11609
11528
|
let hasValidEmail = false;
|
|
11610
11529
|
while (!hasValidEmail) {
|
|
11611
11530
|
const { emailNeedsValidation } = await require_accounts.promptForEmailUnverified();
|
|
11612
|
-
hasValidEmail = await require_accounts.checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) ===
|
|
11531
|
+
hasValidEmail = await require_accounts.checkEmailStatusAndMaybeExit({ validate: emailNeedsValidation }) === "ok";
|
|
11613
11532
|
}
|
|
11614
11533
|
}
|
|
11615
11534
|
if (!resumeEval) testSuite.providers = filterProviders(testSuite.providers, cmdObj.filterProviders || cmdObj.filterTargets);
|
|
11616
11535
|
const missingApiKeys = require_util.checkProviderApiKeys(testSuite.providers);
|
|
11617
11536
|
if (missingApiKeys.size > 0) {
|
|
11618
|
-
for (const [envVar, providerIds] of missingApiKeys) require_logger.
|
|
11619
|
-
require_logger.
|
|
11620
|
-
require_logger.
|
|
11621
|
-
for (const envVar of missingApiKeys.keys()) require_logger.
|
|
11622
|
-
require_logger.
|
|
11537
|
+
for (const [envVar, providerIds] of missingApiKeys) require_logger.logger.error(chalk.default.red(` ✗ Missing ${envVar} (${providerIds.join(", ")})`));
|
|
11538
|
+
require_logger.logger.error("");
|
|
11539
|
+
require_logger.logger.error(`To fix, set the environment variable or use ${chalk.default.bold("--env-file")}:`);
|
|
11540
|
+
for (const envVar of missingApiKeys.keys()) require_logger.logger.error(` export ${envVar}=your-api-key-here`);
|
|
11541
|
+
require_logger.logger.error("");
|
|
11623
11542
|
process.exitCode = 1;
|
|
11624
11543
|
return new Eval({}, { persisted: false });
|
|
11625
11544
|
}
|
|
@@ -11636,12 +11555,12 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11636
11555
|
if (typeof testSuite.defaultTest === "string") testSuite.defaultTest = {};
|
|
11637
11556
|
testSuite.defaultTest = testSuite.defaultTest || {};
|
|
11638
11557
|
testSuite.defaultTest.options = testSuite.defaultTest.options || {};
|
|
11639
|
-
testSuite.defaultTest.options.provider = await require_providers.loadApiProvider(cmdObj.grader, { basePath: require_logger.
|
|
11640
|
-
if (require_logger.
|
|
11641
|
-
if (typeof require_logger.
|
|
11642
|
-
require_logger.
|
|
11643
|
-
require_logger.
|
|
11644
|
-
require_logger.
|
|
11558
|
+
testSuite.defaultTest.options.provider = await require_providers.loadApiProvider(cmdObj.grader, { basePath: require_logger.state.basePath });
|
|
11559
|
+
if (require_logger.state.config) {
|
|
11560
|
+
if (typeof require_logger.state.config.defaultTest === "string") require_logger.state.config.defaultTest = {};
|
|
11561
|
+
require_logger.state.config.defaultTest = require_logger.state.config.defaultTest || {};
|
|
11562
|
+
require_logger.state.config.defaultTest.options = require_logger.state.config.defaultTest.options || {};
|
|
11563
|
+
require_logger.state.config.defaultTest.options.provider = testSuite.defaultTest.options.provider;
|
|
11645
11564
|
}
|
|
11646
11565
|
}
|
|
11647
11566
|
if (!resumeEval && cmdObj.var) {
|
|
@@ -11659,7 +11578,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11659
11578
|
}
|
|
11660
11579
|
for (const scenario of testSuite.scenarios || []) if (scenario.tests) scenario.tests = await require_util.maybeLoadFromExternalFile(scenario.tests);
|
|
11661
11580
|
const testSuiteSchema = require_types.TestSuiteSchema.safeParse(testSuite);
|
|
11662
|
-
if (!testSuiteSchema.success) require_logger.
|
|
11581
|
+
if (!testSuiteSchema.success) require_logger.logger.warn(chalk.default.yellow(dedent.default`
|
|
11663
11582
|
TestSuite Schema Validation Error:
|
|
11664
11583
|
|
|
11665
11584
|
${zod.z.prettifyError(testSuiteSchema.error)}
|
|
@@ -11692,13 +11611,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11692
11611
|
clearTimeout(forceExitTimeout);
|
|
11693
11612
|
forceExitTimeout = void 0;
|
|
11694
11613
|
}
|
|
11695
|
-
require_logger.
|
|
11614
|
+
require_logger.logger.warn("Force exiting...");
|
|
11696
11615
|
process.exit(130);
|
|
11697
11616
|
}
|
|
11698
|
-
require_logger.
|
|
11617
|
+
require_logger.logger.info(chalk.default.yellow("Pausing evaluation... Press Ctrl+C again to force exit."));
|
|
11699
11618
|
abortController.abort();
|
|
11700
11619
|
forceExitTimeout = setTimeout(() => {
|
|
11701
|
-
require_logger.
|
|
11620
|
+
require_logger.logger.warn("Evaluation shutdown timed out, force exiting...");
|
|
11702
11621
|
process.exit(130);
|
|
11703
11622
|
}, 1e4).unref();
|
|
11704
11623
|
};
|
|
@@ -11712,27 +11631,27 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11712
11631
|
abortSignal: evaluateOptions.abortSignal,
|
|
11713
11632
|
isRedteam: Boolean(config.redteam)
|
|
11714
11633
|
});
|
|
11715
|
-
if (retryErrors && require_logger.
|
|
11716
|
-
const errorResultIds = require_logger.
|
|
11634
|
+
if (retryErrors && require_logger.state._retryErrorResultIds && !paused) {
|
|
11635
|
+
const errorResultIds = require_logger.state._retryErrorResultIds;
|
|
11717
11636
|
try {
|
|
11718
11637
|
await deleteErrorResults(errorResultIds);
|
|
11719
11638
|
await recalculatePromptMetrics(ret);
|
|
11720
|
-
require_logger.
|
|
11639
|
+
require_logger.logger.debug(`Cleaned up ${errorResultIds.length} old ERROR results after successful retry`);
|
|
11721
11640
|
} catch (cleanupError) {
|
|
11722
|
-
require_logger.
|
|
11641
|
+
require_logger.logger.warn("Post-retry cleanup had issues. Retry results are saved.", { error: cleanupError });
|
|
11723
11642
|
} finally {
|
|
11724
|
-
delete require_logger.
|
|
11725
|
-
require_logger.
|
|
11643
|
+
delete require_logger.state._retryErrorResultIds;
|
|
11644
|
+
require_logger.state.retryMode = false;
|
|
11726
11645
|
}
|
|
11727
11646
|
}
|
|
11728
11647
|
} finally {
|
|
11729
11648
|
cleanupHandler();
|
|
11730
11649
|
}
|
|
11731
|
-
require_logger.
|
|
11650
|
+
require_logger.state.resume = false;
|
|
11732
11651
|
if (paused && cmdObj.write !== false) {
|
|
11733
11652
|
require_util.printBorder();
|
|
11734
|
-
require_logger.
|
|
11735
|
-
require_logger.
|
|
11653
|
+
require_logger.logger.info(`${chalk.default.yellow("⏸")} Evaluation paused. ID: ${chalk.default.cyan(evalRecord.id)}`);
|
|
11654
|
+
require_logger.logger.info(`» Resume with: ${chalk.default.green.bold("promptfoo eval --resume " + evalRecord.id)}`);
|
|
11736
11655
|
require_util.printBorder();
|
|
11737
11656
|
return ret;
|
|
11738
11657
|
}
|
|
@@ -11745,8 +11664,8 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11745
11664
|
});
|
|
11746
11665
|
const hasExplicitDisable = cmdObj.share === false || cmdObj.noShare === true || require_logger.getEnvBool("PROMPTFOO_DISABLE_SHARING");
|
|
11747
11666
|
const canShareEval = isSharingEnabled(evalRecord);
|
|
11748
|
-
require_logger.
|
|
11749
|
-
require_logger.
|
|
11667
|
+
require_logger.logger.debug(`Wants to share: ${wantsToShare}`);
|
|
11668
|
+
require_logger.logger.debug(`Can share eval: ${canShareEval}`);
|
|
11750
11669
|
const willShare = wantsToShare && canShareEval;
|
|
11751
11670
|
let sharePromise = null;
|
|
11752
11671
|
if (willShare) sharePromise = createShareableUrl(evalRecord, { silent: true });
|
|
@@ -11765,13 +11684,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11765
11684
|
if (cmdObj.table && require_logger.getLogLevel() !== "debug" && totalTests < 500) {
|
|
11766
11685
|
const table = await evalRecord.getTable();
|
|
11767
11686
|
const outputTable = generateTable(table);
|
|
11768
|
-
require_logger.
|
|
11687
|
+
require_logger.logger.info("\n" + outputTable.toString());
|
|
11769
11688
|
if (table.body.length > 25) {
|
|
11770
11689
|
const rowsLeft = table.body.length - 25;
|
|
11771
|
-
require_logger.
|
|
11690
|
+
require_logger.logger.info(`... ${rowsLeft} more row${rowsLeft === 1 ? "" : "s"} not shown ...\n`);
|
|
11772
11691
|
}
|
|
11773
|
-
} else if (failures !== 0) require_logger.
|
|
11774
|
-
if (totalTests >= 500) require_logger.
|
|
11692
|
+
} else if (failures !== 0) require_logger.logger.debug(`At least one evaluation failure occurred. This might be caused by the underlying call to the provider, or a test failure. Context: \n${JSON.stringify(evalRecord.prompts)}`);
|
|
11693
|
+
if (totalTests >= 500) require_logger.logger.info("Skipping table output because there are more than 500 tests.");
|
|
11775
11694
|
const { outputPath } = config;
|
|
11776
11695
|
const paths = (Array.isArray(outputPath) ? outputPath : [outputPath]).filter((p) => typeof p === "string" && p.length > 0 && !p.endsWith(".jsonl"));
|
|
11777
11696
|
const isRedteam = Boolean(config.redteam);
|
|
@@ -11797,13 +11716,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11797
11716
|
targetErrorStatus
|
|
11798
11717
|
});
|
|
11799
11718
|
if (cmdObj.write && wantsToShare && !canShareEval) {
|
|
11800
|
-
require_logger.
|
|
11719
|
+
require_logger.logger.info(summaryLines[0]);
|
|
11801
11720
|
notCloudEnabledShareInstructions();
|
|
11802
11721
|
for (let i = 1; i < summaryLines.length; i++) if (summaryLines[i].includes("View results:")) {
|
|
11803
11722
|
while (i < summaryLines.length && !summaryLines[i].includes("Total Tokens:")) i++;
|
|
11804
11723
|
i--;
|
|
11805
|
-
} else require_logger.
|
|
11806
|
-
} else for (const line of summaryLines) require_logger.
|
|
11724
|
+
} else require_logger.logger.info(summaryLines[i]);
|
|
11725
|
+
} else for (const line of summaryLines) require_logger.logger.info(line);
|
|
11807
11726
|
let shareableUrl = null;
|
|
11808
11727
|
if (sharePromise != null) {
|
|
11809
11728
|
const orgContext = await require_providers.getOrgContext();
|
|
@@ -11822,24 +11741,24 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11822
11741
|
} else spinner.fail(chalk.default.red("Share failed"));
|
|
11823
11742
|
} catch (error) {
|
|
11824
11743
|
spinner.fail(chalk.default.red("Share failed"));
|
|
11825
|
-
require_logger.
|
|
11744
|
+
require_logger.logger.debug(`Share error: ${error}`);
|
|
11826
11745
|
}
|
|
11827
11746
|
} else try {
|
|
11828
11747
|
shareableUrl = await sharePromise;
|
|
11829
11748
|
if (shareableUrl) {
|
|
11830
11749
|
evalRecord.shared = true;
|
|
11831
|
-
require_logger.
|
|
11750
|
+
require_logger.logger.info(`${chalk.default.dim("»")} ${chalk.default.green("✓")} ${shareableUrl}`);
|
|
11832
11751
|
}
|
|
11833
11752
|
} catch (error) {
|
|
11834
|
-
require_logger.
|
|
11753
|
+
require_logger.logger.debug(`Share error: ${error}`);
|
|
11835
11754
|
}
|
|
11836
11755
|
}
|
|
11837
|
-
require_logger.
|
|
11756
|
+
require_logger.logger.debug(`Shareable URL: ${shareableUrl}`);
|
|
11838
11757
|
if (paths.length) {
|
|
11839
11758
|
await require_util.writeMultipleOutputs(paths, evalRecord, shareableUrl);
|
|
11840
|
-
require_logger.
|
|
11759
|
+
require_logger.logger.info(chalk.default.yellow(`Writing output to ${paths.join(", ")}`));
|
|
11841
11760
|
}
|
|
11842
|
-
require_telemetry.
|
|
11761
|
+
require_telemetry.telemetry.record("command_used", {
|
|
11843
11762
|
name: "eval",
|
|
11844
11763
|
watch: Boolean(cmdObj.watch),
|
|
11845
11764
|
duration: Math.round((Date.now() - startTime) / 1e3),
|
|
@@ -11849,7 +11768,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11849
11768
|
if (initialization) {
|
|
11850
11769
|
const configPaths = (cmdObj.config || [defaultConfigPath]).filter(Boolean);
|
|
11851
11770
|
if (!configPaths.length) {
|
|
11852
|
-
require_logger.
|
|
11771
|
+
require_logger.logger.error(`Could not locate config file(s) to watch. Pass --config path/to/promptfooconfig.yaml or run from a directory containing promptfooconfig.{${DEFAULT_CONFIG_EXTENSIONS.join(",")}}.`);
|
|
11853
11772
|
process.exitCode = 1;
|
|
11854
11773
|
return ret;
|
|
11855
11774
|
}
|
|
@@ -11877,19 +11796,19 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11877
11796
|
chokidar.default.watch(watchPaths, {
|
|
11878
11797
|
ignored: /^\./,
|
|
11879
11798
|
persistent: true
|
|
11880
|
-
}).on("change", async (path) => {
|
|
11799
|
+
}).on("change", async (path$7) => {
|
|
11881
11800
|
require_util.printBorder();
|
|
11882
|
-
require_logger.
|
|
11801
|
+
require_logger.logger.info(`File change detected: ${path$7}`);
|
|
11883
11802
|
require_util.printBorder();
|
|
11884
11803
|
clearConfigCache();
|
|
11885
11804
|
await runEvaluation();
|
|
11886
|
-
}).on("error", (error) => require_logger.
|
|
11805
|
+
}).on("error", (error) => require_logger.logger.error(`Watcher error: ${error}`)).on("ready", () => watchPaths.forEach((watchPath) => require_logger.logger.info(`Watching for file changes on ${watchPath} ...`)));
|
|
11887
11806
|
}
|
|
11888
11807
|
} else {
|
|
11889
11808
|
const passRateThreshold = require_logger.getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD", 100);
|
|
11890
11809
|
const failedTestExitCode = require_logger.getEnvInt("PROMPTFOO_FAILED_TEST_EXIT_CODE", 100);
|
|
11891
11810
|
if (passRate < (Number.isFinite(passRateThreshold) ? passRateThreshold : 100)) {
|
|
11892
|
-
if (require_logger.getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD") !== void 0) require_logger.
|
|
11811
|
+
if (require_logger.getEnvFloat("PROMPTFOO_PASS_RATE_THRESHOLD") !== void 0) require_logger.logger.info(chalk.default.white(`Pass rate ${chalk.default.red.bold(passRate.toFixed(2))}${chalk.default.red("%")} is below the threshold of ${chalk.default.red.bold(passRateThreshold)}${chalk.default.red("%")}`));
|
|
11893
11812
|
process.exitCode = Number.isSafeInteger(failedTestExitCode) ? failedTestExitCode : 100;
|
|
11894
11813
|
return ret;
|
|
11895
11814
|
}
|
|
@@ -11905,7 +11824,6 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11905
11824
|
};
|
|
11906
11825
|
return await runEvaluation(true);
|
|
11907
11826
|
}
|
|
11908
|
-
|
|
11909
11827
|
//#endregion
|
|
11910
11828
|
//#region src/util/verboseToggle.ts
|
|
11911
11829
|
let isVerboseToggleEnabled = false;
|
|
@@ -11968,7 +11886,6 @@ function initVerboseToggle() {
|
|
|
11968
11886
|
function disableVerboseToggle() {
|
|
11969
11887
|
if (cleanupFn) cleanupFn();
|
|
11970
11888
|
}
|
|
11971
|
-
|
|
11972
11889
|
//#endregion
|
|
11973
11890
|
//#region src/redteam/shared.ts
|
|
11974
11891
|
async function doRedteamRun(options) {
|
|
@@ -11985,13 +11902,13 @@ async function doRedteamRun(options) {
|
|
|
11985
11902
|
try {
|
|
11986
11903
|
const healthUrl = require_server.getRemoteHealthUrl();
|
|
11987
11904
|
if (healthUrl) {
|
|
11988
|
-
require_logger.
|
|
11905
|
+
require_logger.logger.debug(`Checking Promptfoo API health at ${healthUrl}...`);
|
|
11989
11906
|
const healthResult = await checkRemoteHealth(healthUrl);
|
|
11990
11907
|
if (healthResult.status !== "OK") throw new Error(`Unable to proceed with redteam: ${healthResult.message}\nPlease check your API configuration or try again later.`);
|
|
11991
|
-
require_logger.
|
|
11908
|
+
require_logger.logger.debug("API health check passed");
|
|
11992
11909
|
}
|
|
11993
11910
|
} catch (error) {
|
|
11994
|
-
require_logger.
|
|
11911
|
+
require_logger.logger.warn(`API health check failed with error: ${error}.\nPlease check your API configuration or try again later.`);
|
|
11995
11912
|
}
|
|
11996
11913
|
if (options.liveRedteamConfig) {
|
|
11997
11914
|
const filename = `redteam-${Date.now()}.yaml`;
|
|
@@ -12001,10 +11918,10 @@ async function doRedteamRun(options) {
|
|
|
12001
11918
|
fs.writeFileSync(tmpFile, js_yaml.default.dump(options.liveRedteamConfig));
|
|
12002
11919
|
redteamPath = tmpFile;
|
|
12003
11920
|
configPath = tmpFile;
|
|
12004
|
-
require_logger.
|
|
12005
|
-
require_logger.
|
|
11921
|
+
require_logger.logger.debug(`Using live config from ${tmpFile}`);
|
|
11922
|
+
require_logger.logger.debug(`Live config: ${JSON.stringify(options.liveRedteamConfig, null, 2)}`);
|
|
12006
11923
|
}
|
|
12007
|
-
require_logger.
|
|
11924
|
+
require_logger.logger.info("Generating test cases...");
|
|
12008
11925
|
const { maxConcurrency, ...passThroughOptions } = options;
|
|
12009
11926
|
let redteamConfig;
|
|
12010
11927
|
const generationStartTime = Date.now();
|
|
@@ -12024,7 +11941,7 @@ async function doRedteamRun(options) {
|
|
|
12024
11941
|
});
|
|
12025
11942
|
} catch (error) {
|
|
12026
11943
|
if (error instanceof require_types.PartialGenerationError) {
|
|
12027
|
-
require_logger.
|
|
11944
|
+
require_logger.logger.error(chalk.default.red("\n" + error.message));
|
|
12028
11945
|
require_logger.setLogCallback(null);
|
|
12029
11946
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12030
11947
|
throw error;
|
|
@@ -12033,11 +11950,11 @@ async function doRedteamRun(options) {
|
|
|
12033
11950
|
}
|
|
12034
11951
|
const generationDurationMs = Date.now() - generationStartTime;
|
|
12035
11952
|
if (!redteamConfig || !fs.existsSync(redteamPath)) {
|
|
12036
|
-
require_logger.
|
|
11953
|
+
require_logger.logger.info("No test cases generated. Skipping scan.");
|
|
12037
11954
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12038
11955
|
return;
|
|
12039
11956
|
}
|
|
12040
|
-
require_logger.
|
|
11957
|
+
require_logger.logger.info("Running scan...");
|
|
12041
11958
|
const { defaultConfig } = await loadDefaultConfig();
|
|
12042
11959
|
const { description: _description, ...evalOptions } = options;
|
|
12043
11960
|
const evalResult = await doEval({
|
|
@@ -12059,16 +11976,15 @@ async function doRedteamRun(options) {
|
|
|
12059
11976
|
if (evalResult.persisted) await evalResult.save();
|
|
12060
11977
|
const totalMs = evalResult.durationMs ?? 0;
|
|
12061
11978
|
const evalMs = evalResult.evaluationDurationMs ?? 0;
|
|
12062
|
-
require_logger.
|
|
11979
|
+
require_logger.logger.info(chalk.default.gray(`Total scan time: ${formatDuration(totalMs / 1e3)} (generation: ${formatDuration(generationDurationMs / 1e3)}, evaluation: ${formatDuration(evalMs / 1e3)})`));
|
|
12063
11980
|
}
|
|
12064
|
-
if (evalResult ? await evalResult.findTargetErrorStatus() != null : false) {} else require_logger.
|
|
12065
|
-
if (!evalResult?.shared) if (options.liveRedteamConfig) require_logger.
|
|
12066
|
-
else require_logger.
|
|
11981
|
+
if (evalResult ? await evalResult.findTargetErrorStatus() != null : false) {} else require_logger.logger.info(chalk.default.green("\nRed team scan complete!"));
|
|
11982
|
+
if (!evalResult?.shared) if (options.liveRedteamConfig) require_logger.logger.info(chalk.default.blue(`To view the results, click the ${chalk.default.bold("View Report")} button or run ${chalk.default.bold(promptfooCommand("redteam report"))} on the command line.`));
|
|
11983
|
+
else require_logger.logger.info(chalk.default.blue(`To view the results, run ${chalk.default.bold(promptfooCommand("redteam report"))}`));
|
|
12067
11984
|
require_logger.setLogCallback(null);
|
|
12068
11985
|
if (verboseToggleCleanup) verboseToggleCleanup();
|
|
12069
11986
|
return evalResult;
|
|
12070
11987
|
}
|
|
12071
|
-
|
|
12072
11988
|
//#endregion
|
|
12073
11989
|
//#region src/index.ts
|
|
12074
11990
|
async function evaluate(testSuite, options = {}) {
|
|
@@ -12093,23 +12009,23 @@ async function evaluate(testSuite, options = {}) {
|
|
|
12093
12009
|
if (typeof constructedTestSuite.defaultTest === "object") {
|
|
12094
12010
|
if (constructedTestSuite.defaultTest?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
|
|
12095
12011
|
env: testSuite.env,
|
|
12096
|
-
basePath: require_logger.
|
|
12012
|
+
basePath: require_logger.state.basePath
|
|
12097
12013
|
});
|
|
12098
12014
|
if (constructedTestSuite.defaultTest?.options?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
|
|
12099
12015
|
env: testSuite.env,
|
|
12100
|
-
basePath: require_logger.
|
|
12016
|
+
basePath: require_logger.state.basePath
|
|
12101
12017
|
});
|
|
12102
12018
|
}
|
|
12103
12019
|
for (const test of constructedTestSuite.tests || []) {
|
|
12104
12020
|
if (test.options?.provider && !require_types.isApiProvider(test.options.provider)) test.options.provider = await require_providers.resolveProvider(test.options.provider, providerMap, {
|
|
12105
12021
|
env: testSuite.env,
|
|
12106
|
-
basePath: require_logger.
|
|
12022
|
+
basePath: require_logger.state.basePath
|
|
12107
12023
|
});
|
|
12108
12024
|
if (test.assert) for (const assertion of test.assert) {
|
|
12109
12025
|
if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
|
|
12110
12026
|
if (assertion.provider && !require_types.isApiProvider(assertion.provider)) assertion.provider = await require_providers.resolveProvider(assertion.provider, providerMap, {
|
|
12111
12027
|
env: testSuite.env,
|
|
12112
|
-
basePath: require_logger.
|
|
12028
|
+
basePath: require_logger.state.basePath
|
|
12113
12029
|
});
|
|
12114
12030
|
}
|
|
12115
12031
|
}
|
|
@@ -12133,12 +12049,12 @@ async function evaluate(testSuite, options = {}) {
|
|
|
12133
12049
|
if (shareableUrl) {
|
|
12134
12050
|
ret.shareableUrl = shareableUrl;
|
|
12135
12051
|
ret.shared = true;
|
|
12136
|
-
require_logger.
|
|
12052
|
+
require_logger.logger.debug(`Eval shared successfully: ${shareableUrl}`);
|
|
12137
12053
|
}
|
|
12138
12054
|
} catch (error) {
|
|
12139
|
-
require_logger.
|
|
12055
|
+
require_logger.logger.warn(`Failed to create shareable URL: ${error}`);
|
|
12140
12056
|
}
|
|
12141
|
-
else require_logger.
|
|
12057
|
+
else require_logger.logger.debug("Sharing requested but not enabled (check cloud config or sharing settings)");
|
|
12142
12058
|
if (testSuite.outputPath) {
|
|
12143
12059
|
if (typeof testSuite.outputPath === "string") await require_util.writeOutput(testSuite.outputPath, evalRecord, null);
|
|
12144
12060
|
else if (Array.isArray(testSuite.outputPath)) await require_util.writeMultipleOutputs(testSuite.outputPath, evalRecord, null);
|
|
@@ -12165,11 +12081,10 @@ var src_default = {
|
|
|
12165
12081
|
assertions: assertions_default,
|
|
12166
12082
|
cache: require_cache.cache_exports,
|
|
12167
12083
|
evaluate,
|
|
12168
|
-
guardrails
|
|
12084
|
+
guardrails,
|
|
12169
12085
|
loadApiProvider: require_providers.loadApiProvider,
|
|
12170
12086
|
redteam
|
|
12171
12087
|
};
|
|
12172
|
-
|
|
12173
12088
|
//#endregion
|
|
12174
12089
|
exports.AssertionOrSetSchema = require_types.AssertionOrSetSchema;
|
|
12175
12090
|
exports.AssertionSchema = require_types.AssertionSchema;
|
|
@@ -12208,20 +12123,21 @@ exports.TestSuiteSchema = require_types.TestSuiteSchema;
|
|
|
12208
12123
|
exports.UnifiedConfigSchema = require_types.UnifiedConfigSchema;
|
|
12209
12124
|
exports.VarsSchema = require_types.VarsSchema;
|
|
12210
12125
|
exports.assertions = assertions_default;
|
|
12211
|
-
Object.defineProperty(exports,
|
|
12212
|
-
|
|
12213
|
-
|
|
12214
|
-
|
|
12215
|
-
|
|
12126
|
+
Object.defineProperty(exports, "cache", {
|
|
12127
|
+
enumerable: true,
|
|
12128
|
+
get: function() {
|
|
12129
|
+
return require_cache.cache_exports;
|
|
12130
|
+
}
|
|
12216
12131
|
});
|
|
12217
12132
|
exports.default = src_default;
|
|
12218
12133
|
exports.evaluate = evaluate;
|
|
12219
12134
|
exports.generateTable = generateTable;
|
|
12220
|
-
exports.guardrails =
|
|
12135
|
+
exports.guardrails = guardrails;
|
|
12221
12136
|
exports.isApiProvider = require_types.isApiProvider;
|
|
12222
12137
|
exports.isGradingResult = require_types.isGradingResult;
|
|
12223
12138
|
exports.isProviderOptions = require_types.isProviderOptions;
|
|
12224
12139
|
exports.isResultFailureReason = require_types.isResultFailureReason;
|
|
12225
12140
|
exports.loadApiProvider = require_providers.loadApiProvider;
|
|
12226
12141
|
exports.redteam = redteam;
|
|
12142
|
+
|
|
12227
12143
|
//# sourceMappingURL=index.cjs.map
|