promptfoo 0.120.27 → 0.121.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/src/{ListApp-8WOe2nT6.js → ListApp-Du7YVwj5.js} +2 -4
- package/dist/src/accounts-B0pgC1oV.js +206 -0
- package/dist/src/{accounts-DVINui-2.js → accounts-Bm2D8Db9.js} +39 -34
- package/dist/src/{accounts-CPDRAMND.js → accounts-CiBLOnA7.js} +38 -33
- package/dist/src/{accounts-Fl2J3_Fu.cjs → accounts-gtkH-5KX.cjs} +77 -78
- package/dist/src/{agentic-utils-D922n6mm.js → agentic-utils-DS1g3GLF.js} +9 -10
- package/dist/src/{agents-BcsN_BgB.js → agents-9qiOy0ho.js} +16 -12
- package/dist/src/{agents-BXLmVsxR.js → agents-CBr9A01V.js} +37 -37
- package/dist/src/{agents-pMfppv9Z.js → agents-CmvBq8LV.js} +16 -18
- package/dist/src/{agents-hqgSV-3o.js → agents-D__IdAlg.js} +39 -40
- package/dist/src/{agents-BO2n8Z0d.cjs → agents-DbRtpYxR.cjs} +37 -40
- package/dist/src/{agents-BdUTAwi-.js → agents-DgF2zDag.js} +37 -42
- package/dist/src/{agents-DgJf2-ez.cjs → agents-Di9DKPzn.cjs} +16 -17
- package/dist/src/{agents-DNvSH78i.js → agents-cLXA8a_8.js} +17 -19
- package/dist/src/{aimlapi-DtgPI0nE.js → aimlapi-B4rcnZgv.js} +15 -17
- package/dist/src/{aimlapi-BE_Tg9Fl.cjs → aimlapi-BvlNH0gr.cjs} +15 -16
- package/dist/src/{aimlapi-DOib86oE.js → aimlapi-CnkC2HqE.js} +16 -18
- package/dist/src/{aimlapi-DTPACCB1.js → aimlapi-DHJU_kcV.js} +15 -4
- package/dist/src/app/assets/index-4LKxG2CG.js +439 -0
- package/dist/src/app/assets/{index-NCn4eVBv.css → index-C3zcsZFQ.css} +1 -1
- package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
- package/dist/src/app/index.html +3 -3
- package/dist/src/app/tsconfig.app.tsbuildinfo +1 -1
- package/dist/src/{audio-BnRUGAm_.js → audio-Bkv46et0.js} +6 -5
- package/dist/src/{audio-Cwo68yZS.cjs → audio-CGMyULza.cjs} +6 -7
- package/dist/src/{audio-MSRki4JU.js → audio-ClI_AFre.js} +6 -8
- package/dist/src/{audio-BRYU0BFo.js → audio-Dz3z7s3J.js} +7 -9
- package/dist/src/{base-pGVmXNl4.cjs → base-CGrhspbK.cjs} +36 -38
- package/dist/src/{base-h961VXYk.js → base-CpjcHe4e.js} +11 -13
- package/dist/src/base-DLKtKMFh.js +193 -0
- package/dist/src/{base-XB2tDJrB.js → base-Dy1V8--Z.js} +11 -13
- package/dist/src/blobs-BDbfYdrJ.js +236 -0
- package/dist/src/{blobs-CR5C4Ihh.js → blobs-CBO20krR.js} +9 -12
- package/dist/src/{blobs-BM_e6hCa.js → blobs-CMHN0Qcz.js} +9 -12
- package/dist/src/{blobs-B-KQAFhX.cjs → blobs-D23XLin-.cjs} +34 -37
- package/dist/src/{cache-jsiwsAJv.js → cache-BVeDlD87.js} +132 -117
- package/dist/src/{cache-CIpsoBZR.js → cache-C4Nxf52C.js} +132 -118
- package/dist/src/cache-CeUpFm3M.cjs +5 -0
- package/dist/src/{cache-BTVYfbka.cjs → cache-Dh5WtQps.cjs} +182 -168
- package/dist/src/cache-i1P6crbO.js +756 -0
- package/dist/src/cache-n-RCJ-hL.js +6 -0
- package/dist/src/{chat-BcPjZXIp.js → chat-BiKyneZl.js} +45 -46
- package/dist/src/{chat-D31K7C4u.cjs → chat-C1Qst7jL.cjs} +20 -21
- package/dist/src/{chat-B84t99NW.js → chat-C2jrdPMx.js} +20 -9
- package/dist/src/{chat-BE44YOc6.cjs → chat-CgF-J-Jj.cjs} +65 -66
- package/dist/src/{chat-DwWifjxi.js → chat-CzkrVDfz.js} +20 -22
- package/dist/src/chat-DJIw17u0.js +766 -0
- package/dist/src/{chat-CcUCysjU.js → chat-DqxYYtWA.js} +45 -46
- package/dist/src/{chat-DZM2GUHO.js → chat-qmatte1u.js} +21 -23
- package/dist/src/{chatkit-D67HS_0b.js → chatkit-65VXf5SR.js} +58 -58
- package/dist/src/{chatkit-DAB_qfzI.js → chatkit-Be-Q-a9F.js} +58 -60
- package/dist/src/{chatkit-Biqb_wsD.js → chatkit-BxFvW8KY.js} +58 -60
- package/dist/src/{chatkit-PGG4ZYIn.cjs → chatkit-DKyPi1Gs.cjs} +58 -60
- package/dist/src/chunk-DEq-mXcV.js +15 -0
- package/dist/src/chunk-DRamLcfz.js +16 -0
- package/dist/src/{claude-agent-sdk-SVM6AdBu.js → claude-agent-sdk-Apiy0iaz.js} +31 -31
- package/dist/src/{claude-agent-sdk-C-IOTPfo.js → claude-agent-sdk-D2bJee9S.js} +31 -29
- package/dist/src/{claude-agent-sdk-C9SiaQub.cjs → claude-agent-sdk-D9Z5Pr9X.cjs} +31 -28
- package/dist/src/{claude-agent-sdk-CiluSyW1.js → claude-agent-sdk-DfCoW0E6.js} +33 -20
- package/dist/src/cloud-BBh91EUK.js +4 -0
- package/dist/src/{cloud-CZ-q9Ier.js → cloud-C0dlstV_.js} +7 -9
- package/dist/src/{cloudflare-ai-BahKHyhh.js → cloudflare-ai-8TDxHR0x.js} +16 -18
- package/dist/src/{cloudflare-ai-v_qZD6_q.js → cloudflare-ai-BxAGvfju.js} +17 -19
- package/dist/src/{cloudflare-ai-Dfahv5SY.cjs → cloudflare-ai-CknbZ5LJ.cjs} +16 -17
- package/dist/src/{cloudflare-ai-Dxyt50Nl.js → cloudflare-ai-g7PB6VHR.js} +16 -4
- package/dist/src/{cloudflare-gateway-Bi_FpOFy.js → cloudflare-gateway-B9HWA5wf.js} +23 -23
- package/dist/src/{cloudflare-gateway-BPWoZIzJ.cjs → cloudflare-gateway-BSnDmHYo.cjs} +21 -22
- package/dist/src/{cloudflare-gateway-C0guUNwk.js → cloudflare-gateway-CKDb4dJ8.js} +26 -14
- package/dist/src/{cloudflare-gateway-btS7h1OZ.js → cloudflare-gateway-CP9QEWYS.js} +21 -25
- package/dist/src/{codex-sdk-DSxAnbfT.js → codex-sdk-C6UMlxwV.js} +28 -29
- package/dist/src/{codex-sdk-IYVi9fuM.js → codex-sdk-DUwKWezN.js} +28 -27
- package/dist/src/{codex-sdk-DulY0ZRq.js → codex-sdk-GGAw0qbD.js} +28 -29
- package/dist/src/{codex-sdk-DFKMtAyf.cjs → codex-sdk-fAO0c3yA.cjs} +28 -29
- package/dist/src/{cometapi-DzrR3SR_.js → cometapi-BL9yvj_f.js} +16 -4
- package/dist/src/{cometapi-DIO64tf4.cjs → cometapi-C4xSqeID.cjs} +21 -22
- package/dist/src/{cometapi-C9EEpJzT.js → cometapi-CUQq3H_a.js} +21 -24
- package/dist/src/{cometapi-DkNBMk0G.js → cometapi-DFNiKmSz.js} +17 -19
- package/dist/src/{completion-CG29bfKX.js → completion-5MzrpJxT.js} +11 -13
- package/dist/src/{completion-CCRT4kX1.cjs → completion-CM6oK8PS.cjs} +21 -23
- package/dist/src/{completion-Bgf1VJoq.js → completion-DZ083F31.js} +11 -13
- package/dist/src/completion-qRoZAYRB.js +120 -0
- package/dist/src/{createHash-Dw_iLu31.js → createHash-CTQmL3G2.js} +2 -3
- package/dist/src/{createHash-CYQy4YeL.cjs → createHash-CfZSc0b4.cjs} +13 -14
- package/dist/src/{createHash-CJcfskIZ.js → createHash-Da8fMwqB.js} +2 -3
- package/dist/src/createHash-DmPQkvBh.js +15 -0
- package/dist/src/{docker-D-ayp2FW.js → docker-Bb5dcxr8.js} +18 -20
- package/dist/src/{docker-B81N0t4e.js → docker-BvfL2BrW.js} +19 -21
- package/dist/src/{docker-DNcLR4Ig.cjs → docker-DcF2pRrj.cjs} +18 -19
- package/dist/src/{docker-egERKxCF.js → docker-ExVyLp0S.js} +18 -7
- package/dist/src/entrypoint.js +2 -3
- package/dist/src/{errors-DnGCbnx8.js → errors-P6ll7XSJ.js} +2 -2
- package/dist/src/{esm-B9dPm_BF.js → esm-C03C-mv3.js} +17 -20
- package/dist/src/{esm-D2pZ87fL.js → esm-CaIwzWR5.js} +18 -21
- package/dist/src/esm-Cd1AjG1D.js +379 -0
- package/dist/src/{esm-Ct-Joyue.cjs → esm-CnNt7sI4.cjs} +47 -49
- package/dist/src/eval-B3r2CVXr.js +15 -0
- package/dist/src/{eval-C-Nr6wX_.js → eval-Dg2nG4v2.js} +47 -54
- package/dist/src/evalResult-5xwYnECe.js +12 -0
- package/dist/src/evalResult-71lY93Kj.cjs +10 -0
- package/dist/src/{evalResult-DXMWJ3sx.js → evalResult-BBRNtX4I.js} +10 -11
- package/dist/src/{evalResult-4BzI2tmj.js → evalResult-BDMqrapS.js} +16 -12
- package/dist/src/evalResult-Dx5P5cIv.js +10 -0
- package/dist/src/{evalResult-CX8wQecI.cjs → evalResult-fuaI8HkH.cjs} +20 -21
- package/dist/src/{evaluator-8aGyV12L.js → evaluator-BhoWwp5b.js} +211 -235
- package/dist/src/evaluator-Jx6bRZV6.js +36 -0
- package/dist/src/{extractor-V5x_m1i0.js → extractor-C0EVHewb.js} +22 -24
- package/dist/src/extractor-D25qpmGX.js +374 -0
- package/dist/src/{extractor-CD5yKL-G.js → extractor-DReVID0K.js} +22 -24
- package/dist/src/{extractor-C031XmTA.cjs → extractor-pYLLi3wS.cjs} +37 -39
- package/dist/src/{fetch-BmbD-v1L.cjs → fetch-BPkYtG8K.cjs} +244 -277
- package/dist/src/fetch-BxNb_Lp3.js +5 -0
- package/dist/src/{fetch-D3OHf-lV.js → fetch-Cwxnd8zz.js} +36 -44
- package/dist/src/{fetch-CXZI9RRr.js → fetch-Dxpd4_sr.js} +23 -35
- package/dist/src/fetch-HaqdX7U1.js +780 -0
- package/dist/src/{fileExtensions-ePDqouxn.js → fileExtensions-DnqA1y9x.js} +2 -2
- package/dist/src/{fileExtensions-BpuMmaFL.js → fileExtensions-Ds-foDzt.js} +2 -2
- package/dist/src/fileExtensions-LcDYkU4v.js +85 -0
- package/dist/src/{fileExtensions-DkJYkWUy.cjs → fileExtensions-bYh77CN8.cjs} +27 -28
- package/dist/src/{formatDuration-CdevI3An.js → formatDuration-DgBVMN65.js} +2 -2
- package/dist/src/{genaiTracer-Ce19n68P.js → genaiTracer-70Z8BIuV.js} +2 -3
- package/dist/src/{genaiTracer-CqNnnXrE.js → genaiTracer-C1rxGO8Q.js} +2 -3
- package/dist/src/genaiTracer-D3fD9dNV.js +256 -0
- package/dist/src/{genaiTracer-Dres3qrN.cjs → genaiTracer-DN4dQywX.cjs} +13 -14
- package/dist/src/{graders--1y2u9HO.js → graders-BTeBGqjJ.js} +349 -397
- package/dist/src/graders-B_pgMLS2.js +34 -0
- package/dist/src/{graders-DTeBrzWp.js → graders-Bj_Odv7c.js} +349 -397
- package/dist/src/graders-DErokPDO.cjs +32 -0
- package/dist/src/graders-DP7KFFo-.js +13466 -0
- package/dist/src/graders-DR_uNe54.js +32 -0
- package/dist/src/{graders-DohM2dir.cjs → graders-DU49_J8Y.cjs} +684 -732
- package/dist/src/graders-w3176Wz-.js +32 -0
- package/dist/src/{image-B0U4Hqll.js → image-B02ogr_b.js} +7 -9
- package/dist/src/{image-DmE-niFE.js → image-B0h9VEMc.js} +6 -5
- package/dist/src/{image-CuKHuccK.cjs → image-BLmROtN3.cjs} +29 -30
- package/dist/src/{image-DNEIf_aI.js → image-Bb4vWQLM.js} +6 -8
- package/dist/src/{image-DpKl2F15.cjs → image-C1madmKh.cjs} +6 -7
- package/dist/src/{image-C3wHC9_h.js → image-CHfWvljl.js} +9 -10
- package/dist/src/{image-O1u4bCFg.js → image-DS-o-0ph.js} +9 -10
- package/dist/src/image-Dpxa1Jt6.js +257 -0
- package/dist/src/index.cjs +615 -695
- package/dist/src/index.d.cts +271 -7
- package/dist/src/index.d.ts +271 -3
- package/dist/src/index.js +580 -664
- package/dist/src/{interactiveCheck-Bxj1Swex.js → interactiveCheck-BgLZUIt3.js} +7 -8
- package/dist/src/{invariant-DT20jrBd.js → invariant-BtWWVVhl.js} +2 -2
- package/dist/src/{invariant-1pAf2CD1.js → invariant-Ddh24eXh.js} +2 -2
- package/dist/src/{invariant-CKcJAQ6M.cjs → invariant-kfQ8Bu82.cjs} +7 -8
- package/dist/src/invariant-vgHWClmd.js +25 -0
- package/dist/src/{knowledgeBase-CEzQobWX.js → knowledgeBase-B3OoKIej.js} +14 -9
- package/dist/src/{knowledgeBase-Be_zyW4L.js → knowledgeBase-CYTLHOt1.js} +16 -16
- package/dist/src/{knowledgeBase-BZ41IFwq.js → knowledgeBase-D33Ty2l6.js} +14 -18
- package/dist/src/{knowledgeBase-D-5BMXlr.cjs → knowledgeBase-DOO_BM9b.cjs} +14 -15
- package/dist/src/{litellm-DnbRJ2if.js → litellm-AaeZcZQF.js} +18 -19
- package/dist/src/{litellm-hUSNM_M2.cjs → litellm-I_hbp_dc.cjs} +17 -17
- package/dist/src/{litellm-CRDqPhNI.js → litellm-NbjknEh6.js} +17 -18
- package/dist/src/{litellm-9vR8zpfU.js → litellm-TrljxD9G.js} +17 -5
- package/dist/src/{logger-CG1uZPbQ.js → logger-CT3IKMKA.js} +10 -29
- package/dist/src/{logger-B7sBeGa0.cjs → logger-Cp1GPUjj.cjs} +152 -180
- package/dist/src/logger-DLcq4dWf.js +713 -0
- package/dist/src/{logger-LSBxlt7a.js → logger-KkObSCzq.js} +13 -31
- package/dist/src/{luma-ray-4blv9iZ2.js → luma-ray-BS2_tY8L.js} +22 -21
- package/dist/src/{luma-ray-drvgdpP9.js → luma-ray-DDsjcgZZ.js} +20 -13
- package/dist/src/{luma-ray-Hm3d6VJE.cjs → luma-ray-Due0n7di.cjs} +20 -21
- package/dist/src/{luma-ray-B2__8lYH.js → luma-ray-f6I2fft-.js} +20 -23
- package/dist/src/main.js +1170 -1321
- package/dist/src/{messages-Uee41Mj5.js → messages-BS17jdMx.js} +22 -24
- package/dist/src/{messages-XhiwCbi4.cjs → messages-Bs1kC7P4.cjs} +32 -34
- package/dist/src/{messages-CGPPidQr.js → messages-D0lx5qK7.js} +22 -24
- package/dist/src/messages-ZJk778GH.js +240 -0
- package/dist/src/{meteor-BYykdXrV.js → meteor-44VjEACX.js} +3 -4
- package/dist/src/{meteor-CsopaHrH.js → meteor-D-SotUw9.js} +3 -4
- package/dist/src/{meteor-e-E-2vVl.cjs → meteor-DLZZ3osF.cjs} +3 -4
- package/dist/src/{meteor-C8lGP6P4.js → meteor-DUiCJRC-.js} +3 -4
- package/dist/src/{modelslab-yKz-ZNB4.js → modelslab-Bmni6skY.js} +17 -10
- package/dist/src/{modelslab-E9gO-bYd.js → modelslab-Bx9IrZfS.js} +18 -20
- package/dist/src/{modelslab-lUVW0cmB.cjs → modelslab-CoUX6Jc_.cjs} +17 -18
- package/dist/src/{modelslab-ClBkr8_9.js → modelslab-DRb74SP4.js} +17 -19
- package/dist/src/{nova-reel-Dk8jNpId.js → nova-reel-BfPq-0Yk.js} +20 -13
- package/dist/src/{nova-reel-D8CuO6QH.cjs → nova-reel-C_QM18Xn.cjs} +20 -21
- package/dist/src/{nova-reel-u2eF2Cxm.js → nova-reel-D_W1tjMH.js} +22 -21
- package/dist/src/{nova-reel-P9bwvtYX.js → nova-reel-bgjxilYW.js} +20 -23
- package/dist/src/{nova-sonic-CK2rAiKi.js → nova-sonic-CFb5GYhg.js} +30 -26
- package/dist/src/{nova-sonic-BaqWlkds.js → nova-sonic-DIGQNR07.js} +30 -31
- package/dist/src/{nova-sonic-yZapPLv7.js → nova-sonic-De1HW5fD.js} +31 -32
- package/dist/src/{nova-sonic-Ds1C-dpm.cjs → nova-sonic-zfcljeRp.cjs} +30 -31
- package/dist/src/{openai-DUFopMrH.cjs → openai-Cuif0GEt.cjs} +8 -9
- package/dist/src/{openai-PblZ3jUE.js → openai-DElQ-fPX.js} +3 -4
- package/dist/src/{openai-CcN1B8Sb.js → openai-DhbB7eWK.js} +3 -4
- package/dist/src/openai-j-sE2O7r.js +44 -0
- package/dist/src/{openclaw-B6qqDr_u.cjs → openclaw-CSugPYAr.cjs} +188 -130
- package/dist/src/{openclaw-A-3_loM7.js → openclaw-DiSz3I5L.js} +180 -109
- package/dist/src/{openclaw-a3lylB-V.js → openclaw-DuvJKEW5.js} +178 -124
- package/dist/src/{openclaw-COn6QzDi.js → openclaw-tiVYRtr-.js} +178 -122
- package/dist/src/opencode-sdk-0j6rTWNb.js +562 -0
- package/dist/src/opencode-sdk-B3CWY9h_.js +560 -0
- package/dist/src/opencode-sdk-BL764Jdi.cjs +564 -0
- package/dist/src/opencode-sdk-C2y6UkP2.js +560 -0
- package/dist/src/{otlpReceiver-oyf5wLGC.js → otlpReceiver-C99PPb48.js} +53 -51
- package/dist/src/{otlpReceiver-lXsYVbpj.cjs → otlpReceiver-CGq6LspY.cjs} +53 -55
- package/dist/src/{otlpReceiver-94URx7UW.js → otlpReceiver-CdNBdbsk.js} +53 -55
- package/dist/src/{otlpReceiver-BmmTiMjA.js → otlpReceiver-D89fR-rC.js} +53 -55
- package/dist/src/{providerRegistry-Cq_JK_CJ.js → providerRegistry-B0RUOLI_.js} +7 -8
- package/dist/src/{providerRegistry-DSSHjMKf.js → providerRegistry-CD8MEar9.js} +7 -8
- package/dist/src/{providerRegistry-CvHEVJad.cjs → providerRegistry-Civky8Ar.cjs} +12 -13
- package/dist/src/providerRegistry-DM8rZYol.js +45 -0
- package/dist/src/providers-B7V0njNs.js +32 -0
- package/dist/src/providers-BEwbhv0X.js +30 -0
- package/dist/src/{providers-Iil64vk9.js → providers-BlqUifFg.js} +1543 -1676
- package/dist/src/providers-CH3C7zf7.js +30 -0
- package/dist/src/{providers-DHbjzW2e.cjs → providers-CgKOSgTR.cjs} +1896 -2029
- package/dist/src/providers-D8lF1sqW.js +33246 -0
- package/dist/src/{providers-BnFpbY_s.js → providers-Dk_6ocUX.js} +1536 -1669
- package/dist/src/providers-zyB6k_38.cjs +31 -0
- package/dist/src/{pythonUtils-CcT5LH1M.js → pythonUtils-C3py6GC1.js} +18 -19
- package/dist/src/{pythonUtils-DBbuI3QJ.cjs → pythonUtils-CTU3Y3lw.cjs} +42 -43
- package/dist/src/{pythonUtils-hZ8LeQLv.js → pythonUtils-D5nxkQ0P.js} +18 -19
- package/dist/src/pythonUtils-D6fwaDSg.js +249 -0
- package/dist/src/{quiverai-BuI0tE39.js → quiverai-BbOUOn2L.js} +8 -7
- package/dist/src/{quiverai-DCGSZt4U.js → quiverai-CIaELU_m.js} +8 -10
- package/dist/src/{quiverai-DiMVJQDz.cjs → quiverai-PdShCPox.cjs} +8 -9
- package/dist/src/{quiverai-fQNkExW4.js → quiverai-uH-dcTIr.js} +9 -11
- package/dist/src/{render-Dj1smHEb.js → render-Drod8m7K.js} +4 -5
- package/dist/src/responses-CB2jwoAr.js +660 -0
- package/dist/src/{responses-ghR3IOfy.cjs → responses-D8SBTL64.cjs} +39 -42
- package/dist/src/{responses-DOAFFENS.js → responses-DIR9Ud3j.js} +24 -27
- package/dist/src/{responses-CxzoQoBe.js → responses-WNGNYe3K.js} +24 -27
- package/dist/src/rubyUtils-BUHu6PhO.js +5 -0
- package/dist/src/{rubyUtils-CwbGmgYN.js → rubyUtils-BUVePouc.js} +27 -20
- package/dist/src/rubyUtils-BcuGX77l.js +222 -0
- package/dist/src/{rubyUtils-DudlFZed.js → rubyUtils-Boc4HZzX.js} +18 -19
- package/dist/src/rubyUtils-CP42kMvq.cjs +4 -0
- package/dist/src/{rubyUtils-C8MhKGHb.cjs → rubyUtils-DhCAlxZr.cjs} +48 -50
- package/dist/src/{sagemaker-gmskuyre.js → sagemaker-CNBxx5CJ.js} +75 -70
- package/dist/src/{sagemaker-CcxhlOAR.js → sagemaker-CemTFp2h.js} +75 -79
- package/dist/src/{sagemaker-77zbJ2Q2.cjs → sagemaker-Cl28mZU2.cjs} +75 -76
- package/dist/src/{sagemaker-DuM71dVU.js → sagemaker-YSyBXQQh.js} +77 -77
- package/dist/src/{scanner-DJYiSXQj.js → scanner-BsBlNXNn.js} +100 -121
- package/dist/src/server/index.js +5520 -67427
- package/dist/src/{server-B5v33lvE.cjs → server-C_7Ax-hA.cjs} +57 -67
- package/dist/src/{server-BJ4m4f1D.js → server-CqzrVGpF.js} +26 -29
- package/dist/src/server-CuxBbeSY.js +229 -0
- package/dist/src/server-DA4Cyrrq.js +7 -0
- package/dist/src/server-Dulb-4-K.cjs +5 -0
- package/dist/src/{server-RV_i_YX5.js → server-VWgWb00X.js} +19 -24
- package/dist/src/{signal-BW33JuId.js → signal-4U3mfRvL.js} +9 -11
- package/dist/src/{slack-DEURelTy.cjs → slack-BmVAVGaK.cjs} +7 -8
- package/dist/src/{slack-BQYeW9L3.js → slack-DCUPTzS2.js} +8 -8
- package/dist/src/{slack-BB6yuZzp.js → slack-DOdy_kyv.js} +7 -8
- package/dist/src/{slack-2pRrhhgJ.js → slack-DXMKtA-f.js} +7 -9
- package/dist/src/store-CXGFv4aR.js +228 -0
- package/dist/src/store-CXS-Q_91.js +6 -0
- package/dist/src/{store-D7CgQzAR.cjs → store-DLlFCC4h.cjs} +44 -45
- package/dist/src/{store-DJNsD1iC.js → store-DXilxTl-.js} +40 -36
- package/dist/src/{store-s3SftUwF.js → store-Dim__MDd.js} +34 -35
- package/dist/src/store-eYkaKMwq.cjs +5 -0
- package/dist/src/{tables-DfTsNN7X.js → tables-6YKwjN9-.js} +19 -21
- package/dist/src/tables-DLJPUdUE.js +288 -0
- package/dist/src/{tables-BKTmd6u7.cjs → tables-DPi7wKeM.cjs} +89 -91
- package/dist/src/{tables-DMegD0Xf.js → tables-gftXzE9I.js} +21 -23
- package/dist/src/telemetry-BpMfhthR.cjs +5 -0
- package/dist/src/{telemetry--WAdAfVi.js → telemetry-CMrFgtPB.js} +11 -13
- package/dist/src/telemetry-Cps3mIU-.js +171 -0
- package/dist/src/{telemetry-DQgVBCAb.cjs → telemetry-DaX14Chu.cjs} +21 -24
- package/dist/src/{telemetry-BedSm-bZ.js → telemetry-Dthj_BbD.js} +17 -14
- package/dist/src/telemetry-Dw38hanS.js +7 -0
- package/dist/src/{text-oiSbwSOI.js → text-B_UCRPp2.js} +2 -2
- package/dist/src/{text-oKzCBnK6.cjs → text-CW1cyrwj.cjs} +12 -13
- package/dist/src/{text-B_IrO4GZ.js → text-Db-Wt2u2.js} +2 -2
- package/dist/src/text-TIv0QYnd.js +22 -0
- package/dist/src/{tokenUsageUtils-FZd5O_4A.js → tokenUsageUtils-BDGe-iyI.js} +2 -2
- package/dist/src/{tokenUsageUtils-DmZSD2eU.js → tokenUsageUtils-DflFMjS0.js} +2 -2
- package/dist/src/tokenUsageUtils-NYT-WKS6.js +138 -0
- package/dist/src/{tokenUsageUtils-CXhxVj72.cjs → tokenUsageUtils-bVa1ga6f.cjs} +32 -33
- package/dist/src/{transcription-mYS9vd5v.js → transcription-BNYURcXg.js} +14 -7
- package/dist/src/{transcription-X2-B4vkX.js → transcription-B_OdaHp7.js} +14 -16
- package/dist/src/{transcription-BO1AHegO.cjs → transcription-NLVG9MT1.cjs} +14 -15
- package/dist/src/{transcription-lzBLiTFJ.js → transcription-s6A-bNrZ.js} +15 -17
- package/dist/src/{transform-B1Hi5lWS.cjs → transform-CzK1Q0zl.cjs} +24 -26
- package/dist/src/{transform-DeGlxb0D.js → transform-D5HsjduX.js} +39 -47
- package/dist/src/{transform-CYDILYDe.js → transform-DECvGmzp.js} +15 -13
- package/dist/src/transform-DTGDnAzW.js +6 -0
- package/dist/src/{transform-BEgStbHK.js → transform-DilY9wbS.js} +10 -12
- package/dist/src/{transform-D5PjiWiZ.cjs → transform-DuHvhZpj.cjs} +179 -187
- package/dist/src/transform-aa6tmVpZ.js +216 -0
- package/dist/src/transform-m3qNw4KP.cjs +5 -0
- package/dist/src/transform-uAytVuyX.js +1506 -0
- package/dist/src/{transform-Dfl89yi4.js → transform-vNucnNr0.js} +39 -47
- package/dist/src/{transformersAvailability-SZnTS3pJ.js → transformersAvailability-CEVM2GNQ.js} +2 -2
- package/dist/src/{transformersAvailability-D-glmEy7.cjs → transformersAvailability-CwayUSlh.cjs} +2 -3
- package/dist/src/{transformersAvailability-CjeFXhuJ.js → transformersAvailability-D6c6ROpT.js} +2 -2
- package/dist/src/{types-DWNf48sT.cjs → types-C_7nyzr1.cjs} +538 -574
- package/dist/src/{types-CXQduE9o.js → types-Cbd8uOMq.js} +68 -100
- package/dist/src/types-CzW2QFyi.js +3288 -0
- package/dist/src/{types-C5hEkb-x.js → types-DmyIJ-sR.js} +63 -99
- package/dist/src/{util-CoQjmE3u.js → util-B3xGByQh.js} +4 -5
- package/dist/src/{util-aLhtl3fe.cjs → util-B9vlHIIh.cjs} +208 -223
- package/dist/src/{util-Du96oyYS.js → util-BHGHw5G1.js} +4 -5
- package/dist/src/{util-DQ984syk.js → util-BRYkYPTd.js} +36 -51
- package/dist/src/{util-D9eLdGfa.js → util-BV4XUC0n.js} +5 -6
- package/dist/src/util-Bv6uGDfH.js +293 -0
- package/dist/src/{util-1wWM599Z.cjs → util-BzMcevZc.cjs} +50 -51
- package/dist/src/{util-_h4pVqrz.js → util-C1CeHl-P.js} +36 -51
- package/dist/src/{util-Bm_-UMD_.js → util-CMy69ZgQ.js} +5 -6
- package/dist/src/{util-CyUdMzV0.cjs → util-DGNOS1db.cjs} +34 -35
- package/dist/src/util-Dnmk2mBQ.js +599 -0
- package/dist/src/util-ZzmqNPlg.js +1426 -0
- package/dist/src/{utils-BjLy-Q72.cjs → utils-Cz9qXqII.cjs} +29 -32
- package/dist/src/{utils-CFMn2yHW.js → utils-XiOAgly5.js} +4 -7
- package/dist/src/utils-dLokC-eR.js +94 -0
- package/dist/src/{utils-DvWMzuMx.js → utils-f2-Moju7.js} +4 -7
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +38 -38
- package/dist/src/app/assets/index-B2D0bCSI.js +0 -439
- package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
- package/dist/src/cache-ChPcurj7.js +0 -6
- package/dist/src/cache-VVu_W-yg.js +0 -8
- package/dist/src/cache-YLNCFEM2.cjs +0 -6
- package/dist/src/chunk-DHDDz29n.js +0 -22
- package/dist/src/chunk-FhC4c-0y.js +0 -21
- package/dist/src/cloud-BndfXy4H.js +0 -5
- package/dist/src/eval-BhHvMY82.js +0 -17
- package/dist/src/evalResult-Dq2gFNQY.js +0 -12
- package/dist/src/evalResult-nmcP5VKH.cjs +0 -12
- package/dist/src/evalResult-trqZjVYh.js +0 -14
- package/dist/src/evaluator-CnfPstzT.js +0 -39
- package/dist/src/fetch-IDPDue6F.cjs +0 -4
- package/dist/src/fetch-hKJ-It8q.js +0 -6
- package/dist/src/fetch-ouKnrWK-.js +0 -4
- package/dist/src/graders-CQn7WUsd.cjs +0 -34
- package/dist/src/graders-DC6QAbpW.js +0 -35
- package/dist/src/graders-DUWz3Y7j.js +0 -37
- package/dist/src/opencode-sdk-4bL9n-Gk.js +0 -382
- package/dist/src/opencode-sdk-BfC2zWcR.js +0 -376
- package/dist/src/opencode-sdk-DMJyuwMg.js +0 -380
- package/dist/src/opencode-sdk-Da-9adza.cjs +0 -383
- package/dist/src/providers-CsXB2Ix-.js +0 -35
- package/dist/src/providers-DO8ltjLC.js +0 -33
- package/dist/src/providers-Dtq-xnXd.cjs +0 -33
- package/dist/src/rubyUtils-BUbcND2f.js +0 -6
- package/dist/src/rubyUtils-Cr55X_KE.js +0 -5
- package/dist/src/rubyUtils-DlIiqoYo.cjs +0 -5
- package/dist/src/server-C2eQH4Gu.js +0 -6
- package/dist/src/server-CXWycu7H.cjs +0 -6
- package/dist/src/server-Q6OGlxxT.js +0 -8
- package/dist/src/store-B3EDO9Q3.js +0 -7
- package/dist/src/store-Dl9F8aw5.js +0 -6
- package/dist/src/store-SnrGrlt9.cjs +0 -6
- package/dist/src/telemetry-BGhiPZtl.js +0 -8
- package/dist/src/telemetry-CFfiYan6.cjs +0 -6
- package/dist/src/telemetry-DHzEduxX.js +0 -6
- package/dist/src/transform-C1x1ZlMQ.cjs +0 -6
- package/dist/src/transform-DYHjFmQu.js +0 -8
- package/dist/src/transform-rmwJT5JQ.js +0 -7
- package/dist/src/transformersAvailability-eJooj0gX.js +0 -35
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { C as getEnvString, E as isCI, O as
|
|
3
|
-
import {
|
|
4
|
-
import { t as invariant } from "./invariant-
|
|
5
|
-
import { r as
|
|
6
|
-
import { d as isGradingResult, p as isApiProvider, s as ResultFailureReason } from "./types-
|
|
7
|
-
import { c as promptYesNo } from "./server-
|
|
8
|
-
import { A as renderPrompt, E as isBasicRefusal, F as TokenUsageTracker, G as VertexChatProvider, I as createRateLimitRegistry, K as AIStudioChatProvider, L as createProviderRateLimitOptions, M as isPackagePath, N as loadFromPackage, P as redteamProviderManager, j as runExtensionHook, k as collectFileMetadata, u as GoogleLiveProvider, v as checkExfilTracking, w as getSessionId } from "./providers-
|
|
9
|
-
import { o as getCache } from "./cache-
|
|
10
|
-
import { n as isNonTransientHttpStatus } from "./errors-
|
|
11
|
-
import { i as isJavascriptFile } from "./fileExtensions-
|
|
12
|
-
import { E as parseFileUrl, I as isAnthropicProvider, L as isGoogleProvider, R as isOpenAiProvider, T as loadFunction, g as maybeLoadToolsFromExternalFile, w as getNunjucksEngine, z as isProviderAllowed } from "./util-
|
|
13
|
-
import { r as runPython } from "./pythonUtils-
|
|
14
|
-
import { n as transform, r as getProcessShim, t as TransformInputType } from "./transform-
|
|
15
|
-
import { $ as matchesSearchRubric, B as getAndCheckProvider, G as matchesContextFaithfulness, H as matchesAnswerRelevance, J as matchesFactuality, K as matchesContextRecall, Q as matchesPiScore, R as callProviderWithContext, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, at as DefaultSuggestionsProvider, dt as getFinalTest, et as matchesSelectBest, ft as loadFromJavaScriptFile, it as getDefaultProviders, lt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, mt as resolveContext, n as getGraderById, nt as selectMaxScore, pt as processFileReference, q as matchesContextRelevance, tt as matchesSimilarity, ut as coerceString, z as fail } from "./graders-
|
|
16
|
-
import { i as generateIdFromPrompt } from "./utils-
|
|
17
|
-
import { t as OpenAiChatCompletionProvider } from "./chat-
|
|
18
|
-
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-
|
|
19
|
-
import { m as validateFunctionCall } from "./transform-
|
|
20
|
-
import { l as validateFunctionCall$1 } from "./util-
|
|
21
|
-
import { t as extractAndStoreBinaryData } from "./extractor-
|
|
22
|
-
import { n as getTraceStore } from "./store-
|
|
23
|
-
import { t as providerRegistry } from "./providerRegistry-
|
|
24
|
-
import { n as runRuby } from "./rubyUtils-
|
|
25
|
-
import { a as getActualPromptWithFallback, r as updateSignalFile } from "./signal-
|
|
2
|
+
import { C as getEnvString, E as isCI, O as state, S as getEnvInt, T as getMaxEvalTimeMs, _ as summarizeEvaluateResultForLogging, b as getEnvBool, f as extractJsonObjects, g as safeJsonStringify, o as logger, p as getAjv, w as getEvalTimeoutMs } from "./logger-KkObSCzq.js";
|
|
3
|
+
import { N as VERSION, P as FILE_METADATA_KEY, g as isPromptfooSampleTarget, l as sleep, r as fetchWithRetries, y as parseChatPrompt } from "./fetch-Dxpd4_sr.js";
|
|
4
|
+
import { t as invariant } from "./invariant-BtWWVVhl.js";
|
|
5
|
+
import { r as telemetry } from "./telemetry-CMrFgtPB.js";
|
|
6
|
+
import { d as isGradingResult, p as isApiProvider, s as ResultFailureReason } from "./types-Cbd8uOMq.js";
|
|
7
|
+
import { c as promptYesNo } from "./server-CqzrVGpF.js";
|
|
8
|
+
import { A as renderPrompt, E as isBasicRefusal, F as TokenUsageTracker, G as VertexChatProvider, I as createRateLimitRegistry, K as AIStudioChatProvider, L as createProviderRateLimitOptions, M as isPackagePath, N as loadFromPackage, P as redteamProviderManager, j as runExtensionHook, k as collectFileMetadata, u as GoogleLiveProvider, v as checkExfilTracking, w as getSessionId } from "./providers-BlqUifFg.js";
|
|
9
|
+
import { o as getCache } from "./cache-BVeDlD87.js";
|
|
10
|
+
import { n as isNonTransientHttpStatus } from "./errors-P6ll7XSJ.js";
|
|
11
|
+
import { i as isJavascriptFile } from "./fileExtensions-Ds-foDzt.js";
|
|
12
|
+
import { E as parseFileUrl, I as isAnthropicProvider, L as isGoogleProvider, R as isOpenAiProvider, T as loadFunction, g as maybeLoadToolsFromExternalFile, w as getNunjucksEngine, z as isProviderAllowed } from "./util-C1CeHl-P.js";
|
|
13
|
+
import { r as runPython } from "./pythonUtils-C3py6GC1.js";
|
|
14
|
+
import { n as transform, r as getProcessShim, t as TransformInputType } from "./transform-DilY9wbS.js";
|
|
15
|
+
import { $ as matchesSearchRubric, B as getAndCheckProvider, G as matchesContextFaithfulness, H as matchesAnswerRelevance, J as matchesFactuality, K as matchesContextRecall, Q as matchesPiScore, R as callProviderWithContext, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, at as DefaultSuggestionsProvider, dt as getFinalTest, et as matchesSelectBest, ft as loadFromJavaScriptFile, it as getDefaultProviders, lt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, mt as resolveContext, n as getGraderById, nt as selectMaxScore, pt as processFileReference, q as matchesContextRelevance, tt as matchesSimilarity, ut as coerceString, z as fail } from "./graders-Bj_Odv7c.js";
|
|
16
|
+
import { i as generateIdFromPrompt } from "./utils-f2-Moju7.js";
|
|
17
|
+
import { t as OpenAiChatCompletionProvider } from "./chat-BiKyneZl.js";
|
|
18
|
+
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-DflFMjS0.js";
|
|
19
|
+
import { m as validateFunctionCall } from "./transform-D5HsjduX.js";
|
|
20
|
+
import { l as validateFunctionCall$1 } from "./util-CMy69ZgQ.js";
|
|
21
|
+
import { t as extractAndStoreBinaryData } from "./extractor-DReVID0K.js";
|
|
22
|
+
import { n as getTraceStore } from "./store-Dim__MDd.js";
|
|
23
|
+
import { t as providerRegistry } from "./providerRegistry-B0RUOLI_.js";
|
|
24
|
+
import { n as runRuby } from "./rubyUtils-Boc4HZzX.js";
|
|
25
|
+
import { a as getActualPromptWithFallback, r as updateSignalFile } from "./signal-4U3mfRvL.js";
|
|
26
26
|
import chalk from "chalk";
|
|
27
27
|
import fs, { createWriteStream } from "fs";
|
|
28
28
|
import path from "path";
|
|
@@ -43,7 +43,6 @@ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
|
43
43
|
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
44
44
|
import { BatchSpanProcessor, NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
45
45
|
import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions";
|
|
46
|
-
|
|
47
46
|
//#region src/external/matchers/conversationRelevancyTemplate.ts
|
|
48
47
|
var ConversationRelevancyTemplate = class {
|
|
49
48
|
static generateVerdicts(slidingWindow) {
|
|
@@ -115,7 +114,6 @@ ${JSON.stringify(irrelevancies, null, 2)}
|
|
|
115
114
|
JSON:`;
|
|
116
115
|
}
|
|
117
116
|
};
|
|
118
|
-
|
|
119
117
|
//#endregion
|
|
120
118
|
//#region src/external/matchers/deepeval.ts
|
|
121
119
|
const nunjucks$1 = getNunjucksEngine(void 0, false, true);
|
|
@@ -165,7 +163,6 @@ async function matchesConversationRelevance(messages, threshold, vars, grading,
|
|
|
165
163
|
return fail(`Error parsing output: ${err.message}`, resp.tokenUsage);
|
|
166
164
|
}
|
|
167
165
|
}
|
|
168
|
-
|
|
169
166
|
//#endregion
|
|
170
167
|
//#region src/external/assertions/deepeval.ts
|
|
171
168
|
const DEFAULT_WINDOW_SIZE = 5;
|
|
@@ -220,7 +217,6 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
220
217
|
tokensUsed: tokensUsed.total > 0 ? tokensUsed : void 0
|
|
221
218
|
};
|
|
222
219
|
};
|
|
223
|
-
|
|
224
220
|
//#endregion
|
|
225
221
|
//#region src/tracing/evaluatorTracing.ts
|
|
226
222
|
let otlpReceiverStarted = false;
|
|
@@ -253,28 +249,28 @@ function isOtlpReceiverStarted() {
|
|
|
253
249
|
* Start the OTLP receiver if tracing is enabled and it hasn't been started yet
|
|
254
250
|
*/
|
|
255
251
|
async function startOtlpReceiverIfNeeded(testSuite) {
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
252
|
+
logger.debug(`[EvaluatorTracing] Checking tracing config: ${JSON.stringify(testSuite.tracing)}`);
|
|
253
|
+
logger.debug(`[EvaluatorTracing] testSuite keys: ${Object.keys(testSuite)}`);
|
|
254
|
+
logger.debug(`[EvaluatorTracing] Full testSuite.tracing: ${JSON.stringify(testSuite.tracing, null, 2)}`);
|
|
259
255
|
if (testSuite.tracing?.enabled && testSuite.tracing?.otlp?.http?.enabled && !otlpReceiverStarted) {
|
|
260
|
-
|
|
256
|
+
telemetry.record("feature_used", { feature: "tracing" });
|
|
261
257
|
try {
|
|
262
|
-
|
|
263
|
-
const { startOTLPReceiver } = await import("./otlpReceiver-
|
|
258
|
+
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
259
|
+
const { startOTLPReceiver } = await import("./otlpReceiver-D89fR-rC.js");
|
|
264
260
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
265
261
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
266
|
-
|
|
262
|
+
logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
267
263
|
await startOTLPReceiver(port, host);
|
|
268
264
|
otlpReceiverStarted = true;
|
|
269
|
-
|
|
265
|
+
logger.info(`[EvaluatorTracing] OTLP receiver successfully started on port ${port} for tracing`);
|
|
270
266
|
} catch (error) {
|
|
271
|
-
|
|
267
|
+
logger.error(`[EvaluatorTracing] Failed to start OTLP receiver: ${error}`);
|
|
272
268
|
}
|
|
273
|
-
} else if (otlpReceiverStarted)
|
|
269
|
+
} else if (otlpReceiverStarted) logger.debug("[EvaluatorTracing] OTLP receiver already started, skipping initialization");
|
|
274
270
|
else {
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
271
|
+
logger.debug("[EvaluatorTracing] Tracing not enabled or OTLP HTTP receiver not configured");
|
|
272
|
+
logger.debug(`[EvaluatorTracing] tracing.enabled: ${testSuite.tracing?.enabled}`);
|
|
273
|
+
logger.debug(`[EvaluatorTracing] tracing.otlp.http.enabled: ${testSuite.tracing?.otlp?.http?.enabled}`);
|
|
278
274
|
}
|
|
279
275
|
}
|
|
280
276
|
/**
|
|
@@ -282,13 +278,13 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
282
278
|
*/
|
|
283
279
|
async function stopOtlpReceiverIfNeeded() {
|
|
284
280
|
if (otlpReceiverStarted) try {
|
|
285
|
-
|
|
286
|
-
const { stopOTLPReceiver } = await import("./otlpReceiver-
|
|
281
|
+
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
282
|
+
const { stopOTLPReceiver } = await import("./otlpReceiver-D89fR-rC.js");
|
|
287
283
|
await stopOTLPReceiver();
|
|
288
284
|
otlpReceiverStarted = false;
|
|
289
|
-
|
|
285
|
+
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
290
286
|
} catch (error) {
|
|
291
|
-
|
|
287
|
+
logger.error(`[EvaluatorTracing] Failed to stop OTLP receiver: ${error}`);
|
|
292
288
|
}
|
|
293
289
|
}
|
|
294
290
|
/**
|
|
@@ -304,7 +300,7 @@ function isTracingEnabled(test, testSuite) {
|
|
|
304
300
|
const yamlConfigEnabled = testSuite?.tracing?.enabled === true;
|
|
305
301
|
const envEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false);
|
|
306
302
|
const result = metadataEnabled || yamlConfigEnabled || envEnabled;
|
|
307
|
-
|
|
303
|
+
logger.debug(`[EvaluatorTracing] isTracingEnabled check: metadata=${metadataEnabled}, yamlConfig=${yamlConfigEnabled}, env=${envEnabled}, result=${result}`);
|
|
308
304
|
return result;
|
|
309
305
|
}
|
|
310
306
|
/**
|
|
@@ -313,25 +309,25 @@ function isTracingEnabled(test, testSuite) {
|
|
|
313
309
|
async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite) {
|
|
314
310
|
const tracingEnabled = isTracingEnabled(test, testSuite);
|
|
315
311
|
if (tracingEnabled) {
|
|
316
|
-
|
|
317
|
-
|
|
312
|
+
logger.debug("[EvaluatorTracing] Tracing enabled for test case");
|
|
313
|
+
logger.debug(`[EvaluatorTracing] Test metadata: ${JSON.stringify(test.metadata)}`);
|
|
318
314
|
}
|
|
319
315
|
if (!tracingEnabled) return null;
|
|
320
|
-
|
|
321
|
-
const { getTraceStore } = await import("./store-
|
|
316
|
+
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
317
|
+
const { getTraceStore } = await import("./store-CXS-Q_91.js");
|
|
322
318
|
const traceStore = getTraceStore();
|
|
323
319
|
const traceId = generateTraceId();
|
|
324
320
|
const spanId = generateSpanId();
|
|
325
321
|
const traceparent = generateTraceparent(traceId, spanId);
|
|
326
|
-
|
|
322
|
+
logger.debug(`[EvaluatorTracing] Generated trace context: traceId=${traceId}, spanId=${spanId}`);
|
|
327
323
|
let evaluationId = test.metadata?.evaluationId || evaluateOptions?.eventSource;
|
|
328
324
|
if (!evaluationId) {
|
|
329
|
-
|
|
325
|
+
logger.warn("[EvaluatorTracing] No evaluation ID found in test metadata or evaluateOptions, trace will not be linked to evaluation");
|
|
330
326
|
evaluationId = `eval-${Date.now()}`;
|
|
331
327
|
}
|
|
332
328
|
const testCaseId = test.metadata?.testCaseId || test.id || `${testIdx}-${promptIdx}`;
|
|
333
329
|
try {
|
|
334
|
-
|
|
330
|
+
logger.debug(`[EvaluatorTracing] Creating trace record for traceId=${traceId}`);
|
|
335
331
|
await traceStore.createTrace({
|
|
336
332
|
traceId,
|
|
337
333
|
evaluationId: evaluationId || "",
|
|
@@ -342,18 +338,17 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
342
338
|
vars: test.vars
|
|
343
339
|
}
|
|
344
340
|
});
|
|
345
|
-
|
|
341
|
+
logger.debug("[EvaluatorTracing] Trace record created successfully");
|
|
346
342
|
} catch (error) {
|
|
347
|
-
|
|
343
|
+
logger.error(`[EvaluatorTracing] Failed to create trace: ${error}`);
|
|
348
344
|
}
|
|
349
|
-
|
|
345
|
+
logger.debug(`[EvaluatorTracing] Trace context ready: ${traceparent} for test case ${testCaseId}`);
|
|
350
346
|
return {
|
|
351
347
|
traceparent,
|
|
352
348
|
evaluationId,
|
|
353
349
|
testCaseId
|
|
354
350
|
};
|
|
355
351
|
}
|
|
356
|
-
|
|
357
352
|
//#endregion
|
|
358
353
|
//#region src/assertions/answerRelevance.ts
|
|
359
354
|
const handleAnswerRelevance = async ({ assertion, output, prompt, test, providerCallContext }) => {
|
|
@@ -364,7 +359,6 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
|
|
|
364
359
|
...await matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
|
|
365
360
|
};
|
|
366
361
|
};
|
|
367
|
-
|
|
368
362
|
//#endregion
|
|
369
363
|
//#region src/assertions/assertionsResult.ts
|
|
370
364
|
const GUARDRAIL_BLOCKED_REASON = "Content failed guardrail safety checks";
|
|
@@ -470,7 +464,6 @@ var AssertionsResult = class {
|
|
|
470
464
|
return this.result;
|
|
471
465
|
}
|
|
472
466
|
};
|
|
473
|
-
|
|
474
467
|
//#endregion
|
|
475
468
|
//#region src/assertions/ngrams.ts
|
|
476
469
|
/**
|
|
@@ -486,7 +479,6 @@ function getNGrams(words, n) {
|
|
|
486
479
|
for (let i = 0; i <= words.length - n; i++) ngrams.push(words.slice(i, i + n).join(" "));
|
|
487
480
|
return ngrams;
|
|
488
481
|
}
|
|
489
|
-
|
|
490
482
|
//#endregion
|
|
491
483
|
//#region src/assertions/bleu.ts
|
|
492
484
|
/**
|
|
@@ -582,7 +574,6 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
582
574
|
assertion
|
|
583
575
|
};
|
|
584
576
|
}
|
|
585
|
-
|
|
586
577
|
//#endregion
|
|
587
578
|
//#region src/assertions/classifier.ts
|
|
588
579
|
async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
|
|
@@ -597,9 +588,43 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
597
588
|
...classificationResult
|
|
598
589
|
};
|
|
599
590
|
}
|
|
600
|
-
|
|
601
591
|
//#endregion
|
|
602
592
|
//#region src/assertions/contains.ts
|
|
593
|
+
function parseCommaSeparatedValues(value) {
|
|
594
|
+
const results = [];
|
|
595
|
+
let i = 0;
|
|
596
|
+
while (i < value.length) {
|
|
597
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
598
|
+
if (i >= value.length) break;
|
|
599
|
+
if (value[i] === ",") {
|
|
600
|
+
i++;
|
|
601
|
+
continue;
|
|
602
|
+
}
|
|
603
|
+
if (value[i] === "\"") {
|
|
604
|
+
i++;
|
|
605
|
+
let field = "";
|
|
606
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
607
|
+
field += value[i + 1];
|
|
608
|
+
i += 2;
|
|
609
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
610
|
+
field += "\"";
|
|
611
|
+
i += 2;
|
|
612
|
+
} else if (value[i] === "\"") {
|
|
613
|
+
i++;
|
|
614
|
+
break;
|
|
615
|
+
} else {
|
|
616
|
+
field += value[i];
|
|
617
|
+
i++;
|
|
618
|
+
}
|
|
619
|
+
results.push(field);
|
|
620
|
+
} else {
|
|
621
|
+
const start = i;
|
|
622
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
623
|
+
results.push(value.substring(start, i).trim());
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
return results;
|
|
627
|
+
}
|
|
603
628
|
const handleContains = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
604
629
|
const value = valueFromScript ?? renderedValue;
|
|
605
630
|
invariant(value, "\"contains\" assertion type must have a string or number value");
|
|
@@ -627,7 +652,7 @@ const handleIContains = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
627
652
|
const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
628
653
|
let value = valueFromScript ?? renderedValue;
|
|
629
654
|
invariant(value, "\"contains-any\" assertion type must have a value");
|
|
630
|
-
if (typeof value === "string") value = value
|
|
655
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
631
656
|
invariant(Array.isArray(value), "\"contains-any\" assertion type must have an array value");
|
|
632
657
|
const pass = value.some((v) => outputString.includes(String(v))) !== inverse;
|
|
633
658
|
return {
|
|
@@ -640,7 +665,7 @@ const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
640
665
|
const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
641
666
|
let value = valueFromScript ?? renderedValue;
|
|
642
667
|
invariant(value, "\"icontains-any\" assertion type must have a value");
|
|
643
|
-
if (typeof value === "string") value = value
|
|
668
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
644
669
|
invariant(Array.isArray(value), "\"icontains-any\" assertion type must have an array value");
|
|
645
670
|
const pass = value.some((v) => outputString.toLowerCase().includes(String(v).toLowerCase())) !== inverse;
|
|
646
671
|
return {
|
|
@@ -653,7 +678,7 @@ const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
653
678
|
const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
654
679
|
let value = valueFromScript ?? renderedValue;
|
|
655
680
|
invariant(value, "\"contains-all\" assertion type must have a value");
|
|
656
|
-
if (typeof value === "string") value = value
|
|
681
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
657
682
|
invariant(Array.isArray(value), "\"contains-all\" assertion type must have an array value");
|
|
658
683
|
const missingStrings = value.filter((v) => !outputString.includes(String(v)));
|
|
659
684
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -667,7 +692,7 @@ const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
667
692
|
const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
668
693
|
let value = valueFromScript ?? renderedValue;
|
|
669
694
|
invariant(value, "\"icontains-all\" assertion type must have a value");
|
|
670
|
-
if (typeof value === "string") value = value
|
|
695
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
671
696
|
invariant(Array.isArray(value), "\"icontains-all\" assertion type must have an array value");
|
|
672
697
|
const missingStrings = value.filter((v) => !outputString.toLowerCase().includes(String(v).toLowerCase()));
|
|
673
698
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -678,7 +703,6 @@ const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
678
703
|
assertion
|
|
679
704
|
};
|
|
680
705
|
};
|
|
681
|
-
|
|
682
706
|
//#endregion
|
|
683
707
|
//#region src/assertions/contextFaithfulness.ts
|
|
684
708
|
/**
|
|
@@ -702,7 +726,6 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
702
726
|
metadata: { context }
|
|
703
727
|
};
|
|
704
728
|
}
|
|
705
|
-
|
|
706
729
|
//#endregion
|
|
707
730
|
//#region src/assertions/contextRecall.ts
|
|
708
731
|
/**
|
|
@@ -729,7 +752,6 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
|
|
|
729
752
|
}
|
|
730
753
|
};
|
|
731
754
|
};
|
|
732
|
-
|
|
733
755
|
//#endregion
|
|
734
756
|
//#region src/assertions/contextRelevance.ts
|
|
735
757
|
/**
|
|
@@ -756,7 +778,6 @@ const handleContextRelevance = async ({ assertion, test, output, prompt, provide
|
|
|
756
778
|
}
|
|
757
779
|
};
|
|
758
780
|
};
|
|
759
|
-
|
|
760
781
|
//#endregion
|
|
761
782
|
//#region src/assertions/cost.ts
|
|
762
783
|
const handleCost = ({ cost, assertion }) => {
|
|
@@ -770,7 +791,6 @@ const handleCost = ({ cost, assertion }) => {
|
|
|
770
791
|
assertion
|
|
771
792
|
};
|
|
772
793
|
};
|
|
773
|
-
|
|
774
794
|
//#endregion
|
|
775
795
|
//#region src/assertions/equals.ts
|
|
776
796
|
const handleEquals = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -790,7 +810,6 @@ const handleEquals = async ({ assertion, renderedValue, outputString, inverse })
|
|
|
790
810
|
assertion
|
|
791
811
|
};
|
|
792
812
|
};
|
|
793
|
-
|
|
794
813
|
//#endregion
|
|
795
814
|
//#region src/assertions/factuality.ts
|
|
796
815
|
const handleFactuality = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -801,7 +820,6 @@ const handleFactuality = async ({ assertion, renderedValue, outputString, test,
|
|
|
801
820
|
...await matchesFactuality(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
802
821
|
};
|
|
803
822
|
};
|
|
804
|
-
|
|
805
823
|
//#endregion
|
|
806
824
|
//#region src/assertions/finishReason.ts
|
|
807
825
|
function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
@@ -821,7 +839,6 @@ function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
|
821
839
|
assertion
|
|
822
840
|
};
|
|
823
841
|
}
|
|
824
|
-
|
|
825
842
|
//#endregion
|
|
826
843
|
//#region src/assertions/functionToolCall.ts
|
|
827
844
|
const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
@@ -844,7 +861,6 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
844
861
|
};
|
|
845
862
|
}
|
|
846
863
|
};
|
|
847
|
-
|
|
848
864
|
//#endregion
|
|
849
865
|
//#region src/assertions/geval.ts
|
|
850
866
|
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
@@ -870,7 +886,6 @@ const handleGEval = async ({ assertion, renderedValue, prompt, outputString, tes
|
|
|
870
886
|
...await matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext)
|
|
871
887
|
};
|
|
872
888
|
};
|
|
873
|
-
|
|
874
889
|
//#endregion
|
|
875
890
|
//#region src/assertions/gleu.ts
|
|
876
891
|
/**
|
|
@@ -969,7 +984,6 @@ function handleGleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
969
984
|
assertion
|
|
970
985
|
};
|
|
971
986
|
}
|
|
972
|
-
|
|
973
987
|
//#endregion
|
|
974
988
|
//#region src/assertions/guardrails.ts
|
|
975
989
|
const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
@@ -978,7 +992,7 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
978
992
|
flaggedInput: false,
|
|
979
993
|
flaggedOutput: false
|
|
980
994
|
};
|
|
981
|
-
|
|
995
|
+
logger.debug(`Guardrails from target response: ${JSON.stringify(guardrails, null, 2)}`);
|
|
982
996
|
if (guardrails) if (guardrails.flagged) {
|
|
983
997
|
let reason = "Content failed safety checks";
|
|
984
998
|
if (guardrails?.flaggedInput) reason = "Prompt failed safety checks";
|
|
@@ -1007,7 +1021,6 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1007
1021
|
assertion
|
|
1008
1022
|
};
|
|
1009
1023
|
};
|
|
1010
|
-
|
|
1011
1024
|
//#endregion
|
|
1012
1025
|
//#region src/assertions/html.ts
|
|
1013
1026
|
const HTML_PATTERNS = {
|
|
@@ -1216,7 +1229,6 @@ const handleIsHtml = ({ assertion, outputString, inverse }) => {
|
|
|
1216
1229
|
assertion
|
|
1217
1230
|
};
|
|
1218
1231
|
};
|
|
1219
|
-
|
|
1220
1232
|
//#endregion
|
|
1221
1233
|
//#region src/assertions/javascript.ts
|
|
1222
1234
|
/**
|
|
@@ -1357,7 +1369,6 @@ ${renderedValue}`,
|
|
|
1357
1369
|
assertion
|
|
1358
1370
|
};
|
|
1359
1371
|
};
|
|
1360
|
-
|
|
1361
1372
|
//#endregion
|
|
1362
1373
|
//#region src/assertions/json.ts
|
|
1363
1374
|
function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, assertion }) {
|
|
@@ -1369,7 +1380,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1369
1380
|
} catch {
|
|
1370
1381
|
pass = inverse;
|
|
1371
1382
|
}
|
|
1372
|
-
if (
|
|
1383
|
+
if (parsedJson !== void 0 && renderedValue) {
|
|
1373
1384
|
let validate;
|
|
1374
1385
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
1375
1386
|
const schema = valueFromScript;
|
|
@@ -1381,11 +1392,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1381
1392
|
}
|
|
1382
1393
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
1383
1394
|
else throw new Error("is-json assertion must have a string or object value");
|
|
1384
|
-
|
|
1395
|
+
const valid = validate(parsedJson);
|
|
1396
|
+
pass = inverse ? !valid : valid;
|
|
1385
1397
|
if (!pass) return {
|
|
1386
1398
|
pass,
|
|
1387
1399
|
score: 0,
|
|
1388
|
-
reason: `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
1400
|
+
reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
1389
1401
|
assertion
|
|
1390
1402
|
};
|
|
1391
1403
|
}
|
|
@@ -1412,9 +1424,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1412
1424
|
}
|
|
1413
1425
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
1414
1426
|
else throw new Error("contains-json assertion must have a string or object value");
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1427
|
+
const valid = validate(jsonObject);
|
|
1428
|
+
pass = inverse ? !valid : valid;
|
|
1429
|
+
if (valid) {
|
|
1430
|
+
if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
|
|
1431
|
+
break;
|
|
1432
|
+
} else errorMessage = `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`;
|
|
1418
1433
|
}
|
|
1419
1434
|
return {
|
|
1420
1435
|
pass,
|
|
@@ -1423,7 +1438,6 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1423
1438
|
assertion
|
|
1424
1439
|
};
|
|
1425
1440
|
}
|
|
1426
|
-
|
|
1427
1441
|
//#endregion
|
|
1428
1442
|
//#region src/assertions/latency.ts
|
|
1429
1443
|
const handleLatency = ({ assertion, latencyMs }) => {
|
|
@@ -1437,7 +1451,6 @@ const handleLatency = ({ assertion, latencyMs }) => {
|
|
|
1437
1451
|
assertion
|
|
1438
1452
|
};
|
|
1439
1453
|
};
|
|
1440
|
-
|
|
1441
1454
|
//#endregion
|
|
1442
1455
|
//#region src/assertions/levenshtein.ts
|
|
1443
1456
|
function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
@@ -1452,7 +1465,6 @@ function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
|
1452
1465
|
assertion
|
|
1453
1466
|
};
|
|
1454
1467
|
}
|
|
1455
|
-
|
|
1456
1468
|
//#endregion
|
|
1457
1469
|
//#region src/assertions/llmRubric.ts
|
|
1458
1470
|
const handleLlmRubric = ({ assertion, renderedValue, outputString, test, providerCallContext }) => {
|
|
@@ -1461,7 +1473,6 @@ const handleLlmRubric = ({ assertion, renderedValue, outputString, test, provide
|
|
|
1461
1473
|
assertion.value = assertion.value || test.options?.rubricPrompt;
|
|
1462
1474
|
return matchesLlmRubric(renderedValue || "", outputString, test.options, test.vars, assertion, void 0, providerCallContext);
|
|
1463
1475
|
};
|
|
1464
|
-
|
|
1465
1476
|
//#endregion
|
|
1466
1477
|
//#region src/assertions/modelGradedClosedQa.ts
|
|
1467
1478
|
const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -1472,7 +1483,6 @@ const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputStrin
|
|
|
1472
1483
|
...await matchesClosedQa(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
1473
1484
|
};
|
|
1474
1485
|
};
|
|
1475
|
-
|
|
1476
1486
|
//#endregion
|
|
1477
1487
|
//#region src/assertions/moderation.ts
|
|
1478
1488
|
const handleModeration = async ({ assertion, test, outputString, providerResponse, prompt }) => {
|
|
@@ -1495,7 +1505,6 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
|
|
|
1495
1505
|
assertion
|
|
1496
1506
|
};
|
|
1497
1507
|
};
|
|
1498
|
-
|
|
1499
1508
|
//#endregion
|
|
1500
1509
|
//#region src/assertions/openai.ts
|
|
1501
1510
|
const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test }) => {
|
|
@@ -1556,7 +1565,6 @@ const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test
|
|
|
1556
1565
|
};
|
|
1557
1566
|
}
|
|
1558
1567
|
};
|
|
1559
|
-
|
|
1560
1568
|
//#endregion
|
|
1561
1569
|
//#region src/assertions/perplexity.ts
|
|
1562
1570
|
function handlePerplexity({ logProbs, assertion }) {
|
|
@@ -1583,7 +1591,6 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1583
1591
|
assertion
|
|
1584
1592
|
};
|
|
1585
1593
|
}
|
|
1586
|
-
|
|
1587
1594
|
//#endregion
|
|
1588
1595
|
//#region src/assertions/pi.ts
|
|
1589
1596
|
const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }) => {
|
|
@@ -1591,7 +1598,6 @@ const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }
|
|
|
1591
1598
|
invariant(typeof prompt === "string", "\"pi\" assertion must have a prompt that is a string");
|
|
1592
1599
|
return matchesPiScore(renderedValue, prompt, outputString, assertion);
|
|
1593
1600
|
};
|
|
1594
|
-
|
|
1595
1601
|
//#endregion
|
|
1596
1602
|
//#region src/python/wrapper.ts
|
|
1597
1603
|
/**
|
|
@@ -1607,17 +1613,16 @@ async function runPythonCode(code, method, args) {
|
|
|
1607
1613
|
fs.writeFileSync(tempFilePath, code);
|
|
1608
1614
|
return await runPython(tempFilePath, method, args);
|
|
1609
1615
|
} catch (error) {
|
|
1610
|
-
|
|
1616
|
+
logger.error(`Error executing Python code: ${error}`);
|
|
1611
1617
|
throw error;
|
|
1612
1618
|
} finally {
|
|
1613
1619
|
try {
|
|
1614
1620
|
fs.unlinkSync(tempFilePath);
|
|
1615
1621
|
} catch (error) {
|
|
1616
|
-
|
|
1622
|
+
logger.error(`Error removing temporary file: ${error}`);
|
|
1617
1623
|
}
|
|
1618
1624
|
}
|
|
1619
1625
|
}
|
|
1620
|
-
|
|
1621
1626
|
//#endregion
|
|
1622
1627
|
//#region src/util/caseMapping.ts
|
|
1623
1628
|
/**
|
|
@@ -1641,7 +1646,6 @@ function mapSnakeCaseToCamelCase(obj) {
|
|
|
1641
1646
|
});
|
|
1642
1647
|
return result;
|
|
1643
1648
|
}
|
|
1644
|
-
|
|
1645
1649
|
//#endregion
|
|
1646
1650
|
//#region src/assertions/python.ts
|
|
1647
1651
|
const handlePython = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -1711,7 +1715,6 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1711
1715
|
assertion
|
|
1712
1716
|
};
|
|
1713
1717
|
};
|
|
1714
|
-
|
|
1715
1718
|
//#endregion
|
|
1716
1719
|
//#region src/assertions/redteam.ts
|
|
1717
1720
|
/**
|
|
@@ -1792,7 +1795,7 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1792
1795
|
const { hasAnyErrors, allTurnsHaveErrors } = analyzeGraderErrors(redteamHistory);
|
|
1793
1796
|
if (test.metadata?.strategyId && hasAnyErrors && !allTurnsHaveErrors) {
|
|
1794
1797
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1795
|
-
|
|
1798
|
+
logger.warn("[Redteam] Grading failed for iterative test with some prior grader errors", {
|
|
1796
1799
|
error: errorMessage,
|
|
1797
1800
|
strategyId: test.metadata.strategyId,
|
|
1798
1801
|
pluginId: test.metadata.pluginId
|
|
@@ -1812,7 +1815,6 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1812
1815
|
throw error;
|
|
1813
1816
|
}
|
|
1814
1817
|
};
|
|
1815
|
-
|
|
1816
1818
|
//#endregion
|
|
1817
1819
|
//#region src/assertions/refusal.ts
|
|
1818
1820
|
function handleIsRefusal(params) {
|
|
@@ -1840,7 +1842,6 @@ function handleIsRefusal(params) {
|
|
|
1840
1842
|
assertion
|
|
1841
1843
|
};
|
|
1842
1844
|
}
|
|
1843
|
-
|
|
1844
1845
|
//#endregion
|
|
1845
1846
|
//#region src/assertions/regex.ts
|
|
1846
1847
|
const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -1865,7 +1866,6 @@ const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
|
1865
1866
|
assertion
|
|
1866
1867
|
};
|
|
1867
1868
|
};
|
|
1868
|
-
|
|
1869
1869
|
//#endregion
|
|
1870
1870
|
//#region src/assertions/rouge.ts
|
|
1871
1871
|
function handleRougeScore({ baseType, assertion, renderedValue, outputString, inverse }) {
|
|
@@ -1881,7 +1881,6 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
1881
1881
|
assertion
|
|
1882
1882
|
};
|
|
1883
1883
|
}
|
|
1884
|
-
|
|
1885
1884
|
//#endregion
|
|
1886
1885
|
//#region src/ruby/wrapper.ts
|
|
1887
1886
|
/**
|
|
@@ -1897,17 +1896,16 @@ async function runRubyCode(code, method, args) {
|
|
|
1897
1896
|
fs.writeFileSync(tempFilePath, code);
|
|
1898
1897
|
return await runRuby(tempFilePath, method, args);
|
|
1899
1898
|
} catch (error) {
|
|
1900
|
-
|
|
1899
|
+
logger.error(`Error executing Ruby code: ${error}`);
|
|
1901
1900
|
throw error;
|
|
1902
1901
|
} finally {
|
|
1903
1902
|
try {
|
|
1904
1903
|
fs.unlinkSync(tempFilePath);
|
|
1905
1904
|
} catch (error) {
|
|
1906
|
-
|
|
1905
|
+
logger.error(`Error removing temporary file: ${error}`);
|
|
1907
1906
|
}
|
|
1908
1907
|
}
|
|
1909
1908
|
}
|
|
1910
|
-
|
|
1911
1909
|
//#endregion
|
|
1912
1910
|
//#region src/assertions/ruby.ts
|
|
1913
1911
|
const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -1978,7 +1976,6 @@ end
|
|
|
1978
1976
|
assertion
|
|
1979
1977
|
};
|
|
1980
1978
|
};
|
|
1981
|
-
|
|
1982
1979
|
//#endregion
|
|
1983
1980
|
//#region src/assertions/searchRubric.ts
|
|
1984
1981
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
@@ -1990,7 +1987,6 @@ async function handleSearchRubric({ assertion, baseType: _baseType, inverse, pro
|
|
|
1990
1987
|
}
|
|
1991
1988
|
return result;
|
|
1992
1989
|
}
|
|
1993
|
-
|
|
1994
1990
|
//#endregion
|
|
1995
1991
|
//#region src/assertions/similar.ts
|
|
1996
1992
|
const handleSimilar = async ({ assertion, renderedValue, outputString, inverse, test }) => {
|
|
@@ -2033,7 +2029,6 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2033
2029
|
...await matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
|
|
2034
2030
|
};
|
|
2035
2031
|
};
|
|
2036
|
-
|
|
2037
2032
|
//#endregion
|
|
2038
2033
|
//#region src/assertions/sql.ts
|
|
2039
2034
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2125,7 +2120,6 @@ const handleContainsSql = async (assertionParams) => {
|
|
|
2125
2120
|
}
|
|
2126
2121
|
return handleIsSql(assertionParams);
|
|
2127
2122
|
};
|
|
2128
|
-
|
|
2129
2123
|
//#endregion
|
|
2130
2124
|
//#region src/assertions/startsWith.ts
|
|
2131
2125
|
const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2139,7 +2133,6 @@ const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) =
|
|
|
2139
2133
|
assertion
|
|
2140
2134
|
};
|
|
2141
2135
|
};
|
|
2142
|
-
|
|
2143
2136
|
//#endregion
|
|
2144
2137
|
//#region src/assertions/toolCallF1.ts
|
|
2145
2138
|
/**
|
|
@@ -2268,7 +2261,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2268
2261
|
assertion
|
|
2269
2262
|
};
|
|
2270
2263
|
};
|
|
2271
|
-
|
|
2272
2264
|
//#endregion
|
|
2273
2265
|
//#region src/assertions/traceUtils.ts
|
|
2274
2266
|
/**
|
|
@@ -2286,7 +2278,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
2286
2278
|
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2287
2279
|
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2288
2280
|
}
|
|
2289
|
-
|
|
2290
2281
|
//#endregion
|
|
2291
2282
|
//#region src/assertions/traceErrorSpans.ts
|
|
2292
2283
|
function isErrorSpan(span) {
|
|
@@ -2364,7 +2355,6 @@ const handleTraceErrorSpans = ({ assertion, assertionValueContext }) => {
|
|
|
2364
2355
|
assertion
|
|
2365
2356
|
};
|
|
2366
2357
|
};
|
|
2367
|
-
|
|
2368
2358
|
//#endregion
|
|
2369
2359
|
//#region src/assertions/traceSpanCount.ts
|
|
2370
2360
|
const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
@@ -2399,7 +2389,6 @@ const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
|
2399
2389
|
assertion
|
|
2400
2390
|
};
|
|
2401
2391
|
};
|
|
2402
|
-
|
|
2403
2392
|
//#endregion
|
|
2404
2393
|
//#region src/assertions/traceSpanDuration.ts
|
|
2405
2394
|
function calculatePercentile(durations, percentile) {
|
|
@@ -2457,7 +2446,6 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2457
2446
|
assertion
|
|
2458
2447
|
};
|
|
2459
2448
|
};
|
|
2460
|
-
|
|
2461
2449
|
//#endregion
|
|
2462
2450
|
//#region src/assertions/webhook.ts
|
|
2463
2451
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
@@ -2494,7 +2482,6 @@ async function handleWebhook({ assertion, renderedValue, test, prompt, output, i
|
|
|
2494
2482
|
};
|
|
2495
2483
|
}
|
|
2496
2484
|
}
|
|
2497
|
-
|
|
2498
2485
|
//#endregion
|
|
2499
2486
|
//#region src/assertions/wordCount.ts
|
|
2500
2487
|
/**
|
|
@@ -2557,7 +2544,6 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2557
2544
|
assertion
|
|
2558
2545
|
};
|
|
2559
2546
|
};
|
|
2560
|
-
|
|
2561
2547
|
//#endregion
|
|
2562
2548
|
//#region src/assertions/xml.ts
|
|
2563
2549
|
function validateXml(xmlString, requiredElements) {
|
|
@@ -2632,7 +2618,6 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2632
2618
|
assertion
|
|
2633
2619
|
};
|
|
2634
2620
|
};
|
|
2635
|
-
|
|
2636
2621
|
//#endregion
|
|
2637
2622
|
//#region src/assertions/index.ts
|
|
2638
2623
|
const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
@@ -2686,7 +2671,7 @@ const ASSERTION_HANDLERS = {
|
|
|
2686
2671
|
"llm-rubric": handleLlmRubric,
|
|
2687
2672
|
meteor: async (params) => {
|
|
2688
2673
|
try {
|
|
2689
|
-
const { handleMeteorAssertion } = await import("./meteor-
|
|
2674
|
+
const { handleMeteorAssertion } = await import("./meteor-44VjEACX.js");
|
|
2690
2675
|
return handleMeteorAssertion(params);
|
|
2691
2676
|
} catch (error) {
|
|
2692
2677
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -2732,10 +2717,10 @@ function renderMetricName(metric, vars) {
|
|
|
2732
2717
|
if (!metric) return metric;
|
|
2733
2718
|
try {
|
|
2734
2719
|
const rendered = nunjucks.renderString(metric, vars);
|
|
2735
|
-
if (rendered === "" && metric !== "")
|
|
2720
|
+
if (rendered === "" && metric !== "") logger.debug(`Metric template "${metric}" rendered to empty string`);
|
|
2736
2721
|
return rendered;
|
|
2737
2722
|
} catch (error) {
|
|
2738
|
-
|
|
2723
|
+
logger.warn(`Failed to render metric template "${metric}": ${error instanceof Error ? error.message : error}`);
|
|
2739
2724
|
return metric;
|
|
2740
2725
|
}
|
|
2741
2726
|
}
|
|
@@ -2786,12 +2771,12 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2786
2771
|
spans: traceData.spans || []
|
|
2787
2772
|
};
|
|
2788
2773
|
} catch (error) {
|
|
2789
|
-
|
|
2774
|
+
logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
2790
2775
|
}
|
|
2791
2776
|
let renderedValue = assertion.value;
|
|
2792
2777
|
let valueFromScript;
|
|
2793
2778
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
2794
|
-
const basePath =
|
|
2779
|
+
const basePath = state.basePath || "";
|
|
2795
2780
|
const fileRef = renderedValue.slice(7);
|
|
2796
2781
|
let filePath = fileRef;
|
|
2797
2782
|
let functionName;
|
|
@@ -2803,10 +2788,10 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2803
2788
|
filePath = path.resolve(basePath, filePath);
|
|
2804
2789
|
if (isJavascriptFile(filePath)) {
|
|
2805
2790
|
valueFromScript = await loadFromJavaScriptFile(filePath, functionName, [output, context]);
|
|
2806
|
-
|
|
2791
|
+
logger.debug(`Javascript script ${filePath} output: ${valueFromScript}`);
|
|
2807
2792
|
} else if (filePath.endsWith(".py")) try {
|
|
2808
2793
|
valueFromScript = await runPython(filePath, functionName || "get_assert", [output, context]);
|
|
2809
|
-
|
|
2794
|
+
logger.debug(`Python script ${filePath} output: ${valueFromScript}`);
|
|
2810
2795
|
} catch (error) {
|
|
2811
2796
|
return {
|
|
2812
2797
|
pass: false,
|
|
@@ -2816,9 +2801,9 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2816
2801
|
};
|
|
2817
2802
|
}
|
|
2818
2803
|
else if (filePath.endsWith(".rb")) try {
|
|
2819
|
-
const { runRuby } = await import("./rubyUtils-
|
|
2804
|
+
const { runRuby } = await import("./rubyUtils-BUHu6PhO.js");
|
|
2820
2805
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2821
|
-
|
|
2806
|
+
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2822
2807
|
} catch (error) {
|
|
2823
2808
|
return {
|
|
2824
2809
|
pass: false,
|
|
@@ -2829,7 +2814,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2829
2814
|
}
|
|
2830
2815
|
else renderedValue = processFileReference(renderedValue);
|
|
2831
2816
|
} else if (isPackagePath(renderedValue)) {
|
|
2832
|
-
const basePath =
|
|
2817
|
+
const basePath = state.basePath || "";
|
|
2833
2818
|
const requiredModule = await loadFromPackage(renderedValue, basePath);
|
|
2834
2819
|
if (typeof requiredModule !== "function") throw new Error(`Assertion malformed: ${renderedValue} must be a function. Received: ${typeof requiredModule}`);
|
|
2835
2820
|
valueFromScript = await Promise.resolve(requiredModule(output, context));
|
|
@@ -2990,7 +2975,6 @@ var assertions_default = {
|
|
|
2990
2975
|
matchesModeration,
|
|
2991
2976
|
matchesConversationRelevance
|
|
2992
2977
|
};
|
|
2993
|
-
|
|
2994
2978
|
//#endregion
|
|
2995
2979
|
//#region src/util/promptMatching.ts
|
|
2996
2980
|
/**
|
|
@@ -3028,7 +3012,6 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
3028
3012
|
if (allowedPrompts.length === 0) return false;
|
|
3029
3013
|
return allowedPrompts.some((ref) => doesPromptRefMatch(ref, prompt));
|
|
3030
3014
|
}
|
|
3031
|
-
|
|
3032
3015
|
//#endregion
|
|
3033
3016
|
//#region src/progress/ciProgressReporter.ts
|
|
3034
3017
|
var CIProgressReporter = class {
|
|
@@ -3050,7 +3033,7 @@ var CIProgressReporter = class {
|
|
|
3050
3033
|
}
|
|
3051
3034
|
start() {
|
|
3052
3035
|
if (this.intervalId) clearInterval(this.intervalId);
|
|
3053
|
-
|
|
3036
|
+
logger.info(`[Evaluation] Starting ${this.totalTests} test cases...`);
|
|
3054
3037
|
this.intervalId = setInterval(() => {
|
|
3055
3038
|
this.logPeriodicUpdate();
|
|
3056
3039
|
}, this.updateIntervalMs);
|
|
@@ -3081,14 +3064,14 @@ var CIProgressReporter = class {
|
|
|
3081
3064
|
this.intervalId = null;
|
|
3082
3065
|
}
|
|
3083
3066
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3084
|
-
|
|
3067
|
+
logger.info(`[Evaluation] ✓ Complete! ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3085
3068
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation completed: ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3086
3069
|
}
|
|
3087
3070
|
error(message) {
|
|
3088
3071
|
const now = Date.now();
|
|
3089
3072
|
if (now - this.lastErrorTime < this.ERROR_THROTTLE_MS) return;
|
|
3090
3073
|
this.lastErrorTime = now;
|
|
3091
|
-
|
|
3074
|
+
logger.error(`[Evaluation Error] ${message}`);
|
|
3092
3075
|
if (process.env.GITHUB_ACTIONS) {
|
|
3093
3076
|
const escapedMessage = message.replace(/\r?\n/g, " ").replace(/::/g, " ");
|
|
3094
3077
|
console.log(`::error::${escapedMessage}`);
|
|
@@ -3107,12 +3090,12 @@ var CIProgressReporter = class {
|
|
|
3107
3090
|
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
|
|
3108
3091
|
}
|
|
3109
3092
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3110
|
-
|
|
3111
|
-
|
|
3093
|
+
logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
3094
|
+
logger.info(`[CI Progress] Rate: ~${Math.round(rate)} tests/minute, ETA: ${etaDisplay}`);
|
|
3112
3095
|
}
|
|
3113
3096
|
logMilestone(percentage) {
|
|
3114
3097
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3115
|
-
|
|
3098
|
+
logger.info(`[Evaluation] ✓ ${percentage}% complete (${this.completedTests}/${this.totalTests}) - ${elapsed} elapsed`);
|
|
3116
3099
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation ${percentage}% complete`);
|
|
3117
3100
|
}
|
|
3118
3101
|
formatElapsedTime(ms) {
|
|
@@ -3123,7 +3106,6 @@ var CIProgressReporter = class {
|
|
|
3123
3106
|
return `${minutes}m ${remainingSeconds}s`;
|
|
3124
3107
|
}
|
|
3125
3108
|
};
|
|
3126
|
-
|
|
3127
3109
|
//#endregion
|
|
3128
3110
|
//#region src/providers/azure/warnings.ts
|
|
3129
3111
|
/**
|
|
@@ -3137,13 +3119,12 @@ function maybeEmitAzureOpenAiWarning(testSuite, tests) {
|
|
|
3137
3119
|
const modelGradedAsserts = tests.flatMap((t) => (t.assert || []).filter((a) => a.type !== "assert-set" && MODEL_GRADED_ASSERTION_TYPES.has(a.type) && !a.provider && !t.options?.provider));
|
|
3138
3120
|
if (modelGradedAsserts.length > 0) {
|
|
3139
3121
|
const assertTypes = Array.from(new Set(modelGradedAsserts.map((a) => a.type))).join(", ");
|
|
3140
|
-
|
|
3122
|
+
logger.warn(chalk.yellow(`You are using model-graded assertions of types ${chalk.bold(assertTypes)} while testing an Azure provider. You may need to override these to use your Azure deployment. To learn more, see ${chalk.bold(`https://promptfoo.dev/docs/providers/azure/#model-graded-tests`)}`));
|
|
3141
3123
|
return true;
|
|
3142
3124
|
}
|
|
3143
3125
|
}
|
|
3144
3126
|
return false;
|
|
3145
3127
|
}
|
|
3146
|
-
|
|
3147
3128
|
//#endregion
|
|
3148
3129
|
//#region src/suggestions.ts
|
|
3149
3130
|
async function generatePrompts(prompt, _num) {
|
|
@@ -3174,7 +3155,6 @@ async function generatePrompts(prompt, _num) {
|
|
|
3174
3155
|
};
|
|
3175
3156
|
}
|
|
3176
3157
|
}
|
|
3177
|
-
|
|
3178
3158
|
//#endregion
|
|
3179
3159
|
//#region src/tracing/otelConfig.ts
|
|
3180
3160
|
/**
|
|
@@ -3200,7 +3180,6 @@ function getDefaultOtelConfig() {
|
|
|
3200
3180
|
enabled: true
|
|
3201
3181
|
};
|
|
3202
3182
|
}
|
|
3203
|
-
|
|
3204
3183
|
//#endregion
|
|
3205
3184
|
//#region src/tracing/localSpanExporter.ts
|
|
3206
3185
|
/**
|
|
@@ -3220,7 +3199,7 @@ var LocalSpanExporter = class {
|
|
|
3220
3199
|
});
|
|
3221
3200
|
else resultCallback({ code: ExportResultCode.SUCCESS });
|
|
3222
3201
|
}).catch((error) => {
|
|
3223
|
-
|
|
3202
|
+
logger.error("[LocalSpanExporter] Failed to export spans", { error });
|
|
3224
3203
|
resultCallback({
|
|
3225
3204
|
code: ExportResultCode.FAILED,
|
|
3226
3205
|
error: error instanceof Error ? error : new Error(String(error))
|
|
@@ -3234,7 +3213,7 @@ var LocalSpanExporter = class {
|
|
|
3234
3213
|
async exportAsync(spans) {
|
|
3235
3214
|
if (spans.length === 0) return;
|
|
3236
3215
|
const traceStore = getTraceStore();
|
|
3237
|
-
|
|
3216
|
+
logger.debug(`[LocalSpanExporter] Exporting ${spans.length} spans`);
|
|
3238
3217
|
const spansByTrace = /* @__PURE__ */ new Map();
|
|
3239
3218
|
for (const span of spans) {
|
|
3240
3219
|
const traceId = span.spanContext().traceId;
|
|
@@ -3245,12 +3224,12 @@ var LocalSpanExporter = class {
|
|
|
3245
3224
|
let firstError;
|
|
3246
3225
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
3247
3226
|
const result = await traceStore.addSpans(traceId, spanDataList, { skipTraceCheck: false });
|
|
3248
|
-
if (result.stored)
|
|
3249
|
-
else
|
|
3227
|
+
if (result.stored) logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
3228
|
+
else logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
3250
3229
|
} catch (error) {
|
|
3251
|
-
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY"))
|
|
3230
|
+
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY")) logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}`);
|
|
3252
3231
|
else {
|
|
3253
|
-
|
|
3232
|
+
logger.error(`[LocalSpanExporter] Failed to add spans to trace ${traceId}`, { error });
|
|
3254
3233
|
if (!firstError) firstError = error instanceof Error ? error : new Error(String(error));
|
|
3255
3234
|
}
|
|
3256
3235
|
}
|
|
@@ -3287,7 +3266,7 @@ var LocalSpanExporter = class {
|
|
|
3287
3266
|
* Shutdown the exporter. No-op for local storage.
|
|
3288
3267
|
*/
|
|
3289
3268
|
shutdown() {
|
|
3290
|
-
|
|
3269
|
+
logger.debug("[LocalSpanExporter] Shutting down");
|
|
3291
3270
|
return Promise.resolve();
|
|
3292
3271
|
}
|
|
3293
3272
|
/**
|
|
@@ -3297,7 +3276,6 @@ var LocalSpanExporter = class {
|
|
|
3297
3276
|
return Promise.resolve();
|
|
3298
3277
|
}
|
|
3299
3278
|
};
|
|
3300
|
-
|
|
3301
3279
|
//#endregion
|
|
3302
3280
|
//#region src/tracing/otelSdk.ts
|
|
3303
3281
|
let provider = null;
|
|
@@ -3325,21 +3303,21 @@ function getHandlers() {
|
|
|
3325
3303
|
*/
|
|
3326
3304
|
function initializeOtel(config) {
|
|
3327
3305
|
if (initialized) {
|
|
3328
|
-
|
|
3306
|
+
logger.debug("[OtelSdk] Already initialized, skipping");
|
|
3329
3307
|
return;
|
|
3330
3308
|
}
|
|
3331
3309
|
if (!config.enabled) {
|
|
3332
|
-
|
|
3310
|
+
logger.debug("[OtelSdk] OTEL tracing is disabled");
|
|
3333
3311
|
return;
|
|
3334
3312
|
}
|
|
3335
|
-
|
|
3313
|
+
logger.debug("[OtelSdk] Initializing OpenTelemetry SDK", {
|
|
3336
3314
|
serviceName: config.serviceName,
|
|
3337
3315
|
endpoint: config.endpoint,
|
|
3338
3316
|
localExport: config.localExport
|
|
3339
3317
|
});
|
|
3340
3318
|
if (config.debug) diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.DEBUG);
|
|
3341
3319
|
propagation.setGlobalPropagator(new W3CTraceContextPropagator());
|
|
3342
|
-
|
|
3320
|
+
logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
|
|
3343
3321
|
const resource = resourceFromAttributes({
|
|
3344
3322
|
[ATTR_SERVICE_NAME]: config.serviceName,
|
|
3345
3323
|
[ATTR_SERVICE_VERSION]: VERSION
|
|
@@ -3348,12 +3326,12 @@ function initializeOtel(config) {
|
|
|
3348
3326
|
if (config.localExport) {
|
|
3349
3327
|
const localExporter = new LocalSpanExporter();
|
|
3350
3328
|
spanProcessors.push(new BatchSpanProcessor(localExporter));
|
|
3351
|
-
|
|
3329
|
+
logger.debug("[OtelSdk] Added local span exporter");
|
|
3352
3330
|
}
|
|
3353
3331
|
if (config.endpoint) {
|
|
3354
3332
|
const otlpExporter = new OTLPTraceExporter({ url: config.endpoint });
|
|
3355
3333
|
spanProcessors.push(new BatchSpanProcessor(otlpExporter));
|
|
3356
|
-
|
|
3334
|
+
logger.debug(`[OtelSdk] Added OTLP exporter to ${config.endpoint}`);
|
|
3357
3335
|
}
|
|
3358
3336
|
provider = new NodeTracerProvider({
|
|
3359
3337
|
resource,
|
|
@@ -3361,7 +3339,7 @@ function initializeOtel(config) {
|
|
|
3361
3339
|
});
|
|
3362
3340
|
provider.register();
|
|
3363
3341
|
initialized = true;
|
|
3364
|
-
|
|
3342
|
+
logger.info("[OtelSdk] OpenTelemetry SDK initialized successfully");
|
|
3365
3343
|
setupShutdownHandlers();
|
|
3366
3344
|
}
|
|
3367
3345
|
/**
|
|
@@ -3370,12 +3348,12 @@ function initializeOtel(config) {
|
|
|
3370
3348
|
*/
|
|
3371
3349
|
async function shutdownOtel() {
|
|
3372
3350
|
if (!initialized || !provider) return;
|
|
3373
|
-
|
|
3351
|
+
logger.debug("[OtelSdk] Shutting down OpenTelemetry SDK");
|
|
3374
3352
|
try {
|
|
3375
3353
|
await provider.shutdown();
|
|
3376
|
-
|
|
3354
|
+
logger.info("[OtelSdk] OpenTelemetry SDK shut down successfully");
|
|
3377
3355
|
} catch (error) {
|
|
3378
|
-
|
|
3356
|
+
logger.error("[OtelSdk] Error shutting down OpenTelemetry SDK", { error });
|
|
3379
3357
|
} finally {
|
|
3380
3358
|
provider = null;
|
|
3381
3359
|
initialized = false;
|
|
@@ -3388,12 +3366,12 @@ async function shutdownOtel() {
|
|
|
3388
3366
|
*/
|
|
3389
3367
|
async function flushOtel() {
|
|
3390
3368
|
if (!initialized || !provider) return;
|
|
3391
|
-
|
|
3369
|
+
logger.debug("[OtelSdk] Flushing pending spans");
|
|
3392
3370
|
try {
|
|
3393
3371
|
await provider.forceFlush();
|
|
3394
|
-
|
|
3372
|
+
logger.debug("[OtelSdk] Spans flushed successfully");
|
|
3395
3373
|
} catch (error) {
|
|
3396
|
-
|
|
3374
|
+
logger.error("[OtelSdk] Error flushing spans", { error });
|
|
3397
3375
|
}
|
|
3398
3376
|
}
|
|
3399
3377
|
/**
|
|
@@ -3405,7 +3383,7 @@ function setupShutdownHandlers() {
|
|
|
3405
3383
|
const handlers = getHandlers();
|
|
3406
3384
|
if (handlers.registered) return;
|
|
3407
3385
|
const shutdown = async (signal) => {
|
|
3408
|
-
|
|
3386
|
+
logger.debug(`[OtelSdk] Received ${signal}, shutting down`);
|
|
3409
3387
|
await shutdownOtel();
|
|
3410
3388
|
};
|
|
3411
3389
|
handlers.sigTermHandler = () => {
|
|
@@ -3442,7 +3420,6 @@ function cleanupShutdownHandlers() {
|
|
|
3442
3420
|
}
|
|
3443
3421
|
handlers.registered = false;
|
|
3444
3422
|
}
|
|
3445
|
-
|
|
3446
3423
|
//#endregion
|
|
3447
3424
|
//#region src/util/exportToFile/writeToFile.ts
|
|
3448
3425
|
var JsonlFileWriter = class {
|
|
@@ -3466,7 +3443,6 @@ var JsonlFileWriter = class {
|
|
|
3466
3443
|
});
|
|
3467
3444
|
}
|
|
3468
3445
|
};
|
|
3469
|
-
|
|
3470
3446
|
//#endregion
|
|
3471
3447
|
//#region src/evaluator.ts
|
|
3472
3448
|
/**
|
|
@@ -3658,7 +3634,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3658
3634
|
if (test.providerOutput) response.output = test.providerOutput;
|
|
3659
3635
|
else {
|
|
3660
3636
|
const activeProvider = isApiProvider(test.provider) ? test.provider : provider;
|
|
3661
|
-
|
|
3637
|
+
logger.debug(`Provider type: ${activeProvider.id()}`);
|
|
3662
3638
|
traceContext = await generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite);
|
|
3663
3639
|
const callApiContext = {
|
|
3664
3640
|
vars,
|
|
@@ -3669,7 +3645,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3669
3645
|
filters,
|
|
3670
3646
|
originalProvider: provider,
|
|
3671
3647
|
test,
|
|
3672
|
-
logger
|
|
3648
|
+
logger,
|
|
3673
3649
|
getCache,
|
|
3674
3650
|
repeatIndex
|
|
3675
3651
|
};
|
|
@@ -3686,8 +3662,8 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3686
3662
|
const sanitizedMetadata = safeJsonStringify(response.metadata);
|
|
3687
3663
|
response.metadata = sanitizedMetadata ? JSON.parse(sanitizedMetadata) : {};
|
|
3688
3664
|
}
|
|
3689
|
-
|
|
3690
|
-
|
|
3665
|
+
logger.debug(`Provider response properties: ${Object.keys(response).join(", ")}`);
|
|
3666
|
+
logger.debug(`Provider response cached property explicitly: ${response.cached}`);
|
|
3691
3667
|
}
|
|
3692
3668
|
latencyMs = Date.now() - startTime;
|
|
3693
3669
|
let conversationLastInput = void 0;
|
|
@@ -3704,12 +3680,12 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3704
3680
|
metadata: response.metadata
|
|
3705
3681
|
});
|
|
3706
3682
|
}
|
|
3707
|
-
|
|
3708
|
-
|
|
3683
|
+
logger.debug("Evaluator response", { responsePreview: (safeJsonStringify(response) ?? "").slice(0, 100) });
|
|
3684
|
+
logger.debug(`Evaluator checking cached flag: response.cached = ${Boolean(response.cached)}, provider.delay = ${provider.delay}`);
|
|
3709
3685
|
if (!response.cached && provider.delay > 0) {
|
|
3710
|
-
|
|
3686
|
+
logger.debug(`Sleeping for ${provider.delay}ms`);
|
|
3711
3687
|
await sleep(provider.delay);
|
|
3712
|
-
} else if (response.cached)
|
|
3688
|
+
} else if (response.cached) logger.debug(`Skipping delay because response is cached`);
|
|
3713
3689
|
const ret = {
|
|
3714
3690
|
...setup,
|
|
3715
3691
|
response,
|
|
@@ -3812,7 +3788,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3812
3788
|
promptIdx,
|
|
3813
3789
|
testIdx
|
|
3814
3790
|
});
|
|
3815
|
-
if (!(err instanceof Error && err.name === "AbortError"))
|
|
3791
|
+
if (!(err instanceof Error && err.name === "AbortError")) logger.error("Provider call failed during eval", logContext);
|
|
3816
3792
|
return [{
|
|
3817
3793
|
...setup,
|
|
3818
3794
|
error: errorWithStack,
|
|
@@ -3895,7 +3871,7 @@ function generateVarCombinations(vars) {
|
|
|
3895
3871
|
let values = [];
|
|
3896
3872
|
if (typeof vars[key] === "string" && vars[key].startsWith("file://")) {
|
|
3897
3873
|
const filePath = vars[key].slice(7);
|
|
3898
|
-
const basePath =
|
|
3874
|
+
const basePath = state.basePath || "";
|
|
3899
3875
|
values = (globSync(filePath, {
|
|
3900
3876
|
cwd: basePath || process.cwd(),
|
|
3901
3877
|
windowsPathsNoEscape: true
|
|
@@ -3935,28 +3911,28 @@ var Evaluator = class {
|
|
|
3935
3911
|
this.conversations = {};
|
|
3936
3912
|
this.registers = {};
|
|
3937
3913
|
this.fileWriters = (Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : []).map((p) => new JsonlFileWriter(p));
|
|
3938
|
-
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency ||
|
|
3914
|
+
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
3939
3915
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
3940
|
-
|
|
3916
|
+
logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
3941
3917
|
retryAfterMs: data.retryAfterMs,
|
|
3942
3918
|
resetAt: data.resetAt,
|
|
3943
3919
|
concurrencyChange: data.concurrencyChange
|
|
3944
3920
|
});
|
|
3945
3921
|
});
|
|
3946
3922
|
this.rateLimitRegistry.on("ratelimit:learned", (data) => {
|
|
3947
|
-
|
|
3923
|
+
logger.debug(`[Scheduler] Learned rate limits for ${data.rateLimitKey}`, {
|
|
3948
3924
|
requestLimit: data.requestLimit,
|
|
3949
3925
|
tokenLimit: data.tokenLimit
|
|
3950
3926
|
});
|
|
3951
3927
|
});
|
|
3952
3928
|
this.rateLimitRegistry.on("concurrency:decreased", (data) => {
|
|
3953
|
-
|
|
3929
|
+
logger.debug(`[Scheduler] Concurrency decreased for ${data.rateLimitKey}`, {
|
|
3954
3930
|
previous: data.previous,
|
|
3955
3931
|
current: data.current
|
|
3956
3932
|
});
|
|
3957
3933
|
});
|
|
3958
3934
|
this.rateLimitRegistry.on("concurrency:increased", (data) => {
|
|
3959
|
-
|
|
3935
|
+
logger.debug(`[Scheduler] Concurrency increased for ${data.rateLimitKey}`, {
|
|
3960
3936
|
previous: data.previous,
|
|
3961
3937
|
current: data.current
|
|
3962
3938
|
});
|
|
@@ -4013,7 +3989,7 @@ var Evaluator = class {
|
|
|
4013
3989
|
const checkAbort = () => {
|
|
4014
3990
|
if (combinedAbortSignal.aborted) throw new Error("Operation cancelled");
|
|
4015
3991
|
};
|
|
4016
|
-
if (!options.silent)
|
|
3992
|
+
if (!options.silent) logger.info(`Starting evaluation ${this.evalRecord.id}`);
|
|
4017
3993
|
checkAbort();
|
|
4018
3994
|
const prompts = [];
|
|
4019
3995
|
const assertionTypes = /* @__PURE__ */ new Set();
|
|
@@ -4025,32 +4001,32 @@ var Evaluator = class {
|
|
|
4025
4001
|
}
|
|
4026
4002
|
testSuite = (await runExtensionHook(testSuite.extensions, "beforeAll", { suite: testSuite })).suite;
|
|
4027
4003
|
if (options.generateSuggestions) {
|
|
4028
|
-
|
|
4004
|
+
logger.info(`Generating prompt variations...`);
|
|
4029
4005
|
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
4030
4006
|
if (error || !newPrompts) throw new Error(`Failed to generate prompts: ${error}`);
|
|
4031
|
-
|
|
4007
|
+
logger.info(chalk.blue("Generated prompts:"));
|
|
4032
4008
|
let numAdded = 0;
|
|
4033
4009
|
for (const prompt of newPrompts) {
|
|
4034
|
-
|
|
4035
|
-
|
|
4036
|
-
|
|
4010
|
+
logger.info("--------------------------------------------------------");
|
|
4011
|
+
logger.info(`${prompt}`);
|
|
4012
|
+
logger.info("--------------------------------------------------------");
|
|
4037
4013
|
if (await promptYesNo("Do you want to test this prompt?", false)) {
|
|
4038
4014
|
testSuite.prompts.push({
|
|
4039
4015
|
raw: prompt,
|
|
4040
4016
|
label: prompt
|
|
4041
4017
|
});
|
|
4042
4018
|
numAdded++;
|
|
4043
|
-
} else
|
|
4019
|
+
} else logger.info("Skipping this prompt.");
|
|
4044
4020
|
}
|
|
4045
4021
|
if (numAdded < 1) {
|
|
4046
|
-
|
|
4022
|
+
logger.info(chalk.red("No prompts selected. Aborting."));
|
|
4047
4023
|
process.exitCode = 1;
|
|
4048
4024
|
return this.evalRecord;
|
|
4049
4025
|
}
|
|
4050
4026
|
}
|
|
4051
4027
|
const existingPromptsMap = /* @__PURE__ */ new Map();
|
|
4052
|
-
if (
|
|
4053
|
-
|
|
4028
|
+
if (state.resume && this.evalRecord.persisted && this.evalRecord.prompts.length > 0) {
|
|
4029
|
+
logger.debug("Resuming evaluation: preserving metrics from previous run");
|
|
4054
4030
|
for (const existingPrompt of this.evalRecord.prompts) {
|
|
4055
4031
|
const key = `${existingPrompt.provider}:${existingPrompt.id}`;
|
|
4056
4032
|
existingPromptsMap.set(key, existingPrompt);
|
|
@@ -4088,7 +4064,7 @@ var Evaluator = class {
|
|
|
4088
4064
|
await this.evalRecord.addPrompts(prompts);
|
|
4089
4065
|
let tests = testSuite.tests && testSuite.tests.length > 0 ? testSuite.tests : testSuite.scenarios ? [] : [{}];
|
|
4090
4066
|
if (testSuite.scenarios && testSuite.scenarios.length > 0) {
|
|
4091
|
-
|
|
4067
|
+
telemetry.record("feature_used", { feature: "scenarios" });
|
|
4092
4068
|
let scenarioIndex = 0;
|
|
4093
4069
|
for (const scenario of testSuite.scenarios) for (const data of scenario.config) {
|
|
4094
4070
|
const scenarioTests = (scenario.tests || [{}]).map((test) => {
|
|
@@ -4152,7 +4128,7 @@ var Evaluator = class {
|
|
|
4152
4128
|
}
|
|
4153
4129
|
const runEvalOptions = [];
|
|
4154
4130
|
let testIdx = 0;
|
|
4155
|
-
let concurrency = options.maxConcurrency ||
|
|
4131
|
+
let concurrency = options.maxConcurrency || 4;
|
|
4156
4132
|
for (let index = 0; index < tests.length; index++) {
|
|
4157
4133
|
const testCase = tests[index];
|
|
4158
4134
|
invariant(typeof testSuite.defaultTest !== "object" || Array.isArray(testSuite.defaultTest?.assert || []), `defaultTest.assert is not an array in test case #${index + 1}`);
|
|
@@ -4172,7 +4148,7 @@ var Evaluator = class {
|
|
|
4172
4148
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4173
4149
|
if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4174
4150
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4175
|
-
const { loadApiProvider } = await import("./providers-
|
|
4151
|
+
const { loadApiProvider } = await import("./providers-B7V0njNs.js");
|
|
4176
4152
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4177
4153
|
} else testCase.provider = defaultProvider;
|
|
4178
4154
|
}
|
|
@@ -4199,7 +4175,7 @@ var Evaluator = class {
|
|
|
4199
4175
|
const promptId = generateIdFromPrompt(prompt);
|
|
4200
4176
|
const promptIdx = promptIndexMap.get(`${providerKey}:${promptId}`);
|
|
4201
4177
|
if (promptIdx === void 0) {
|
|
4202
|
-
|
|
4178
|
+
logger.warn(`Could not find prompt index for ${providerKey}:${promptId}, skipping`);
|
|
4203
4179
|
continue;
|
|
4204
4180
|
}
|
|
4205
4181
|
runEvalOptions.push({
|
|
@@ -4222,7 +4198,7 @@ var Evaluator = class {
|
|
|
4222
4198
|
options: testOptions
|
|
4223
4199
|
};
|
|
4224
4200
|
const tracingEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || testCase.metadata?.tracingEnabled === true || testSuite.tracing?.enabled === true;
|
|
4225
|
-
|
|
4201
|
+
logger.debug(`[Evaluator] Tracing check: env=${getEnvBool("PROMPTFOO_TRACING_ENABLED", false)}, testCase.metadata?.tracingEnabled=${testCase.metadata?.tracingEnabled}, testSuite.tracing?.enabled=${testSuite.tracing?.enabled}, tracingEnabled=${tracingEnabled}`);
|
|
4226
4202
|
if (tracingEnabled) return {
|
|
4227
4203
|
...baseTest,
|
|
4228
4204
|
metadata: {
|
|
@@ -4255,27 +4231,27 @@ var Evaluator = class {
|
|
|
4255
4231
|
if (evalOption.test.assert?.some((a) => a.type === "select-best")) rowsWithSelectBestAssertion.add(evalOption.testIdx);
|
|
4256
4232
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4257
4233
|
}
|
|
4258
|
-
if (
|
|
4259
|
-
const { default: EvalResult } = await import("./evalResult-
|
|
4260
|
-
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors:
|
|
4234
|
+
if (state.resume && this.evalRecord.persisted) try {
|
|
4235
|
+
const { default: EvalResult } = await import("./evalResult-5xwYnECe.js");
|
|
4236
|
+
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
|
|
4261
4237
|
const originalCount = runEvalOptions.length;
|
|
4262
4238
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
4263
4239
|
const step = runEvalOptions[i];
|
|
4264
4240
|
if (completedPairs.has(`${step.testIdx}:${step.promptIdx}`)) runEvalOptions.splice(i, 1);
|
|
4265
4241
|
}
|
|
4266
4242
|
const skipped = originalCount - runEvalOptions.length;
|
|
4267
|
-
if (skipped > 0)
|
|
4243
|
+
if (skipped > 0) logger.info(`Resuming: skipping ${skipped} previously completed cases`);
|
|
4268
4244
|
} catch (err) {
|
|
4269
|
-
|
|
4245
|
+
logger.warn(`Resume: failed to load completed results. Running full evaluation. ${String(err)}`);
|
|
4270
4246
|
}
|
|
4271
4247
|
if (concurrency > 1) {
|
|
4272
4248
|
const usesConversation = prompts.some((p) => p.raw.includes("_conversation"));
|
|
4273
4249
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
4274
4250
|
if (usesConversation) {
|
|
4275
|
-
|
|
4251
|
+
logger.info(`Setting concurrency to 1 because the ${chalk.cyan("_conversation")} variable is used.`);
|
|
4276
4252
|
concurrency = 1;
|
|
4277
4253
|
} else if (usesStoreOutputAs) {
|
|
4278
|
-
|
|
4254
|
+
logger.info(`Setting concurrency to 1 because storeOutputAs is used.`);
|
|
4279
4255
|
concurrency = 1;
|
|
4280
4256
|
}
|
|
4281
4257
|
}
|
|
@@ -4306,14 +4282,14 @@ var Evaluator = class {
|
|
|
4306
4282
|
await this.evalRecord.addResult(row);
|
|
4307
4283
|
} catch (error) {
|
|
4308
4284
|
const resultSummary = summarizeEvaluateResultForLogging(row);
|
|
4309
|
-
|
|
4285
|
+
logger.error(`Error saving result: ${error} ${safeJsonStringify(resultSummary)}`);
|
|
4310
4286
|
}
|
|
4311
4287
|
for (const writer of this.fileWriters) await writer.write(row);
|
|
4312
4288
|
const httpStatus = row.response?.metadata?.http?.status;
|
|
4313
4289
|
if (typeof httpStatus === "number" && isNonTransientHttpStatus(httpStatus)) {
|
|
4314
4290
|
targetUnavailable = true;
|
|
4315
4291
|
targetErrorStatus = httpStatus;
|
|
4316
|
-
|
|
4292
|
+
logger.error(`Target returned HTTP ${httpStatus}. Aborting scan - this error will not resolve on retry.`);
|
|
4317
4293
|
targetErrorAbortController.abort();
|
|
4318
4294
|
break;
|
|
4319
4295
|
}
|
|
@@ -4333,7 +4309,7 @@ var Evaluator = class {
|
|
|
4333
4309
|
if (testSuite.derivedMetrics) {
|
|
4334
4310
|
const math = await import("mathjs");
|
|
4335
4311
|
const promptEvalCount = metrics.testPassCount + metrics.testFailCount + metrics.testErrorCount + 1;
|
|
4336
|
-
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count"))
|
|
4312
|
+
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count")) logger.warn("Metric name '__count' is reserved for derived metrics and will be overridden.");
|
|
4337
4313
|
const evalContext = {
|
|
4338
4314
|
...metrics.namedScores,
|
|
4339
4315
|
__count: promptEvalCount
|
|
@@ -4348,7 +4324,7 @@ var Evaluator = class {
|
|
|
4348
4324
|
}
|
|
4349
4325
|
evalContext[metric.name] = metrics.namedScores[metric.name];
|
|
4350
4326
|
} catch (error) {
|
|
4351
|
-
|
|
4327
|
+
logger.debug(`Could not evaluate derived metric '${metric.name}': ${error.message}`);
|
|
4352
4328
|
}
|
|
4353
4329
|
}
|
|
4354
4330
|
}
|
|
@@ -4387,7 +4363,7 @@ var Evaluator = class {
|
|
|
4387
4363
|
if (typeof evalStep.provider.cleanup === "function") try {
|
|
4388
4364
|
evalStep.provider.cleanup();
|
|
4389
4365
|
} catch (cleanupErr) {
|
|
4390
|
-
|
|
4366
|
+
logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
4391
4367
|
}
|
|
4392
4368
|
reject(/* @__PURE__ */ new Error(`Evaluation timed out after ${timeoutMs}ms`));
|
|
4393
4369
|
}, timeoutMs);
|
|
@@ -4451,8 +4427,8 @@ var Evaluator = class {
|
|
|
4451
4427
|
}
|
|
4452
4428
|
};
|
|
4453
4429
|
const originalProgressCallback = this.options.progressCallback;
|
|
4454
|
-
const isWebUI = Boolean(
|
|
4455
|
-
|
|
4430
|
+
const isWebUI = Boolean(state.webUI);
|
|
4431
|
+
logger.debug(`Progress bar settings: showProgressBar=${this.options.showProgressBar}, isWebUI=${isWebUI}`);
|
|
4456
4432
|
if (isCI() && !isWebUI) {
|
|
4457
4433
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4458
4434
|
ciProgressReporter.start();
|
|
@@ -4462,20 +4438,20 @@ var Evaluator = class {
|
|
|
4462
4438
|
if (isWebUI) {
|
|
4463
4439
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4464
4440
|
const vars = formatVarsForDisplay(evalStep.test.vars, 50);
|
|
4465
|
-
|
|
4441
|
+
logger.info(`[${numComplete}/${total}] Running ${provider} with vars: ${vars}`);
|
|
4466
4442
|
} else if (progressBarManager) {
|
|
4467
4443
|
const phase = evalStep.test.options?.runSerially ? "serial" : "concurrent";
|
|
4468
4444
|
progressBarManager.updateProgress(index, evalStep, phase, metrics);
|
|
4469
4445
|
} else if (ciProgressReporter) ciProgressReporter.update(numComplete);
|
|
4470
|
-
else
|
|
4446
|
+
else logger.debug(`Eval #${index + 1} complete (${numComplete} of ${runEvalOptions.length})`);
|
|
4471
4447
|
};
|
|
4472
4448
|
const serialRunEvalOptions = [];
|
|
4473
4449
|
const concurrentRunEvalOptions = [];
|
|
4474
4450
|
for (const evalOption of runEvalOptions) if (evalOption.test.options?.runSerially) serialRunEvalOptions.push(evalOption);
|
|
4475
4451
|
else concurrentRunEvalOptions.push(evalOption);
|
|
4476
4452
|
if (!this.options.silent) {
|
|
4477
|
-
if (serialRunEvalOptions.length > 0)
|
|
4478
|
-
if (concurrentRunEvalOptions.length > 0)
|
|
4453
|
+
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4454
|
+
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4479
4455
|
}
|
|
4480
4456
|
if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
4481
4457
|
try {
|
|
@@ -4484,7 +4460,7 @@ var Evaluator = class {
|
|
|
4484
4460
|
if (isWebUI) {
|
|
4485
4461
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4486
4462
|
const vars = formatVarsForDisplay(evalStep.test.vars || {}, 50);
|
|
4487
|
-
|
|
4463
|
+
logger.info(`[${numComplete}/${runEvalOptions.length}] Running ${provider} with vars: ${vars}`);
|
|
4488
4464
|
}
|
|
4489
4465
|
const idx = runEvalOptions.indexOf(evalStep);
|
|
4490
4466
|
await processEvalStepWithTimeout(evalStep, idx);
|
|
@@ -4499,9 +4475,9 @@ var Evaluator = class {
|
|
|
4499
4475
|
});
|
|
4500
4476
|
} catch (err) {
|
|
4501
4477
|
if (combinedAbortSignal.aborted) {
|
|
4502
|
-
if (evalTimedOut)
|
|
4478
|
+
if (evalTimedOut) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
4503
4479
|
else if (!targetUnavailable) {
|
|
4504
|
-
|
|
4480
|
+
logger.info("Evaluation interrupted, saving progress...");
|
|
4505
4481
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4506
4482
|
if (progressBarManager) progressBarManager.stop();
|
|
4507
4483
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -4531,10 +4507,10 @@ var Evaluator = class {
|
|
|
4531
4507
|
let compareCount = 0;
|
|
4532
4508
|
for (const testIdx of rowsWithSelectBestAssertion) {
|
|
4533
4509
|
compareCount++;
|
|
4534
|
-
if (isWebUI)
|
|
4510
|
+
if (isWebUI) logger.info(`Running model-graded comparison ${compareCount} of ${compareRowsCount}...`);
|
|
4535
4511
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4536
4512
|
if (resultsToCompare.length === 0) {
|
|
4537
|
-
|
|
4513
|
+
logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4538
4514
|
continue;
|
|
4539
4515
|
}
|
|
4540
4516
|
const compareAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "select-best");
|
|
@@ -4596,16 +4572,16 @@ var Evaluator = class {
|
|
|
4596
4572
|
}
|
|
4597
4573
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4598
4574
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4599
|
-
else if (!isWebUI)
|
|
4575
|
+
else if (!isWebUI) logger.debug(`Model-graded comparison #${compareCount} of ${compareRowsCount} complete`);
|
|
4600
4576
|
}
|
|
4601
4577
|
}
|
|
4602
4578
|
const maxScoreRowsCount = rowsWithMaxScoreAssertion.size;
|
|
4603
4579
|
if (maxScoreRowsCount > 0) {
|
|
4604
|
-
|
|
4580
|
+
logger.info(`Processing ${maxScoreRowsCount} max-score assertions...`);
|
|
4605
4581
|
for (const testIdx of rowsWithMaxScoreAssertion) {
|
|
4606
4582
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4607
4583
|
if (resultsToCompare.length === 0) {
|
|
4608
|
-
|
|
4584
|
+
logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4609
4585
|
continue;
|
|
4610
4586
|
}
|
|
4611
4587
|
const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
|
|
@@ -4613,7 +4589,7 @@ var Evaluator = class {
|
|
|
4613
4589
|
const maxScoreGradingResults = await selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
|
|
4614
4590
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4615
4591
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4616
|
-
else if (!isWebUI)
|
|
4592
|
+
else if (!isWebUI) logger.debug(`Max-score assertion for test #${testIdx} complete`);
|
|
4617
4593
|
for (let index = 0; index < resultsToCompare.length; index++) {
|
|
4618
4594
|
const result = resultsToCompare[index];
|
|
4619
4595
|
const maxScoreGradingResult = {
|
|
@@ -4657,7 +4633,7 @@ var Evaluator = class {
|
|
|
4657
4633
|
progressBarManager.stop();
|
|
4658
4634
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
4659
4635
|
} catch (cleanupErr) {
|
|
4660
|
-
|
|
4636
|
+
logger.warn(`Error during progress reporter cleanup: ${cleanupErr}`);
|
|
4661
4637
|
}
|
|
4662
4638
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4663
4639
|
if (evalTimedOut) {
|
|
@@ -4730,7 +4706,7 @@ var Evaluator = class {
|
|
|
4730
4706
|
return idParts.length > 1 ? idParts[0] : "unknown";
|
|
4731
4707
|
})));
|
|
4732
4708
|
const timeoutOccurred = evalTimedOut || this.evalRecord.results.some((r) => r.failureReason === ResultFailureReason.ERROR && r.error?.includes("timed out"));
|
|
4733
|
-
|
|
4709
|
+
telemetry.record("eval_ran", {
|
|
4734
4710
|
numPrompts: prompts.length,
|
|
4735
4711
|
numTests: this.stats.successes + this.stats.failures + this.stats.errors,
|
|
4736
4712
|
numRequests: this.stats.tokenUsage.numRequests || 0,
|
|
@@ -4778,26 +4754,26 @@ var Evaluator = class {
|
|
|
4778
4754
|
await startOtlpReceiverIfNeeded(this.testSuite);
|
|
4779
4755
|
const tracingEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || this.testSuite.tracing?.enabled === true || typeof this.testSuite.defaultTest === "object" && this.testSuite.defaultTest?.metadata?.tracingEnabled === true || this.testSuite.tests?.some((t) => t.metadata?.tracingEnabled === true);
|
|
4780
4756
|
if (tracingEnabled) {
|
|
4781
|
-
|
|
4757
|
+
logger.debug("[Evaluator] Initializing OTEL SDK for tracing");
|
|
4782
4758
|
initializeOtel(getDefaultOtelConfig());
|
|
4783
4759
|
}
|
|
4784
4760
|
try {
|
|
4785
4761
|
return await this._runEvaluation();
|
|
4786
4762
|
} finally {
|
|
4787
4763
|
if (tracingEnabled) {
|
|
4788
|
-
|
|
4764
|
+
logger.debug("[Evaluator] Flushing OTEL spans...");
|
|
4789
4765
|
await flushOtel();
|
|
4790
4766
|
await shutdownOtel();
|
|
4791
4767
|
}
|
|
4792
4768
|
if (isOtlpReceiverStarted()) {
|
|
4793
|
-
|
|
4769
|
+
logger.debug("[Evaluator] Waiting for span exports to complete...");
|
|
4794
4770
|
await sleep(3e3);
|
|
4795
4771
|
}
|
|
4796
4772
|
await stopOtlpReceiverIfNeeded();
|
|
4797
4773
|
await providerRegistry.shutdownAll();
|
|
4798
4774
|
if (this.rateLimitRegistry) {
|
|
4799
4775
|
const metrics = this.rateLimitRegistry.getMetrics();
|
|
4800
|
-
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0)
|
|
4776
|
+
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0) logger.debug(`[Scheduler] Final metrics for ${key}`, {
|
|
4801
4777
|
totalRequests: m.totalRequests,
|
|
4802
4778
|
completedRequests: m.completedRequests,
|
|
4803
4779
|
failedRequests: m.failedRequests,
|
|
@@ -4810,14 +4786,14 @@ var Evaluator = class {
|
|
|
4810
4786
|
}
|
|
4811
4787
|
this.rateLimitRegistry?.dispose();
|
|
4812
4788
|
redteamProviderManager.setRateLimitRegistry(void 0);
|
|
4813
|
-
|
|
4789
|
+
state.maxConcurrency = void 0;
|
|
4814
4790
|
}
|
|
4815
4791
|
}
|
|
4816
4792
|
};
|
|
4817
4793
|
function evaluate(testSuite, evalRecord, options) {
|
|
4818
4794
|
return new Evaluator(testSuite, evalRecord, options).evaluate();
|
|
4819
4795
|
}
|
|
4820
|
-
|
|
4821
4796
|
//#endregion
|
|
4822
4797
|
export { runEval as a, readAssertions as c, isAllowedPrompt as i, renderMetricName as l, formatVarsForDisplay as n, doesPromptRefMatch as o, generateVarCombinations as r, assertions_default as s, evaluate as t, runAssertions as u };
|
|
4823
|
-
|
|
4798
|
+
|
|
4799
|
+
//# sourceMappingURL=evaluator-BhoWwp5b.js.map
|