promptfoo 0.120.26 → 0.121.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/drizzle/0023_wooden_mandrill.sql +2 -0
- package/dist/drizzle/meta/0023_snapshot.json +1496 -0
- package/dist/drizzle/meta/_journal.json +7 -0
- package/dist/src/{ListApp-D3DG0F8h.js → ListApp-Du7YVwj5.js} +2 -4
- package/dist/src/accounts-BgNJDBE6.js +206 -0
- package/dist/src/{accounts-BzEY8H3v.cjs → accounts-Bx-x3bmW.cjs} +99 -80
- package/dist/src/{accounts-DHHiXsy6.js → accounts-CMqkzrVf.js} +61 -36
- package/dist/src/{accounts-R3gfCR_g.js → accounts-xrUGFA6n.js} +60 -35
- package/dist/src/{agentic-utils-D6_gzOUF.js → agentic-utils-BKIN5PKu.js} +9 -10
- package/dist/src/{agents-CwM7re15.cjs → agents-B0f4HICh.cjs} +37 -40
- package/dist/src/{agents-Cnph5GLD.js → agents-C-dDThPK.js} +37 -37
- package/dist/src/{agents-C7BiinFI.cjs → agents-CErsqg5U.cjs} +19 -27
- package/dist/src/{agents-v4cW_ZgC.js → agents-CVIn-Utx.js} +19 -22
- package/dist/src/{agents-GiUianme.js → agents-CXknwsFX.js} +37 -40
- package/dist/src/{agents-DETIQHqF.js → agents-DeH4Gu94.js} +21 -28
- package/dist/src/{agents-DYIT-hQy.js → agents-Dy2YpZpa.js} +38 -41
- package/dist/src/{agents-Cao4i7AX.js → agents-aF4-T121.js} +19 -30
- package/dist/src/{aimlapi-DMF6v_vb.js → aimlapi-BAGZDo5G.js} +16 -18
- package/dist/src/{aimlapi-CMJpKK-B.js → aimlapi-BNfTBexL.js} +15 -17
- package/dist/src/{aimlapi-DtSf1ykJ.js → aimlapi-DHRKlBEA.js} +15 -4
- package/dist/src/{aimlapi-DoGLcQW_.cjs → aimlapi-tg0Gkcvr.cjs} +15 -16
- package/dist/src/app/assets/index-BFCZg7hQ.js +439 -0
- package/dist/src/app/assets/index-NCn4eVBv.css +1 -0
- package/dist/src/app/assets/{vendor-charts-CYyo8R8v.js → vendor-charts-CCl15Imd.js} +1 -1
- package/dist/src/app/assets/{vendor-markdown-DSmzq4Jh.js → vendor-markdown-0tekx3KX.js} +1 -1
- package/dist/src/app/index.html +4 -34
- package/dist/src/{audio-DUH4q0Xq.js → audio-BRODU0UK.js} +7 -9
- package/dist/src/{audio-BWjyvHn9.cjs → audio-BWeaWovU.cjs} +6 -7
- package/dist/src/{audio-U580w8jM.js → audio-CHQ4r-RV.js} +6 -5
- package/dist/src/{audio-BrJBFN2b.js → audio-tf_NBjlC.js} +6 -8
- package/dist/src/base-B0tcrnq_.js +193 -0
- package/dist/src/base-B4QJRyFS.js +194 -0
- package/dist/src/base-DBtwl2FR.cjs +222 -0
- package/dist/src/base-fEDN28WM.js +193 -0
- package/dist/src/{blobs-kt8v3UyH.js → blobs-BAU-dXan.js} +9 -12
- package/dist/src/{blobs-C9J2mVgo.js → blobs-Bpg5rH6i.js} +9 -12
- package/dist/src/{blobs-673H0jCl.cjs → blobs-DvS-O6be.cjs} +34 -37
- package/dist/src/blobs-qTYm-1PY.js +236 -0
- package/dist/src/{cache-BLLayYEN.js → cache-8XhNqPKW.js} +64 -67
- package/dist/src/cache-Bbn1Nyrd.cjs +5 -0
- package/dist/src/cache-BwsMSda7.js +6 -0
- package/dist/src/{cache-mIszOnuz.js → cache-CG0SlR1d.js} +64 -66
- package/dist/src/{cache-7xULbvt3.cjs → cache-COish3-W.cjs} +114 -117
- package/dist/src/cache-D3eqDYGU.js +739 -0
- package/dist/src/{chat-Fl6TZJRS.cjs → chat-2K608PeQ.cjs} +20 -21
- package/dist/src/chat-BKm79wib.js +764 -0
- package/dist/src/{chat-XPN9YHhr.js → chat-CM_kyI8B.js} +20 -9
- package/dist/src/{chat-DIywASPG.js → chat-CRWNNq73.js} +49 -49
- package/dist/src/{chat-C8Ei4f87.js → chat-CznLWr_D.js} +49 -49
- package/dist/src/{chat-CgyGj2hC.js → chat-DHMH-N64.js} +20 -22
- package/dist/src/{chat-C4zqjObh.cjs → chat-DaqekjFr.cjs} +69 -69
- package/dist/src/{chat-Cpz3O-Xl.js → chat-DxysjBvt.js} +21 -23
- package/dist/src/{chatkit-Dpxrq4eD.js → chatkit-65VXf5SR.js} +58 -58
- package/dist/src/{chatkit-DIrJX8xk.js → chatkit-Be-Q-a9F.js} +58 -60
- package/dist/src/{chatkit-DEls11hE.js → chatkit-BxFvW8KY.js} +58 -60
- package/dist/src/{chatkit-e25Ziu17.cjs → chatkit-DKyPi1Gs.cjs} +58 -60
- package/dist/src/chunk-DEq-mXcV.js +15 -0
- package/dist/src/chunk-DRamLcfz.js +16 -0
- package/dist/src/{claude-agent-sdk-6-xTaLwM.js → claude-agent-sdk-BLTu0WBO.js} +45 -31
- package/dist/src/{claude-agent-sdk-BzxF6NIJ.cjs → claude-agent-sdk-CJH22shf.cjs} +44 -29
- package/dist/src/{claude-agent-sdk-CmkTnKGH.js → claude-agent-sdk-D6_k9FKA.js} +45 -33
- package/dist/src/{claude-agent-sdk-rXCBLK_o.js → claude-agent-sdk-Dy5lT-Tx.js} +46 -21
- package/dist/src/{cloud-BMbRVJFw.js → cloud-Bc9526yV.js} +32 -12
- package/dist/src/cloud-DmE0EwsY.js +4 -0
- package/dist/src/{cloudflare-ai-CUg4BTcj.js → cloudflare-ai-C9r2sRhw.js} +16 -18
- package/dist/src/{cloudflare-ai-Z9X219gp.js → cloudflare-ai-CWWJCRim.js} +16 -4
- package/dist/src/{cloudflare-ai-BAQ0u_dg.cjs → cloudflare-ai-ClWSdor4.cjs} +16 -17
- package/dist/src/{cloudflare-ai-CobxMTR4.js → cloudflare-ai-ICsOuD-z.js} +17 -19
- package/dist/src/{cloudflare-gateway-C0sgfr_z.cjs → cloudflare-gateway-C2_-KG5o.cjs} +21 -22
- package/dist/src/{cloudflare-gateway-_itGuXry.js → cloudflare-gateway-D6O7AlYb.js} +23 -23
- package/dist/src/{cloudflare-gateway-D2_yi-Fh.js → cloudflare-gateway-D6xFc5pa.js} +21 -25
- package/dist/src/{cloudflare-gateway-Djf3F3_H.js → cloudflare-gateway-pXGHxJ47.js} +26 -14
- package/dist/src/{codex-sdk-ibXwdglL.js → codex-sdk-C6UMlxwV.js} +49 -32
- package/dist/src/{codex-sdk-BASDNkIl.js → codex-sdk-DUwKWezN.js} +49 -30
- package/dist/src/{codex-sdk-dSnGdgIp.js → codex-sdk-GGAw0qbD.js} +49 -32
- package/dist/src/{codex-sdk-wTEpMM_X.cjs → codex-sdk-fAO0c3yA.cjs} +49 -32
- package/dist/src/{cometapi-B01btbfb.js → cometapi-BasUi7-_.js} +17 -19
- package/dist/src/{cometapi-DHUAH6nK.js → cometapi-Bbjp5V4x.js} +16 -4
- package/dist/src/{cometapi-ChAaRjg5.js → cometapi-DkXrKi5z.js} +21 -24
- package/dist/src/{cometapi-JbvOJSCO.cjs → cometapi-vY6aDZgo.cjs} +21 -22
- package/dist/src/{completion-D9_MDlnd.js → completion-6Mx_iXxK.js} +11 -13
- package/dist/src/{completion-BBJ6zmG3.js → completion-C5rtR_9P.js} +11 -13
- package/dist/src/{completion-DDyL3Cb2.cjs → completion-CDOouNzq.cjs} +21 -23
- package/dist/src/completion-C_P3ypkJ.js +120 -0
- package/dist/src/createHash-CTQmL3G2.js +15 -0
- package/dist/src/createHash-CfZSc0b4.cjs +27 -0
- package/dist/src/createHash-Da8fMwqB.js +16 -0
- package/dist/src/createHash-DmPQkvBh.js +15 -0
- package/dist/src/{docker-JAAubMw3.js → docker-5KcG-_86.js} +18 -20
- package/dist/src/{docker-Ckw-j7Rr.cjs → docker-BwsKwxFs.cjs} +18 -19
- package/dist/src/{docker-vnOg96gi.js → docker-CZnqU1XV.js} +18 -7
- package/dist/src/{docker-BuButc4D.js → docker-DzxyDPIj.js} +19 -21
- package/dist/src/entrypoint.js +2 -3
- package/dist/src/{errors-DnGCbnx8.js → errors-P6ll7XSJ.js} +2 -2
- package/dist/src/{esm-CYhseqj4.js → esm-C03C-mv3.js} +17 -20
- package/dist/src/{esm-rDtG_2rg.js → esm-CaIwzWR5.js} +18 -21
- package/dist/src/esm-Cd1AjG1D.js +379 -0
- package/dist/src/{esm-BQkx5roy.cjs → esm-CnNt7sI4.cjs} +47 -49
- package/dist/src/eval-17JizQIv.js +15 -0
- package/dist/src/{eval-CYrbG57o.js → eval-DmFyWU7i.js} +49 -55
- package/dist/src/{evalResult-COsVttMA.js → evalResult-CDQiuUuf.js} +16 -12
- package/dist/src/{evalResult-6JaUIStC.js → evalResult-CTG2AHOS.js} +10 -11
- package/dist/src/evalResult-Cqj8pldJ.js +12 -0
- package/dist/src/{evalResult-DlRfu_Rq.cjs → evalResult-Dap2CekP.cjs} +20 -21
- package/dist/src/evalResult-DvcJAWJU.cjs +10 -0
- package/dist/src/evalResult-Hftn-S_i.js +10 -0
- package/dist/src/evaluator-B2CFNt-P.js +36 -0
- package/dist/src/{evaluator-3EJCMTs0.js → evaluator-DPFRbFIL.js} +210 -232
- package/dist/src/{extractor-LSYjrhK0.js → extractor-CFG6bcWJ.js} +23 -38
- package/dist/src/{extractor-DbhlYEeo.cjs → extractor-DX36oYEv.cjs} +37 -64
- package/dist/src/{extractor-Hs7la_19.js → extractor-M67RUtg6.js} +23 -38
- package/dist/src/extractor-YMU_Gvt8.js +374 -0
- package/dist/src/fetch-4M3YRaqL.js +780 -0
- package/dist/src/{fetch-18MuNu9i.js → fetch-60Gzydls.js} +60 -46
- package/dist/src/{fetch-SRsE6Ctl.js → fetch-BMv0O527.js} +41 -35
- package/dist/src/{fetch-ZMn_oemb.cjs → fetch-BxUk8odA.cjs} +268 -279
- package/dist/src/fetch-KV5kNASw.js +5 -0
- package/dist/src/{fileExtensions-ePDqouxn.js → fileExtensions-DnqA1y9x.js} +2 -2
- package/dist/src/{fileExtensions-BpuMmaFL.js → fileExtensions-Ds-foDzt.js} +2 -2
- package/dist/src/fileExtensions-LcDYkU4v.js +85 -0
- package/dist/src/{fileExtensions-DkJYkWUy.cjs → fileExtensions-bYh77CN8.cjs} +27 -28
- package/dist/src/{formatDuration-Doo0xq-z.js → formatDuration-DgBVMN65.js} +2 -2
- package/dist/src/{genaiTracer-Ce19n68P.js → genaiTracer-70Z8BIuV.js} +2 -3
- package/dist/src/{genaiTracer-CqNnnXrE.js → genaiTracer-C1rxGO8Q.js} +2 -3
- package/dist/src/genaiTracer-D3fD9dNV.js +256 -0
- package/dist/src/{genaiTracer-CQlpZkrp.cjs → genaiTracer-DN4dQywX.cjs} +13 -14
- package/dist/src/graders-Bu0H9nXi.js +32 -0
- package/dist/src/{graders-BaMCwIKp.js → graders-CHO8EPM4.js} +385 -417
- package/dist/src/graders-Cfhkvx-e.js +34 -0
- package/dist/src/{graders-QsALpIdy.js → graders-CpdqD9PI.js} +385 -417
- package/dist/src/graders-DClJVpGP.cjs +32 -0
- package/dist/src/{graders-DzUUnUjC.cjs → graders-DOXycdlG.cjs} +721 -753
- package/dist/src/graders-DcnJsrMO.js +32 -0
- package/dist/src/graders-R9rYUM0d.js +13466 -0
- package/dist/src/{image-BiEVdpdP.js → image-BmEZqVmk.js} +57 -18
- package/dist/src/{image-mhAGP07h.js → image-CBBVXWuT.js} +57 -18
- package/dist/src/{image-D10zEe1f.cjs → image-CDLQOcqT.cjs} +6 -7
- package/dist/src/{image-COCWy5dX.js → image-DJEvKveK.js} +6 -5
- package/dist/src/{image-C3BjJUAU.cjs → image-DTedmQPg.cjs} +77 -32
- package/dist/src/{image-DB4sHxdJ.js → image-gvmivTEe.js} +7 -9
- package/dist/src/image-pAX56tPG.js +257 -0
- package/dist/src/{image-BXt_7u0v.js → image-tL5hIOFh.js} +6 -8
- package/dist/src/index.cjs +696 -693
- package/dist/src/index.d.cts +113 -10
- package/dist/src/index.d.ts +113 -6
- package/dist/src/index.js +657 -658
- package/dist/src/{interactiveCheck-DU-MAhp5.js → interactiveCheck-BgLZUIt3.js} +7 -8
- package/dist/src/{invariant-DT20jrBd.js → invariant-BtWWVVhl.js} +2 -2
- package/dist/src/{invariant-1pAf2CD1.js → invariant-Ddh24eXh.js} +2 -2
- package/dist/src/{invariant-CKcJAQ6M.cjs → invariant-kfQ8Bu82.cjs} +7 -8
- package/dist/src/invariant-vgHWClmd.js +25 -0
- package/dist/src/{knowledgeBase-DotRBzUE.js → knowledgeBase-CLJybhnF.js} +19 -34
- package/dist/src/{knowledgeBase-XJQ0Qyez.js → knowledgeBase-CoU-UQBg.js} +17 -41
- package/dist/src/{knowledgeBase-CMvMlLZR.js → knowledgeBase-DjWPVqSb.js} +17 -43
- package/dist/src/{knowledgeBase-Bnb00xKs.cjs → knowledgeBase-wkxuRFhA.cjs} +17 -40
- package/dist/src/{litellm-CHrRmPAe.js → litellm-B9Hysuri.js} +16 -18
- package/dist/src/{litellm-CrLJrPIm.js → litellm-CTfa0hqi.js} +15 -17
- package/dist/src/{litellm-BrnZhMcL.cjs → litellm-NYpQ8RQu.cjs} +15 -16
- package/dist/src/{litellm-BECdjOTx.js → litellm-ePxtr9F1.js} +15 -4
- package/dist/src/{logger-w8Ozp0Td.js → logger-CT3IKMKA.js} +24 -41
- package/dist/src/{logger-BdZ-IqBc.cjs → logger-Cp1GPUjj.cjs} +166 -192
- package/dist/src/logger-DLcq4dWf.js +713 -0
- package/dist/src/{logger-BotXmWKW.js → logger-KkObSCzq.js} +27 -43
- package/dist/src/{luma-ray-C0RkI3lt.cjs → luma-ray-B0GGNRc1.cjs} +20 -21
- package/dist/src/{luma-ray-C-w6EsJm.js → luma-ray-BE2mOt6N.js} +20 -13
- package/dist/src/{luma-ray-BOeX-h0M.js → luma-ray-BW9IRGIc.js} +22 -21
- package/dist/src/{luma-ray-DgKLS0BF.js → luma-ray-Cm1KZBhs.js} +20 -23
- package/dist/src/main.js +1985 -2055
- package/dist/src/{messages-DXV3Qh8_.cjs → messages-1JrJs91T.cjs} +35 -34
- package/dist/src/{messages-D61tPFQo.js → messages-1x9atZmP.js} +25 -24
- package/dist/src/{messages-CDZYGNlS.js → messages-BLbWdsyt.js} +25 -24
- package/dist/src/messages-D8EA0oDc.js +240 -0
- package/dist/src/{meteor-P2rUE-Uz.js → meteor-44VjEACX.js} +3 -4
- package/dist/src/{meteor-SLNTgmXm.js → meteor-D-SotUw9.js} +3 -4
- package/dist/src/{meteor-odmwVbyG.cjs → meteor-DLZZ3osF.cjs} +3 -4
- package/dist/src/{meteor-Dj8cTkU_.js → meteor-DUiCJRC-.js} +3 -4
- package/dist/src/modelslab-C1OLRmVX.js +166 -0
- package/dist/src/modelslab-CqXBy3U8.js +168 -0
- package/dist/src/modelslab-DcOSFwKh.cjs +166 -0
- package/dist/src/modelslab-X5-4LroM.js +166 -0
- package/dist/src/{nova-reel-C2LFfVTf.js → nova-reel-BgS1ZWuK.js} +20 -13
- package/dist/src/{nova-reel-DtCjbD5O.js → nova-reel-D2ZkOSyr.js} +22 -21
- package/dist/src/{nova-reel-D9FXq3Mt.cjs → nova-reel-D9xfaMBs.cjs} +20 -21
- package/dist/src/{nova-reel-Bk5npr2q.js → nova-reel-DihqLeol.js} +20 -23
- package/dist/src/{nova-sonic-BoRSY_U6.cjs → nova-sonic-DVu3mMIy.cjs} +30 -31
- package/dist/src/{nova-sonic-D_qERM-K.js → nova-sonic-DezhVUYT.js} +30 -26
- package/dist/src/{nova-sonic-CgaWLDM1.js → nova-sonic-P-CdUMlV.js} +30 -31
- package/dist/src/{nova-sonic-BXRfQyF-.js → nova-sonic-Q3BOJeig.js} +31 -32
- package/dist/src/{openai-Bigwjgo1.cjs → openai-Cuif0GEt.cjs} +8 -9
- package/dist/src/{openai-Dz3surb_.js → openai-DElQ-fPX.js} +3 -4
- package/dist/src/{openai-CT5fwbve.js → openai-DhbB7eWK.js} +3 -4
- package/dist/src/openai-j-sE2O7r.js +44 -0
- package/dist/src/{openclaw-dHLcXUWZ.js → openclaw-BiSZPL7J.js} +20 -14
- package/dist/src/{openclaw-CpPrXwf6.js → openclaw-Bv1DINsX.js} +20 -27
- package/dist/src/{openclaw-B6XY2kUf.js → openclaw-D1D_ej1z.js} +21 -28
- package/dist/src/{openclaw-DDSfq5fp.cjs → openclaw-DAfWQn-o.cjs} +33 -39
- package/dist/src/opencode-sdk-C7m-wRfI.js +560 -0
- package/dist/src/opencode-sdk-CfaLN8PY.cjs +564 -0
- package/dist/src/opencode-sdk-D95s6SnR.js +562 -0
- package/dist/src/opencode-sdk-DxUPkLT7.js +560 -0
- package/dist/src/{otlpReceiver-DmRb0NBj.js → otlpReceiver--AIRW_S4.js} +53 -51
- package/dist/src/{otlpReceiver-Dg817agV.js → otlpReceiver-Bn5wGB1v.js} +53 -55
- package/dist/src/{otlpReceiver-B6Xo4KZM.cjs → otlpReceiver-Diec4cln.cjs} +53 -55
- package/dist/src/{otlpReceiver-BO0rbDzh.js → otlpReceiver-g3ByGaXs.js} +53 -55
- package/dist/src/{providerRegistry-Xf0qdqGQ.js → providerRegistry-B0RUOLI_.js} +7 -8
- package/dist/src/{providerRegistry-wCWd7sKQ.js → providerRegistry-CD8MEar9.js} +7 -8
- package/dist/src/{providerRegistry-lc7a7utN.cjs → providerRegistry-Civky8Ar.cjs} +12 -13
- package/dist/src/providerRegistry-DM8rZYol.js +45 -0
- package/dist/src/providers-B3HvufyI.js +33246 -0
- package/dist/src/{providers-BiNq_Iyc.js → providers-BKRJTjBz.js} +1743 -1795
- package/dist/src/providers-C1rOSHiR.js +32 -0
- package/dist/src/{providers-BlEhY5mi.js → providers-CFLy1_ji.js} +1750 -1802
- package/dist/src/{providers-BNKVY53V.cjs → providers-CFu-TZl-.cjs} +2111 -2163
- package/dist/src/providers-CxmDwEFf.cjs +31 -0
- package/dist/src/providers-Dodakqr0.js +30 -0
- package/dist/src/providers-GIQ2TcsA.js +30 -0
- package/dist/src/{pythonUtils-r1uBuA0n.js → pythonUtils-C3py6GC1.js} +18 -19
- package/dist/src/{pythonUtils-DZ6EbdY4.cjs → pythonUtils-CTU3Y3lw.cjs} +42 -43
- package/dist/src/{pythonUtils-vMlk9Qp5.js → pythonUtils-D5nxkQ0P.js} +18 -19
- package/dist/src/pythonUtils-D6fwaDSg.js +249 -0
- package/dist/src/quiverai-C2jVwbH1.js +213 -0
- package/dist/src/quiverai-CI6gYJVI.js +213 -0
- package/dist/src/quiverai-CLkWkyZc.cjs +213 -0
- package/dist/src/quiverai-MHSxbmmZ.js +215 -0
- package/dist/src/{render-CAZvKKkB.js → render-Drod8m7K.js} +4 -5
- package/dist/src/{responses-DLLjADw5.js → responses-BKqJmhhc.js} +34 -27
- package/dist/src/{responses-TsdODUpm.js → responses-CGw0DCzh.js} +34 -27
- package/dist/src/responses-jxdehPkC.js +660 -0
- package/dist/src/{responses-zOtKtnY_.cjs → responses-tD4Bd4dc.cjs} +49 -42
- package/dist/src/rubyUtils-BUHu6PhO.js +5 -0
- package/dist/src/{rubyUtils-Cs35SDYa.js → rubyUtils-BUVePouc.js} +27 -20
- package/dist/src/rubyUtils-BcuGX77l.js +222 -0
- package/dist/src/{rubyUtils-BtjlqyXt.js → rubyUtils-Boc4HZzX.js} +18 -19
- package/dist/src/rubyUtils-CP42kMvq.cjs +4 -0
- package/dist/src/{rubyUtils-DCVaJ3mc.cjs → rubyUtils-DhCAlxZr.cjs} +48 -50
- package/dist/src/{sagemaker-Du4LIR97.js → sagemaker-BK4Zb993.js} +75 -70
- package/dist/src/{sagemaker-CLdUAv5z.js → sagemaker-BfiWTmvn.js} +77 -77
- package/dist/src/{sagemaker-DwNnEVYt.cjs → sagemaker-CcQHM1jV.cjs} +75 -76
- package/dist/src/{sagemaker-BcgLu0U4.js → sagemaker-D2Q1c-sD.js} +75 -79
- package/dist/src/{scanner-Dyw21Wg_.js → scanner-J8CA3LsV.js} +149 -122
- package/dist/src/server/index.js +5620 -67302
- package/dist/src/{server-CgUQ25qW.cjs → server-B0PPuDw-.cjs} +57 -67
- package/dist/src/server-B1vi21hA.js +7 -0
- package/dist/src/{server-CbMTRQkg.js → server-BC7XJFgr.js} +19 -24
- package/dist/src/server-Cm9Kai_h.cjs +5 -0
- package/dist/src/{server-DWmZLfCy.js → server-DbFphssR.js} +26 -29
- package/dist/src/server-OAs3nBRT.js +229 -0
- package/dist/src/{signal-Bl32q42d.js → signal-BOTbd53Z.js} +9 -11
- package/dist/src/{slack-BtMkB6xP.cjs → slack-BmVAVGaK.cjs} +7 -8
- package/dist/src/{slack-OZYxoVON.js → slack-DCUPTzS2.js} +8 -8
- package/dist/src/{slack-DPqj42Ts.js → slack-DOdy_kyv.js} +7 -8
- package/dist/src/{slack-BfdBx2tO.js → slack-DXMKtA-f.js} +7 -9
- package/dist/src/store-BNmZ1KAz.cjs +5 -0
- package/dist/src/{store-BqwfFEyF.cjs → store-BSc-TF2w.cjs} +44 -45
- package/dist/src/store-BltJg2cd.js +6 -0
- package/dist/src/{store-D4gdn9ih.js → store-D1tv90v3.js} +34 -35
- package/dist/src/{store-2ocbYY9D.js → store-DQLEjuEO.js} +40 -36
- package/dist/src/store-Ub2vaGJ1.js +228 -0
- package/dist/src/{tables-D-NSwNIb.js → tables-5EvT_Bwn.js} +23 -23
- package/dist/src/{tables-B9E1kRp-.cjs → tables-C7K-XKWp.cjs} +93 -93
- package/dist/src/{tables-C7TT2XVn.js → tables-D36WTqKX.js} +25 -25
- package/dist/src/tables-xKANLRBD.js +288 -0
- package/dist/src/telemetry-5BCRNBbe.cjs +5 -0
- package/dist/src/{telemetry-DZ_7PaVq.js → telemetry-C15ziL8u.js} +17 -14
- package/dist/src/{telemetry-BXyVqyAg.js → telemetry-C2YDkUQH.js} +11 -13
- package/dist/src/{telemetry-D0_yFdtU.cjs → telemetry-CbrnxHp_.cjs} +21 -24
- package/dist/src/telemetry-D4W5hboe.js +7 -0
- package/dist/src/telemetry-DMb2Mpfm.js +171 -0
- package/dist/src/{text-Dm78AVGG.js → text-B_UCRPp2.js} +2 -2
- package/dist/src/{text-DF2hMKdg.cjs → text-CW1cyrwj.cjs} +12 -13
- package/dist/src/{text-DgMr_tiM.js → text-Db-Wt2u2.js} +2 -2
- package/dist/src/text-TIv0QYnd.js +22 -0
- package/dist/src/{tokenUsageUtils-FZd5O_4A.js → tokenUsageUtils-BDGe-iyI.js} +2 -2
- package/dist/src/{tokenUsageUtils-DmZSD2eU.js → tokenUsageUtils-DflFMjS0.js} +2 -2
- package/dist/src/tokenUsageUtils-NYT-WKS6.js +138 -0
- package/dist/src/{tokenUsageUtils-CXhxVj72.cjs → tokenUsageUtils-bVa1ga6f.cjs} +32 -33
- package/dist/src/{transcription-FNIz3YOe.cjs → transcription-CL78qbOU.cjs} +14 -15
- package/dist/src/{transcription-C-M81iDA.js → transcription-DAtxHhAM.js} +14 -7
- package/dist/src/{transcription-CYuY5sFO.js → transcription-LNZTNUUL.js} +14 -16
- package/dist/src/{transcription-Ch7S-LWw.js → transcription-QHh3AH6Z.js} +15 -17
- package/dist/src/{transform-CoP2bJ7P.js → transform-Cgi24fJ7.js} +94 -66
- package/dist/src/{transform-Kd6u-oNm.cjs → transform-CzK1Q0zl.cjs} +24 -26
- package/dist/src/{transform-D8dILpfZ.js → transform-DECvGmzp.js} +15 -13
- package/dist/src/{transform-DMaxQwDx.js → transform-DGLazrMm.js} +94 -66
- package/dist/src/transform-DGxXocjk.js +1506 -0
- package/dist/src/{transform-ivxEY4f7.cjs → transform-DOcQeLld.cjs} +234 -206
- package/dist/src/transform-DTGDnAzW.js +6 -0
- package/dist/src/{transform-CqTFr7KR.js → transform-DilY9wbS.js} +10 -12
- package/dist/src/transform-aa6tmVpZ.js +216 -0
- package/dist/src/transform-m3qNw4KP.cjs +5 -0
- package/dist/src/{transformersAvailability-DEU2naS1.js → transformersAvailability-CEVM2GNQ.js} +2 -2
- package/dist/src/{transformersAvailability-Bkep3ka7.cjs → transformersAvailability-CwayUSlh.cjs} +2 -3
- package/dist/src/{transformersAvailability-DwmezkVe.js → transformersAvailability-D6c6ROpT.js} +2 -2
- package/dist/src/{types-t52w-XsS.js → types-CH3Ge2sE.js} +103 -92
- package/dist/src/{types-DMVjYLpx.js → types-CLKiCBW3.js} +98 -91
- package/dist/src/types-CN_TZ2GJ.js +3260 -0
- package/dist/src/{types-BIfttHrT.cjs → types-LJ0r3wbR.cjs} +573 -566
- package/dist/src/util-5cB-L7U3.js +1430 -0
- package/dist/src/util-6-GqIvzS.js +599 -0
- package/dist/src/{util-vjscpUzy.js → util-B7T3SiBS.js} +5 -6
- package/dist/src/{util-Cl0zfT3V.js → util-Betm42rL.js} +44 -17
- package/dist/src/{util-CUEt0Vum.js → util-C-PPYSMq.js} +44 -17
- package/dist/src/{util-DkFTvieG.cjs → util-CchiqXh_.cjs} +35 -36
- package/dist/src/{util-mJ58qbbw.js → util-DaWTWKBK.js} +5 -6
- package/dist/src/{util-C08Kns6-.cjs → util-Db0a0AFH.cjs} +89 -62
- package/dist/src/{util-DiCePfDu.js → util-Dlz_Wvgm.js} +102 -53
- package/dist/src/{util-BSh4a_Q8.js → util-YT5HPZaS.js} +102 -53
- package/dist/src/{util-DUYOvxAy.cjs → util-Yz-1aEhW.cjs} +274 -219
- package/dist/src/util-ZZH-3QZz.js +293 -0
- package/dist/src/{utils-DFaZa6Rf.cjs → utils-Cz9qXqII.cjs} +32 -35
- package/dist/src/{utils-CVzb4YiI.js → utils-XiOAgly5.js} +4 -7
- package/dist/src/utils-dLokC-eR.js +94 -0
- package/dist/src/{utils-JaY9veb5.js → utils-f2-Moju7.js} +4 -7
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +59 -53
- package/dist/src/app/assets/index-BOgkICuY.css +0 -1
- package/dist/src/app/assets/index-CSgqn_Vd.js +0 -428
- package/dist/src/app/tsconfig.app.tsbuildinfo +0 -1
- package/dist/src/base-BaXmtXYp.js +0 -107
- package/dist/src/base-Dtp8b4_N.js +0 -106
- package/dist/src/base-f71xxWai.cjs +0 -111
- package/dist/src/cache-BUPcq0Ad.js +0 -6
- package/dist/src/cache-CVfRb-HD.cjs +0 -6
- package/dist/src/cache-O4EuX2JV.js +0 -8
- package/dist/src/chunk-DHDDz29n.js +0 -22
- package/dist/src/chunk-FhC4c-0y.js +0 -21
- package/dist/src/cloud-CZ4hytdm.js +0 -5
- package/dist/src/eval-CKHWqG9f.js +0 -16
- package/dist/src/evalResult-CxTP-LMm.cjs +0 -11
- package/dist/src/evalResult-CzLURDcP.js +0 -13
- package/dist/src/evalResult-DyttNQ_G.js +0 -11
- package/dist/src/evaluator-0PvfeBYh.js +0 -38
- package/dist/src/fetch-Bi0o-fdp.js +0 -4
- package/dist/src/fetch-CMptBDVg.cjs +0 -4
- package/dist/src/fetch-DAZkv3gV.js +0 -6
- package/dist/src/graders-BCytzXrb.js +0 -34
- package/dist/src/graders-CGZQShfJ.cjs +0 -33
- package/dist/src/graders-spkuVC-E.js +0 -36
- package/dist/src/opencode-sdk-CImWVqy9.js +0 -382
- package/dist/src/opencode-sdk-CuCztr4P.js +0 -380
- package/dist/src/opencode-sdk-DhcfRbBH.js +0 -376
- package/dist/src/opencode-sdk-mqF-Oj3f.cjs +0 -383
- package/dist/src/providers-BMZZmPBJ.cjs +0 -32
- package/dist/src/providers-CQQrNaJk.js +0 -32
- package/dist/src/providers-Ck8HyrC-.js +0 -34
- package/dist/src/quiverai-BNfIwKCO.cjs +0 -54
- package/dist/src/quiverai-BQigKdIH.js +0 -57
- package/dist/src/quiverai-Bfy2WnE2.js +0 -55
- package/dist/src/quiverai-CedIP0PJ.js +0 -43
- package/dist/src/rubyUtils-D7--T12C.js +0 -6
- package/dist/src/rubyUtils-DRRiMFV2.js +0 -5
- package/dist/src/rubyUtils-vb8OYFC-.cjs +0 -5
- package/dist/src/server-BUbS0Qfh.js +0 -6
- package/dist/src/server-XpGXFHkS.cjs +0 -6
- package/dist/src/server-gfOx5Zrk.js +0 -8
- package/dist/src/store-5u2yriTV.js +0 -7
- package/dist/src/store-D_lq_8oQ.js +0 -6
- package/dist/src/store-m5KT6Ly7.cjs +0 -6
- package/dist/src/telemetry-5RHFoCJh.js +0 -6
- package/dist/src/telemetry-Do8wMnA-.js +0 -8
- package/dist/src/telemetry-LojxPoFq.cjs +0 -6
- package/dist/src/transform-8eGmaH-7.js +0 -7
- package/dist/src/transform-BRVvWaG4.cjs +0 -6
- package/dist/src/transform-GybT0X0u.js +0 -8
- package/dist/src/transformersAvailability-DkAWaK5B.js +0 -35
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { C as getEnvString, E as isCI, O as
|
|
3
|
-
import {
|
|
4
|
-
import { t as invariant } from "./invariant-
|
|
5
|
-
import { r as
|
|
6
|
-
import { d as isGradingResult, p as isApiProvider, s as ResultFailureReason } from "./types-
|
|
7
|
-
import { c as promptYesNo } from "./server-
|
|
8
|
-
import { A as renderPrompt, E as isBasicRefusal, F as TokenUsageTracker, G as VertexChatProvider, I as createRateLimitRegistry, K as AIStudioChatProvider, L as createProviderRateLimitOptions, M as isPackagePath, N as loadFromPackage, P as redteamProviderManager, j as runExtensionHook, k as collectFileMetadata, u as GoogleLiveProvider, v as checkExfilTracking, w as getSessionId } from "./providers-
|
|
9
|
-
import { o as getCache } from "./cache-
|
|
10
|
-
import { n as isNonTransientHttpStatus } from "./errors-
|
|
11
|
-
import { i as isJavascriptFile } from "./fileExtensions-
|
|
12
|
-
import { E as parseFileUrl,
|
|
13
|
-
import { r as runPython } from "./pythonUtils-
|
|
14
|
-
import { n as transform, r as getProcessShim, t as TransformInputType } from "./transform-
|
|
15
|
-
import { $ as matchesSearchRubric, B as getAndCheckProvider, G as matchesContextFaithfulness, H as matchesAnswerRelevance, J as matchesFactuality, K as matchesContextRecall, Q as matchesPiScore, R as callProviderWithContext, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, at as DefaultSuggestionsProvider, dt as getFinalTest, et as matchesSelectBest, ft as loadFromJavaScriptFile, it as getDefaultProviders, lt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, mt as resolveContext, n as getGraderById, nt as selectMaxScore, pt as processFileReference, q as matchesContextRelevance, tt as matchesSimilarity, ut as coerceString, z as fail } from "./graders-
|
|
16
|
-
import { i as generateIdFromPrompt } from "./utils-
|
|
17
|
-
import { t as
|
|
18
|
-
import { t as
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
21
|
-
import {
|
|
22
|
-
import { n as getTraceStore } from "./store-
|
|
23
|
-
import { t as providerRegistry } from "./providerRegistry-
|
|
24
|
-
import { n as runRuby } from "./rubyUtils-
|
|
25
|
-
import { a as getActualPromptWithFallback, r as updateSignalFile } from "./signal-
|
|
2
|
+
import { C as getEnvString, E as isCI, O as state, S as getEnvInt, T as getMaxEvalTimeMs, _ as summarizeEvaluateResultForLogging, b as getEnvBool, f as extractJsonObjects, g as safeJsonStringify, o as logger, p as getAjv, w as getEvalTimeoutMs } from "./logger-KkObSCzq.js";
|
|
3
|
+
import { N as VERSION, P as FILE_METADATA_KEY, g as isPromptfooSampleTarget, l as sleep, r as fetchWithRetries, y as parseChatPrompt } from "./fetch-BMv0O527.js";
|
|
4
|
+
import { t as invariant } from "./invariant-BtWWVVhl.js";
|
|
5
|
+
import { r as telemetry } from "./telemetry-C2YDkUQH.js";
|
|
6
|
+
import { d as isGradingResult, p as isApiProvider, s as ResultFailureReason } from "./types-CH3Ge2sE.js";
|
|
7
|
+
import { c as promptYesNo } from "./server-DbFphssR.js";
|
|
8
|
+
import { A as renderPrompt, E as isBasicRefusal, F as TokenUsageTracker, G as VertexChatProvider, I as createRateLimitRegistry, K as AIStudioChatProvider, L as createProviderRateLimitOptions, M as isPackagePath, N as loadFromPackage, P as redteamProviderManager, j as runExtensionHook, k as collectFileMetadata, u as GoogleLiveProvider, v as checkExfilTracking, w as getSessionId } from "./providers-CFLy1_ji.js";
|
|
9
|
+
import { o as getCache } from "./cache-CG0SlR1d.js";
|
|
10
|
+
import { n as isNonTransientHttpStatus } from "./errors-P6ll7XSJ.js";
|
|
11
|
+
import { i as isJavascriptFile } from "./fileExtensions-Ds-foDzt.js";
|
|
12
|
+
import { E as parseFileUrl, I as isAnthropicProvider, L as isGoogleProvider, R as isOpenAiProvider, T as loadFunction, g as maybeLoadToolsFromExternalFile, w as getNunjucksEngine, z as isProviderAllowed } from "./util-YT5HPZaS.js";
|
|
13
|
+
import { r as runPython } from "./pythonUtils-C3py6GC1.js";
|
|
14
|
+
import { n as transform, r as getProcessShim, t as TransformInputType } from "./transform-DilY9wbS.js";
|
|
15
|
+
import { $ as matchesSearchRubric, B as getAndCheckProvider, G as matchesContextFaithfulness, H as matchesAnswerRelevance, J as matchesFactuality, K as matchesContextRecall, Q as matchesPiScore, R as callProviderWithContext, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, at as DefaultSuggestionsProvider, dt as getFinalTest, et as matchesSelectBest, ft as loadFromJavaScriptFile, it as getDefaultProviders, lt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, mt as resolveContext, n as getGraderById, nt as selectMaxScore, pt as processFileReference, q as matchesContextRelevance, tt as matchesSimilarity, ut as coerceString, z as fail } from "./graders-CHO8EPM4.js";
|
|
16
|
+
import { i as generateIdFromPrompt } from "./utils-f2-Moju7.js";
|
|
17
|
+
import { t as OpenAiChatCompletionProvider } from "./chat-CRWNNq73.js";
|
|
18
|
+
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-DflFMjS0.js";
|
|
19
|
+
import { m as validateFunctionCall } from "./transform-Cgi24fJ7.js";
|
|
20
|
+
import { l as validateFunctionCall$1 } from "./util-C-PPYSMq.js";
|
|
21
|
+
import { t as extractAndStoreBinaryData } from "./extractor-CFG6bcWJ.js";
|
|
22
|
+
import { n as getTraceStore } from "./store-D1tv90v3.js";
|
|
23
|
+
import { t as providerRegistry } from "./providerRegistry-B0RUOLI_.js";
|
|
24
|
+
import { n as runRuby } from "./rubyUtils-Boc4HZzX.js";
|
|
25
|
+
import { a as getActualPromptWithFallback, r as updateSignalFile } from "./signal-BOTbd53Z.js";
|
|
26
26
|
import chalk from "chalk";
|
|
27
27
|
import fs, { createWriteStream } from "fs";
|
|
28
28
|
import path from "path";
|
|
@@ -43,7 +43,6 @@ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
|
43
43
|
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
44
44
|
import { BatchSpanProcessor, NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
45
45
|
import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions";
|
|
46
|
-
|
|
47
46
|
//#region src/external/matchers/conversationRelevancyTemplate.ts
|
|
48
47
|
var ConversationRelevancyTemplate = class {
|
|
49
48
|
static generateVerdicts(slidingWindow) {
|
|
@@ -115,7 +114,6 @@ ${JSON.stringify(irrelevancies, null, 2)}
|
|
|
115
114
|
JSON:`;
|
|
116
115
|
}
|
|
117
116
|
};
|
|
118
|
-
|
|
119
117
|
//#endregion
|
|
120
118
|
//#region src/external/matchers/deepeval.ts
|
|
121
119
|
const nunjucks$1 = getNunjucksEngine(void 0, false, true);
|
|
@@ -165,7 +163,6 @@ async function matchesConversationRelevance(messages, threshold, vars, grading,
|
|
|
165
163
|
return fail(`Error parsing output: ${err.message}`, resp.tokenUsage);
|
|
166
164
|
}
|
|
167
165
|
}
|
|
168
|
-
|
|
169
166
|
//#endregion
|
|
170
167
|
//#region src/external/assertions/deepeval.ts
|
|
171
168
|
const DEFAULT_WINDOW_SIZE = 5;
|
|
@@ -220,7 +217,6 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
220
217
|
tokensUsed: tokensUsed.total > 0 ? tokensUsed : void 0
|
|
221
218
|
};
|
|
222
219
|
};
|
|
223
|
-
|
|
224
220
|
//#endregion
|
|
225
221
|
//#region src/tracing/evaluatorTracing.ts
|
|
226
222
|
let otlpReceiverStarted = false;
|
|
@@ -253,28 +249,28 @@ function isOtlpReceiverStarted() {
|
|
|
253
249
|
* Start the OTLP receiver if tracing is enabled and it hasn't been started yet
|
|
254
250
|
*/
|
|
255
251
|
async function startOtlpReceiverIfNeeded(testSuite) {
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
252
|
+
logger.debug(`[EvaluatorTracing] Checking tracing config: ${JSON.stringify(testSuite.tracing)}`);
|
|
253
|
+
logger.debug(`[EvaluatorTracing] testSuite keys: ${Object.keys(testSuite)}`);
|
|
254
|
+
logger.debug(`[EvaluatorTracing] Full testSuite.tracing: ${JSON.stringify(testSuite.tracing, null, 2)}`);
|
|
259
255
|
if (testSuite.tracing?.enabled && testSuite.tracing?.otlp?.http?.enabled && !otlpReceiverStarted) {
|
|
260
|
-
|
|
256
|
+
telemetry.record("feature_used", { feature: "tracing" });
|
|
261
257
|
try {
|
|
262
|
-
|
|
263
|
-
const { startOTLPReceiver } = await import("./otlpReceiver-
|
|
258
|
+
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
259
|
+
const { startOTLPReceiver } = await import("./otlpReceiver-Bn5wGB1v.js");
|
|
264
260
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
265
261
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
266
|
-
|
|
262
|
+
logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
267
263
|
await startOTLPReceiver(port, host);
|
|
268
264
|
otlpReceiverStarted = true;
|
|
269
|
-
|
|
265
|
+
logger.info(`[EvaluatorTracing] OTLP receiver successfully started on port ${port} for tracing`);
|
|
270
266
|
} catch (error) {
|
|
271
|
-
|
|
267
|
+
logger.error(`[EvaluatorTracing] Failed to start OTLP receiver: ${error}`);
|
|
272
268
|
}
|
|
273
|
-
} else if (otlpReceiverStarted)
|
|
269
|
+
} else if (otlpReceiverStarted) logger.debug("[EvaluatorTracing] OTLP receiver already started, skipping initialization");
|
|
274
270
|
else {
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
271
|
+
logger.debug("[EvaluatorTracing] Tracing not enabled or OTLP HTTP receiver not configured");
|
|
272
|
+
logger.debug(`[EvaluatorTracing] tracing.enabled: ${testSuite.tracing?.enabled}`);
|
|
273
|
+
logger.debug(`[EvaluatorTracing] tracing.otlp.http.enabled: ${testSuite.tracing?.otlp?.http?.enabled}`);
|
|
278
274
|
}
|
|
279
275
|
}
|
|
280
276
|
/**
|
|
@@ -282,13 +278,13 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
282
278
|
*/
|
|
283
279
|
async function stopOtlpReceiverIfNeeded() {
|
|
284
280
|
if (otlpReceiverStarted) try {
|
|
285
|
-
|
|
286
|
-
const { stopOTLPReceiver } = await import("./otlpReceiver-
|
|
281
|
+
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
282
|
+
const { stopOTLPReceiver } = await import("./otlpReceiver-Bn5wGB1v.js");
|
|
287
283
|
await stopOTLPReceiver();
|
|
288
284
|
otlpReceiverStarted = false;
|
|
289
|
-
|
|
285
|
+
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
290
286
|
} catch (error) {
|
|
291
|
-
|
|
287
|
+
logger.error(`[EvaluatorTracing] Failed to stop OTLP receiver: ${error}`);
|
|
292
288
|
}
|
|
293
289
|
}
|
|
294
290
|
/**
|
|
@@ -304,7 +300,7 @@ function isTracingEnabled(test, testSuite) {
|
|
|
304
300
|
const yamlConfigEnabled = testSuite?.tracing?.enabled === true;
|
|
305
301
|
const envEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false);
|
|
306
302
|
const result = metadataEnabled || yamlConfigEnabled || envEnabled;
|
|
307
|
-
|
|
303
|
+
logger.debug(`[EvaluatorTracing] isTracingEnabled check: metadata=${metadataEnabled}, yamlConfig=${yamlConfigEnabled}, env=${envEnabled}, result=${result}`);
|
|
308
304
|
return result;
|
|
309
305
|
}
|
|
310
306
|
/**
|
|
@@ -313,25 +309,25 @@ function isTracingEnabled(test, testSuite) {
|
|
|
313
309
|
async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite) {
|
|
314
310
|
const tracingEnabled = isTracingEnabled(test, testSuite);
|
|
315
311
|
if (tracingEnabled) {
|
|
316
|
-
|
|
317
|
-
|
|
312
|
+
logger.debug("[EvaluatorTracing] Tracing enabled for test case");
|
|
313
|
+
logger.debug(`[EvaluatorTracing] Test metadata: ${JSON.stringify(test.metadata)}`);
|
|
318
314
|
}
|
|
319
315
|
if (!tracingEnabled) return null;
|
|
320
|
-
|
|
321
|
-
const { getTraceStore } = await import("./store-
|
|
316
|
+
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
317
|
+
const { getTraceStore } = await import("./store-BltJg2cd.js");
|
|
322
318
|
const traceStore = getTraceStore();
|
|
323
319
|
const traceId = generateTraceId();
|
|
324
320
|
const spanId = generateSpanId();
|
|
325
321
|
const traceparent = generateTraceparent(traceId, spanId);
|
|
326
|
-
|
|
322
|
+
logger.debug(`[EvaluatorTracing] Generated trace context: traceId=${traceId}, spanId=${spanId}`);
|
|
327
323
|
let evaluationId = test.metadata?.evaluationId || evaluateOptions?.eventSource;
|
|
328
324
|
if (!evaluationId) {
|
|
329
|
-
|
|
325
|
+
logger.warn("[EvaluatorTracing] No evaluation ID found in test metadata or evaluateOptions, trace will not be linked to evaluation");
|
|
330
326
|
evaluationId = `eval-${Date.now()}`;
|
|
331
327
|
}
|
|
332
328
|
const testCaseId = test.metadata?.testCaseId || test.id || `${testIdx}-${promptIdx}`;
|
|
333
329
|
try {
|
|
334
|
-
|
|
330
|
+
logger.debug(`[EvaluatorTracing] Creating trace record for traceId=${traceId}`);
|
|
335
331
|
await traceStore.createTrace({
|
|
336
332
|
traceId,
|
|
337
333
|
evaluationId: evaluationId || "",
|
|
@@ -342,18 +338,17 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
342
338
|
vars: test.vars
|
|
343
339
|
}
|
|
344
340
|
});
|
|
345
|
-
|
|
341
|
+
logger.debug("[EvaluatorTracing] Trace record created successfully");
|
|
346
342
|
} catch (error) {
|
|
347
|
-
|
|
343
|
+
logger.error(`[EvaluatorTracing] Failed to create trace: ${error}`);
|
|
348
344
|
}
|
|
349
|
-
|
|
345
|
+
logger.debug(`[EvaluatorTracing] Trace context ready: ${traceparent} for test case ${testCaseId}`);
|
|
350
346
|
return {
|
|
351
347
|
traceparent,
|
|
352
348
|
evaluationId,
|
|
353
349
|
testCaseId
|
|
354
350
|
};
|
|
355
351
|
}
|
|
356
|
-
|
|
357
352
|
//#endregion
|
|
358
353
|
//#region src/assertions/answerRelevance.ts
|
|
359
354
|
const handleAnswerRelevance = async ({ assertion, output, prompt, test, providerCallContext }) => {
|
|
@@ -364,7 +359,6 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
|
|
|
364
359
|
...await matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
|
|
365
360
|
};
|
|
366
361
|
};
|
|
367
|
-
|
|
368
362
|
//#endregion
|
|
369
363
|
//#region src/assertions/assertionsResult.ts
|
|
370
364
|
const GUARDRAIL_BLOCKED_REASON = "Content failed guardrail safety checks";
|
|
@@ -470,7 +464,6 @@ var AssertionsResult = class {
|
|
|
470
464
|
return this.result;
|
|
471
465
|
}
|
|
472
466
|
};
|
|
473
|
-
|
|
474
467
|
//#endregion
|
|
475
468
|
//#region src/assertions/ngrams.ts
|
|
476
469
|
/**
|
|
@@ -486,7 +479,6 @@ function getNGrams(words, n) {
|
|
|
486
479
|
for (let i = 0; i <= words.length - n; i++) ngrams.push(words.slice(i, i + n).join(" "));
|
|
487
480
|
return ngrams;
|
|
488
481
|
}
|
|
489
|
-
|
|
490
482
|
//#endregion
|
|
491
483
|
//#region src/assertions/bleu.ts
|
|
492
484
|
/**
|
|
@@ -582,7 +574,6 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
582
574
|
assertion
|
|
583
575
|
};
|
|
584
576
|
}
|
|
585
|
-
|
|
586
577
|
//#endregion
|
|
587
578
|
//#region src/assertions/classifier.ts
|
|
588
579
|
async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
|
|
@@ -597,9 +588,43 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
597
588
|
...classificationResult
|
|
598
589
|
};
|
|
599
590
|
}
|
|
600
|
-
|
|
601
591
|
//#endregion
|
|
602
592
|
//#region src/assertions/contains.ts
|
|
593
|
+
function parseCommaSeparatedValues(value) {
|
|
594
|
+
const results = [];
|
|
595
|
+
let i = 0;
|
|
596
|
+
while (i < value.length) {
|
|
597
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
598
|
+
if (i >= value.length) break;
|
|
599
|
+
if (value[i] === ",") {
|
|
600
|
+
i++;
|
|
601
|
+
continue;
|
|
602
|
+
}
|
|
603
|
+
if (value[i] === "\"") {
|
|
604
|
+
i++;
|
|
605
|
+
let field = "";
|
|
606
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
607
|
+
field += value[i + 1];
|
|
608
|
+
i += 2;
|
|
609
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
610
|
+
field += "\"";
|
|
611
|
+
i += 2;
|
|
612
|
+
} else if (value[i] === "\"") {
|
|
613
|
+
i++;
|
|
614
|
+
break;
|
|
615
|
+
} else {
|
|
616
|
+
field += value[i];
|
|
617
|
+
i++;
|
|
618
|
+
}
|
|
619
|
+
results.push(field);
|
|
620
|
+
} else {
|
|
621
|
+
const start = i;
|
|
622
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
623
|
+
results.push(value.substring(start, i).trim());
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
return results;
|
|
627
|
+
}
|
|
603
628
|
const handleContains = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
604
629
|
const value = valueFromScript ?? renderedValue;
|
|
605
630
|
invariant(value, "\"contains\" assertion type must have a string or number value");
|
|
@@ -627,7 +652,7 @@ const handleIContains = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
627
652
|
const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
628
653
|
let value = valueFromScript ?? renderedValue;
|
|
629
654
|
invariant(value, "\"contains-any\" assertion type must have a value");
|
|
630
|
-
if (typeof value === "string") value = value
|
|
655
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
631
656
|
invariant(Array.isArray(value), "\"contains-any\" assertion type must have an array value");
|
|
632
657
|
const pass = value.some((v) => outputString.includes(String(v))) !== inverse;
|
|
633
658
|
return {
|
|
@@ -640,7 +665,7 @@ const handleContainsAny = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
640
665
|
const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
641
666
|
let value = valueFromScript ?? renderedValue;
|
|
642
667
|
invariant(value, "\"icontains-any\" assertion type must have a value");
|
|
643
|
-
if (typeof value === "string") value = value
|
|
668
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
644
669
|
invariant(Array.isArray(value), "\"icontains-any\" assertion type must have an array value");
|
|
645
670
|
const pass = value.some((v) => outputString.toLowerCase().includes(String(v).toLowerCase())) !== inverse;
|
|
646
671
|
return {
|
|
@@ -653,7 +678,7 @@ const handleIContainsAny = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
653
678
|
const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
654
679
|
let value = valueFromScript ?? renderedValue;
|
|
655
680
|
invariant(value, "\"contains-all\" assertion type must have a value");
|
|
656
|
-
if (typeof value === "string") value = value
|
|
681
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
657
682
|
invariant(Array.isArray(value), "\"contains-all\" assertion type must have an array value");
|
|
658
683
|
const missingStrings = value.filter((v) => !outputString.includes(String(v)));
|
|
659
684
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -667,7 +692,7 @@ const handleContainsAll = ({ assertion, renderedValue, valueFromScript, outputSt
|
|
|
667
692
|
const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputString, inverse }) => {
|
|
668
693
|
let value = valueFromScript ?? renderedValue;
|
|
669
694
|
invariant(value, "\"icontains-all\" assertion type must have a value");
|
|
670
|
-
if (typeof value === "string") value = value
|
|
695
|
+
if (typeof value === "string") value = parseCommaSeparatedValues(value);
|
|
671
696
|
invariant(Array.isArray(value), "\"icontains-all\" assertion type must have an array value");
|
|
672
697
|
const missingStrings = value.filter((v) => !outputString.toLowerCase().includes(String(v).toLowerCase()));
|
|
673
698
|
const pass = missingStrings.length === 0 !== inverse;
|
|
@@ -678,7 +703,6 @@ const handleIContainsAll = ({ assertion, renderedValue, valueFromScript, outputS
|
|
|
678
703
|
assertion
|
|
679
704
|
};
|
|
680
705
|
};
|
|
681
|
-
|
|
682
706
|
//#endregion
|
|
683
707
|
//#region src/assertions/contextFaithfulness.ts
|
|
684
708
|
/**
|
|
@@ -702,7 +726,6 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
|
|
|
702
726
|
metadata: { context }
|
|
703
727
|
};
|
|
704
728
|
}
|
|
705
|
-
|
|
706
729
|
//#endregion
|
|
707
730
|
//#region src/assertions/contextRecall.ts
|
|
708
731
|
/**
|
|
@@ -729,7 +752,6 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
|
|
|
729
752
|
}
|
|
730
753
|
};
|
|
731
754
|
};
|
|
732
|
-
|
|
733
755
|
//#endregion
|
|
734
756
|
//#region src/assertions/contextRelevance.ts
|
|
735
757
|
/**
|
|
@@ -756,7 +778,6 @@ const handleContextRelevance = async ({ assertion, test, output, prompt, provide
|
|
|
756
778
|
}
|
|
757
779
|
};
|
|
758
780
|
};
|
|
759
|
-
|
|
760
781
|
//#endregion
|
|
761
782
|
//#region src/assertions/cost.ts
|
|
762
783
|
const handleCost = ({ cost, assertion }) => {
|
|
@@ -770,7 +791,6 @@ const handleCost = ({ cost, assertion }) => {
|
|
|
770
791
|
assertion
|
|
771
792
|
};
|
|
772
793
|
};
|
|
773
|
-
|
|
774
794
|
//#endregion
|
|
775
795
|
//#region src/assertions/equals.ts
|
|
776
796
|
const handleEquals = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -790,7 +810,6 @@ const handleEquals = async ({ assertion, renderedValue, outputString, inverse })
|
|
|
790
810
|
assertion
|
|
791
811
|
};
|
|
792
812
|
};
|
|
793
|
-
|
|
794
813
|
//#endregion
|
|
795
814
|
//#region src/assertions/factuality.ts
|
|
796
815
|
const handleFactuality = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -801,7 +820,6 @@ const handleFactuality = async ({ assertion, renderedValue, outputString, test,
|
|
|
801
820
|
...await matchesFactuality(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
802
821
|
};
|
|
803
822
|
};
|
|
804
|
-
|
|
805
823
|
//#endregion
|
|
806
824
|
//#region src/assertions/finishReason.ts
|
|
807
825
|
function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
@@ -821,7 +839,6 @@ function handleFinishReason({ assertion, renderedValue, providerResponse }) {
|
|
|
821
839
|
assertion
|
|
822
840
|
};
|
|
823
841
|
}
|
|
824
|
-
|
|
825
842
|
//#endregion
|
|
826
843
|
//#region src/assertions/functionToolCall.ts
|
|
827
844
|
const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
@@ -844,7 +861,6 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
844
861
|
};
|
|
845
862
|
}
|
|
846
863
|
};
|
|
847
|
-
|
|
848
864
|
//#endregion
|
|
849
865
|
//#region src/assertions/geval.ts
|
|
850
866
|
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
@@ -870,7 +886,6 @@ const handleGEval = async ({ assertion, renderedValue, prompt, outputString, tes
|
|
|
870
886
|
...await matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext)
|
|
871
887
|
};
|
|
872
888
|
};
|
|
873
|
-
|
|
874
889
|
//#endregion
|
|
875
890
|
//#region src/assertions/gleu.ts
|
|
876
891
|
/**
|
|
@@ -969,7 +984,6 @@ function handleGleuScore({ assertion, inverse, outputString, renderedValue }) {
|
|
|
969
984
|
assertion
|
|
970
985
|
};
|
|
971
986
|
}
|
|
972
|
-
|
|
973
987
|
//#endregion
|
|
974
988
|
//#region src/assertions/guardrails.ts
|
|
975
989
|
const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
@@ -978,7 +992,7 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
978
992
|
flaggedInput: false,
|
|
979
993
|
flaggedOutput: false
|
|
980
994
|
};
|
|
981
|
-
|
|
995
|
+
logger.debug(`Guardrails from target response: ${JSON.stringify(guardrails, null, 2)}`);
|
|
982
996
|
if (guardrails) if (guardrails.flagged) {
|
|
983
997
|
let reason = "Content failed safety checks";
|
|
984
998
|
if (guardrails?.flaggedInput) reason = "Prompt failed safety checks";
|
|
@@ -1007,7 +1021,6 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1007
1021
|
assertion
|
|
1008
1022
|
};
|
|
1009
1023
|
};
|
|
1010
|
-
|
|
1011
1024
|
//#endregion
|
|
1012
1025
|
//#region src/assertions/html.ts
|
|
1013
1026
|
const HTML_PATTERNS = {
|
|
@@ -1216,7 +1229,6 @@ const handleIsHtml = ({ assertion, outputString, inverse }) => {
|
|
|
1216
1229
|
assertion
|
|
1217
1230
|
};
|
|
1218
1231
|
};
|
|
1219
|
-
|
|
1220
1232
|
//#endregion
|
|
1221
1233
|
//#region src/assertions/javascript.ts
|
|
1222
1234
|
/**
|
|
@@ -1357,7 +1369,6 @@ ${renderedValue}`,
|
|
|
1357
1369
|
assertion
|
|
1358
1370
|
};
|
|
1359
1371
|
};
|
|
1360
|
-
|
|
1361
1372
|
//#endregion
|
|
1362
1373
|
//#region src/assertions/json.ts
|
|
1363
1374
|
function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, assertion }) {
|
|
@@ -1423,7 +1434,6 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1423
1434
|
assertion
|
|
1424
1435
|
};
|
|
1425
1436
|
}
|
|
1426
|
-
|
|
1427
1437
|
//#endregion
|
|
1428
1438
|
//#region src/assertions/latency.ts
|
|
1429
1439
|
const handleLatency = ({ assertion, latencyMs }) => {
|
|
@@ -1437,7 +1447,6 @@ const handleLatency = ({ assertion, latencyMs }) => {
|
|
|
1437
1447
|
assertion
|
|
1438
1448
|
};
|
|
1439
1449
|
};
|
|
1440
|
-
|
|
1441
1450
|
//#endregion
|
|
1442
1451
|
//#region src/assertions/levenshtein.ts
|
|
1443
1452
|
function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
@@ -1452,7 +1461,6 @@ function handleLevenshtein({ assertion, renderedValue, outputString }) {
|
|
|
1452
1461
|
assertion
|
|
1453
1462
|
};
|
|
1454
1463
|
}
|
|
1455
|
-
|
|
1456
1464
|
//#endregion
|
|
1457
1465
|
//#region src/assertions/llmRubric.ts
|
|
1458
1466
|
const handleLlmRubric = ({ assertion, renderedValue, outputString, test, providerCallContext }) => {
|
|
@@ -1461,7 +1469,6 @@ const handleLlmRubric = ({ assertion, renderedValue, outputString, test, provide
|
|
|
1461
1469
|
assertion.value = assertion.value || test.options?.rubricPrompt;
|
|
1462
1470
|
return matchesLlmRubric(renderedValue || "", outputString, test.options, test.vars, assertion, void 0, providerCallContext);
|
|
1463
1471
|
};
|
|
1464
|
-
|
|
1465
1472
|
//#endregion
|
|
1466
1473
|
//#region src/assertions/modelGradedClosedQa.ts
|
|
1467
1474
|
const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputString, test, prompt, providerCallContext }) => {
|
|
@@ -1472,7 +1479,6 @@ const handleModelGradedClosedQa = async ({ assertion, renderedValue, outputStrin
|
|
|
1472
1479
|
...await matchesClosedQa(prompt, renderedValue, outputString, test.options, test.vars, providerCallContext)
|
|
1473
1480
|
};
|
|
1474
1481
|
};
|
|
1475
|
-
|
|
1476
1482
|
//#endregion
|
|
1477
1483
|
//#region src/assertions/moderation.ts
|
|
1478
1484
|
const handleModeration = async ({ assertion, test, outputString, providerResponse, prompt }) => {
|
|
@@ -1495,7 +1501,6 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
|
|
|
1495
1501
|
assertion
|
|
1496
1502
|
};
|
|
1497
1503
|
};
|
|
1498
|
-
|
|
1499
1504
|
//#endregion
|
|
1500
1505
|
//#region src/assertions/openai.ts
|
|
1501
1506
|
const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test }) => {
|
|
@@ -1556,7 +1561,6 @@ const handleIsValidOpenAiToolsCall = async ({ assertion, output, provider, test
|
|
|
1556
1561
|
};
|
|
1557
1562
|
}
|
|
1558
1563
|
};
|
|
1559
|
-
|
|
1560
1564
|
//#endregion
|
|
1561
1565
|
//#region src/assertions/perplexity.ts
|
|
1562
1566
|
function handlePerplexity({ logProbs, assertion }) {
|
|
@@ -1583,7 +1587,6 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1583
1587
|
assertion
|
|
1584
1588
|
};
|
|
1585
1589
|
}
|
|
1586
|
-
|
|
1587
1590
|
//#endregion
|
|
1588
1591
|
//#region src/assertions/pi.ts
|
|
1589
1592
|
const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }) => {
|
|
@@ -1591,7 +1594,6 @@ const handlePiScorer = async ({ assertion, prompt, renderedValue, outputString }
|
|
|
1591
1594
|
invariant(typeof prompt === "string", "\"pi\" assertion must have a prompt that is a string");
|
|
1592
1595
|
return matchesPiScore(renderedValue, prompt, outputString, assertion);
|
|
1593
1596
|
};
|
|
1594
|
-
|
|
1595
1597
|
//#endregion
|
|
1596
1598
|
//#region src/python/wrapper.ts
|
|
1597
1599
|
/**
|
|
@@ -1607,17 +1609,16 @@ async function runPythonCode(code, method, args) {
|
|
|
1607
1609
|
fs.writeFileSync(tempFilePath, code);
|
|
1608
1610
|
return await runPython(tempFilePath, method, args);
|
|
1609
1611
|
} catch (error) {
|
|
1610
|
-
|
|
1612
|
+
logger.error(`Error executing Python code: ${error}`);
|
|
1611
1613
|
throw error;
|
|
1612
1614
|
} finally {
|
|
1613
1615
|
try {
|
|
1614
1616
|
fs.unlinkSync(tempFilePath);
|
|
1615
1617
|
} catch (error) {
|
|
1616
|
-
|
|
1618
|
+
logger.error(`Error removing temporary file: ${error}`);
|
|
1617
1619
|
}
|
|
1618
1620
|
}
|
|
1619
1621
|
}
|
|
1620
|
-
|
|
1621
1622
|
//#endregion
|
|
1622
1623
|
//#region src/util/caseMapping.ts
|
|
1623
1624
|
/**
|
|
@@ -1641,7 +1642,6 @@ function mapSnakeCaseToCamelCase(obj) {
|
|
|
1641
1642
|
});
|
|
1642
1643
|
return result;
|
|
1643
1644
|
}
|
|
1644
|
-
|
|
1645
1645
|
//#endregion
|
|
1646
1646
|
//#region src/assertions/python.ts
|
|
1647
1647
|
const handlePython = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -1711,7 +1711,6 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1711
1711
|
assertion
|
|
1712
1712
|
};
|
|
1713
1713
|
};
|
|
1714
|
-
|
|
1715
1714
|
//#endregion
|
|
1716
1715
|
//#region src/assertions/redteam.ts
|
|
1717
1716
|
/**
|
|
@@ -1792,7 +1791,7 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1792
1791
|
const { hasAnyErrors, allTurnsHaveErrors } = analyzeGraderErrors(redteamHistory);
|
|
1793
1792
|
if (test.metadata?.strategyId && hasAnyErrors && !allTurnsHaveErrors) {
|
|
1794
1793
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1795
|
-
|
|
1794
|
+
logger.warn("[Redteam] Grading failed for iterative test with some prior grader errors", {
|
|
1796
1795
|
error: errorMessage,
|
|
1797
1796
|
strategyId: test.metadata.strategyId,
|
|
1798
1797
|
pluginId: test.metadata.pluginId
|
|
@@ -1812,7 +1811,6 @@ const handleRedteam = async ({ assertion, baseType, test, prompt, outputString,
|
|
|
1812
1811
|
throw error;
|
|
1813
1812
|
}
|
|
1814
1813
|
};
|
|
1815
|
-
|
|
1816
1814
|
//#endregion
|
|
1817
1815
|
//#region src/assertions/refusal.ts
|
|
1818
1816
|
function handleIsRefusal(params) {
|
|
@@ -1840,7 +1838,6 @@ function handleIsRefusal(params) {
|
|
|
1840
1838
|
assertion
|
|
1841
1839
|
};
|
|
1842
1840
|
}
|
|
1843
|
-
|
|
1844
1841
|
//#endregion
|
|
1845
1842
|
//#region src/assertions/regex.ts
|
|
1846
1843
|
const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -1865,7 +1862,6 @@ const handleRegex = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
|
1865
1862
|
assertion
|
|
1866
1863
|
};
|
|
1867
1864
|
};
|
|
1868
|
-
|
|
1869
1865
|
//#endregion
|
|
1870
1866
|
//#region src/assertions/rouge.ts
|
|
1871
1867
|
function handleRougeScore({ baseType, assertion, renderedValue, outputString, inverse }) {
|
|
@@ -1881,7 +1877,6 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
1881
1877
|
assertion
|
|
1882
1878
|
};
|
|
1883
1879
|
}
|
|
1884
|
-
|
|
1885
1880
|
//#endregion
|
|
1886
1881
|
//#region src/ruby/wrapper.ts
|
|
1887
1882
|
/**
|
|
@@ -1897,17 +1892,16 @@ async function runRubyCode(code, method, args) {
|
|
|
1897
1892
|
fs.writeFileSync(tempFilePath, code);
|
|
1898
1893
|
return await runRuby(tempFilePath, method, args);
|
|
1899
1894
|
} catch (error) {
|
|
1900
|
-
|
|
1895
|
+
logger.error(`Error executing Ruby code: ${error}`);
|
|
1901
1896
|
throw error;
|
|
1902
1897
|
} finally {
|
|
1903
1898
|
try {
|
|
1904
1899
|
fs.unlinkSync(tempFilePath);
|
|
1905
1900
|
} catch (error) {
|
|
1906
|
-
|
|
1901
|
+
logger.error(`Error removing temporary file: ${error}`);
|
|
1907
1902
|
}
|
|
1908
1903
|
}
|
|
1909
1904
|
}
|
|
1910
|
-
|
|
1911
1905
|
//#endregion
|
|
1912
1906
|
//#region src/assertions/ruby.ts
|
|
1913
1907
|
const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertionValueContext, output }) => {
|
|
@@ -1978,7 +1972,6 @@ end
|
|
|
1978
1972
|
assertion
|
|
1979
1973
|
};
|
|
1980
1974
|
};
|
|
1981
|
-
|
|
1982
1975
|
//#endregion
|
|
1983
1976
|
//#region src/assertions/searchRubric.ts
|
|
1984
1977
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
@@ -1990,7 +1983,6 @@ async function handleSearchRubric({ assertion, baseType: _baseType, inverse, pro
|
|
|
1990
1983
|
}
|
|
1991
1984
|
return result;
|
|
1992
1985
|
}
|
|
1993
|
-
|
|
1994
1986
|
//#endregion
|
|
1995
1987
|
//#region src/assertions/similar.ts
|
|
1996
1988
|
const handleSimilar = async ({ assertion, renderedValue, outputString, inverse, test }) => {
|
|
@@ -2033,7 +2025,6 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2033
2025
|
...await matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
|
|
2034
2026
|
};
|
|
2035
2027
|
};
|
|
2036
|
-
|
|
2037
2028
|
//#endregion
|
|
2038
2029
|
//#region src/assertions/sql.ts
|
|
2039
2030
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2125,7 +2116,6 @@ const handleContainsSql = async (assertionParams) => {
|
|
|
2125
2116
|
}
|
|
2126
2117
|
return handleIsSql(assertionParams);
|
|
2127
2118
|
};
|
|
2128
|
-
|
|
2129
2119
|
//#endregion
|
|
2130
2120
|
//#region src/assertions/startsWith.ts
|
|
2131
2121
|
const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) => {
|
|
@@ -2139,7 +2129,6 @@ const handleStartsWith = ({ assertion, renderedValue, outputString, inverse }) =
|
|
|
2139
2129
|
assertion
|
|
2140
2130
|
};
|
|
2141
2131
|
};
|
|
2142
|
-
|
|
2143
2132
|
//#endregion
|
|
2144
2133
|
//#region src/assertions/toolCallF1.ts
|
|
2145
2134
|
/**
|
|
@@ -2268,7 +2257,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2268
2257
|
assertion
|
|
2269
2258
|
};
|
|
2270
2259
|
};
|
|
2271
|
-
|
|
2272
2260
|
//#endregion
|
|
2273
2261
|
//#region src/assertions/traceUtils.ts
|
|
2274
2262
|
/**
|
|
@@ -2286,7 +2274,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
2286
2274
|
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2287
2275
|
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2288
2276
|
}
|
|
2289
|
-
|
|
2290
2277
|
//#endregion
|
|
2291
2278
|
//#region src/assertions/traceErrorSpans.ts
|
|
2292
2279
|
function isErrorSpan(span) {
|
|
@@ -2364,7 +2351,6 @@ const handleTraceErrorSpans = ({ assertion, assertionValueContext }) => {
|
|
|
2364
2351
|
assertion
|
|
2365
2352
|
};
|
|
2366
2353
|
};
|
|
2367
|
-
|
|
2368
2354
|
//#endregion
|
|
2369
2355
|
//#region src/assertions/traceSpanCount.ts
|
|
2370
2356
|
const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
@@ -2399,7 +2385,6 @@ const handleTraceSpanCount = ({ assertion, assertionValueContext }) => {
|
|
|
2399
2385
|
assertion
|
|
2400
2386
|
};
|
|
2401
2387
|
};
|
|
2402
|
-
|
|
2403
2388
|
//#endregion
|
|
2404
2389
|
//#region src/assertions/traceSpanDuration.ts
|
|
2405
2390
|
function calculatePercentile(durations, percentile) {
|
|
@@ -2457,7 +2442,6 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2457
2442
|
assertion
|
|
2458
2443
|
};
|
|
2459
2444
|
};
|
|
2460
|
-
|
|
2461
2445
|
//#endregion
|
|
2462
2446
|
//#region src/assertions/webhook.ts
|
|
2463
2447
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
@@ -2494,7 +2478,6 @@ async function handleWebhook({ assertion, renderedValue, test, prompt, output, i
|
|
|
2494
2478
|
};
|
|
2495
2479
|
}
|
|
2496
2480
|
}
|
|
2497
|
-
|
|
2498
2481
|
//#endregion
|
|
2499
2482
|
//#region src/assertions/wordCount.ts
|
|
2500
2483
|
/**
|
|
@@ -2557,7 +2540,6 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2557
2540
|
assertion
|
|
2558
2541
|
};
|
|
2559
2542
|
};
|
|
2560
|
-
|
|
2561
2543
|
//#endregion
|
|
2562
2544
|
//#region src/assertions/xml.ts
|
|
2563
2545
|
function validateXml(xmlString, requiredElements) {
|
|
@@ -2632,7 +2614,6 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2632
2614
|
assertion
|
|
2633
2615
|
};
|
|
2634
2616
|
};
|
|
2635
|
-
|
|
2636
2617
|
//#endregion
|
|
2637
2618
|
//#region src/assertions/index.ts
|
|
2638
2619
|
const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
@@ -2686,7 +2667,7 @@ const ASSERTION_HANDLERS = {
|
|
|
2686
2667
|
"llm-rubric": handleLlmRubric,
|
|
2687
2668
|
meteor: async (params) => {
|
|
2688
2669
|
try {
|
|
2689
|
-
const { handleMeteorAssertion } = await import("./meteor-
|
|
2670
|
+
const { handleMeteorAssertion } = await import("./meteor-44VjEACX.js");
|
|
2690
2671
|
return handleMeteorAssertion(params);
|
|
2691
2672
|
} catch (error) {
|
|
2692
2673
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -2732,10 +2713,10 @@ function renderMetricName(metric, vars) {
|
|
|
2732
2713
|
if (!metric) return metric;
|
|
2733
2714
|
try {
|
|
2734
2715
|
const rendered = nunjucks.renderString(metric, vars);
|
|
2735
|
-
if (rendered === "" && metric !== "")
|
|
2716
|
+
if (rendered === "" && metric !== "") logger.debug(`Metric template "${metric}" rendered to empty string`);
|
|
2736
2717
|
return rendered;
|
|
2737
2718
|
} catch (error) {
|
|
2738
|
-
|
|
2719
|
+
logger.warn(`Failed to render metric template "${metric}": ${error instanceof Error ? error.message : error}`);
|
|
2739
2720
|
return metric;
|
|
2740
2721
|
}
|
|
2741
2722
|
}
|
|
@@ -2786,12 +2767,12 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2786
2767
|
spans: traceData.spans || []
|
|
2787
2768
|
};
|
|
2788
2769
|
} catch (error) {
|
|
2789
|
-
|
|
2770
|
+
logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
2790
2771
|
}
|
|
2791
2772
|
let renderedValue = assertion.value;
|
|
2792
2773
|
let valueFromScript;
|
|
2793
2774
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
2794
|
-
const basePath =
|
|
2775
|
+
const basePath = state.basePath || "";
|
|
2795
2776
|
const fileRef = renderedValue.slice(7);
|
|
2796
2777
|
let filePath = fileRef;
|
|
2797
2778
|
let functionName;
|
|
@@ -2803,10 +2784,10 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2803
2784
|
filePath = path.resolve(basePath, filePath);
|
|
2804
2785
|
if (isJavascriptFile(filePath)) {
|
|
2805
2786
|
valueFromScript = await loadFromJavaScriptFile(filePath, functionName, [output, context]);
|
|
2806
|
-
|
|
2787
|
+
logger.debug(`Javascript script ${filePath} output: ${valueFromScript}`);
|
|
2807
2788
|
} else if (filePath.endsWith(".py")) try {
|
|
2808
2789
|
valueFromScript = await runPython(filePath, functionName || "get_assert", [output, context]);
|
|
2809
|
-
|
|
2790
|
+
logger.debug(`Python script ${filePath} output: ${valueFromScript}`);
|
|
2810
2791
|
} catch (error) {
|
|
2811
2792
|
return {
|
|
2812
2793
|
pass: false,
|
|
@@ -2816,9 +2797,9 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2816
2797
|
};
|
|
2817
2798
|
}
|
|
2818
2799
|
else if (filePath.endsWith(".rb")) try {
|
|
2819
|
-
const { runRuby } = await import("./rubyUtils-
|
|
2800
|
+
const { runRuby } = await import("./rubyUtils-BUHu6PhO.js");
|
|
2820
2801
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2821
|
-
|
|
2802
|
+
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2822
2803
|
} catch (error) {
|
|
2823
2804
|
return {
|
|
2824
2805
|
pass: false,
|
|
@@ -2829,7 +2810,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2829
2810
|
}
|
|
2830
2811
|
else renderedValue = processFileReference(renderedValue);
|
|
2831
2812
|
} else if (isPackagePath(renderedValue)) {
|
|
2832
|
-
const basePath =
|
|
2813
|
+
const basePath = state.basePath || "";
|
|
2833
2814
|
const requiredModule = await loadFromPackage(renderedValue, basePath);
|
|
2834
2815
|
if (typeof requiredModule !== "function") throw new Error(`Assertion malformed: ${renderedValue} must be a function. Received: ${typeof requiredModule}`);
|
|
2835
2816
|
valueFromScript = await Promise.resolve(requiredModule(output, context));
|
|
@@ -2990,7 +2971,6 @@ var assertions_default = {
|
|
|
2990
2971
|
matchesModeration,
|
|
2991
2972
|
matchesConversationRelevance
|
|
2992
2973
|
};
|
|
2993
|
-
|
|
2994
2974
|
//#endregion
|
|
2995
2975
|
//#region src/util/promptMatching.ts
|
|
2996
2976
|
/**
|
|
@@ -3028,7 +3008,6 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
3028
3008
|
if (allowedPrompts.length === 0) return false;
|
|
3029
3009
|
return allowedPrompts.some((ref) => doesPromptRefMatch(ref, prompt));
|
|
3030
3010
|
}
|
|
3031
|
-
|
|
3032
3011
|
//#endregion
|
|
3033
3012
|
//#region src/progress/ciProgressReporter.ts
|
|
3034
3013
|
var CIProgressReporter = class {
|
|
@@ -3050,7 +3029,7 @@ var CIProgressReporter = class {
|
|
|
3050
3029
|
}
|
|
3051
3030
|
start() {
|
|
3052
3031
|
if (this.intervalId) clearInterval(this.intervalId);
|
|
3053
|
-
|
|
3032
|
+
logger.info(`[Evaluation] Starting ${this.totalTests} test cases...`);
|
|
3054
3033
|
this.intervalId = setInterval(() => {
|
|
3055
3034
|
this.logPeriodicUpdate();
|
|
3056
3035
|
}, this.updateIntervalMs);
|
|
@@ -3081,14 +3060,14 @@ var CIProgressReporter = class {
|
|
|
3081
3060
|
this.intervalId = null;
|
|
3082
3061
|
}
|
|
3083
3062
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3084
|
-
|
|
3063
|
+
logger.info(`[Evaluation] ✓ Complete! ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3085
3064
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation completed: ${this.completedTests}/${this.totalTests} tests in ${elapsed}`);
|
|
3086
3065
|
}
|
|
3087
3066
|
error(message) {
|
|
3088
3067
|
const now = Date.now();
|
|
3089
3068
|
if (now - this.lastErrorTime < this.ERROR_THROTTLE_MS) return;
|
|
3090
3069
|
this.lastErrorTime = now;
|
|
3091
|
-
|
|
3070
|
+
logger.error(`[Evaluation Error] ${message}`);
|
|
3092
3071
|
if (process.env.GITHUB_ACTIONS) {
|
|
3093
3072
|
const escapedMessage = message.replace(/\r?\n/g, " ").replace(/::/g, " ");
|
|
3094
3073
|
console.log(`::error::${escapedMessage}`);
|
|
@@ -3107,12 +3086,12 @@ var CIProgressReporter = class {
|
|
|
3107
3086
|
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
|
|
3108
3087
|
}
|
|
3109
3088
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3110
|
-
|
|
3111
|
-
|
|
3089
|
+
logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
3090
|
+
logger.info(`[CI Progress] Rate: ~${Math.round(rate)} tests/minute, ETA: ${etaDisplay}`);
|
|
3112
3091
|
}
|
|
3113
3092
|
logMilestone(percentage) {
|
|
3114
3093
|
const elapsed = this.formatElapsedTime(Date.now() - this.startTime);
|
|
3115
|
-
|
|
3094
|
+
logger.info(`[Evaluation] ✓ ${percentage}% complete (${this.completedTests}/${this.totalTests}) - ${elapsed} elapsed`);
|
|
3116
3095
|
if (process.env.GITHUB_ACTIONS) console.log(`::notice::Evaluation ${percentage}% complete`);
|
|
3117
3096
|
}
|
|
3118
3097
|
formatElapsedTime(ms) {
|
|
@@ -3123,7 +3102,6 @@ var CIProgressReporter = class {
|
|
|
3123
3102
|
return `${minutes}m ${remainingSeconds}s`;
|
|
3124
3103
|
}
|
|
3125
3104
|
};
|
|
3126
|
-
|
|
3127
3105
|
//#endregion
|
|
3128
3106
|
//#region src/providers/azure/warnings.ts
|
|
3129
3107
|
/**
|
|
@@ -3137,13 +3115,12 @@ function maybeEmitAzureOpenAiWarning(testSuite, tests) {
|
|
|
3137
3115
|
const modelGradedAsserts = tests.flatMap((t) => (t.assert || []).filter((a) => a.type !== "assert-set" && MODEL_GRADED_ASSERTION_TYPES.has(a.type) && !a.provider && !t.options?.provider));
|
|
3138
3116
|
if (modelGradedAsserts.length > 0) {
|
|
3139
3117
|
const assertTypes = Array.from(new Set(modelGradedAsserts.map((a) => a.type))).join(", ");
|
|
3140
|
-
|
|
3118
|
+
logger.warn(chalk.yellow(`You are using model-graded assertions of types ${chalk.bold(assertTypes)} while testing an Azure provider. You may need to override these to use your Azure deployment. To learn more, see ${chalk.bold(`https://promptfoo.dev/docs/providers/azure/#model-graded-tests`)}`));
|
|
3141
3119
|
return true;
|
|
3142
3120
|
}
|
|
3143
3121
|
}
|
|
3144
3122
|
return false;
|
|
3145
3123
|
}
|
|
3146
|
-
|
|
3147
3124
|
//#endregion
|
|
3148
3125
|
//#region src/suggestions.ts
|
|
3149
3126
|
async function generatePrompts(prompt, _num) {
|
|
@@ -3174,7 +3151,6 @@ async function generatePrompts(prompt, _num) {
|
|
|
3174
3151
|
};
|
|
3175
3152
|
}
|
|
3176
3153
|
}
|
|
3177
|
-
|
|
3178
3154
|
//#endregion
|
|
3179
3155
|
//#region src/tracing/otelConfig.ts
|
|
3180
3156
|
/**
|
|
@@ -3200,7 +3176,6 @@ function getDefaultOtelConfig() {
|
|
|
3200
3176
|
enabled: true
|
|
3201
3177
|
};
|
|
3202
3178
|
}
|
|
3203
|
-
|
|
3204
3179
|
//#endregion
|
|
3205
3180
|
//#region src/tracing/localSpanExporter.ts
|
|
3206
3181
|
/**
|
|
@@ -3220,7 +3195,7 @@ var LocalSpanExporter = class {
|
|
|
3220
3195
|
});
|
|
3221
3196
|
else resultCallback({ code: ExportResultCode.SUCCESS });
|
|
3222
3197
|
}).catch((error) => {
|
|
3223
|
-
|
|
3198
|
+
logger.error("[LocalSpanExporter] Failed to export spans", { error });
|
|
3224
3199
|
resultCallback({
|
|
3225
3200
|
code: ExportResultCode.FAILED,
|
|
3226
3201
|
error: error instanceof Error ? error : new Error(String(error))
|
|
@@ -3234,7 +3209,7 @@ var LocalSpanExporter = class {
|
|
|
3234
3209
|
async exportAsync(spans) {
|
|
3235
3210
|
if (spans.length === 0) return;
|
|
3236
3211
|
const traceStore = getTraceStore();
|
|
3237
|
-
|
|
3212
|
+
logger.debug(`[LocalSpanExporter] Exporting ${spans.length} spans`);
|
|
3238
3213
|
const spansByTrace = /* @__PURE__ */ new Map();
|
|
3239
3214
|
for (const span of spans) {
|
|
3240
3215
|
const traceId = span.spanContext().traceId;
|
|
@@ -3245,12 +3220,12 @@ var LocalSpanExporter = class {
|
|
|
3245
3220
|
let firstError;
|
|
3246
3221
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
3247
3222
|
const result = await traceStore.addSpans(traceId, spanDataList, { skipTraceCheck: false });
|
|
3248
|
-
if (result.stored)
|
|
3249
|
-
else
|
|
3223
|
+
if (result.stored) logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
3224
|
+
else logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
3250
3225
|
} catch (error) {
|
|
3251
|
-
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY"))
|
|
3226
|
+
if ((error instanceof Error ? error.message : String(error)).includes("FOREIGN KEY")) logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}`);
|
|
3252
3227
|
else {
|
|
3253
|
-
|
|
3228
|
+
logger.error(`[LocalSpanExporter] Failed to add spans to trace ${traceId}`, { error });
|
|
3254
3229
|
if (!firstError) firstError = error instanceof Error ? error : new Error(String(error));
|
|
3255
3230
|
}
|
|
3256
3231
|
}
|
|
@@ -3287,7 +3262,7 @@ var LocalSpanExporter = class {
|
|
|
3287
3262
|
* Shutdown the exporter. No-op for local storage.
|
|
3288
3263
|
*/
|
|
3289
3264
|
shutdown() {
|
|
3290
|
-
|
|
3265
|
+
logger.debug("[LocalSpanExporter] Shutting down");
|
|
3291
3266
|
return Promise.resolve();
|
|
3292
3267
|
}
|
|
3293
3268
|
/**
|
|
@@ -3297,7 +3272,6 @@ var LocalSpanExporter = class {
|
|
|
3297
3272
|
return Promise.resolve();
|
|
3298
3273
|
}
|
|
3299
3274
|
};
|
|
3300
|
-
|
|
3301
3275
|
//#endregion
|
|
3302
3276
|
//#region src/tracing/otelSdk.ts
|
|
3303
3277
|
let provider = null;
|
|
@@ -3325,21 +3299,21 @@ function getHandlers() {
|
|
|
3325
3299
|
*/
|
|
3326
3300
|
function initializeOtel(config) {
|
|
3327
3301
|
if (initialized) {
|
|
3328
|
-
|
|
3302
|
+
logger.debug("[OtelSdk] Already initialized, skipping");
|
|
3329
3303
|
return;
|
|
3330
3304
|
}
|
|
3331
3305
|
if (!config.enabled) {
|
|
3332
|
-
|
|
3306
|
+
logger.debug("[OtelSdk] OTEL tracing is disabled");
|
|
3333
3307
|
return;
|
|
3334
3308
|
}
|
|
3335
|
-
|
|
3309
|
+
logger.debug("[OtelSdk] Initializing OpenTelemetry SDK", {
|
|
3336
3310
|
serviceName: config.serviceName,
|
|
3337
3311
|
endpoint: config.endpoint,
|
|
3338
3312
|
localExport: config.localExport
|
|
3339
3313
|
});
|
|
3340
3314
|
if (config.debug) diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.DEBUG);
|
|
3341
3315
|
propagation.setGlobalPropagator(new W3CTraceContextPropagator());
|
|
3342
|
-
|
|
3316
|
+
logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
|
|
3343
3317
|
const resource = resourceFromAttributes({
|
|
3344
3318
|
[ATTR_SERVICE_NAME]: config.serviceName,
|
|
3345
3319
|
[ATTR_SERVICE_VERSION]: VERSION
|
|
@@ -3348,12 +3322,12 @@ function initializeOtel(config) {
|
|
|
3348
3322
|
if (config.localExport) {
|
|
3349
3323
|
const localExporter = new LocalSpanExporter();
|
|
3350
3324
|
spanProcessors.push(new BatchSpanProcessor(localExporter));
|
|
3351
|
-
|
|
3325
|
+
logger.debug("[OtelSdk] Added local span exporter");
|
|
3352
3326
|
}
|
|
3353
3327
|
if (config.endpoint) {
|
|
3354
3328
|
const otlpExporter = new OTLPTraceExporter({ url: config.endpoint });
|
|
3355
3329
|
spanProcessors.push(new BatchSpanProcessor(otlpExporter));
|
|
3356
|
-
|
|
3330
|
+
logger.debug(`[OtelSdk] Added OTLP exporter to ${config.endpoint}`);
|
|
3357
3331
|
}
|
|
3358
3332
|
provider = new NodeTracerProvider({
|
|
3359
3333
|
resource,
|
|
@@ -3361,7 +3335,7 @@ function initializeOtel(config) {
|
|
|
3361
3335
|
});
|
|
3362
3336
|
provider.register();
|
|
3363
3337
|
initialized = true;
|
|
3364
|
-
|
|
3338
|
+
logger.info("[OtelSdk] OpenTelemetry SDK initialized successfully");
|
|
3365
3339
|
setupShutdownHandlers();
|
|
3366
3340
|
}
|
|
3367
3341
|
/**
|
|
@@ -3370,12 +3344,12 @@ function initializeOtel(config) {
|
|
|
3370
3344
|
*/
|
|
3371
3345
|
async function shutdownOtel() {
|
|
3372
3346
|
if (!initialized || !provider) return;
|
|
3373
|
-
|
|
3347
|
+
logger.debug("[OtelSdk] Shutting down OpenTelemetry SDK");
|
|
3374
3348
|
try {
|
|
3375
3349
|
await provider.shutdown();
|
|
3376
|
-
|
|
3350
|
+
logger.info("[OtelSdk] OpenTelemetry SDK shut down successfully");
|
|
3377
3351
|
} catch (error) {
|
|
3378
|
-
|
|
3352
|
+
logger.error("[OtelSdk] Error shutting down OpenTelemetry SDK", { error });
|
|
3379
3353
|
} finally {
|
|
3380
3354
|
provider = null;
|
|
3381
3355
|
initialized = false;
|
|
@@ -3388,12 +3362,12 @@ async function shutdownOtel() {
|
|
|
3388
3362
|
*/
|
|
3389
3363
|
async function flushOtel() {
|
|
3390
3364
|
if (!initialized || !provider) return;
|
|
3391
|
-
|
|
3365
|
+
logger.debug("[OtelSdk] Flushing pending spans");
|
|
3392
3366
|
try {
|
|
3393
3367
|
await provider.forceFlush();
|
|
3394
|
-
|
|
3368
|
+
logger.debug("[OtelSdk] Spans flushed successfully");
|
|
3395
3369
|
} catch (error) {
|
|
3396
|
-
|
|
3370
|
+
logger.error("[OtelSdk] Error flushing spans", { error });
|
|
3397
3371
|
}
|
|
3398
3372
|
}
|
|
3399
3373
|
/**
|
|
@@ -3405,7 +3379,7 @@ function setupShutdownHandlers() {
|
|
|
3405
3379
|
const handlers = getHandlers();
|
|
3406
3380
|
if (handlers.registered) return;
|
|
3407
3381
|
const shutdown = async (signal) => {
|
|
3408
|
-
|
|
3382
|
+
logger.debug(`[OtelSdk] Received ${signal}, shutting down`);
|
|
3409
3383
|
await shutdownOtel();
|
|
3410
3384
|
};
|
|
3411
3385
|
handlers.sigTermHandler = () => {
|
|
@@ -3442,7 +3416,6 @@ function cleanupShutdownHandlers() {
|
|
|
3442
3416
|
}
|
|
3443
3417
|
handlers.registered = false;
|
|
3444
3418
|
}
|
|
3445
|
-
|
|
3446
3419
|
//#endregion
|
|
3447
3420
|
//#region src/util/exportToFile/writeToFile.ts
|
|
3448
3421
|
var JsonlFileWriter = class {
|
|
@@ -3466,7 +3439,6 @@ var JsonlFileWriter = class {
|
|
|
3466
3439
|
});
|
|
3467
3440
|
}
|
|
3468
3441
|
};
|
|
3469
|
-
|
|
3470
3442
|
//#endregion
|
|
3471
3443
|
//#region src/evaluator.ts
|
|
3472
3444
|
/**
|
|
@@ -3616,7 +3588,8 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3616
3588
|
const usesConversation = prompt.raw.includes("_conversation");
|
|
3617
3589
|
if (!getEnvBool("PROMPTFOO_DISABLE_CONVERSATION_VAR") && !test.options?.disableConversationVar && usesConversation) vars._conversation = conversations?.[conversationKey] || [];
|
|
3618
3590
|
Object.assign(vars, registers);
|
|
3619
|
-
const
|
|
3591
|
+
const promptForRender = { ...prompt };
|
|
3592
|
+
let mergedPromptConfig = {
|
|
3620
3593
|
...prompt.config ?? {},
|
|
3621
3594
|
...test.options ?? {}
|
|
3622
3595
|
};
|
|
@@ -3636,7 +3609,12 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3636
3609
|
let latencyMs = 0;
|
|
3637
3610
|
let traceContext = null;
|
|
3638
3611
|
try {
|
|
3639
|
-
const renderedPrompt = await renderPrompt(
|
|
3612
|
+
const renderedPrompt = await renderPrompt(promptForRender, vars, filters, provider, isRedteam ? [testSuite?.redteam?.injectVar ?? "prompt"] : void 0);
|
|
3613
|
+
mergedPromptConfig = {
|
|
3614
|
+
...promptForRender.config ?? {},
|
|
3615
|
+
...test.options ?? {}
|
|
3616
|
+
};
|
|
3617
|
+
setup.prompt.config = mergedPromptConfig;
|
|
3640
3618
|
let renderedJson = void 0;
|
|
3641
3619
|
try {
|
|
3642
3620
|
renderedJson = JSON.parse(renderedPrompt);
|
|
@@ -3652,18 +3630,18 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3652
3630
|
if (test.providerOutput) response.output = test.providerOutput;
|
|
3653
3631
|
else {
|
|
3654
3632
|
const activeProvider = isApiProvider(test.provider) ? test.provider : provider;
|
|
3655
|
-
|
|
3633
|
+
logger.debug(`Provider type: ${activeProvider.id()}`);
|
|
3656
3634
|
traceContext = await generateTraceContextIfNeeded(test, evaluateOptions, testIdx, promptIdx, testSuite);
|
|
3657
3635
|
const callApiContext = {
|
|
3658
3636
|
vars,
|
|
3659
3637
|
prompt: {
|
|
3660
|
-
...
|
|
3638
|
+
...promptForRender,
|
|
3661
3639
|
config: mergedPromptConfig
|
|
3662
3640
|
},
|
|
3663
3641
|
filters,
|
|
3664
3642
|
originalProvider: provider,
|
|
3665
3643
|
test,
|
|
3666
|
-
logger
|
|
3644
|
+
logger,
|
|
3667
3645
|
getCache,
|
|
3668
3646
|
repeatIndex
|
|
3669
3647
|
};
|
|
@@ -3680,8 +3658,8 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3680
3658
|
const sanitizedMetadata = safeJsonStringify(response.metadata);
|
|
3681
3659
|
response.metadata = sanitizedMetadata ? JSON.parse(sanitizedMetadata) : {};
|
|
3682
3660
|
}
|
|
3683
|
-
|
|
3684
|
-
|
|
3661
|
+
logger.debug(`Provider response properties: ${Object.keys(response).join(", ")}`);
|
|
3662
|
+
logger.debug(`Provider response cached property explicitly: ${response.cached}`);
|
|
3685
3663
|
}
|
|
3686
3664
|
latencyMs = Date.now() - startTime;
|
|
3687
3665
|
let conversationLastInput = void 0;
|
|
@@ -3698,12 +3676,12 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3698
3676
|
metadata: response.metadata
|
|
3699
3677
|
});
|
|
3700
3678
|
}
|
|
3701
|
-
|
|
3702
|
-
|
|
3679
|
+
logger.debug("Evaluator response", { responsePreview: (safeJsonStringify(response) ?? "").slice(0, 100) });
|
|
3680
|
+
logger.debug(`Evaluator checking cached flag: response.cached = ${Boolean(response.cached)}, provider.delay = ${provider.delay}`);
|
|
3703
3681
|
if (!response.cached && provider.delay > 0) {
|
|
3704
|
-
|
|
3682
|
+
logger.debug(`Sleeping for ${provider.delay}ms`);
|
|
3705
3683
|
await sleep(provider.delay);
|
|
3706
|
-
} else if (response.cached)
|
|
3684
|
+
} else if (response.cached) logger.debug(`Skipping delay because response is cached`);
|
|
3707
3685
|
const ret = {
|
|
3708
3686
|
...setup,
|
|
3709
3687
|
response,
|
|
@@ -3806,7 +3784,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3806
3784
|
promptIdx,
|
|
3807
3785
|
testIdx
|
|
3808
3786
|
});
|
|
3809
|
-
if (!(err instanceof Error && err.name === "AbortError"))
|
|
3787
|
+
if (!(err instanceof Error && err.name === "AbortError")) logger.error("Provider call failed during eval", logContext);
|
|
3810
3788
|
return [{
|
|
3811
3789
|
...setup,
|
|
3812
3790
|
error: errorWithStack,
|
|
@@ -3889,7 +3867,7 @@ function generateVarCombinations(vars) {
|
|
|
3889
3867
|
let values = [];
|
|
3890
3868
|
if (typeof vars[key] === "string" && vars[key].startsWith("file://")) {
|
|
3891
3869
|
const filePath = vars[key].slice(7);
|
|
3892
|
-
const basePath =
|
|
3870
|
+
const basePath = state.basePath || "";
|
|
3893
3871
|
values = (globSync(filePath, {
|
|
3894
3872
|
cwd: basePath || process.cwd(),
|
|
3895
3873
|
windowsPathsNoEscape: true
|
|
@@ -3929,28 +3907,28 @@ var Evaluator = class {
|
|
|
3929
3907
|
this.conversations = {};
|
|
3930
3908
|
this.registers = {};
|
|
3931
3909
|
this.fileWriters = (Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : []).map((p) => new JsonlFileWriter(p));
|
|
3932
|
-
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency ||
|
|
3910
|
+
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
3933
3911
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
3934
|
-
|
|
3912
|
+
logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
3935
3913
|
retryAfterMs: data.retryAfterMs,
|
|
3936
3914
|
resetAt: data.resetAt,
|
|
3937
3915
|
concurrencyChange: data.concurrencyChange
|
|
3938
3916
|
});
|
|
3939
3917
|
});
|
|
3940
3918
|
this.rateLimitRegistry.on("ratelimit:learned", (data) => {
|
|
3941
|
-
|
|
3919
|
+
logger.debug(`[Scheduler] Learned rate limits for ${data.rateLimitKey}`, {
|
|
3942
3920
|
requestLimit: data.requestLimit,
|
|
3943
3921
|
tokenLimit: data.tokenLimit
|
|
3944
3922
|
});
|
|
3945
3923
|
});
|
|
3946
3924
|
this.rateLimitRegistry.on("concurrency:decreased", (data) => {
|
|
3947
|
-
|
|
3925
|
+
logger.debug(`[Scheduler] Concurrency decreased for ${data.rateLimitKey}`, {
|
|
3948
3926
|
previous: data.previous,
|
|
3949
3927
|
current: data.current
|
|
3950
3928
|
});
|
|
3951
3929
|
});
|
|
3952
3930
|
this.rateLimitRegistry.on("concurrency:increased", (data) => {
|
|
3953
|
-
|
|
3931
|
+
logger.debug(`[Scheduler] Concurrency increased for ${data.rateLimitKey}`, {
|
|
3954
3932
|
previous: data.previous,
|
|
3955
3933
|
current: data.current
|
|
3956
3934
|
});
|
|
@@ -4007,7 +3985,7 @@ var Evaluator = class {
|
|
|
4007
3985
|
const checkAbort = () => {
|
|
4008
3986
|
if (combinedAbortSignal.aborted) throw new Error("Operation cancelled");
|
|
4009
3987
|
};
|
|
4010
|
-
if (!options.silent)
|
|
3988
|
+
if (!options.silent) logger.info(`Starting evaluation ${this.evalRecord.id}`);
|
|
4011
3989
|
checkAbort();
|
|
4012
3990
|
const prompts = [];
|
|
4013
3991
|
const assertionTypes = /* @__PURE__ */ new Set();
|
|
@@ -4019,32 +3997,32 @@ var Evaluator = class {
|
|
|
4019
3997
|
}
|
|
4020
3998
|
testSuite = (await runExtensionHook(testSuite.extensions, "beforeAll", { suite: testSuite })).suite;
|
|
4021
3999
|
if (options.generateSuggestions) {
|
|
4022
|
-
|
|
4000
|
+
logger.info(`Generating prompt variations...`);
|
|
4023
4001
|
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
4024
4002
|
if (error || !newPrompts) throw new Error(`Failed to generate prompts: ${error}`);
|
|
4025
|
-
|
|
4003
|
+
logger.info(chalk.blue("Generated prompts:"));
|
|
4026
4004
|
let numAdded = 0;
|
|
4027
4005
|
for (const prompt of newPrompts) {
|
|
4028
|
-
|
|
4029
|
-
|
|
4030
|
-
|
|
4006
|
+
logger.info("--------------------------------------------------------");
|
|
4007
|
+
logger.info(`${prompt}`);
|
|
4008
|
+
logger.info("--------------------------------------------------------");
|
|
4031
4009
|
if (await promptYesNo("Do you want to test this prompt?", false)) {
|
|
4032
4010
|
testSuite.prompts.push({
|
|
4033
4011
|
raw: prompt,
|
|
4034
4012
|
label: prompt
|
|
4035
4013
|
});
|
|
4036
4014
|
numAdded++;
|
|
4037
|
-
} else
|
|
4015
|
+
} else logger.info("Skipping this prompt.");
|
|
4038
4016
|
}
|
|
4039
4017
|
if (numAdded < 1) {
|
|
4040
|
-
|
|
4018
|
+
logger.info(chalk.red("No prompts selected. Aborting."));
|
|
4041
4019
|
process.exitCode = 1;
|
|
4042
4020
|
return this.evalRecord;
|
|
4043
4021
|
}
|
|
4044
4022
|
}
|
|
4045
4023
|
const existingPromptsMap = /* @__PURE__ */ new Map();
|
|
4046
|
-
if (
|
|
4047
|
-
|
|
4024
|
+
if (state.resume && this.evalRecord.persisted && this.evalRecord.prompts.length > 0) {
|
|
4025
|
+
logger.debug("Resuming evaluation: preserving metrics from previous run");
|
|
4048
4026
|
for (const existingPrompt of this.evalRecord.prompts) {
|
|
4049
4027
|
const key = `${existingPrompt.provider}:${existingPrompt.id}`;
|
|
4050
4028
|
existingPromptsMap.set(key, existingPrompt);
|
|
@@ -4082,7 +4060,7 @@ var Evaluator = class {
|
|
|
4082
4060
|
await this.evalRecord.addPrompts(prompts);
|
|
4083
4061
|
let tests = testSuite.tests && testSuite.tests.length > 0 ? testSuite.tests : testSuite.scenarios ? [] : [{}];
|
|
4084
4062
|
if (testSuite.scenarios && testSuite.scenarios.length > 0) {
|
|
4085
|
-
|
|
4063
|
+
telemetry.record("feature_used", { feature: "scenarios" });
|
|
4086
4064
|
let scenarioIndex = 0;
|
|
4087
4065
|
for (const scenario of testSuite.scenarios) for (const data of scenario.config) {
|
|
4088
4066
|
const scenarioTests = (scenario.tests || [{}]).map((test) => {
|
|
@@ -4146,7 +4124,7 @@ var Evaluator = class {
|
|
|
4146
4124
|
}
|
|
4147
4125
|
const runEvalOptions = [];
|
|
4148
4126
|
let testIdx = 0;
|
|
4149
|
-
let concurrency = options.maxConcurrency ||
|
|
4127
|
+
let concurrency = options.maxConcurrency || 4;
|
|
4150
4128
|
for (let index = 0; index < tests.length; index++) {
|
|
4151
4129
|
const testCase = tests[index];
|
|
4152
4130
|
invariant(typeof testSuite.defaultTest !== "object" || Array.isArray(testSuite.defaultTest?.assert || []), `defaultTest.assert is not an array in test case #${index + 1}`);
|
|
@@ -4166,7 +4144,7 @@ var Evaluator = class {
|
|
|
4166
4144
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4167
4145
|
if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4168
4146
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4169
|
-
const { loadApiProvider } = await import("./providers-
|
|
4147
|
+
const { loadApiProvider } = await import("./providers-C1rOSHiR.js");
|
|
4170
4148
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4171
4149
|
} else testCase.provider = defaultProvider;
|
|
4172
4150
|
}
|
|
@@ -4193,7 +4171,7 @@ var Evaluator = class {
|
|
|
4193
4171
|
const promptId = generateIdFromPrompt(prompt);
|
|
4194
4172
|
const promptIdx = promptIndexMap.get(`${providerKey}:${promptId}`);
|
|
4195
4173
|
if (promptIdx === void 0) {
|
|
4196
|
-
|
|
4174
|
+
logger.warn(`Could not find prompt index for ${providerKey}:${promptId}, skipping`);
|
|
4197
4175
|
continue;
|
|
4198
4176
|
}
|
|
4199
4177
|
runEvalOptions.push({
|
|
@@ -4216,7 +4194,7 @@ var Evaluator = class {
|
|
|
4216
4194
|
options: testOptions
|
|
4217
4195
|
};
|
|
4218
4196
|
const tracingEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || testCase.metadata?.tracingEnabled === true || testSuite.tracing?.enabled === true;
|
|
4219
|
-
|
|
4197
|
+
logger.debug(`[Evaluator] Tracing check: env=${getEnvBool("PROMPTFOO_TRACING_ENABLED", false)}, testCase.metadata?.tracingEnabled=${testCase.metadata?.tracingEnabled}, testSuite.tracing?.enabled=${testSuite.tracing?.enabled}, tracingEnabled=${tracingEnabled}`);
|
|
4220
4198
|
if (tracingEnabled) return {
|
|
4221
4199
|
...baseTest,
|
|
4222
4200
|
metadata: {
|
|
@@ -4249,27 +4227,27 @@ var Evaluator = class {
|
|
|
4249
4227
|
if (evalOption.test.assert?.some((a) => a.type === "select-best")) rowsWithSelectBestAssertion.add(evalOption.testIdx);
|
|
4250
4228
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4251
4229
|
}
|
|
4252
|
-
if (
|
|
4253
|
-
const { default: EvalResult } = await import("./evalResult-
|
|
4254
|
-
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors:
|
|
4230
|
+
if (state.resume && this.evalRecord.persisted) try {
|
|
4231
|
+
const { default: EvalResult } = await import("./evalResult-Cqj8pldJ.js");
|
|
4232
|
+
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
|
|
4255
4233
|
const originalCount = runEvalOptions.length;
|
|
4256
4234
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
4257
4235
|
const step = runEvalOptions[i];
|
|
4258
4236
|
if (completedPairs.has(`${step.testIdx}:${step.promptIdx}`)) runEvalOptions.splice(i, 1);
|
|
4259
4237
|
}
|
|
4260
4238
|
const skipped = originalCount - runEvalOptions.length;
|
|
4261
|
-
if (skipped > 0)
|
|
4239
|
+
if (skipped > 0) logger.info(`Resuming: skipping ${skipped} previously completed cases`);
|
|
4262
4240
|
} catch (err) {
|
|
4263
|
-
|
|
4241
|
+
logger.warn(`Resume: failed to load completed results. Running full evaluation. ${String(err)}`);
|
|
4264
4242
|
}
|
|
4265
4243
|
if (concurrency > 1) {
|
|
4266
4244
|
const usesConversation = prompts.some((p) => p.raw.includes("_conversation"));
|
|
4267
4245
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
4268
4246
|
if (usesConversation) {
|
|
4269
|
-
|
|
4247
|
+
logger.info(`Setting concurrency to 1 because the ${chalk.cyan("_conversation")} variable is used.`);
|
|
4270
4248
|
concurrency = 1;
|
|
4271
4249
|
} else if (usesStoreOutputAs) {
|
|
4272
|
-
|
|
4250
|
+
logger.info(`Setting concurrency to 1 because storeOutputAs is used.`);
|
|
4273
4251
|
concurrency = 1;
|
|
4274
4252
|
}
|
|
4275
4253
|
}
|
|
@@ -4300,14 +4278,14 @@ var Evaluator = class {
|
|
|
4300
4278
|
await this.evalRecord.addResult(row);
|
|
4301
4279
|
} catch (error) {
|
|
4302
4280
|
const resultSummary = summarizeEvaluateResultForLogging(row);
|
|
4303
|
-
|
|
4281
|
+
logger.error(`Error saving result: ${error} ${safeJsonStringify(resultSummary)}`);
|
|
4304
4282
|
}
|
|
4305
4283
|
for (const writer of this.fileWriters) await writer.write(row);
|
|
4306
4284
|
const httpStatus = row.response?.metadata?.http?.status;
|
|
4307
4285
|
if (typeof httpStatus === "number" && isNonTransientHttpStatus(httpStatus)) {
|
|
4308
4286
|
targetUnavailable = true;
|
|
4309
4287
|
targetErrorStatus = httpStatus;
|
|
4310
|
-
|
|
4288
|
+
logger.error(`Target returned HTTP ${httpStatus}. Aborting scan - this error will not resolve on retry.`);
|
|
4311
4289
|
targetErrorAbortController.abort();
|
|
4312
4290
|
break;
|
|
4313
4291
|
}
|
|
@@ -4327,7 +4305,7 @@ var Evaluator = class {
|
|
|
4327
4305
|
if (testSuite.derivedMetrics) {
|
|
4328
4306
|
const math = await import("mathjs");
|
|
4329
4307
|
const promptEvalCount = metrics.testPassCount + metrics.testFailCount + metrics.testErrorCount + 1;
|
|
4330
|
-
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count"))
|
|
4308
|
+
if (Object.prototype.hasOwnProperty.call(metrics.namedScores, "__count")) logger.warn("Metric name '__count' is reserved for derived metrics and will be overridden.");
|
|
4331
4309
|
const evalContext = {
|
|
4332
4310
|
...metrics.namedScores,
|
|
4333
4311
|
__count: promptEvalCount
|
|
@@ -4342,7 +4320,7 @@ var Evaluator = class {
|
|
|
4342
4320
|
}
|
|
4343
4321
|
evalContext[metric.name] = metrics.namedScores[metric.name];
|
|
4344
4322
|
} catch (error) {
|
|
4345
|
-
|
|
4323
|
+
logger.debug(`Could not evaluate derived metric '${metric.name}': ${error.message}`);
|
|
4346
4324
|
}
|
|
4347
4325
|
}
|
|
4348
4326
|
}
|
|
@@ -4381,7 +4359,7 @@ var Evaluator = class {
|
|
|
4381
4359
|
if (typeof evalStep.provider.cleanup === "function") try {
|
|
4382
4360
|
evalStep.provider.cleanup();
|
|
4383
4361
|
} catch (cleanupErr) {
|
|
4384
|
-
|
|
4362
|
+
logger.warn(`Error during provider cleanup: ${cleanupErr}`);
|
|
4385
4363
|
}
|
|
4386
4364
|
reject(/* @__PURE__ */ new Error(`Evaluation timed out after ${timeoutMs}ms`));
|
|
4387
4365
|
}, timeoutMs);
|
|
@@ -4445,8 +4423,8 @@ var Evaluator = class {
|
|
|
4445
4423
|
}
|
|
4446
4424
|
};
|
|
4447
4425
|
const originalProgressCallback = this.options.progressCallback;
|
|
4448
|
-
const isWebUI = Boolean(
|
|
4449
|
-
|
|
4426
|
+
const isWebUI = Boolean(state.webUI);
|
|
4427
|
+
logger.debug(`Progress bar settings: showProgressBar=${this.options.showProgressBar}, isWebUI=${isWebUI}`);
|
|
4450
4428
|
if (isCI() && !isWebUI) {
|
|
4451
4429
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4452
4430
|
ciProgressReporter.start();
|
|
@@ -4456,20 +4434,20 @@ var Evaluator = class {
|
|
|
4456
4434
|
if (isWebUI) {
|
|
4457
4435
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4458
4436
|
const vars = formatVarsForDisplay(evalStep.test.vars, 50);
|
|
4459
|
-
|
|
4437
|
+
logger.info(`[${numComplete}/${total}] Running ${provider} with vars: ${vars}`);
|
|
4460
4438
|
} else if (progressBarManager) {
|
|
4461
4439
|
const phase = evalStep.test.options?.runSerially ? "serial" : "concurrent";
|
|
4462
4440
|
progressBarManager.updateProgress(index, evalStep, phase, metrics);
|
|
4463
4441
|
} else if (ciProgressReporter) ciProgressReporter.update(numComplete);
|
|
4464
|
-
else
|
|
4442
|
+
else logger.debug(`Eval #${index + 1} complete (${numComplete} of ${runEvalOptions.length})`);
|
|
4465
4443
|
};
|
|
4466
4444
|
const serialRunEvalOptions = [];
|
|
4467
4445
|
const concurrentRunEvalOptions = [];
|
|
4468
4446
|
for (const evalOption of runEvalOptions) if (evalOption.test.options?.runSerially) serialRunEvalOptions.push(evalOption);
|
|
4469
4447
|
else concurrentRunEvalOptions.push(evalOption);
|
|
4470
4448
|
if (!this.options.silent) {
|
|
4471
|
-
if (serialRunEvalOptions.length > 0)
|
|
4472
|
-
if (concurrentRunEvalOptions.length > 0)
|
|
4449
|
+
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4450
|
+
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4473
4451
|
}
|
|
4474
4452
|
if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
4475
4453
|
try {
|
|
@@ -4478,7 +4456,7 @@ var Evaluator = class {
|
|
|
4478
4456
|
if (isWebUI) {
|
|
4479
4457
|
const provider = evalStep.provider.label || evalStep.provider.id();
|
|
4480
4458
|
const vars = formatVarsForDisplay(evalStep.test.vars || {}, 50);
|
|
4481
|
-
|
|
4459
|
+
logger.info(`[${numComplete}/${runEvalOptions.length}] Running ${provider} with vars: ${vars}`);
|
|
4482
4460
|
}
|
|
4483
4461
|
const idx = runEvalOptions.indexOf(evalStep);
|
|
4484
4462
|
await processEvalStepWithTimeout(evalStep, idx);
|
|
@@ -4493,9 +4471,9 @@ var Evaluator = class {
|
|
|
4493
4471
|
});
|
|
4494
4472
|
} catch (err) {
|
|
4495
4473
|
if (combinedAbortSignal.aborted) {
|
|
4496
|
-
if (evalTimedOut)
|
|
4474
|
+
if (evalTimedOut) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
4497
4475
|
else if (!targetUnavailable) {
|
|
4498
|
-
|
|
4476
|
+
logger.info("Evaluation interrupted, saving progress...");
|
|
4499
4477
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4500
4478
|
if (progressBarManager) progressBarManager.stop();
|
|
4501
4479
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -4525,10 +4503,10 @@ var Evaluator = class {
|
|
|
4525
4503
|
let compareCount = 0;
|
|
4526
4504
|
for (const testIdx of rowsWithSelectBestAssertion) {
|
|
4527
4505
|
compareCount++;
|
|
4528
|
-
if (isWebUI)
|
|
4506
|
+
if (isWebUI) logger.info(`Running model-graded comparison ${compareCount} of ${compareRowsCount}...`);
|
|
4529
4507
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4530
4508
|
if (resultsToCompare.length === 0) {
|
|
4531
|
-
|
|
4509
|
+
logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4532
4510
|
continue;
|
|
4533
4511
|
}
|
|
4534
4512
|
const compareAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "select-best");
|
|
@@ -4590,16 +4568,16 @@ var Evaluator = class {
|
|
|
4590
4568
|
}
|
|
4591
4569
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4592
4570
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4593
|
-
else if (!isWebUI)
|
|
4571
|
+
else if (!isWebUI) logger.debug(`Model-graded comparison #${compareCount} of ${compareRowsCount} complete`);
|
|
4594
4572
|
}
|
|
4595
4573
|
}
|
|
4596
4574
|
const maxScoreRowsCount = rowsWithMaxScoreAssertion.size;
|
|
4597
4575
|
if (maxScoreRowsCount > 0) {
|
|
4598
|
-
|
|
4576
|
+
logger.info(`Processing ${maxScoreRowsCount} max-score assertions...`);
|
|
4599
4577
|
for (const testIdx of rowsWithMaxScoreAssertion) {
|
|
4600
4578
|
const resultsToCompare = this.evalRecord.persisted ? await this.evalRecord.fetchResultsByTestIdx(testIdx) : this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
4601
4579
|
if (resultsToCompare.length === 0) {
|
|
4602
|
-
|
|
4580
|
+
logger.warn(`Expected results to be found for test index ${testIdx}`);
|
|
4603
4581
|
continue;
|
|
4604
4582
|
}
|
|
4605
4583
|
const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
|
|
@@ -4607,7 +4585,7 @@ var Evaluator = class {
|
|
|
4607
4585
|
const maxScoreGradingResults = await selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
|
|
4608
4586
|
if (progressBarManager) progressBarManager.updateComparisonProgress(resultsToCompare[0].prompt.raw);
|
|
4609
4587
|
else if (ciProgressReporter) ciProgressReporter.update(runEvalOptions.length + compareCount);
|
|
4610
|
-
else if (!isWebUI)
|
|
4588
|
+
else if (!isWebUI) logger.debug(`Max-score assertion for test #${testIdx} complete`);
|
|
4611
4589
|
for (let index = 0; index < resultsToCompare.length; index++) {
|
|
4612
4590
|
const result = resultsToCompare[index];
|
|
4613
4591
|
const maxScoreGradingResult = {
|
|
@@ -4651,7 +4629,7 @@ var Evaluator = class {
|
|
|
4651
4629
|
progressBarManager.stop();
|
|
4652
4630
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
4653
4631
|
} catch (cleanupErr) {
|
|
4654
|
-
|
|
4632
|
+
logger.warn(`Error during progress reporter cleanup: ${cleanupErr}`);
|
|
4655
4633
|
}
|
|
4656
4634
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4657
4635
|
if (evalTimedOut) {
|
|
@@ -4724,7 +4702,7 @@ var Evaluator = class {
|
|
|
4724
4702
|
return idParts.length > 1 ? idParts[0] : "unknown";
|
|
4725
4703
|
})));
|
|
4726
4704
|
const timeoutOccurred = evalTimedOut || this.evalRecord.results.some((r) => r.failureReason === ResultFailureReason.ERROR && r.error?.includes("timed out"));
|
|
4727
|
-
|
|
4705
|
+
telemetry.record("eval_ran", {
|
|
4728
4706
|
numPrompts: prompts.length,
|
|
4729
4707
|
numTests: this.stats.successes + this.stats.failures + this.stats.errors,
|
|
4730
4708
|
numRequests: this.stats.tokenUsage.numRequests || 0,
|
|
@@ -4772,26 +4750,26 @@ var Evaluator = class {
|
|
|
4772
4750
|
await startOtlpReceiverIfNeeded(this.testSuite);
|
|
4773
4751
|
const tracingEnabled = getEnvBool("PROMPTFOO_TRACING_ENABLED", false) || this.testSuite.tracing?.enabled === true || typeof this.testSuite.defaultTest === "object" && this.testSuite.defaultTest?.metadata?.tracingEnabled === true || this.testSuite.tests?.some((t) => t.metadata?.tracingEnabled === true);
|
|
4774
4752
|
if (tracingEnabled) {
|
|
4775
|
-
|
|
4753
|
+
logger.debug("[Evaluator] Initializing OTEL SDK for tracing");
|
|
4776
4754
|
initializeOtel(getDefaultOtelConfig());
|
|
4777
4755
|
}
|
|
4778
4756
|
try {
|
|
4779
4757
|
return await this._runEvaluation();
|
|
4780
4758
|
} finally {
|
|
4781
4759
|
if (tracingEnabled) {
|
|
4782
|
-
|
|
4760
|
+
logger.debug("[Evaluator] Flushing OTEL spans...");
|
|
4783
4761
|
await flushOtel();
|
|
4784
4762
|
await shutdownOtel();
|
|
4785
4763
|
}
|
|
4786
4764
|
if (isOtlpReceiverStarted()) {
|
|
4787
|
-
|
|
4765
|
+
logger.debug("[Evaluator] Waiting for span exports to complete...");
|
|
4788
4766
|
await sleep(3e3);
|
|
4789
4767
|
}
|
|
4790
4768
|
await stopOtlpReceiverIfNeeded();
|
|
4791
4769
|
await providerRegistry.shutdownAll();
|
|
4792
4770
|
if (this.rateLimitRegistry) {
|
|
4793
4771
|
const metrics = this.rateLimitRegistry.getMetrics();
|
|
4794
|
-
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0)
|
|
4772
|
+
for (const [key, m] of Object.entries(metrics)) if (m.totalRequests > 0) logger.debug(`[Scheduler] Final metrics for ${key}`, {
|
|
4795
4773
|
totalRequests: m.totalRequests,
|
|
4796
4774
|
completedRequests: m.completedRequests,
|
|
4797
4775
|
failedRequests: m.failedRequests,
|
|
@@ -4804,14 +4782,14 @@ var Evaluator = class {
|
|
|
4804
4782
|
}
|
|
4805
4783
|
this.rateLimitRegistry?.dispose();
|
|
4806
4784
|
redteamProviderManager.setRateLimitRegistry(void 0);
|
|
4807
|
-
|
|
4785
|
+
state.maxConcurrency = void 0;
|
|
4808
4786
|
}
|
|
4809
4787
|
}
|
|
4810
4788
|
};
|
|
4811
4789
|
function evaluate(testSuite, evalRecord, options) {
|
|
4812
4790
|
return new Evaluator(testSuite, evalRecord, options).evaluate();
|
|
4813
4791
|
}
|
|
4814
|
-
|
|
4815
4792
|
//#endregion
|
|
4816
4793
|
export { runEval as a, readAssertions as c, isAllowedPrompt as i, renderMetricName as l, formatVarsForDisplay as n, doesPromptRefMatch as o, generateVarCombinations as r, assertions_default as s, evaluate as t, runAssertions as u };
|
|
4817
|
-
|
|
4794
|
+
|
|
4795
|
+
//# sourceMappingURL=evaluator-DPFRbFIL.js.map
|