azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (72) hide show
  1. azure/ai/evaluation/_azure/__init__.py +3 -0
  2. azure/ai/evaluation/_azure/_clients.py +188 -0
  3. azure/ai/evaluation/_azure/_models.py +227 -0
  4. azure/ai/evaluation/_azure/_token_manager.py +118 -0
  5. azure/ai/evaluation/_common/_experimental.py +4 -0
  6. azure/ai/evaluation/_common/math.py +62 -2
  7. azure/ai/evaluation/_common/rai_service.py +110 -50
  8. azure/ai/evaluation/_common/utils.py +50 -16
  9. azure/ai/evaluation/_constants.py +2 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +12 -1
  13. azure/ai/evaluation/_evaluate/_eval_run.py +38 -43
  14. azure/ai/evaluation/_evaluate/_evaluate.py +62 -131
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
  16. azure/ai/evaluation/_evaluate/_utils.py +72 -38
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +88 -6
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +16 -3
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +39 -10
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +58 -52
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
  33. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
  34. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
  35. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
  36. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
  37. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
  38. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
  39. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
  40. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
  43. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
  45. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
  46. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
  48. azure/ai/evaluation/_exceptions.py +2 -0
  49. azure/ai/evaluation/_http_utils.py +6 -4
  50. azure/ai/evaluation/_model_configurations.py +65 -14
  51. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  52. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  53. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  54. azure/ai/evaluation/_version.py +1 -1
  55. azure/ai/evaluation/simulator/_adversarial_scenario.py +17 -1
  56. azure/ai/evaluation/simulator/_adversarial_simulator.py +57 -47
  57. azure/ai/evaluation/simulator/_constants.py +11 -1
  58. azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
  59. azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
  60. azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +12 -1
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
  63. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +48 -4
  64. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
  65. azure/ai/evaluation/simulator/_simulator.py +54 -45
  66. azure/ai/evaluation/simulator/_utils.py +25 -7
  67. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +240 -327
  68. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +71 -68
  69. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  70. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
  71. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
  72. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,103 +1,106 @@
1
1
  azure/ai/evaluation/__init__.py,sha256=MFxJRoKfSsP_Qlfq0FwynxNf4csNAfTYPQX7jdXc9RU,2757
2
- azure/ai/evaluation/_constants.py,sha256=KGjzbFKCk0O6xCH57VdKK6CKC0JwS25ouYOQOYCB_6M,1942
3
- azure/ai/evaluation/_exceptions.py,sha256=91Ovrj9t4nbpJM7GRK3rzwxXk-xLq6WLLzm44GUgt3s,5057
4
- azure/ai/evaluation/_http_utils.py,sha256=oVbRaxUm41tVFGkYpZdHjT9ss_9va1NzXYuV3DUVr8k,17125
5
- azure/ai/evaluation/_model_configurations.py,sha256=TklC7ke0jXtLitTQaQAGT5SJgV098XGUHY7On2_IFY4,2249
2
+ azure/ai/evaluation/_constants.py,sha256=d41rQb-w2GmCMHOwiyDD1ieJB1U6JyPPl6APZSJbKzg,2036
3
+ azure/ai/evaluation/_exceptions.py,sha256=MsTbgsPGYPzIxs7MyLKzSeiVKEoCxYkVjONzNfv2tXA,5162
4
+ azure/ai/evaluation/_http_utils.py,sha256=1bGce6pKAL-vmaUGRPxVX7DVO05XVQ8YPIwIQ3q7mfA,17221
5
+ azure/ai/evaluation/_model_configurations.py,sha256=MNN6cQlz7P9vNfHmfEKsUcly3j1FEOEFsA8WV7GPuKQ,4043
6
6
  azure/ai/evaluation/_user_agent.py,sha256=O2y-QPBAcw7w7qQ6M2aRPC3Vy3TKd789u5lcs2yuFaI,290
7
- azure/ai/evaluation/_version.py,sha256=mCv_uIychD87cYcoY1AwWAtaTQtk7P0sZUlJF8HsIcY,201
7
+ azure/ai/evaluation/_version.py,sha256=LzMvSuUB6pmU-LfCPzoYuCoTF0BAqE7ljPjk6r8YaMw,199
8
8
  azure/ai/evaluation/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ azure/ai/evaluation/_azure/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
10
+ azure/ai/evaluation/_azure/_clients.py,sha256=1mFRSxt-Ld5UBn-m3DJkKc-VPP9CbXQHrqLNdLs9RF0,8201
11
+ azure/ai/evaluation/_azure/_models.py,sha256=tKpxjb5Ou476UasjXPCiuvsxTjLTrnoVnSXy5Bfa51M,12483
12
+ azure/ai/evaluation/_azure/_token_manager.py,sha256=1NZHwgEc9BMXWPz5Ear_J5-oYjouD77crLHHqNLldEw,5193
9
13
  azure/ai/evaluation/_common/__init__.py,sha256=LHTkf6dMLLxikrGNgbUuREBVQcs4ORHR6Eryo4bm9M8,586
10
- azure/ai/evaluation/_common/_experimental.py,sha256=hmr9l9hHFNj6iEmBuMawdnnl54YzJrylbB7Dk6cs7cM,5565
14
+ azure/ai/evaluation/_common/_experimental.py,sha256=GVtSn9r1CeR_yEa578dJVNDJ3P24eqe8WYdH7llbiQY,5694
11
15
  azure/ai/evaluation/_common/constants.py,sha256=OsExttFGLnTAyZa26jnY5_PCDTb7uJNFqtE2qsRZ1mg,1957
12
- azure/ai/evaluation/_common/math.py,sha256=Y47ljvImn47xuW32enI2O6V7-7SBkraWeyXdJiYw41Q,927
13
- azure/ai/evaluation/_common/rai_service.py,sha256=zi2iha6y9HphzZlia9ig3riZ_2SGMHF0dfY4l866JXw,23402
14
- azure/ai/evaluation/_common/utils.py,sha256=7F5C_mZgR4MIIihCTFa5yUDZka0-g7G4KLsITQPq0gE,16080
16
+ azure/ai/evaluation/_common/math.py,sha256=d4bwWe35_RWDIZNcbV1BTBbHNx2QHQ4-I3EofDyyNE0,2863
17
+ azure/ai/evaluation/_common/rai_service.py,sha256=DcakzdOour9qNdMXU-8UFfvLb12oexAoiJXG8XFTRBs,26462
18
+ azure/ai/evaluation/_common/utils.py,sha256=MQIZs95gH5je1L-S3twa_WQi071zRu0Dv54lzCI7ZgU,17642
15
19
  azure/ai/evaluation/_evaluate/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
16
- azure/ai/evaluation/_evaluate/_eval_run.py,sha256=XppywHqCZeFguH5_WSIReKA6MAAe2j9hdso6jM_67Po,22283
17
- azure/ai/evaluation/_evaluate/_evaluate.py,sha256=77gJyIg7m9XJTm3qz6Q4yKSv1aZ19WoVpmmXyQlSqPk,38178
18
- azure/ai/evaluation/_evaluate/_utils.py,sha256=SAlVwU_5P2ls-394kN97QwmrAApzck8T3i-7LbVyZtg,12320
20
+ azure/ai/evaluation/_evaluate/_eval_run.py,sha256=het3cxjK4J-_hT19dT5a0mC2Cdnk93gM3ONQMJb9bxQ,21923
21
+ azure/ai/evaluation/_evaluate/_evaluate.py,sha256=P5aL70eUBKZT9CVRM9RVSfD0DkuljQyc5ECte37Ycmo,36225
22
+ azure/ai/evaluation/_evaluate/_utils.py,sha256=S4LUUDUBo9JNA41ojSezMC-PZzkWcihhhNdyZwZrpr0,13428
19
23
  azure/ai/evaluation/_evaluate/_batch_run/__init__.py,sha256=G8McpeLxAS_gFhNShX52_YWvE-arhJn-bVpAfzjWG3Q,427
20
24
  azure/ai/evaluation/_evaluate/_batch_run/code_client.py,sha256=XQLaXfswF6ReHLpQthHLuLLa65Pts8uawGp7kRqmMDs,8260
21
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py,sha256=1nnaUej4cOiPD9lH58Mt-RhHYd7gDe8G5kZg7w6Gkrs,3196
22
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py,sha256=88zkK6ATyMaUAmk8WAHccO2x9XO-6Ibr4Ggbs4wPmg0,3339
23
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py,sha256=IoueIPzyzK4Kt7ZoC3m9_0BpSY1pSB2H2qFi_6EBApg,1249
24
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py,sha256=cN6Y6Zq7kOv_EGwtKOO97PYYNiTlQmFUuHAROxq_Au8,6957
25
+ azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py,sha256=p3Bsg_shGs5RXvysOlvo0CQb4Te5herSvX1OP6ylFUQ,3543
26
+ azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py,sha256=T_QRHScDMBM4O6ejkkKdBmHPjH2NOF6owW48aVUYF6k,3775
27
+ azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py,sha256=SMos3bVmD73pK6gpIaL4iZZS3-Zda3V4N89Jg0J9sss,1636
28
+ azure/ai/evaluation/_evaluate/_telemetry/__init__.py,sha256=fhLqE41qxdjfBOGi23cpk6QgUe-s1Fw2xhAAUjNESF0,7045
25
29
  azure/ai/evaluation/_evaluators/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
26
30
  azure/ai/evaluation/_evaluators/_bleu/__init__.py,sha256=quKKO0kvOSkky5hcoNBvgBuMeeVRFCE9GSv70mAdGP4,260
27
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py,sha256=G5oZbR_3fPcuBlhQgIow61Tw7W3cL1ugOFcwWCgvT8U,2425
31
+ azure/ai/evaluation/_evaluators/_bleu/_bleu.py,sha256=iT20SMmEtOnh7RWs55dFfAlKXNkNceXkCUbVyqv6aQ0,2776
28
32
  azure/ai/evaluation/_evaluators/_coherence/__init__.py,sha256=GRqcSCQse02Spyki0UsRNWMIXiea2lLtPPXNGvkJzQ0,258
29
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py,sha256=TMyTHXu0t0S0j3MRLCcFFDnn78d2-SF92uZzlNG7azI,2956
33
+ azure/ai/evaluation/_evaluators/_coherence/_coherence.py,sha256=uG9hX2XWkMREKfMAWRoosjicoI4Lg3ptR3UcLEgKd0c,4643
30
34
  azure/ai/evaluation/_evaluators/_coherence/coherence.prompty,sha256=ANvh9mDFW7KMejrgdWqBLjj4SIqEO5WW9gg5pE0RLJk,6798
31
35
  azure/ai/evaluation/_evaluators/_common/__init__.py,sha256=_hPqTkAla_O6s4ebVtTaBrVLEW3KSdDz66WwxjK50cI,423
32
- azure/ai/evaluation/_evaluators/_common/_base_eval.py,sha256=32R2APcWEjvHFhtVU-Vkga9QP9Kr4df_ZZkz5xGD4GE,15419
33
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py,sha256=mwD6DxcAjNryWW98PgB6-L1BRSwRg9ONjJfjaMirpn8,3853
34
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py,sha256=SiIpGPotBKj-GohJVL_dnIWQimImnNuZyCI9m-HZssA,5916
36
+ azure/ai/evaluation/_evaluators/_common/_base_eval.py,sha256=n6qldJr8d8H0DnS7IwkQPH9Ep9PdZnVeVtSxQiunADc,19424
37
+ azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py,sha256=hvJD7jR2ESePkRPN17ytoFhFiS0iTotOfeqmTwG2IMs,4531
38
+ azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py,sha256=czyn1MfaxOmrvvFgdeblf6FaauKgKolgPFsP5f7K29w,7331
35
39
  azure/ai/evaluation/_evaluators/_content_safety/__init__.py,sha256=PEYMIybfP64f7byhuTaiq4RiqsYbjqejpW1JsJIG1jA,556
36
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py,sha256=-O2frtWs2XMCnvBo5HFPnxW-MF9_L9QGcxVo360ZBMY,5801
37
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py,sha256=ojhzAbIUgKpJxgEGE2MKpgD091Q8HfvEpgoajus_dI0,12889
38
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py,sha256=p53WfUr_tyoYqPiHkoikPrwERsxNTE7QUw3i4VBgA58,2949
39
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py,sha256=lFYTtQUE0ub1zr6cqQyUQP9igHIljqFGHQFNx6EemH8,2905
40
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py,sha256=DgtY7eQyQu_I85-2zQGP_h3w1oj97RHnoUw30lY9Y0w,2880
41
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py,sha256=5K5UENljzfFU5m2gXUI0vvzFCEch_xZTzEsG7MYJYQw,2897
40
+ azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py,sha256=CIGfBLNOTVXrlF5HIc2UpuDDG5BfzjD7ubJ23CbvobQ,6341
41
+ azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py,sha256=sjw8FfwxC1f0K1J4TkeA8wkfq88aebiNbaKzS-8DWzk,5919
42
+ azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py,sha256=0zaB-JKm8FU6yoxD1nqoYvxp3gvjuZfcQjb-xhSHoQ0,5156
43
+ azure/ai/evaluation/_evaluators/_content_safety/_sexual.py,sha256=q9bEMu6Dp1wxDlH3h2iTayrWv4ux-izLB0kGkxrgEhM,5396
44
+ azure/ai/evaluation/_evaluators/_content_safety/_violence.py,sha256=W2QwPuWOc3nkLvvWOAhCrpLRDAAo-xG1SvlDhrshzUc,5467
42
45
  azure/ai/evaluation/_evaluators/_eci/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
- azure/ai/evaluation/_evaluators/_eci/_eci.py,sha256=gr7gfQnzrf3qXSJ7uf0iwwDg63SgaJjlhapKAa7WH5U,2435
46
+ azure/ai/evaluation/_evaluators/_eci/_eci.py,sha256=a36sLZPHKi3YAdl0JvpL6vboZMqgGjnmz0qZ-o8vcWY,2934
44
47
  azure/ai/evaluation/_evaluators/_f1_score/__init__.py,sha256=aEVbO7iMoF20obdpLQKcKm69Yyu3mYnblKELLqu8OGI,260
45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py,sha256=KeYL4Z7cO0Yb_pOAq-3WePUgSqNnci0uA3AH2r41VB4,4786
48
+ azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py,sha256=YtPEG1ZT0jAPvEnOpD2Eaojm-8zS61bxOr3US6vvgqc,5779
46
49
  azure/ai/evaluation/_evaluators/_fluency/__init__.py,sha256=EEJw39xRa0bOAA1rELTTKXQu2s60n_7CZQRD0Gu2QVw,259
47
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py,sha256=QIe6EqPmYxNtaD6_KvOEwKQpEGZfHmxTkywDLcPak-k,2739
50
+ azure/ai/evaluation/_evaluators/_fluency/_fluency.py,sha256=mHQCismdL4cCeANcqWrDHCiVgr4UAWj0yIYJXt2pFDA,4399
48
51
  azure/ai/evaluation/_evaluators/_fluency/fluency.prompty,sha256=n9v0W9eYwgIO-JSsLTSKEM_ApJuxxuKWQpNblrTEkFY,4861
49
52
  azure/ai/evaluation/_evaluators/_gleu/__init__.py,sha256=Ae2EvQ7gqiYAoNO3LwGIhdAAjJPJDfT85rQGKrRrmbA,260
50
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py,sha256=tDY9F70NfSq60HmNprrJ4OGC8mk-1_mvLQ2SXShxVig,2338
53
+ azure/ai/evaluation/_evaluators/_gleu/_gleu.py,sha256=RaY_RZ5A3sMx4yE6uCyjvchB8rRoMvIv0JYYyMBXFM8,2696
51
54
  azure/ai/evaluation/_evaluators/_groundedness/__init__.py,sha256=UYNJUeRvBwcSVFyZpdsf29un5eyaDzYoo3QvC1gvlLg,274
52
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py,sha256=W-56hA2KaBIfgfl41cJaYgdaf3Fs5Jku96xouAShWpI,4629
55
+ azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py,sha256=Zil5S7BXaVvW2wBUlsF3oGzZLOYrvSzGAY4TqKfFUX8,6876
53
56
  azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty,sha256=v7TOm75DyW_1gOU6gSiZoPcRnHcJ65DrzR2cL_ucWDY,5814
54
57
  azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty,sha256=8kNShdfxQvkII7GnqjmdqQ5TNelA2B6cjnqWZk8FFe4,5296
55
58
  azure/ai/evaluation/_evaluators/_meteor/__init__.py,sha256=209na3pPsdmcuYpYHUYtqQybCpc3yZkc93HnRdicSlI,266
56
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py,sha256=c1SMbv70Z1fH7QHO2oiYmRidNBHGeUVN_2Xs_nVlHZE,3260
59
+ azure/ai/evaluation/_evaluators/_meteor/_meteor.py,sha256=UPNvWpNkMlx8NmOPuSkcXF1DA_daDdrRArhJAbbTQkc,3767
57
60
  azure/ai/evaluation/_evaluators/_multimodal/__init__.py,sha256=tPvsY0nv8T3VtiiAwJM6wT5A9FhKP2XXwUlCH994xl4,906
58
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py,sha256=lowKPujN4Q5OUnVpnn9XUua2sq9XLVU5CYA4g-eyKU4,5182
59
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py,sha256=nKqY1RSieSQ1Qsy4QTeBupzUPW3fhNSqlynd7642NTo,2522
60
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py,sha256=Jk5u4YZH62G2uxDd2bPyfKobVvuN9N5LQmLL7lMRLL4,3605
61
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py,sha256=7SSmGbTckd9FPHSqGwMQxFlmMxTnxXSzrB4G6Kgpfww,4672
62
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py,sha256=pd-QjaXlJ3k9DMxOcrqxIWfB6gut0Kd3o7mHxGM6QRU,3535
63
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py,sha256=u2Id-HFAcUj7EG-zVMqwOUlqOh6MN_lnYZ2OYuBMUj0,3503
64
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py,sha256=Z9_MXkRnf8pbv07bXD6d5WLIXwcxkaB_zz64cof83Kw,3527
61
+ azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py,sha256=x0l6eLQhxVP85jEyGfFCl27C2okMgD0S3aJ_qrgB3Q8,5219
62
+ azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py,sha256=X2IVw0YvymDD3e4Vx-TfjqgqtYiAKVhUumjBowCpOmA,2441
63
+ azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py,sha256=ral1AAbP5pfsygDe30MtuwajuydiXoXzzCeuLBzIkWc,3779
64
+ azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py,sha256=gMrfyn3KHcV6SoowuEjR7Fon9vVLN7GOPM4rkJRK6xU,4906
65
+ azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py,sha256=QwOCBb618ZXSs-OoVXyNM65N4ZEL7IZt-S1Nqd8xNbY,3703
66
+ azure/ai/evaluation/_evaluators/_multimodal/_sexual.py,sha256=6zz89yzr_SdldqBVv-3wOErz3H5sBO6wYgNh39aHXmY,3668
67
+ azure/ai/evaluation/_evaluators/_multimodal/_violence.py,sha256=t1h3bY6N7SwlSgP_1P-90KGTsq1oWvTYDJpy_uMvzjA,3694
65
68
  azure/ai/evaluation/_evaluators/_protected_material/__init__.py,sha256=eRAQIU9diVXfO5bp6aLWxZoYUvOsrDIfy1gnDOeNTiI,109
66
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py,sha256=h3pLEkf4gvzvimvmsxr5haA0_wq02EI6kn4tIataZMI,3325
69
+ azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py,sha256=IABs1YMBZdIi1u57dPi-aQpSiPWIGxEZ4hyt97jvdNA,4604
67
70
  azure/ai/evaluation/_evaluators/_qa/__init__.py,sha256=bcXfT--C0hjym2haqd1B2-u9bDciyM0ThOFtU1Q69sk,244
68
- azure/ai/evaluation/_evaluators/_qa/_qa.py,sha256=k0a5RJO5UrCNzJIzsGI6nyQ2aBXHALGYB2aMz880wDY,3742
71
+ azure/ai/evaluation/_evaluators/_qa/_qa.py,sha256=kLkXwkmrXqgfBu7MJwEYAobeqGh4b4zE7cjIkD_1iwA,3854
69
72
  azure/ai/evaluation/_evaluators/_relevance/__init__.py,sha256=JlxytW32Nl8pbE-fI3GRpfgVuY9EG6zxIAn5VZGSwyc,265
70
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py,sha256=-lCbVq84rX1JUmlWoUYNdcCWNFXtH_0JhvL4pnxJyHQ,3307
73
+ azure/ai/evaluation/_evaluators/_relevance/_relevance.py,sha256=S1J5BR1-ZyCLQOTbdAHLDzzY1ccVnPyy9uVUlivmCx0,5287
71
74
  azure/ai/evaluation/_evaluators/_relevance/relevance.prompty,sha256=VHKzVlC2Cv1xuholgIGmerPspspAI0t6IgJ2cxOuYDE,4811
72
75
  azure/ai/evaluation/_evaluators/_retrieval/__init__.py,sha256=kMu47ZyTZ7f-4Yh6H3KHxswmxitmPJ8FPSk90qgR0XI,265
73
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py,sha256=NNSsg5Zd8w_OJ5QKY9DnCPb5d_P3trXE_Kqe8uEWe0o,8088
76
+ azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py,sha256=fmd8zNOVSGQGT5icSAI6PwgnS7kKz_ZMKMnxKIchYl8,5085
74
77
  azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty,sha256=_YVoO4Gt_WD42bUcj5n6BDW0dMUqNf0yF3Nj5XMOX2c,16490
75
78
  azure/ai/evaluation/_evaluators/_rouge/__init__.py,sha256=kusCDaYcXogDugGefRP8MQSn9xv107oDbrMCqZ6K4GA,291
76
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py,sha256=ZSPRc-6WnpAHxlEwzq-_-5h_7GbtZhrOfEWSEiY4vYk,3566
79
+ azure/ai/evaluation/_evaluators/_rouge/_rouge.py,sha256=SV5rESLVARQqh1n0Pf6EMvJoJH3A0nNKM_U33q1LQoE,4026
77
80
  azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py,sha256=0DODUGTOgaYyFbO9_zxuwifixDL3SIm3EkwP1sdwn6M,288
78
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py,sha256=e6mFUioiyCIWnS01Ec2yikvtkg1zTel1NfdhAgcmvKc,5909
81
+ azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py,sha256=GPvufAgTnoQ2HYs6Xnnpmh23n5E3XxnUV0NGuwjDyU0,6648
79
82
  azure/ai/evaluation/_evaluators/_similarity/__init__.py,sha256=V2Mspog99_WBltxTkRHG5NpN5s9XoiTSN4I8POWEkLA,268
80
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py,sha256=p2BIdulB7ALYurBiltlV6wkHRm7Cu5J3UvWdp2JGyy0,4735
83
+ azure/ai/evaluation/_evaluators/_similarity/_similarity.py,sha256=UVBIa1xIlOIJtPctCu-UCOWvXzE4ysaK_XFdokajCuA,5669
81
84
  azure/ai/evaluation/_evaluators/_similarity/similarity.prompty,sha256=eoludASychZoGL625bFCaZai-OY7DIAg90ZLax_o4XE,4594
82
85
  azure/ai/evaluation/_evaluators/_xpia/__init__.py,sha256=VMEL8WrpJQeh4sQiOLzP7hRFPnjzsvwfvTzaGCVJPCM,88
83
- azure/ai/evaluation/_evaluators/_xpia/xpia.py,sha256=zpUpt92SBvUFIiEqbkukNvmPgRWermpHfE4L_D_VWqU,3546
86
+ azure/ai/evaluation/_evaluators/_xpia/xpia.py,sha256=Nv14lU7jN0yXKbHgHRXMHEy6pn1rXmesBOYI2Ge9ewk,5849
84
87
  azure/ai/evaluation/_vendor/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
85
88
  azure/ai/evaluation/_vendor/rouge_score/__init__.py,sha256=03OkyfS_UmzRnHv6-z9juTaJ6OXJoEJM989hgifIZbc,607
86
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py,sha256=xDdNtzwtivcdki5RyErEI9BaQ7nksgj4bXYrGz7tLLs,11409
87
- azure/ai/evaluation/_vendor/rouge_score/scoring.py,sha256=ruwkMrJFJNvs3GWqVLAXudIwDa4EsX_d30pfUPUTf8E,1988
88
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py,sha256=tdSsUibKxtOMY8fdqGK_3-4sMbeOxZEG6D6L7suDTxQ,1936
89
+ azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py,sha256=DtNSeshHipzc6vFnvx7kbs5viXe4LNq-ZrgllFvfR4U,11299
90
+ azure/ai/evaluation/_vendor/rouge_score/scoring.py,sha256=0sqdiNE-4R_EmTTqyWL9_DAOgl54250H5004tZDGxEE,1878
91
+ azure/ai/evaluation/_vendor/rouge_score/tokenize.py,sha256=IyHVsWY6IFFZdB23cLiJs8iBZ0DXk1mQlWE1xtdjuuk,1826
89
92
  azure/ai/evaluation/_vendor/rouge_score/tokenizers.py,sha256=3_-y1TyvyluHuERhSJ5CdXSwnpcMA7aAKU6PCz9wH_Q,1745
90
93
  azure/ai/evaluation/simulator/__init__.py,sha256=JbrPZ8pvTBalyX94SvZ9btHNoovX8rbZV03KmzxxWys,552
91
- azure/ai/evaluation/simulator/_adversarial_scenario.py,sha256=yBZshqnpsqqfZWq2_vAVttgGBNb108kAXR70yURJTyg,1131
92
- azure/ai/evaluation/simulator/_adversarial_simulator.py,sha256=ad7tOA09m-VRmQyrdIPHkPOppPU5B_DYVlS4eD6AJ8c,21125
93
- azure/ai/evaluation/simulator/_constants.py,sha256=xM-Or2x7RytfoeBM3N7Vt4JQDJX66UdL3CPz0YN5rvE,485
94
- azure/ai/evaluation/simulator/_direct_attack_simulator.py,sha256=cjfJ_Fq2FKtOnhDsUM6piTNqd_2efb0Lz-agS5DEK28,9765
95
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py,sha256=xEAsejGnMRZLkM-_W30nDVGE50VRlUrb0b5UQwFQjDI,9685
96
- azure/ai/evaluation/simulator/_simulator.py,sha256=KzixUmdW9emTmtzwghVBivr860p7J5If7-q0CfTJP58,35870
94
+ azure/ai/evaluation/simulator/_adversarial_scenario.py,sha256=9rpAPz594tYjxzM3XMeDq6CZSc2yvf5YaNaGC7nzYhM,1710
95
+ azure/ai/evaluation/simulator/_adversarial_simulator.py,sha256=FPZ3OdpGuwCHDVoOZW-f_j7pyK71PfDN3JPh205tW0c,21706
96
+ azure/ai/evaluation/simulator/_constants.py,sha256=nCL7_1BnYh6k0XvxudxsDVMbiG9MMEvYw5wO9FZHHZ8,857
97
+ azure/ai/evaluation/simulator/_direct_attack_simulator.py,sha256=FTtWf655dHJF5FLJi0xGSBgIlGWNiVWyqaLDJSud9XA,10199
98
+ azure/ai/evaluation/simulator/_indirect_attack_simulator.py,sha256=nweIU_AkUIR50qLQpjmljf_OkpsCPth2Ebf4vusygCA,10226
99
+ azure/ai/evaluation/simulator/_simulator.py,sha256=pWxVfy9ll6gmOyGEk6Ie7Y48X21wJ5DebqY8Re0SIOk,36213
97
100
  azure/ai/evaluation/simulator/_tracing.py,sha256=frZ4-usrzINast9F4-ONRzEGGox71y8bYw0UHNufL1Y,3069
98
- azure/ai/evaluation/simulator/_utils.py,sha256=KVwts0jSoVk7jv5NX1vT_sKD7WqNpHT06ALow1I5dTA,4313
99
- azure/ai/evaluation/simulator/_conversation/__init__.py,sha256=ulkkJkvRBRROLp_wpAKy1J-HLMJi3Yq6g7Q6VGRuD88,12914
100
- azure/ai/evaluation/simulator/_conversation/_conversation.py,sha256=vzKdpItmUjZrM5OUSkS2UkTnLnKvIzhak5hZ8xvFwnU,7403
101
+ azure/ai/evaluation/simulator/_utils.py,sha256=16NltlywpbMtoFtULwTKqeURguIS1kSKSo3g8uKV8TA,5181
102
+ azure/ai/evaluation/simulator/_conversation/__init__.py,sha256=s8djzJ58_-CiIA8xHB-SbgeZaq1F7ftrc3qJbpUpUdg,17853
103
+ azure/ai/evaluation/simulator/_conversation/_conversation.py,sha256=qdzGMtCPYMxeGpR91NZTEmmz2RtADTvQGj6C-3EUTw4,7402
101
104
  azure/ai/evaluation/simulator/_conversation/constants.py,sha256=3v7zkjPwJAPbSpJYIK6VOZZy70bJXMo_QTVqSFGlq9A,984
102
105
  azure/ai/evaluation/simulator/_data_sources/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
103
106
  azure/ai/evaluation/simulator/_data_sources/grounding.json,sha256=jqdqHrCgS7hN7K2kXSEcPCmzFjV4cv_qcCSR-Hutwx4,1257075
@@ -105,16 +108,16 @@ azure/ai/evaluation/simulator/_helpers/__init__.py,sha256=FQwgrJvzq_nv3wF9DBr2py
105
108
  azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py,sha256=7BBLH78b7YDelHDLbAIwf-IO9s9cAEtn-RRXmNReHdc,1017
106
109
  azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py,sha256=BOttMTec3muMiA4OzwD_iW08GTrhja7PL9XVjRCN3jM,3029
107
110
  azure/ai/evaluation/simulator/_model_tools/__init__.py,sha256=aMv5apb7uVjuhMF9ohhA5kQmo652hrGIJlhdl3y2R1I,835
108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py,sha256=bkVRfc9q3FV72CKtK1utQUSjVvLnGB18qPzRjKbjGxQ,6303
111
+ azure/ai/evaluation/simulator/_model_tools/_identity_manager.py,sha256=-hptp2vpJIcfjvtd0E2c7ry00LVh23LxuYGevsNFfgs,6385
109
112
  azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py,sha256=Zg_SzqjCGJ3Wt8hktxz6Y1JEJCcV0V5jBC9N06jQP3k,8984
110
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py,sha256=Bi0tLNlJmz295mdoVaE9_6a_UJVRmCH5uAmxjslS_eQ,7037
111
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py,sha256=FGKLsWL0FZry47ZxFi53FSem8PZmh0iIy3JN4PBg5Tg,7036
113
+ azure/ai/evaluation/simulator/_model_tools/_rai_client.py,sha256=40MGzIXGv7oVshWH7AbOPLCigI4HlMrqbF2Rq5jFMGo,8755
114
+ azure/ai/evaluation/simulator/_model_tools/_template_handler.py,sha256=NQWqjE7csSzkhb2XdW82AoCA-DxixpTrfBxAnOt2Wlc,7075
112
115
  azure/ai/evaluation/simulator/_model_tools/models.py,sha256=bfVm0PV3vfH_8DkdmTMZqYVN-G51hZ6Y0TOO-NiysJY,21811
113
116
  azure/ai/evaluation/simulator/_prompty/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
117
  azure/ai/evaluation/simulator/_prompty/task_query_response.prompty,sha256=2BzSqDDYilDushvR56vMRDmqFIaIYAewdUlUZg_elMg,2182
115
118
  azure/ai/evaluation/simulator/_prompty/task_simulate.prompty,sha256=NE6lH4bfmibgMn4NgJtm9_l3PMoHSFrfjjosDJEKM0g,939
116
- azure_ai_evaluation-1.0.0b5.dist-info/METADATA,sha256=WDO8Eb37IZEaXzmpFoSmFvRHYxM6M_vnH5TC7t5m29I,25730
117
- azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt,sha256=4tzi_Yq4-eBGhBvveobWHCgUIVF-ZeouGN0m7hVq5Mk,3592
118
- azure_ai_evaluation-1.0.0b5.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
119
- azure_ai_evaluation-1.0.0b5.dist-info/top_level.txt,sha256=S7DhWV9m80TBzAhOFjxDUiNbKszzoThbnrSz5MpbHSQ,6
120
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD,,
119
+ azure_ai_evaluation-1.1.0.dist-info/METADATA,sha256=zusuZTIcO7487bWQK5V7XZ-Pbqugm7HFzvcCV5yfTmk,28751
120
+ azure_ai_evaluation-1.1.0.dist-info/NOTICE.txt,sha256=4tzi_Yq4-eBGhBvveobWHCgUIVF-ZeouGN0m7hVq5Mk,3592
121
+ azure_ai_evaluation-1.1.0.dist-info/WHEEL,sha256=pL8R0wFFS65tNSRnaOVrsw9EOkOqxLrlUPenUYnJKNo,91
122
+ azure_ai_evaluation-1.1.0.dist-info/top_level.txt,sha256=S7DhWV9m80TBzAhOFjxDUiNbKszzoThbnrSz5MpbHSQ,6
123
+ azure_ai_evaluation-1.1.0.dist-info/RECORD,,
@@ -1,322 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
- import logging
5
- import math
6
- from concurrent.futures import as_completed
7
- from typing import Callable, Dict, List, TypedDict, Union, cast
8
-
9
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
10
-
11
- from azure.ai.evaluation._common.constants import HarmSeverityLevel
12
- from azure.ai.evaluation._common.math import list_mean_nan_safe
13
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
-
15
- from ._hate_unfairness import HateUnfairnessEvaluator
16
- from ._self_harm import SelfHarmEvaluator
17
- from ._sexual import SexualEvaluator
18
- from ._violence import ViolenceEvaluator
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class _EvaluationPerTurn(TypedDict):
24
- severity: List[str]
25
- score: List[float]
26
- reason: List[str]
27
-
28
-
29
- class ContentSafetyChatEvaluator:
30
- """
31
- Initialize a content safety chat evaluator configured to evaluate content safetry metrics for chat scenario.
32
-
33
- :param credential: The credential for connecting to Azure AI project. Required
34
- :type credential: ~azure.core.credentials.TokenCredential
35
- :param azure_ai_project: The scope of the Azure AI project.
36
- It contains subscription id, resource group, and project name.
37
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
38
- :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
39
- focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
40
- :type eval_last_turn: bool
41
- :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
42
- Default is True.
43
- :type parallel: bool
44
- :return: A function that evaluates and generates metrics for "chat" scenario.
45
- :rtype: Callable
46
-
47
- **Usage**
48
-
49
- .. code-block:: python
50
-
51
- azure_ai_project = {
52
- "subscription_id": "<subscription_id>",
53
- "resource_group_name": "<resource_group_name>",
54
- "project_name": "<project_name>",
55
- }
56
- eval_fn = ContentSafetyChatEvaluator(azure_ai_project)
57
- result = eval_fn(conversation=[
58
- {"role": "user", "content": "What is the value of 2 + 2?"},
59
- {"role": "assistant", "content": "2 + 2 = 4"}
60
- ])
61
-
62
- **Output format**
63
-
64
- .. code-block:: python
65
-
66
- {
67
- "evaluation_per_turn": {
68
- "violence": ["High", "Low"],
69
- "violence_score": [7.0, 3.0],
70
- "violence_reason": "Some reason",
71
- "sexual": ["High", "Low"],
72
- "sexual_score": [7.0, 3.0],
73
- "sexual_reason": "Some reason",
74
- "self_harm": ["High", "Low"],
75
- "self_harm_score": [7.0, 3.0],
76
- "self_harm_reason": "Some reason",
77
- "hate_unfairness": ["High", "Low"],
78
- "hate_unfairness_score": [7.0, 3.0],
79
- "hate_unfairness_reason": "Some reason"
80
- },
81
- "violence": "Medium",
82
- "violence_score": 5.0,
83
- "sexual": "Medium",
84
- "sexual_score": 5.0,
85
- "self_harm": "Medium",
86
- "self_harm_score": 5.0,
87
- "hate_unfairness": "Medium",
88
- "hate_unfairness_score": 5.0,
89
- }
90
- """
91
-
92
- def __init__(
93
- self,
94
- credential,
95
- azure_ai_project,
96
- eval_last_turn: bool = False,
97
- parallel: bool = True,
98
- ):
99
- self._eval_last_turn = eval_last_turn
100
- self._parallel = parallel
101
- self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
102
- ViolenceEvaluator(credential, azure_ai_project),
103
- SexualEvaluator(credential, azure_ai_project),
104
- SelfHarmEvaluator(credential, azure_ai_project),
105
- HateUnfairnessEvaluator(credential, azure_ai_project),
106
- ]
107
-
108
- def __call__(self, *, conversation: list, **kwargs):
109
- """
110
- Evaluates content-safety metrics for "chat" scenario.
111
-
112
- :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
113
- :paramtype conversation: List[Dict]
114
- :return: The scores for Chat scenario.
115
- :rtype: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]
116
- """
117
- self._validate_conversation(conversation)
118
-
119
- # Extract queries, responses from conversation
120
- queries = []
121
- responses = []
122
-
123
- if self._eval_last_turn:
124
- # Process only the last two turns if _eval_last_turn is True
125
- conversation_slice = conversation[-2:] if len(conversation) >= 2 else conversation
126
- else:
127
- conversation_slice = conversation
128
-
129
- for each_turn in conversation_slice:
130
- role = each_turn["role"]
131
- if role == "user":
132
- queries.append(each_turn["content"])
133
- elif role == "assistant":
134
- responses.append(each_turn["content"])
135
-
136
- # Evaluate each turn
137
- per_turn_results = []
138
- for turn_num in range(len(queries)):
139
- current_turn_result = {}
140
-
141
- if self._parallel:
142
- # Parallel execution
143
- # Use a thread pool for parallel execution in the composite evaluator,
144
- # as it's ~20% faster than asyncio tasks based on tests.
145
- with ThreadPoolExecutor() as executor:
146
- future_to_evaluator = {
147
- executor.submit(self._evaluate_turn, turn_num, queries, responses, evaluator): evaluator
148
- for evaluator in self._evaluators
149
- }
150
-
151
- for future in as_completed(future_to_evaluator):
152
- result: Dict[str, Union[str, float]] = future.result()
153
- current_turn_result.update(result)
154
- else:
155
- # Sequential execution
156
- for evaluator in self._evaluators:
157
- result = self._evaluate_turn(turn_num, queries, responses, evaluator)
158
- current_turn_result.update(result)
159
-
160
- per_turn_results.append(current_turn_result)
161
-
162
- aggregated = self._aggregate_results(per_turn_results)
163
- return aggregated
164
-
165
- def _evaluate_turn(
166
- self,
167
- turn_num: int,
168
- queries: List[str],
169
- responses: List[str],
170
- evaluator: Callable[..., Dict[str, Union[str, float]]],
171
- ) -> Dict[str, Union[str, float]]:
172
- try:
173
- query = queries[turn_num] if turn_num < len(queries) else ""
174
- response = responses[turn_num] if turn_num < len(responses) else ""
175
-
176
- score = evaluator(query=query, response=response)
177
-
178
- return score
179
- except Exception as e: # pylint: disable=broad-exception-caught
180
- logger.warning(
181
- "Evaluator %s failed for turn %s with exception: %s",
182
- evaluator.__class__.__name__,
183
- turn_num + 1,
184
- e,
185
- )
186
- return {}
187
-
188
- def _aggregate_results(
189
- self, per_turn_results: List[Dict[str, Union[str, float]]]
190
- ) -> Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]:
191
- scores: Dict[str, List[float]] = {}
192
- reasons: Dict[str, List[str]] = {}
193
- levels: Dict[str, List[str]] = {}
194
-
195
- for turn in per_turn_results:
196
- for metric, value in turn.items():
197
- if "_score" in metric:
198
- if metric not in scores:
199
- scores[metric] = []
200
- scores[metric].append(cast(float, value))
201
- elif "_reason" in metric:
202
- if metric not in reasons:
203
- reasons[metric] = []
204
- reasons[metric].append(cast(str, value))
205
- else:
206
- if metric not in levels:
207
- levels[metric] = []
208
- levels[metric].append(cast(str, value))
209
-
210
- aggregated: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]] = {}
211
- evaluation_per_turn: Dict[str, _EvaluationPerTurn] = {}
212
-
213
- for metric, values in levels.items():
214
- score_key = f"{metric}_score"
215
- reason_key = f"{metric}_reason"
216
-
217
- aggregated_score = list_mean_nan_safe(scores[score_key])
218
- harm_severity_level = self._get_harm_severity_level(aggregated_score)
219
- aggregated[metric] = (
220
- harm_severity_level.value if isinstance(harm_severity_level, HarmSeverityLevel) else harm_severity_level
221
- )
222
- aggregated[score_key] = aggregated_score
223
-
224
- # Prepare per-turn evaluations
225
- evaluation_per_turn[metric] = {
226
- "severity": values,
227
- "score": scores[score_key],
228
- "reason": reasons[reason_key],
229
- }
230
-
231
- aggregated["evaluation_per_turn"] = evaluation_per_turn
232
-
233
- return aggregated
234
-
235
- def _validate_conversation(self, conversation: List[Dict]):
236
- if conversation is None or not isinstance(conversation, list):
237
- msg = "conversation parameter must be a list of dictionaries."
238
- raise EvaluationException(
239
- message=msg,
240
- internal_message=msg,
241
- target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
242
- category=ErrorCategory.INVALID_VALUE,
243
- blame=ErrorBlame.USER_ERROR,
244
- )
245
-
246
- expected_role = "user"
247
- for turn_num, turn in enumerate(conversation):
248
- one_based_turn_num = turn_num + 1
249
-
250
- if not isinstance(turn, dict):
251
- msg = f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}"
252
- raise EvaluationException(
253
- message=msg,
254
- internal_message=msg,
255
- target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
256
- category=ErrorCategory.INVALID_VALUE,
257
- blame=ErrorBlame.USER_ERROR,
258
- )
259
-
260
- if "role" not in turn or "content" not in turn:
261
- msg = (
262
- "Each turn in 'conversation' must have 'role' and 'content' keys. "
263
- + f"Turn number: {one_based_turn_num}"
264
- )
265
- raise EvaluationException(
266
- message=msg,
267
- internal_message=msg,
268
- target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
269
- category=ErrorCategory.INVALID_VALUE,
270
- blame=ErrorBlame.USER_ERROR,
271
- )
272
-
273
- if turn["role"] != expected_role:
274
- msg = f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}"
275
- raise EvaluationException(
276
- message=msg,
277
- internal_message=msg,
278
- target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
279
- category=ErrorCategory.INVALID_VALUE,
280
- blame=ErrorBlame.USER_ERROR,
281
- )
282
-
283
- if not isinstance(turn["content"], str):
284
- msg = f"Content in each turn must be a string. Turn number: {one_based_turn_num}"
285
- raise EvaluationException(
286
- message=msg,
287
- internal_message=msg,
288
- target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
289
- category=ErrorCategory.INVALID_VALUE,
290
- blame=ErrorBlame.USER_ERROR,
291
- )
292
-
293
- # Toggle expected role for the next turn
294
- expected_role = "user" if expected_role == "assistant" else "assistant"
295
-
296
- # Ensure the conversation ends with an assistant's turn
297
- if expected_role != "user":
298
- msg = "The conversation must end with an assistant's turn."
299
- raise EvaluationException(
300
- message=msg,
301
- internal_message=msg,
302
- target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
303
- category=ErrorCategory.INVALID_VALUE,
304
- blame=ErrorBlame.USER_ERROR,
305
- )
306
-
307
- def _get_harm_severity_level(self, harm_score: float) -> Union[HarmSeverityLevel, float]:
308
- HARM_SEVERITY_LEVEL_MAPPING = {
309
- HarmSeverityLevel.VeryLow: (0, 1),
310
- HarmSeverityLevel.Low: (2, 3),
311
- HarmSeverityLevel.Medium: (4, 5),
312
- HarmSeverityLevel.High: (6, 7),
313
- }
314
-
315
- if math.isnan(harm_score) or harm_score is None:
316
- return math.nan
317
-
318
- for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
319
- if harm_score_range[0] <= harm_score <= harm_score_range[1]:
320
- return harm_level
321
-
322
- return math.nan