azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (43) hide show
  1. azure/ai/evaluation/__init__.py +9 -5
  2. azure/ai/evaluation/_common/utils.py +24 -9
  3. azure/ai/evaluation/_constants.py +4 -0
  4. azure/ai/evaluation/_evaluate/_evaluate.py +57 -39
  5. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +34 -81
  6. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  7. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  8. azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
  9. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
  10. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
  11. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  12. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  13. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  14. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  15. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  16. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  17. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +29 -74
  18. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  19. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +33 -80
  20. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  21. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  22. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +34 -83
  23. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  24. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  25. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +16 -22
  26. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  27. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -11
  28. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  29. azure/ai/evaluation/_evaluators/_xpia/xpia.py +16 -90
  30. azure/ai/evaluation/_exceptions.py +0 -1
  31. azure/ai/evaluation/_model_configurations.py +36 -8
  32. azure/ai/evaluation/_version.py +1 -1
  33. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
  34. azure/ai/evaluation/simulator/_simulator.py +19 -8
  35. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/METADATA +59 -1
  36. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/RECORD +38 -39
  37. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  38. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
  39. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  40. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  41. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  42. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/WHEEL +0 -0
  43. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: azure-ai-evaluation
3
- Version: 1.0.0b2
3
+ Version: 1.0.0b3
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -426,6 +426,64 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
426
426
 
427
427
  # Release History
428
428
 
429
+ ## 1.0.0b3 (2024-10-01)
430
+
431
+ ### Features Added
432
+
433
+ - Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
434
+ - The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
435
+ - `ViolenceEvaluator`
436
+ - `SexualEvaluator`
437
+ - `SelfHarmEvaluator`
438
+ - `HateUnfairnessEvaluator`
439
+ - `ProtectedMaterialEvaluator`
440
+ - `IndirectAttackEvaluator`
441
+ - `CoherenceEvaluator`
442
+ - `RelevanceEvaluator`
443
+ - `FluencyEvaluator`
444
+ - `GroundednessEvaluator`
445
+ - Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
446
+
447
+ ### Breaking Changes
448
+
449
+ - Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
450
+ - The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
451
+ `column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
452
+
453
+ Before:
454
+ ```python
455
+ evaluate(
456
+ ...,
457
+ evaluator_config={
458
+ "hate_unfairness": {
459
+ "query": "${data.question}",
460
+ "response": "${data.answer}",
461
+ }
462
+ },
463
+ ...
464
+ )
465
+ ```
466
+
467
+ After
468
+ ```python
469
+ evaluate(
470
+ ...,
471
+ evaluator_config={
472
+ "hate_unfairness": {
473
+ "column_mapping": {
474
+ "query": "${data.question}",
475
+ "response": "${data.answer}",
476
+ }
477
+ }
478
+ },
479
+ ...
480
+ )
481
+ ```
482
+
483
+ ### Bugs Fixed
484
+
485
+ - Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
486
+
429
487
  ## 1.0.0b2 (2024-09-24)
430
488
 
431
489
  ### Breaking Changes
@@ -1,18 +1,18 @@
1
- azure/ai/evaluation/__init__.py,sha256=7ff0POpQe1C1L9eD4yz9P9y9BtFWNr5wHEtSOR2xVA0,1967
2
- azure/ai/evaluation/_constants.py,sha256=cWnrEjJuPok2CPbibJ40XyX2VSlw4-x_dob3DjFsd5U,1683
3
- azure/ai/evaluation/_exceptions.py,sha256=HUMfvguDc7ygcbs3MTK14R3PK7UxGNWQQHH3hYXIV3U,4168
1
+ azure/ai/evaluation/__init__.py,sha256=AW8HyrHG5L1NT-0-vzu14o9Em8-ZiKXokPlIAvgF9lI,1977
2
+ azure/ai/evaluation/_constants.py,sha256=RWerL5-uO8xmgZmdyon5TYH2_xPM_31cZqXs7qk28Ms,1743
3
+ azure/ai/evaluation/_exceptions.py,sha256=CH4Waotlr519uHirau38NFExv5cG5JgrjPxjTAHWPGU,4131
4
4
  azure/ai/evaluation/_http_utils.py,sha256=kpAuxuoC6ZK-ZHw6qScOC8ePJQXLwf7SJWP5S4OWwUs,13983
5
- azure/ai/evaluation/_model_configurations.py,sha256=D02AzOdyO6LQCia0k232Msd7ro35-EcwmlQ0tOD_5H0,652
5
+ azure/ai/evaluation/_model_configurations.py,sha256=9yiXXCFw8DiCHB0Rjg6NEHZNSK8AYCfpRdRp_rwd5DU,1869
6
6
  azure/ai/evaluation/_user_agent.py,sha256=O2y-QPBAcw7w7qQ6M2aRPC3Vy3TKd789u5lcs2yuFaI,290
7
- azure/ai/evaluation/_version.py,sha256=Fx-4qD0RW2DqdKCStZHT4KOjqxKVEeT05DbGkqnTSF4,201
7
+ azure/ai/evaluation/_version.py,sha256=O5t2mfbIREHTFuOmpAqCrJ00-kGQ4SRcFh42DDRu2Ac,201
8
8
  azure/ai/evaluation/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  azure/ai/evaluation/_common/__init__.py,sha256=LHTkf6dMLLxikrGNgbUuREBVQcs4ORHR6Eryo4bm9M8,586
10
10
  azure/ai/evaluation/_common/constants.py,sha256=ZKHGVgGA1Fc6Pvo22X-CeOUX6-m0q_UwpOKOWATTSuI,1639
11
11
  azure/ai/evaluation/_common/rai_service.py,sha256=R-1jtWi4Fu4IT4v0j0hKsWyN-D5nwH5HQr1x0AtCp8Q,17539
12
- azure/ai/evaluation/_common/utils.py,sha256=orrvIGY3L2o8uHEJkTdfNa4jV_8OsscWyL3CzX96U9o,2991
12
+ azure/ai/evaluation/_common/utils.py,sha256=IKCAxHbGcmi5rH6qFZOB76vInXS8A7Oa7dYC56K0BWY,3494
13
13
  azure/ai/evaluation/_evaluate/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
14
14
  azure/ai/evaluation/_evaluate/_eval_run.py,sha256=EVCSRjIwOkXfYlIBTv7hosyUqcMTmVqGQ44yvHmm2Eg,20943
15
- azure/ai/evaluation/_evaluate/_evaluate.py,sha256=50djPOmW-K3f7lbTttPWcgGx96_b-zTeG5iBHuPArRY,29141
15
+ azure/ai/evaluation/_evaluate/_evaluate.py,sha256=XoGXpzO8Z_hYwpPw9amjYarrKqtvNtLB4xLdtE8mmuI,30324
16
16
  azure/ai/evaluation/_evaluate/_utils.py,sha256=9I29KAFsYJCp6frlLXb5vwZJzmiNzfzku9CD1eslaHU,9880
17
17
  azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py,sha256=BkxhojWca3e2QM3hFwO2xrLiiQ0i-3f8wsMfOx1zchs,361
18
18
  azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py,sha256=sn7k9nM7vVZDt5CgNwwQrvQwV--SwfsfLjfjahk23DM,2984
@@ -22,59 +22,58 @@ azure/ai/evaluation/_evaluate/_telemetry/__init__.py,sha256=6kkbiTCsz7BNV5WxOdwC
22
22
  azure/ai/evaluation/_evaluators/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
23
23
  azure/ai/evaluation/_evaluators/_bleu/__init__.py,sha256=quKKO0kvOSkky5hcoNBvgBuMeeVRFCE9GSv70mAdGP4,260
24
24
  azure/ai/evaluation/_evaluators/_bleu/_bleu.py,sha256=6EJCG9DnL2Y4pU_vhY4o3UOrumvI-6HI92tzEuCoyXk,2413
25
- azure/ai/evaluation/_evaluators/_chat/__init__.py,sha256=xOsSHYNGJJiZvBMPbmLd_-ZZs8_15Sblvk-OF7iVoIo,250
26
- azure/ai/evaluation/_evaluators/_chat/_chat.py,sha256=ZCm7L50bRaAvj2-Gw6T_9u4RTHRF6neCYnjDCunRjKw,14787
27
- azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py,sha256=DmBjBkwDDlCsSGpBeXfpfMM9ekxIJs62dij4rBXND7k,273
28
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py,sha256=HKkQdvYVD3Mr5kPUzKLBYsct94k1hAcLu5v9MCxZ6lA,5579
29
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty,sha256=NoHNDf_UE8BGAixqZPhRQ4ecxOUi9w9gO8HfHhJuxaY,1761
30
25
  azure/ai/evaluation/_evaluators/_coherence/__init__.py,sha256=GRqcSCQse02Spyki0UsRNWMIXiea2lLtPPXNGvkJzQ0,258
31
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py,sha256=XMymhai4cw9Gqy9Lfa9IvIcw9s_cnCM1pbyVVnpF53M,3958
32
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty,sha256=WVEXxKmh_Gbb11_00N2WCIIJSMgPssFxJ5h2--rMG-w,2725
33
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py,sha256=mR5CbcMyxV9GQoY71Saoi0bQTpEB74HrYmM8gcVhnAg,746
26
+ azure/ai/evaluation/_evaluators/_coherence/_coherence.py,sha256=ydYo03-XlJDaX6YD432xMfutU8r41ovf1MW7vVuWPw4,2512
27
+ azure/ai/evaluation/_evaluators/_coherence/coherence.prompty,sha256=_GXYhAH04tsl2qntZH5ACx7gFNfUeQ0hZQpOmDoLPNc,2549
28
+ azure/ai/evaluation/_evaluators/_common/__init__.py,sha256=_hPqTkAla_O6s4ebVtTaBrVLEW3KSdDz66WwxjK50cI,423
29
+ azure/ai/evaluation/_evaluators/_common/_base_eval.py,sha256=Ai3jN-HRzNsG2V48pMR9TmK3owT-6YJxKi9gzMloZNE,14072
30
+ azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py,sha256=bPePVS7-5gvzEtpKxlDBamxNwetBJTqf5nCMt6Wu7ao,3050
31
+ azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py,sha256=O_klXEBsmYJyyqoIGN5gRg0udnvTKtaouQmEeMGpOgM,4331
32
+ azure/ai/evaluation/_evaluators/_content_safety/__init__.py,sha256=PEYMIybfP64f7byhuTaiq4RiqsYbjqejpW1JsJIG1jA,556
34
33
  azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py,sha256=uM6BL9jwtv5nJpSchezTYc-E514_VCTN2pACy7oxHuU,3928
35
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py,sha256=VlBDKrB2uNXX8ccV7aKsHM74_mK6JLUZ9SAGcvR3Lk8,2733
36
34
  azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py,sha256=n5fdL0TPelJY_AuiamkLO7Jiv7P-7gIZqipo5ShyoR8,11958
37
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py,sha256=uA_FecBD9M1C0_hfnbqR9shZNMFbhmBcCAhKgTxv8rI,2807
38
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py,sha256=6_FK2zDAyS3wQTbYe3FmNXxw7qWwYOGqLUKopWUWKvQ,2608
39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py,sha256=d8LZD368nqppZi-Kdimo4Lt8QOTFtc6csPjY66G7SMQ,2576
40
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py,sha256=e0ZeUcD1YKWPzqfaLBxFdNHmod69K55zIOgLOJAMe_4,2600
35
+ azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py,sha256=YIQLFj6P7WXAyRRHVOflikUePN5sMCanJQmnIpSDeY0,1856
36
+ azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py,sha256=BFRNVDehDe7Qgnt3k1zRzscFQmn_miuHaSpjOytFhds,1810
37
+ azure/ai/evaluation/_evaluators/_content_safety/_sexual.py,sha256=Ap4EvHDPF8YjJ_esKEK83yusSR_xYhJWn6HIn1mkwW0,1788
38
+ azure/ai/evaluation/_evaluators/_content_safety/_violence.py,sha256=HbLsZOqLopr0beDeHW85EmqSMFcTCZIYGUYvxUq_-gM,1804
41
39
  azure/ai/evaluation/_evaluators/_eci/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- azure/ai/evaluation/_evaluators/_eci/_eci.py,sha256=Amrch3q5BG2BFAB7GdShnTX19f5IoGY6VF66_M6hbZs,3990
40
+ azure/ai/evaluation/_evaluators/_eci/_eci.py,sha256=EPy_A4BtqHm_10kHApi9xZ2eHYU5CjVGtkfG4zUMRhs,2411
43
41
  azure/ai/evaluation/_evaluators/_f1_score/__init__.py,sha256=aEVbO7iMoF20obdpLQKcKm69Yyu3mYnblKELLqu8OGI,260
44
42
  azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py,sha256=Cgp-hANXtrNfe1q6SVm-CUG3UEP8Gj1pvH293ay57OI,4554
45
43
  azure/ai/evaluation/_evaluators/_fluency/__init__.py,sha256=EEJw39xRa0bOAA1rELTTKXQu2s60n_7CZQRD0Gu2QVw,259
46
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py,sha256=MhDOoP9oD-rw7uT90pU6O1XP6vqfOP6e608tp1qRy7Y,3919
47
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty,sha256=RparSdDZs-xiGbq7lRifz9z7jaD10ldXDU3E7sO0v2s,2579
44
+ azure/ai/evaluation/_evaluators/_fluency/_fluency.py,sha256=k7lq7qRoiI2SaHPqLhW-Frm_STRK-hFHsbAFOejAU7s,2459
45
+ azure/ai/evaluation/_evaluators/_fluency/fluency.prompty,sha256=xdznyssZDQiLELv4ecC-8uUJ4ssM-iij7A6S1aDsxOQ,2403
48
46
  azure/ai/evaluation/_evaluators/_gleu/__init__.py,sha256=Ae2EvQ7gqiYAoNO3LwGIhdAAjJPJDfT85rQGKrRrmbA,260
49
47
  azure/ai/evaluation/_evaluators/_gleu/_gleu.py,sha256=m02wmIGjdoXjp9dwjnFQAKA8hGOUOTvpppDf2CD4QQo,2326
50
48
  azure/ai/evaluation/_evaluators/_groundedness/__init__.py,sha256=UYNJUeRvBwcSVFyZpdsf29un5eyaDzYoo3QvC1gvlLg,274
51
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py,sha256=3HcNMOBxbslIJmmhJWU3itPOL3TOiLqQYiYr_n8wTqU,4138
52
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty,sha256=dDclp_YowBjeiGhwmCxCnS4A3K9r4v2tzsUm-ccLt-I,3199
49
+ azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py,sha256=FREk-1w_K6oF74eiNii5EdRS4uK_NUxW0dLd5Kzgj6c,2682
50
+ azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty,sha256=ylgxKa_xipb7wN_QwxSnjrD9AhKcJQCv8pPpWPwFfGg,3023
53
51
  azure/ai/evaluation/_evaluators/_meteor/__init__.py,sha256=209na3pPsdmcuYpYHUYtqQybCpc3yZkc93HnRdicSlI,266
54
52
  azure/ai/evaluation/_evaluators/_meteor/_meteor.py,sha256=K3EdRuRcuEZYVIlI2jMEp0O9KJYXQB2o6h08q43oKWY,3316
55
53
  azure/ai/evaluation/_evaluators/_protected_material/__init__.py,sha256=eRAQIU9diVXfO5bp6aLWxZoYUvOsrDIfy1gnDOeNTiI,109
56
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py,sha256=LnbyS3xVu_7Q1nDuoOxnQnzXhZVrDMSnEzNdu66g6DY,4165
57
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py,sha256=A12UsRVIebGvy9FtZLBPsOIAWUskBt8iuhRdILyRcSo,112
58
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py,sha256=xF5jHhM29OXh2sHrnXkYtiRYltuVU-fqC7xToiI3WOM,4136
54
+ azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py,sha256=KT8AMLL9nGvI_KVwpUAcZJcPqPN_QNyQMkkmIhmexNE,2117
59
55
  azure/ai/evaluation/_evaluators/_qa/__init__.py,sha256=bcXfT--C0hjym2haqd1B2-u9bDciyM0ThOFtU1Q69sk,244
60
56
  azure/ai/evaluation/_evaluators/_qa/_qa.py,sha256=w9XJOfDof78mfOpc7tbPF5wec9dGPFmXXAdR5yx2buI,3502
61
57
  azure/ai/evaluation/_evaluators/_relevance/__init__.py,sha256=JlxytW32Nl8pbE-fI3GRpfgVuY9EG6zxIAn5VZGSwyc,265
62
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py,sha256=iMky4Gf1kJB1kvBLaNWwCOBKLSWfhtSSLU0tPTU6ruE,4344
63
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty,sha256=AO70ho2nMhBtKcl_q4nKFW4kA1LjYsmSfymNa-Cbcrw,3735
58
+ azure/ai/evaluation/_evaluators/_relevance/_relevance.py,sha256=tHBHzp2wz3szgfA24HQgphP4mF5iJfg-lw6bVqgqkpY,2934
59
+ azure/ai/evaluation/_evaluators/_relevance/relevance.prompty,sha256=QNWlrWxObUPlXFF1hdCDVpfXuw0QDOxHUtWLj1MwrxA,3559
60
+ azure/ai/evaluation/_evaluators/_retrieval/__init__.py,sha256=kMu47ZyTZ7f-4Yh6H3KHxswmxitmPJ8FPSk90qgR0XI,265
61
+ azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py,sha256=u6OqyZ62JpHmYatepRW5aRbtwu1sZByVSCDj_CRZSj8,5160
62
+ azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty,sha256=HbQu5Gy9Ghw9r8vGCF-4ui441JBD8w45NOU_9ehamd0,1585
64
63
  azure/ai/evaluation/_evaluators/_rouge/__init__.py,sha256=kusCDaYcXogDugGefRP8MQSn9xv107oDbrMCqZ6K4GA,291
65
64
  azure/ai/evaluation/_evaluators/_rouge/_rouge.py,sha256=28vqjjleeJR5VRsQP5VCCMX_PVUUVxkgh4c3xIvwmXE,3526
66
65
  azure/ai/evaluation/_evaluators/_similarity/__init__.py,sha256=V2Mspog99_WBltxTkRHG5NpN5s9XoiTSN4I8POWEkLA,268
67
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py,sha256=T-aTYausFvIfxCe0u8Lbd3ehKwxRtRLk2gHZ2pJ9YkE,4332
68
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty,sha256=p2Tb4IW6QnP2BaGRQsAicW4V0B23Oezhf5l3Hau0nxU,4770
66
+ azure/ai/evaluation/_evaluators/_similarity/_similarity.py,sha256=m4Ub7EcGMHotcOll_WIEYrvUWV1hMjF6K1VGthkEoqk,3883
67
+ azure/ai/evaluation/_evaluators/_similarity/similarity.prompty,sha256=eoludASychZoGL625bFCaZai-OY7DIAg90ZLax_o4XE,4594
69
68
  azure/ai/evaluation/_evaluators/_xpia/__init__.py,sha256=VMEL8WrpJQeh4sQiOLzP7hRFPnjzsvwfvTzaGCVJPCM,88
70
- azure/ai/evaluation/_evaluators/_xpia/xpia.py,sha256=iYXjGt0F_gfc7SF6Q_d3Z_0Bkniqad91wljtBwR6BDM,5750
69
+ azure/ai/evaluation/_evaluators/_xpia/xpia.py,sha256=Mg0nhT00VPgfBqp0Pu-7C4Unf6MEu8yNMFW-Wu7RTXw,2556
71
70
  azure/ai/evaluation/simulator/__init__.py,sha256=UtlcXo3SteIQEW_hW2WMhtqLNiDiIGLeW_lIkEUNoMc,486
72
71
  azure/ai/evaluation/simulator/_adversarial_scenario.py,sha256=SxpyMw5wmM5-fiUjl1_oJH0GQEnsa7ASso10MAr2Hjw,1030
73
72
  azure/ai/evaluation/simulator/_adversarial_simulator.py,sha256=kOL31FcD7vXTpkeFUooASXNaFTe9Vme5st_i0Qa_9sA,20542
74
73
  azure/ai/evaluation/simulator/_constants.py,sha256=xM-Or2x7RytfoeBM3N7Vt4JQDJX66UdL3CPz0YN5rvE,485
75
74
  azure/ai/evaluation/simulator/_direct_attack_simulator.py,sha256=zFYYdk8Sdg4-_HSd_rumM0LizPevcR57HjqvEdowv8c,11691
76
75
  azure/ai/evaluation/simulator/_indirect_attack_simulator.py,sha256=qALFN3LG5o1kSjMjdlLeJInax8GcjD1iPUZCayJp0Kc,9628
77
- azure/ai/evaluation/simulator/_simulator.py,sha256=MpB8fcCmuge7AvHmoV-K3L9tdEwWHcfpkiU22MJfZ-A,31642
76
+ azure/ai/evaluation/simulator/_simulator.py,sha256=V9xNOwDRTlK9Xf1SyRK4yv8j3pTFd_4D79BYanePoDw,32187
78
77
  azure/ai/evaluation/simulator/_tracing.py,sha256=LRPjsVLe9VohmXowFr9aCK_VwD0MHd1CBe8rl9jGQhU,3032
79
78
  azure/ai/evaluation/simulator/_utils.py,sha256=aXH5GdzQrwluKvYofWtdT0s_nzgVHS2hP6x4rc5zt-E,4287
80
79
  azure/ai/evaluation/simulator/_conversation/__init__.py,sha256=MNfFW4UDsVrk1p2ysIvmYlLzHqjKfxExktQXfSRiBPk,12774
@@ -91,9 +90,9 @@ azure/ai/evaluation/simulator/_model_tools/_rai_client.py,sha256=Bi0tLNlJmz295md
91
90
  azure/ai/evaluation/simulator/_model_tools/_template_handler.py,sha256=gGSMvveKWn0LKSQ4FS5AxIwcsxj6iqCbUP53yjvndPw,5471
92
91
  azure/ai/evaluation/simulator/_model_tools/models.py,sha256=11O6jcj3Zwo4FZvmF-X0walNp22ux1k3ghi3KFtbdy0,21762
93
92
  azure/ai/evaluation/simulator/_prompty/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty,sha256=lNSioz2XiGQJb-AXRNYm2JCLMivZKa3JlHfol2Jd7fY,2244
93
+ azure/ai/evaluation/simulator/_prompty/task_query_response.prompty,sha256=wUiDKFL_vnAk3eEW66z33UgNML8Wqd_ReCzqfEBMId8,2350
95
94
  azure/ai/evaluation/simulator/_prompty/task_simulate.prompty,sha256=00zLVfNgHZdlbC2XvBedSrwDJOaAhl3B1ohE3LKsGg4,928
96
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA,sha256=t8N3Aanv8JY6JHI4bElRml_Iy0Bm1wSESBG0K78Ou4Y,15520
97
- azure_ai_evaluation-1.0.0b2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
98
- azure_ai_evaluation-1.0.0b2.dist-info/top_level.txt,sha256=S7DhWV9m80TBzAhOFjxDUiNbKszzoThbnrSz5MpbHSQ,6
99
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD,,
95
+ azure_ai_evaluation-1.0.0b3.dist-info/METADATA,sha256=2jKCZxNNYmFeTFJo87_qO32_nAE9ur1YwSSwnn2Mi9I,17077
96
+ azure_ai_evaluation-1.0.0b3.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
97
+ azure_ai_evaluation-1.0.0b3.dist-info/top_level.txt,sha256=S7DhWV9m80TBzAhOFjxDUiNbKszzoThbnrSz5MpbHSQ,6
98
+ azure_ai_evaluation-1.0.0b3.dist-info/RECORD,,
@@ -1,357 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
- import json
5
- import logging
6
- from concurrent.futures import as_completed
7
- from typing import Dict, List
8
-
9
- import numpy as np
10
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
11
-
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
-
14
- from .._coherence import CoherenceEvaluator
15
- from .._fluency import FluencyEvaluator
16
- from .._groundedness import GroundednessEvaluator
17
- from .._relevance import RelevanceEvaluator
18
- from .retrieval import RetrievalChatEvaluator
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class ChatEvaluator:
24
- """
25
- Initialize a chat evaluator configured for a specific Azure OpenAI model.
26
-
27
- :param model_config: Configuration for the Azure OpenAI model.
28
- :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
29
- ~azure.ai.evaluation.OpenAIModelConfiguration]
30
- :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
31
- focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
32
- :type eval_last_turn: bool
33
- :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
34
- Default is True.
35
- :type parallel: bool
36
- :return: A function that evaluates and generates metrics for "chat" scenario.
37
- :rtype: Callable
38
-
39
- **Usage**
40
-
41
- .. code-block:: python
42
-
43
- chat_eval = ChatEvaluator(model_config)
44
- conversation = [
45
- {"role": "user", "content": "What is the value of 2 + 2?"},
46
- {"role": "assistant", "content": "2 + 2 = 4", "context": {
47
- "citations": [
48
- {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
49
- ]
50
- }
51
- }
52
- ]
53
- result = chat_eval(conversation=conversation)
54
-
55
- **Output format**
56
-
57
- .. code-block:: python
58
-
59
- {
60
- "evaluation_per_turn": {
61
- "gpt_retrieval": [1.0, 2.0],
62
- "gpt_groundedness": [5.0, 2.0],
63
- "gpt_relevance": [3.0, 5.0],
64
- "gpt_coherence": [1.0, 2.0],
65
- "gpt_fluency": [3.0, 5.0]
66
- }
67
- "gpt_retrieval": 1.5,
68
- "gpt_groundedness": 3.5,
69
- "gpt_relevance": 4.0,
70
- "gpt_coherence": 1.5,
71
- "gpt_fluency": 4.0
72
- }
73
- """
74
-
75
- def __init__(
76
- self,
77
- model_config: dict,
78
- eval_last_turn: bool = False,
79
- parallel: bool = True,
80
- ):
81
- self._eval_last_turn = eval_last_turn
82
- self._parallel = parallel
83
-
84
- # TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
85
- self._rag_evaluators = [
86
- GroundednessEvaluator(model_config),
87
- RelevanceEvaluator(model_config),
88
- ]
89
- self._non_rag_evaluators = [
90
- CoherenceEvaluator(model_config),
91
- FluencyEvaluator(model_config),
92
- ]
93
- # TODO: Temporary workaround to close the gap of missing retrieval score
94
- # https://msdata.visualstudio.com/Vienna/_workitems/edit/3186644
95
- # For long term, we need to add a built-in evaluator for retrieval after prompt is generalized for QA and Chat
96
- self._retrieval_chat_evaluator = RetrievalChatEvaluator(model_config)
97
-
98
- def __call__(self, *, conversation, **kwargs):
99
- """
100
- Evaluates chat scenario.
101
-
102
- :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
103
- "context" key is optional for assistant's turn and should have "citations" key with list of citations.
104
- :paramtype conversation: List[Dict]
105
- :return: The scores for Chat scenario.
106
- :rtype: dict
107
- """
108
- self._validate_conversation(conversation)
109
-
110
- # Extract queries, responses and contexts from conversation
111
- queries = []
112
- responses = []
113
- contexts = []
114
-
115
- if self._eval_last_turn:
116
- # Process only the last two turns if _eval_last_turn is True
117
- conversation_slice = conversation[-2:] if len(conversation) >= 2 else conversation
118
- else:
119
- conversation_slice = conversation
120
-
121
- for each_turn in conversation_slice:
122
- role = each_turn["role"]
123
- if role == "user":
124
- queries.append(each_turn["content"])
125
- elif role == "assistant":
126
- responses.append(each_turn["content"])
127
- if "context" in each_turn and "citations" in each_turn["context"]:
128
- citations = json.dumps(each_turn["context"]["citations"])
129
- contexts.append(citations)
130
-
131
- # Select evaluators to be used for evaluation
132
- compute_rag_based_metrics = True
133
- if len(responses) != len(contexts):
134
- safe_message = (
135
- "Skipping rag based metrics as we need citations or "
136
- "retrieved_documents in context key of every assistant's turn"
137
- )
138
- logger.warning(safe_message)
139
- compute_rag_based_metrics = False
140
-
141
- selected_evaluators = []
142
- selected_evaluators.extend(self._non_rag_evaluators)
143
- if compute_rag_based_metrics:
144
- selected_evaluators.extend(self._rag_evaluators)
145
-
146
- # Evaluate each turn
147
- per_turn_results = []
148
- for turn_num in range(len(queries)):
149
- current_turn_result = {}
150
-
151
- if self._parallel:
152
- # Parallel execution
153
- with ThreadPoolExecutor() as executor:
154
- future_to_evaluator = {
155
- executor.submit(
156
- self._evaluate_turn, turn_num, queries, responses, contexts, evaluator
157
- ): evaluator
158
- for evaluator in selected_evaluators
159
- }
160
-
161
- for future in as_completed(future_to_evaluator):
162
- result = future.result()
163
- current_turn_result.update(result)
164
- else:
165
- # Sequential execution
166
- for evaluator in selected_evaluators:
167
- async_evaluator = evaluator._to_async()
168
- result = self._evaluate_turn(turn_num, queries, responses, contexts, async_evaluator)
169
- current_turn_result.update(result)
170
-
171
- per_turn_results.append(current_turn_result)
172
-
173
- # Aggregate results
174
- # Final aggregated results for a conversation will look like:
175
- # "gpt_groundedness": 2.0, # Mean of all groundedness scores
176
- # "evaluation_per_turn": {
177
- # "gpt_groundedness": {
178
- # "score": [1.0, ...],
179
- # "reason": ["reason1", ...],
180
- # },
181
- # },
182
- # }
183
- aggregated = self._aggregate_results(per_turn_results)
184
-
185
- # Run RetrievalChatEvaluator and merge the results
186
- if compute_rag_based_metrics:
187
- retrieval_score = self._retrieval_chat_evaluator(conversation=conversation_slice)
188
- aggregated["gpt_retrieval"] = retrieval_score["gpt_retrieval"]
189
- aggregated["evaluation_per_turn"]["gpt_retrieval"] = retrieval_score["evaluation_per_turn"]["gpt_retrieval"]
190
- aggregated = dict(sorted(aggregated.items()))
191
-
192
- return aggregated
193
-
194
- def _evaluate_turn(self, turn_num, queries, responses, contexts, evaluator):
195
- try:
196
- query = queries[turn_num] if turn_num < len(queries) else ""
197
- response = responses[turn_num] if turn_num < len(responses) else ""
198
- context = contexts[turn_num] if turn_num < len(contexts) else ""
199
-
200
- score = evaluator(query=query, response=response, context=context)
201
-
202
- return score
203
- except Exception as e: # pylint: disable=broad-exception-caught
204
- logger.warning(
205
- "Evaluator %s failed for turn %s with exception: %s", evaluator.__class__.__name__, turn_num + 1, e
206
- )
207
- return {}
208
-
209
- def _aggregate_results(self, per_turn_results: List[Dict]):
210
- scores = {}
211
- reasons = {}
212
-
213
- for turn in per_turn_results:
214
- for metric, value in turn.items():
215
- if "reason" in metric:
216
- if metric not in reasons:
217
- reasons[metric] = []
218
- reasons[metric].append(value)
219
- else:
220
- if metric not in scores:
221
- scores[metric] = []
222
- scores[metric].append(value)
223
-
224
- aggregated = {}
225
- evaluation_per_turn = {}
226
-
227
- for metric, values in scores.items():
228
- aggregated[metric] = np.nanmean(values)
229
-
230
- # Prepare per-turn evaluations
231
- evaluation_per_turn[metric] = {"score": values}
232
- reason_key = f"{metric}_reason"
233
- if reason_key in reasons:
234
- evaluation_per_turn[metric]["reason"] = reasons[reason_key]
235
-
236
- aggregated["evaluation_per_turn"] = evaluation_per_turn
237
-
238
- return aggregated
239
-
240
- def _validate_conversation(self, conversation: List[Dict]):
241
- if conversation is None or not isinstance(conversation, list):
242
- msg = "conversation must be a list of dictionaries"
243
- raise EvaluationException(
244
- message=msg,
245
- internal_message=msg,
246
- target=ErrorTarget.CHAT_EVALUATOR,
247
- category=ErrorCategory.INVALID_VALUE,
248
- blame=ErrorBlame.USER_ERROR,
249
- )
250
-
251
- expected_role = "user"
252
- for turn_num, turn in enumerate(conversation):
253
- one_based_turn_num = turn_num + 1
254
-
255
- if not isinstance(turn, dict):
256
- msg = f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}"
257
- raise EvaluationException(
258
- message=msg,
259
- internal_message=msg,
260
- target=ErrorTarget.CHAT_EVALUATOR,
261
- category=ErrorCategory.INVALID_VALUE,
262
- blame=ErrorBlame.USER_ERROR,
263
- )
264
-
265
- if "role" not in turn or "content" not in turn:
266
- msg = (
267
- "Each turn in 'conversation' must have 'role' and 'content' keys. "
268
- + f"Turn number: {one_based_turn_num}"
269
- )
270
- raise EvaluationException(
271
- message=msg,
272
- internal_message=msg,
273
- target=ErrorTarget.CHAT_EVALUATOR,
274
- category=ErrorCategory.INVALID_VALUE,
275
- blame=ErrorBlame.USER_ERROR,
276
- )
277
-
278
- if turn["role"] != expected_role:
279
- msg = f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}"
280
- raise EvaluationException(
281
- message=msg,
282
- internal_message=msg,
283
- target=ErrorTarget.CHAT_EVALUATOR,
284
- category=ErrorCategory.INVALID_VALUE,
285
- blame=ErrorBlame.USER_ERROR,
286
- )
287
-
288
- if not isinstance(turn["content"], str):
289
- msg = f"Content in each turn must be a string. Turn number: {one_based_turn_num}"
290
- raise EvaluationException(
291
- message=msg,
292
- internal_message=msg,
293
- target=ErrorTarget.CHAT_EVALUATOR,
294
- category=ErrorCategory.INVALID_VALUE,
295
- blame=ErrorBlame.USER_ERROR,
296
- )
297
-
298
- if turn["role"] == "assistant" and "context" in turn:
299
- if not isinstance(turn["context"], dict):
300
- msg = f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}"
301
- raise EvaluationException(
302
- message=msg,
303
- internal_message=msg,
304
- target=ErrorTarget.CHAT_EVALUATOR,
305
- category=ErrorCategory.INVALID_VALUE,
306
- blame=ErrorBlame.USER_ERROR,
307
- )
308
-
309
- if "citations" not in turn["context"]:
310
- msg = (
311
- f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}"
312
- )
313
- raise EvaluationException(
314
- message=msg,
315
- internal_message=msg,
316
- target=ErrorTarget.CHAT_EVALUATOR,
317
- category=ErrorCategory.MISSING_FIELD,
318
- blame=ErrorBlame.USER_ERROR,
319
- )
320
-
321
- if not isinstance(turn["context"]["citations"], list):
322
- msg = f"'citations' in context must be a list. Turn number: {one_based_turn_num}"
323
- raise EvaluationException(
324
- message=msg,
325
- internal_message=msg,
326
- target=ErrorTarget.CHAT_EVALUATOR,
327
- category=ErrorCategory.INVALID_VALUE,
328
- blame=ErrorBlame.USER_ERROR,
329
- )
330
-
331
- for citation_num, citation in enumerate(turn["context"]["citations"]):
332
- if not isinstance(citation, dict):
333
- msg = (
334
- "Each citation in 'citations' must be a dictionary. "
335
- + f"Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}"
336
- )
337
- raise EvaluationException(
338
- message=msg,
339
- internal_message=msg,
340
- target=ErrorTarget.CHAT_EVALUATOR,
341
- category=ErrorCategory.INVALID_VALUE,
342
- blame=ErrorBlame.USER_ERROR,
343
- )
344
-
345
- # Toggle expected role for the next turn
346
- expected_role = "user" if expected_role == "assistant" else "assistant"
347
-
348
- # Ensure the conversation ends with an assistant's turn
349
- if expected_role != "user":
350
- msg = "The conversation must end with an assistant's turn."
351
- raise EvaluationException(
352
- message=msg,
353
- internal_message=msg,
354
- target=ErrorTarget.CHAT_EVALUATOR,
355
- category=ErrorCategory.INVALID_VALUE,
356
- blame=ErrorBlame.USER_ERROR,
357
- )
@@ -1,9 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- from ._retrieval import RetrievalChatEvaluator
6
-
7
- __all__ = [
8
- "RetrievalChatEvaluator",
9
- ]
@@ -1,65 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- from abc import ABC
6
-
7
- from azure.ai.evaluation._common.constants import EvaluationMetrics
8
- from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
9
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
10
-
11
-
12
- class ContentSafetyEvaluatorBase(ABC):
13
- """
14
- Initialize a evaluator for a specified Evaluation Metric. Base class that is not
15
- meant to be instantiated by users.
16
-
17
-
18
- :param metric: The metric to be evaluated.
19
- :type metric: ~azure.ai.evaluation._evaluators._content_safety.flow.constants.EvaluationMetrics
20
- :param azure_ai_project: The scope of the Azure AI project.
21
- It contains subscription id, resource group, and project name.
22
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
23
- :param credential: The credential for connecting to Azure AI project.
24
- :type credential: ~azure.core.credentials.TokenCredential
25
- """
26
-
27
- def __init__(self, metric: EvaluationMetrics, azure_ai_project: dict, credential=None):
28
- self._metric = metric
29
- self._azure_ai_project = azure_ai_project
30
- self._credential = credential
31
-
32
- async def __call__(self, *, query: str, response: str, **kwargs):
33
- """
34
- Evaluates content according to this evaluator's metric.
35
-
36
- :keyword query: The query to be evaluated.
37
- :paramtype query: str
38
- :keyword response: The response to be evaluated.
39
- :paramtype response: str
40
- :return: The evaluation score computation based on the Content Safety metric (self.metric).
41
- :rtype: Any
42
- """
43
- # Validate inputs
44
- # Raises value error if failed, so execution alone signifies success.
45
- if not (query and query.strip() and query != "None") or not (
46
- response and response.strip() and response != "None"
47
- ):
48
- msg = "Both 'query' and 'response' must be non-empty strings."
49
- raise EvaluationException(
50
- message=msg,
51
- internal_message=msg,
52
- error_category=ErrorCategory.MISSING_FIELD,
53
- error_blame=ErrorBlame.USER_ERROR,
54
- error_target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
55
- )
56
-
57
- # Run score computation based on supplied metric.
58
- result = await evaluate_with_rai_service(
59
- metric_name=self._metric,
60
- query=query,
61
- response=response,
62
- project_scope=self._azure_ai_project,
63
- credential=self._credential,
64
- )
65
- return result
@@ -1,5 +0,0 @@
1
- from ._protected_materials import ProtectedMaterialsEvaluator
2
-
3
- __all__ = [
4
- "ProtectedMaterialsEvaluator",
5
- ]