azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -5
- azure/ai/evaluation/_common/utils.py +24 -9
- azure/ai/evaluation/_constants.py +4 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +57 -39
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +34 -81
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +29 -74
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +33 -80
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +34 -83
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +16 -22
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -11
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +16 -90
- azure/ai/evaluation/_exceptions.py +0 -1
- azure/ai/evaluation/_model_configurations.py +36 -8
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
- azure/ai/evaluation/simulator/_simulator.py +19 -8
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/METADATA +59 -1
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/RECORD +38 -39
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0b3
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -426,6 +426,64 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
426
426
|
|
|
427
427
|
# Release History
|
|
428
428
|
|
|
429
|
+
## 1.0.0b3 (2024-10-01)
|
|
430
|
+
|
|
431
|
+
### Features Added
|
|
432
|
+
|
|
433
|
+
- Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
|
|
434
|
+
- The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
|
|
435
|
+
- `ViolenceEvaluator`
|
|
436
|
+
- `SexualEvaluator`
|
|
437
|
+
- `SelfHarmEvaluator`
|
|
438
|
+
- `HateUnfairnessEvaluator`
|
|
439
|
+
- `ProtectedMaterialEvaluator`
|
|
440
|
+
- `IndirectAttackEvaluator`
|
|
441
|
+
- `CoherenceEvaluator`
|
|
442
|
+
- `RelevanceEvaluator`
|
|
443
|
+
- `FluencyEvaluator`
|
|
444
|
+
- `GroundednessEvaluator`
|
|
445
|
+
- Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
|
|
446
|
+
|
|
447
|
+
### Breaking Changes
|
|
448
|
+
|
|
449
|
+
- Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
|
|
450
|
+
- The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
|
|
451
|
+
`column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
|
|
452
|
+
|
|
453
|
+
Before:
|
|
454
|
+
```python
|
|
455
|
+
evaluate(
|
|
456
|
+
...,
|
|
457
|
+
evaluator_config={
|
|
458
|
+
"hate_unfairness": {
|
|
459
|
+
"query": "${data.question}",
|
|
460
|
+
"response": "${data.answer}",
|
|
461
|
+
}
|
|
462
|
+
},
|
|
463
|
+
...
|
|
464
|
+
)
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
After
|
|
468
|
+
```python
|
|
469
|
+
evaluate(
|
|
470
|
+
...,
|
|
471
|
+
evaluator_config={
|
|
472
|
+
"hate_unfairness": {
|
|
473
|
+
"column_mapping": {
|
|
474
|
+
"query": "${data.question}",
|
|
475
|
+
"response": "${data.answer}",
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
},
|
|
479
|
+
...
|
|
480
|
+
)
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
### Bugs Fixed
|
|
484
|
+
|
|
485
|
+
- Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
|
|
486
|
+
|
|
429
487
|
## 1.0.0b2 (2024-09-24)
|
|
430
488
|
|
|
431
489
|
### Breaking Changes
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
azure/ai/evaluation/__init__.py,sha256=
|
|
2
|
-
azure/ai/evaluation/_constants.py,sha256=
|
|
3
|
-
azure/ai/evaluation/_exceptions.py,sha256=
|
|
1
|
+
azure/ai/evaluation/__init__.py,sha256=AW8HyrHG5L1NT-0-vzu14o9Em8-ZiKXokPlIAvgF9lI,1977
|
|
2
|
+
azure/ai/evaluation/_constants.py,sha256=RWerL5-uO8xmgZmdyon5TYH2_xPM_31cZqXs7qk28Ms,1743
|
|
3
|
+
azure/ai/evaluation/_exceptions.py,sha256=CH4Waotlr519uHirau38NFExv5cG5JgrjPxjTAHWPGU,4131
|
|
4
4
|
azure/ai/evaluation/_http_utils.py,sha256=kpAuxuoC6ZK-ZHw6qScOC8ePJQXLwf7SJWP5S4OWwUs,13983
|
|
5
|
-
azure/ai/evaluation/_model_configurations.py,sha256=
|
|
5
|
+
azure/ai/evaluation/_model_configurations.py,sha256=9yiXXCFw8DiCHB0Rjg6NEHZNSK8AYCfpRdRp_rwd5DU,1869
|
|
6
6
|
azure/ai/evaluation/_user_agent.py,sha256=O2y-QPBAcw7w7qQ6M2aRPC3Vy3TKd789u5lcs2yuFaI,290
|
|
7
|
-
azure/ai/evaluation/_version.py,sha256=
|
|
7
|
+
azure/ai/evaluation/_version.py,sha256=O5t2mfbIREHTFuOmpAqCrJ00-kGQ4SRcFh42DDRu2Ac,201
|
|
8
8
|
azure/ai/evaluation/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
azure/ai/evaluation/_common/__init__.py,sha256=LHTkf6dMLLxikrGNgbUuREBVQcs4ORHR6Eryo4bm9M8,586
|
|
10
10
|
azure/ai/evaluation/_common/constants.py,sha256=ZKHGVgGA1Fc6Pvo22X-CeOUX6-m0q_UwpOKOWATTSuI,1639
|
|
11
11
|
azure/ai/evaluation/_common/rai_service.py,sha256=R-1jtWi4Fu4IT4v0j0hKsWyN-D5nwH5HQr1x0AtCp8Q,17539
|
|
12
|
-
azure/ai/evaluation/_common/utils.py,sha256=
|
|
12
|
+
azure/ai/evaluation/_common/utils.py,sha256=IKCAxHbGcmi5rH6qFZOB76vInXS8A7Oa7dYC56K0BWY,3494
|
|
13
13
|
azure/ai/evaluation/_evaluate/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
|
|
14
14
|
azure/ai/evaluation/_evaluate/_eval_run.py,sha256=EVCSRjIwOkXfYlIBTv7hosyUqcMTmVqGQ44yvHmm2Eg,20943
|
|
15
|
-
azure/ai/evaluation/_evaluate/_evaluate.py,sha256=
|
|
15
|
+
azure/ai/evaluation/_evaluate/_evaluate.py,sha256=XoGXpzO8Z_hYwpPw9amjYarrKqtvNtLB4xLdtE8mmuI,30324
|
|
16
16
|
azure/ai/evaluation/_evaluate/_utils.py,sha256=9I29KAFsYJCp6frlLXb5vwZJzmiNzfzku9CD1eslaHU,9880
|
|
17
17
|
azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py,sha256=BkxhojWca3e2QM3hFwO2xrLiiQ0i-3f8wsMfOx1zchs,361
|
|
18
18
|
azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py,sha256=sn7k9nM7vVZDt5CgNwwQrvQwV--SwfsfLjfjahk23DM,2984
|
|
@@ -22,59 +22,58 @@ azure/ai/evaluation/_evaluate/_telemetry/__init__.py,sha256=6kkbiTCsz7BNV5WxOdwC
|
|
|
22
22
|
azure/ai/evaluation/_evaluators/__init__.py,sha256=Yx1Iq2GNKQ5lYxTotvPwkPL4u0cm6YVxUe-iVbu1clI,180
|
|
23
23
|
azure/ai/evaluation/_evaluators/_bleu/__init__.py,sha256=quKKO0kvOSkky5hcoNBvgBuMeeVRFCE9GSv70mAdGP4,260
|
|
24
24
|
azure/ai/evaluation/_evaluators/_bleu/_bleu.py,sha256=6EJCG9DnL2Y4pU_vhY4o3UOrumvI-6HI92tzEuCoyXk,2413
|
|
25
|
-
azure/ai/evaluation/_evaluators/_chat/__init__.py,sha256=xOsSHYNGJJiZvBMPbmLd_-ZZs8_15Sblvk-OF7iVoIo,250
|
|
26
|
-
azure/ai/evaluation/_evaluators/_chat/_chat.py,sha256=ZCm7L50bRaAvj2-Gw6T_9u4RTHRF6neCYnjDCunRjKw,14787
|
|
27
|
-
azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py,sha256=DmBjBkwDDlCsSGpBeXfpfMM9ekxIJs62dij4rBXND7k,273
|
|
28
|
-
azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py,sha256=HKkQdvYVD3Mr5kPUzKLBYsct94k1hAcLu5v9MCxZ6lA,5579
|
|
29
|
-
azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty,sha256=NoHNDf_UE8BGAixqZPhRQ4ecxOUi9w9gO8HfHhJuxaY,1761
|
|
30
25
|
azure/ai/evaluation/_evaluators/_coherence/__init__.py,sha256=GRqcSCQse02Spyki0UsRNWMIXiea2lLtPPXNGvkJzQ0,258
|
|
31
|
-
azure/ai/evaluation/_evaluators/_coherence/_coherence.py,sha256=
|
|
32
|
-
azure/ai/evaluation/_evaluators/_coherence/coherence.prompty,sha256=
|
|
33
|
-
azure/ai/evaluation/_evaluators/
|
|
26
|
+
azure/ai/evaluation/_evaluators/_coherence/_coherence.py,sha256=ydYo03-XlJDaX6YD432xMfutU8r41ovf1MW7vVuWPw4,2512
|
|
27
|
+
azure/ai/evaluation/_evaluators/_coherence/coherence.prompty,sha256=_GXYhAH04tsl2qntZH5ACx7gFNfUeQ0hZQpOmDoLPNc,2549
|
|
28
|
+
azure/ai/evaluation/_evaluators/_common/__init__.py,sha256=_hPqTkAla_O6s4ebVtTaBrVLEW3KSdDz66WwxjK50cI,423
|
|
29
|
+
azure/ai/evaluation/_evaluators/_common/_base_eval.py,sha256=Ai3jN-HRzNsG2V48pMR9TmK3owT-6YJxKi9gzMloZNE,14072
|
|
30
|
+
azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py,sha256=bPePVS7-5gvzEtpKxlDBamxNwetBJTqf5nCMt6Wu7ao,3050
|
|
31
|
+
azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py,sha256=O_klXEBsmYJyyqoIGN5gRg0udnvTKtaouQmEeMGpOgM,4331
|
|
32
|
+
azure/ai/evaluation/_evaluators/_content_safety/__init__.py,sha256=PEYMIybfP64f7byhuTaiq4RiqsYbjqejpW1JsJIG1jA,556
|
|
34
33
|
azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py,sha256=uM6BL9jwtv5nJpSchezTYc-E514_VCTN2pACy7oxHuU,3928
|
|
35
|
-
azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py,sha256=VlBDKrB2uNXX8ccV7aKsHM74_mK6JLUZ9SAGcvR3Lk8,2733
|
|
36
34
|
azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py,sha256=n5fdL0TPelJY_AuiamkLO7Jiv7P-7gIZqipo5ShyoR8,11958
|
|
37
|
-
azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py,sha256=
|
|
38
|
-
azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py,sha256=
|
|
39
|
-
azure/ai/evaluation/_evaluators/_content_safety/_sexual.py,sha256=
|
|
40
|
-
azure/ai/evaluation/_evaluators/_content_safety/_violence.py,sha256=
|
|
35
|
+
azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py,sha256=YIQLFj6P7WXAyRRHVOflikUePN5sMCanJQmnIpSDeY0,1856
|
|
36
|
+
azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py,sha256=BFRNVDehDe7Qgnt3k1zRzscFQmn_miuHaSpjOytFhds,1810
|
|
37
|
+
azure/ai/evaluation/_evaluators/_content_safety/_sexual.py,sha256=Ap4EvHDPF8YjJ_esKEK83yusSR_xYhJWn6HIn1mkwW0,1788
|
|
38
|
+
azure/ai/evaluation/_evaluators/_content_safety/_violence.py,sha256=HbLsZOqLopr0beDeHW85EmqSMFcTCZIYGUYvxUq_-gM,1804
|
|
41
39
|
azure/ai/evaluation/_evaluators/_eci/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
azure/ai/evaluation/_evaluators/_eci/_eci.py,sha256=
|
|
40
|
+
azure/ai/evaluation/_evaluators/_eci/_eci.py,sha256=EPy_A4BtqHm_10kHApi9xZ2eHYU5CjVGtkfG4zUMRhs,2411
|
|
43
41
|
azure/ai/evaluation/_evaluators/_f1_score/__init__.py,sha256=aEVbO7iMoF20obdpLQKcKm69Yyu3mYnblKELLqu8OGI,260
|
|
44
42
|
azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py,sha256=Cgp-hANXtrNfe1q6SVm-CUG3UEP8Gj1pvH293ay57OI,4554
|
|
45
43
|
azure/ai/evaluation/_evaluators/_fluency/__init__.py,sha256=EEJw39xRa0bOAA1rELTTKXQu2s60n_7CZQRD0Gu2QVw,259
|
|
46
|
-
azure/ai/evaluation/_evaluators/_fluency/_fluency.py,sha256=
|
|
47
|
-
azure/ai/evaluation/_evaluators/_fluency/fluency.prompty,sha256=
|
|
44
|
+
azure/ai/evaluation/_evaluators/_fluency/_fluency.py,sha256=k7lq7qRoiI2SaHPqLhW-Frm_STRK-hFHsbAFOejAU7s,2459
|
|
45
|
+
azure/ai/evaluation/_evaluators/_fluency/fluency.prompty,sha256=xdznyssZDQiLELv4ecC-8uUJ4ssM-iij7A6S1aDsxOQ,2403
|
|
48
46
|
azure/ai/evaluation/_evaluators/_gleu/__init__.py,sha256=Ae2EvQ7gqiYAoNO3LwGIhdAAjJPJDfT85rQGKrRrmbA,260
|
|
49
47
|
azure/ai/evaluation/_evaluators/_gleu/_gleu.py,sha256=m02wmIGjdoXjp9dwjnFQAKA8hGOUOTvpppDf2CD4QQo,2326
|
|
50
48
|
azure/ai/evaluation/_evaluators/_groundedness/__init__.py,sha256=UYNJUeRvBwcSVFyZpdsf29un5eyaDzYoo3QvC1gvlLg,274
|
|
51
|
-
azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py,sha256=
|
|
52
|
-
azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty,sha256=
|
|
49
|
+
azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py,sha256=FREk-1w_K6oF74eiNii5EdRS4uK_NUxW0dLd5Kzgj6c,2682
|
|
50
|
+
azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty,sha256=ylgxKa_xipb7wN_QwxSnjrD9AhKcJQCv8pPpWPwFfGg,3023
|
|
53
51
|
azure/ai/evaluation/_evaluators/_meteor/__init__.py,sha256=209na3pPsdmcuYpYHUYtqQybCpc3yZkc93HnRdicSlI,266
|
|
54
52
|
azure/ai/evaluation/_evaluators/_meteor/_meteor.py,sha256=K3EdRuRcuEZYVIlI2jMEp0O9KJYXQB2o6h08q43oKWY,3316
|
|
55
53
|
azure/ai/evaluation/_evaluators/_protected_material/__init__.py,sha256=eRAQIU9diVXfO5bp6aLWxZoYUvOsrDIfy1gnDOeNTiI,109
|
|
56
|
-
azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py,sha256=
|
|
57
|
-
azure/ai/evaluation/_evaluators/_protected_materials/__init__.py,sha256=A12UsRVIebGvy9FtZLBPsOIAWUskBt8iuhRdILyRcSo,112
|
|
58
|
-
azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py,sha256=xF5jHhM29OXh2sHrnXkYtiRYltuVU-fqC7xToiI3WOM,4136
|
|
54
|
+
azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py,sha256=KT8AMLL9nGvI_KVwpUAcZJcPqPN_QNyQMkkmIhmexNE,2117
|
|
59
55
|
azure/ai/evaluation/_evaluators/_qa/__init__.py,sha256=bcXfT--C0hjym2haqd1B2-u9bDciyM0ThOFtU1Q69sk,244
|
|
60
56
|
azure/ai/evaluation/_evaluators/_qa/_qa.py,sha256=w9XJOfDof78mfOpc7tbPF5wec9dGPFmXXAdR5yx2buI,3502
|
|
61
57
|
azure/ai/evaluation/_evaluators/_relevance/__init__.py,sha256=JlxytW32Nl8pbE-fI3GRpfgVuY9EG6zxIAn5VZGSwyc,265
|
|
62
|
-
azure/ai/evaluation/_evaluators/_relevance/_relevance.py,sha256=
|
|
63
|
-
azure/ai/evaluation/_evaluators/_relevance/relevance.prompty,sha256=
|
|
58
|
+
azure/ai/evaluation/_evaluators/_relevance/_relevance.py,sha256=tHBHzp2wz3szgfA24HQgphP4mF5iJfg-lw6bVqgqkpY,2934
|
|
59
|
+
azure/ai/evaluation/_evaluators/_relevance/relevance.prompty,sha256=QNWlrWxObUPlXFF1hdCDVpfXuw0QDOxHUtWLj1MwrxA,3559
|
|
60
|
+
azure/ai/evaluation/_evaluators/_retrieval/__init__.py,sha256=kMu47ZyTZ7f-4Yh6H3KHxswmxitmPJ8FPSk90qgR0XI,265
|
|
61
|
+
azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py,sha256=u6OqyZ62JpHmYatepRW5aRbtwu1sZByVSCDj_CRZSj8,5160
|
|
62
|
+
azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty,sha256=HbQu5Gy9Ghw9r8vGCF-4ui441JBD8w45NOU_9ehamd0,1585
|
|
64
63
|
azure/ai/evaluation/_evaluators/_rouge/__init__.py,sha256=kusCDaYcXogDugGefRP8MQSn9xv107oDbrMCqZ6K4GA,291
|
|
65
64
|
azure/ai/evaluation/_evaluators/_rouge/_rouge.py,sha256=28vqjjleeJR5VRsQP5VCCMX_PVUUVxkgh4c3xIvwmXE,3526
|
|
66
65
|
azure/ai/evaluation/_evaluators/_similarity/__init__.py,sha256=V2Mspog99_WBltxTkRHG5NpN5s9XoiTSN4I8POWEkLA,268
|
|
67
|
-
azure/ai/evaluation/_evaluators/_similarity/_similarity.py,sha256=
|
|
68
|
-
azure/ai/evaluation/_evaluators/_similarity/similarity.prompty,sha256=
|
|
66
|
+
azure/ai/evaluation/_evaluators/_similarity/_similarity.py,sha256=m4Ub7EcGMHotcOll_WIEYrvUWV1hMjF6K1VGthkEoqk,3883
|
|
67
|
+
azure/ai/evaluation/_evaluators/_similarity/similarity.prompty,sha256=eoludASychZoGL625bFCaZai-OY7DIAg90ZLax_o4XE,4594
|
|
69
68
|
azure/ai/evaluation/_evaluators/_xpia/__init__.py,sha256=VMEL8WrpJQeh4sQiOLzP7hRFPnjzsvwfvTzaGCVJPCM,88
|
|
70
|
-
azure/ai/evaluation/_evaluators/_xpia/xpia.py,sha256=
|
|
69
|
+
azure/ai/evaluation/_evaluators/_xpia/xpia.py,sha256=Mg0nhT00VPgfBqp0Pu-7C4Unf6MEu8yNMFW-Wu7RTXw,2556
|
|
71
70
|
azure/ai/evaluation/simulator/__init__.py,sha256=UtlcXo3SteIQEW_hW2WMhtqLNiDiIGLeW_lIkEUNoMc,486
|
|
72
71
|
azure/ai/evaluation/simulator/_adversarial_scenario.py,sha256=SxpyMw5wmM5-fiUjl1_oJH0GQEnsa7ASso10MAr2Hjw,1030
|
|
73
72
|
azure/ai/evaluation/simulator/_adversarial_simulator.py,sha256=kOL31FcD7vXTpkeFUooASXNaFTe9Vme5st_i0Qa_9sA,20542
|
|
74
73
|
azure/ai/evaluation/simulator/_constants.py,sha256=xM-Or2x7RytfoeBM3N7Vt4JQDJX66UdL3CPz0YN5rvE,485
|
|
75
74
|
azure/ai/evaluation/simulator/_direct_attack_simulator.py,sha256=zFYYdk8Sdg4-_HSd_rumM0LizPevcR57HjqvEdowv8c,11691
|
|
76
75
|
azure/ai/evaluation/simulator/_indirect_attack_simulator.py,sha256=qALFN3LG5o1kSjMjdlLeJInax8GcjD1iPUZCayJp0Kc,9628
|
|
77
|
-
azure/ai/evaluation/simulator/_simulator.py,sha256=
|
|
76
|
+
azure/ai/evaluation/simulator/_simulator.py,sha256=V9xNOwDRTlK9Xf1SyRK4yv8j3pTFd_4D79BYanePoDw,32187
|
|
78
77
|
azure/ai/evaluation/simulator/_tracing.py,sha256=LRPjsVLe9VohmXowFr9aCK_VwD0MHd1CBe8rl9jGQhU,3032
|
|
79
78
|
azure/ai/evaluation/simulator/_utils.py,sha256=aXH5GdzQrwluKvYofWtdT0s_nzgVHS2hP6x4rc5zt-E,4287
|
|
80
79
|
azure/ai/evaluation/simulator/_conversation/__init__.py,sha256=MNfFW4UDsVrk1p2ysIvmYlLzHqjKfxExktQXfSRiBPk,12774
|
|
@@ -91,9 +90,9 @@ azure/ai/evaluation/simulator/_model_tools/_rai_client.py,sha256=Bi0tLNlJmz295md
|
|
|
91
90
|
azure/ai/evaluation/simulator/_model_tools/_template_handler.py,sha256=gGSMvveKWn0LKSQ4FS5AxIwcsxj6iqCbUP53yjvndPw,5471
|
|
92
91
|
azure/ai/evaluation/simulator/_model_tools/models.py,sha256=11O6jcj3Zwo4FZvmF-X0walNp22ux1k3ghi3KFtbdy0,21762
|
|
93
92
|
azure/ai/evaluation/simulator/_prompty/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
94
|
-
azure/ai/evaluation/simulator/_prompty/task_query_response.prompty,sha256=
|
|
93
|
+
azure/ai/evaluation/simulator/_prompty/task_query_response.prompty,sha256=wUiDKFL_vnAk3eEW66z33UgNML8Wqd_ReCzqfEBMId8,2350
|
|
95
94
|
azure/ai/evaluation/simulator/_prompty/task_simulate.prompty,sha256=00zLVfNgHZdlbC2XvBedSrwDJOaAhl3B1ohE3LKsGg4,928
|
|
96
|
-
azure_ai_evaluation-1.0.
|
|
97
|
-
azure_ai_evaluation-1.0.
|
|
98
|
-
azure_ai_evaluation-1.0.
|
|
99
|
-
azure_ai_evaluation-1.0.
|
|
95
|
+
azure_ai_evaluation-1.0.0b3.dist-info/METADATA,sha256=2jKCZxNNYmFeTFJo87_qO32_nAE9ur1YwSSwnn2Mi9I,17077
|
|
96
|
+
azure_ai_evaluation-1.0.0b3.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
|
97
|
+
azure_ai_evaluation-1.0.0b3.dist-info/top_level.txt,sha256=S7DhWV9m80TBzAhOFjxDUiNbKszzoThbnrSz5MpbHSQ,6
|
|
98
|
+
azure_ai_evaluation-1.0.0b3.dist-info/RECORD,,
|
|
@@ -1,357 +0,0 @@
|
|
|
1
|
-
# ---------------------------------------------------------
|
|
2
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
-
# ---------------------------------------------------------
|
|
4
|
-
import json
|
|
5
|
-
import logging
|
|
6
|
-
from concurrent.futures import as_completed
|
|
7
|
-
from typing import Dict, List
|
|
8
|
-
|
|
9
|
-
import numpy as np
|
|
10
|
-
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
11
|
-
|
|
12
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
|
-
|
|
14
|
-
from .._coherence import CoherenceEvaluator
|
|
15
|
-
from .._fluency import FluencyEvaluator
|
|
16
|
-
from .._groundedness import GroundednessEvaluator
|
|
17
|
-
from .._relevance import RelevanceEvaluator
|
|
18
|
-
from .retrieval import RetrievalChatEvaluator
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class ChatEvaluator:
|
|
24
|
-
"""
|
|
25
|
-
Initialize a chat evaluator configured for a specific Azure OpenAI model.
|
|
26
|
-
|
|
27
|
-
:param model_config: Configuration for the Azure OpenAI model.
|
|
28
|
-
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
29
|
-
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
30
|
-
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
|
|
31
|
-
focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
|
|
32
|
-
:type eval_last_turn: bool
|
|
33
|
-
:param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
|
|
34
|
-
Default is True.
|
|
35
|
-
:type parallel: bool
|
|
36
|
-
:return: A function that evaluates and generates metrics for "chat" scenario.
|
|
37
|
-
:rtype: Callable
|
|
38
|
-
|
|
39
|
-
**Usage**
|
|
40
|
-
|
|
41
|
-
.. code-block:: python
|
|
42
|
-
|
|
43
|
-
chat_eval = ChatEvaluator(model_config)
|
|
44
|
-
conversation = [
|
|
45
|
-
{"role": "user", "content": "What is the value of 2 + 2?"},
|
|
46
|
-
{"role": "assistant", "content": "2 + 2 = 4", "context": {
|
|
47
|
-
"citations": [
|
|
48
|
-
{"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
|
|
49
|
-
]
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
]
|
|
53
|
-
result = chat_eval(conversation=conversation)
|
|
54
|
-
|
|
55
|
-
**Output format**
|
|
56
|
-
|
|
57
|
-
.. code-block:: python
|
|
58
|
-
|
|
59
|
-
{
|
|
60
|
-
"evaluation_per_turn": {
|
|
61
|
-
"gpt_retrieval": [1.0, 2.0],
|
|
62
|
-
"gpt_groundedness": [5.0, 2.0],
|
|
63
|
-
"gpt_relevance": [3.0, 5.0],
|
|
64
|
-
"gpt_coherence": [1.0, 2.0],
|
|
65
|
-
"gpt_fluency": [3.0, 5.0]
|
|
66
|
-
}
|
|
67
|
-
"gpt_retrieval": 1.5,
|
|
68
|
-
"gpt_groundedness": 3.5,
|
|
69
|
-
"gpt_relevance": 4.0,
|
|
70
|
-
"gpt_coherence": 1.5,
|
|
71
|
-
"gpt_fluency": 4.0
|
|
72
|
-
}
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
def __init__(
|
|
76
|
-
self,
|
|
77
|
-
model_config: dict,
|
|
78
|
-
eval_last_turn: bool = False,
|
|
79
|
-
parallel: bool = True,
|
|
80
|
-
):
|
|
81
|
-
self._eval_last_turn = eval_last_turn
|
|
82
|
-
self._parallel = parallel
|
|
83
|
-
|
|
84
|
-
# TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
|
|
85
|
-
self._rag_evaluators = [
|
|
86
|
-
GroundednessEvaluator(model_config),
|
|
87
|
-
RelevanceEvaluator(model_config),
|
|
88
|
-
]
|
|
89
|
-
self._non_rag_evaluators = [
|
|
90
|
-
CoherenceEvaluator(model_config),
|
|
91
|
-
FluencyEvaluator(model_config),
|
|
92
|
-
]
|
|
93
|
-
# TODO: Temporary workaround to close the gap of missing retrieval score
|
|
94
|
-
# https://msdata.visualstudio.com/Vienna/_workitems/edit/3186644
|
|
95
|
-
# For long term, we need to add a built-in evaluator for retrieval after prompt is generalized for QA and Chat
|
|
96
|
-
self._retrieval_chat_evaluator = RetrievalChatEvaluator(model_config)
|
|
97
|
-
|
|
98
|
-
def __call__(self, *, conversation, **kwargs):
|
|
99
|
-
"""
|
|
100
|
-
Evaluates chat scenario.
|
|
101
|
-
|
|
102
|
-
:keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
|
|
103
|
-
"context" key is optional for assistant's turn and should have "citations" key with list of citations.
|
|
104
|
-
:paramtype conversation: List[Dict]
|
|
105
|
-
:return: The scores for Chat scenario.
|
|
106
|
-
:rtype: dict
|
|
107
|
-
"""
|
|
108
|
-
self._validate_conversation(conversation)
|
|
109
|
-
|
|
110
|
-
# Extract queries, responses and contexts from conversation
|
|
111
|
-
queries = []
|
|
112
|
-
responses = []
|
|
113
|
-
contexts = []
|
|
114
|
-
|
|
115
|
-
if self._eval_last_turn:
|
|
116
|
-
# Process only the last two turns if _eval_last_turn is True
|
|
117
|
-
conversation_slice = conversation[-2:] if len(conversation) >= 2 else conversation
|
|
118
|
-
else:
|
|
119
|
-
conversation_slice = conversation
|
|
120
|
-
|
|
121
|
-
for each_turn in conversation_slice:
|
|
122
|
-
role = each_turn["role"]
|
|
123
|
-
if role == "user":
|
|
124
|
-
queries.append(each_turn["content"])
|
|
125
|
-
elif role == "assistant":
|
|
126
|
-
responses.append(each_turn["content"])
|
|
127
|
-
if "context" in each_turn and "citations" in each_turn["context"]:
|
|
128
|
-
citations = json.dumps(each_turn["context"]["citations"])
|
|
129
|
-
contexts.append(citations)
|
|
130
|
-
|
|
131
|
-
# Select evaluators to be used for evaluation
|
|
132
|
-
compute_rag_based_metrics = True
|
|
133
|
-
if len(responses) != len(contexts):
|
|
134
|
-
safe_message = (
|
|
135
|
-
"Skipping rag based metrics as we need citations or "
|
|
136
|
-
"retrieved_documents in context key of every assistant's turn"
|
|
137
|
-
)
|
|
138
|
-
logger.warning(safe_message)
|
|
139
|
-
compute_rag_based_metrics = False
|
|
140
|
-
|
|
141
|
-
selected_evaluators = []
|
|
142
|
-
selected_evaluators.extend(self._non_rag_evaluators)
|
|
143
|
-
if compute_rag_based_metrics:
|
|
144
|
-
selected_evaluators.extend(self._rag_evaluators)
|
|
145
|
-
|
|
146
|
-
# Evaluate each turn
|
|
147
|
-
per_turn_results = []
|
|
148
|
-
for turn_num in range(len(queries)):
|
|
149
|
-
current_turn_result = {}
|
|
150
|
-
|
|
151
|
-
if self._parallel:
|
|
152
|
-
# Parallel execution
|
|
153
|
-
with ThreadPoolExecutor() as executor:
|
|
154
|
-
future_to_evaluator = {
|
|
155
|
-
executor.submit(
|
|
156
|
-
self._evaluate_turn, turn_num, queries, responses, contexts, evaluator
|
|
157
|
-
): evaluator
|
|
158
|
-
for evaluator in selected_evaluators
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
for future in as_completed(future_to_evaluator):
|
|
162
|
-
result = future.result()
|
|
163
|
-
current_turn_result.update(result)
|
|
164
|
-
else:
|
|
165
|
-
# Sequential execution
|
|
166
|
-
for evaluator in selected_evaluators:
|
|
167
|
-
async_evaluator = evaluator._to_async()
|
|
168
|
-
result = self._evaluate_turn(turn_num, queries, responses, contexts, async_evaluator)
|
|
169
|
-
current_turn_result.update(result)
|
|
170
|
-
|
|
171
|
-
per_turn_results.append(current_turn_result)
|
|
172
|
-
|
|
173
|
-
# Aggregate results
|
|
174
|
-
# Final aggregated results for a conversation will look like:
|
|
175
|
-
# "gpt_groundedness": 2.0, # Mean of all groundedness scores
|
|
176
|
-
# "evaluation_per_turn": {
|
|
177
|
-
# "gpt_groundedness": {
|
|
178
|
-
# "score": [1.0, ...],
|
|
179
|
-
# "reason": ["reason1", ...],
|
|
180
|
-
# },
|
|
181
|
-
# },
|
|
182
|
-
# }
|
|
183
|
-
aggregated = self._aggregate_results(per_turn_results)
|
|
184
|
-
|
|
185
|
-
# Run RetrievalChatEvaluator and merge the results
|
|
186
|
-
if compute_rag_based_metrics:
|
|
187
|
-
retrieval_score = self._retrieval_chat_evaluator(conversation=conversation_slice)
|
|
188
|
-
aggregated["gpt_retrieval"] = retrieval_score["gpt_retrieval"]
|
|
189
|
-
aggregated["evaluation_per_turn"]["gpt_retrieval"] = retrieval_score["evaluation_per_turn"]["gpt_retrieval"]
|
|
190
|
-
aggregated = dict(sorted(aggregated.items()))
|
|
191
|
-
|
|
192
|
-
return aggregated
|
|
193
|
-
|
|
194
|
-
def _evaluate_turn(self, turn_num, queries, responses, contexts, evaluator):
|
|
195
|
-
try:
|
|
196
|
-
query = queries[turn_num] if turn_num < len(queries) else ""
|
|
197
|
-
response = responses[turn_num] if turn_num < len(responses) else ""
|
|
198
|
-
context = contexts[turn_num] if turn_num < len(contexts) else ""
|
|
199
|
-
|
|
200
|
-
score = evaluator(query=query, response=response, context=context)
|
|
201
|
-
|
|
202
|
-
return score
|
|
203
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
204
|
-
logger.warning(
|
|
205
|
-
"Evaluator %s failed for turn %s with exception: %s", evaluator.__class__.__name__, turn_num + 1, e
|
|
206
|
-
)
|
|
207
|
-
return {}
|
|
208
|
-
|
|
209
|
-
def _aggregate_results(self, per_turn_results: List[Dict]):
|
|
210
|
-
scores = {}
|
|
211
|
-
reasons = {}
|
|
212
|
-
|
|
213
|
-
for turn in per_turn_results:
|
|
214
|
-
for metric, value in turn.items():
|
|
215
|
-
if "reason" in metric:
|
|
216
|
-
if metric not in reasons:
|
|
217
|
-
reasons[metric] = []
|
|
218
|
-
reasons[metric].append(value)
|
|
219
|
-
else:
|
|
220
|
-
if metric not in scores:
|
|
221
|
-
scores[metric] = []
|
|
222
|
-
scores[metric].append(value)
|
|
223
|
-
|
|
224
|
-
aggregated = {}
|
|
225
|
-
evaluation_per_turn = {}
|
|
226
|
-
|
|
227
|
-
for metric, values in scores.items():
|
|
228
|
-
aggregated[metric] = np.nanmean(values)
|
|
229
|
-
|
|
230
|
-
# Prepare per-turn evaluations
|
|
231
|
-
evaluation_per_turn[metric] = {"score": values}
|
|
232
|
-
reason_key = f"{metric}_reason"
|
|
233
|
-
if reason_key in reasons:
|
|
234
|
-
evaluation_per_turn[metric]["reason"] = reasons[reason_key]
|
|
235
|
-
|
|
236
|
-
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
237
|
-
|
|
238
|
-
return aggregated
|
|
239
|
-
|
|
240
|
-
def _validate_conversation(self, conversation: List[Dict]):
|
|
241
|
-
if conversation is None or not isinstance(conversation, list):
|
|
242
|
-
msg = "conversation must be a list of dictionaries"
|
|
243
|
-
raise EvaluationException(
|
|
244
|
-
message=msg,
|
|
245
|
-
internal_message=msg,
|
|
246
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
247
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
248
|
-
blame=ErrorBlame.USER_ERROR,
|
|
249
|
-
)
|
|
250
|
-
|
|
251
|
-
expected_role = "user"
|
|
252
|
-
for turn_num, turn in enumerate(conversation):
|
|
253
|
-
one_based_turn_num = turn_num + 1
|
|
254
|
-
|
|
255
|
-
if not isinstance(turn, dict):
|
|
256
|
-
msg = f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}"
|
|
257
|
-
raise EvaluationException(
|
|
258
|
-
message=msg,
|
|
259
|
-
internal_message=msg,
|
|
260
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
261
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
262
|
-
blame=ErrorBlame.USER_ERROR,
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
if "role" not in turn or "content" not in turn:
|
|
266
|
-
msg = (
|
|
267
|
-
"Each turn in 'conversation' must have 'role' and 'content' keys. "
|
|
268
|
-
+ f"Turn number: {one_based_turn_num}"
|
|
269
|
-
)
|
|
270
|
-
raise EvaluationException(
|
|
271
|
-
message=msg,
|
|
272
|
-
internal_message=msg,
|
|
273
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
274
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
275
|
-
blame=ErrorBlame.USER_ERROR,
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
if turn["role"] != expected_role:
|
|
279
|
-
msg = f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}"
|
|
280
|
-
raise EvaluationException(
|
|
281
|
-
message=msg,
|
|
282
|
-
internal_message=msg,
|
|
283
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
284
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
285
|
-
blame=ErrorBlame.USER_ERROR,
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
if not isinstance(turn["content"], str):
|
|
289
|
-
msg = f"Content in each turn must be a string. Turn number: {one_based_turn_num}"
|
|
290
|
-
raise EvaluationException(
|
|
291
|
-
message=msg,
|
|
292
|
-
internal_message=msg,
|
|
293
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
294
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
295
|
-
blame=ErrorBlame.USER_ERROR,
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
if turn["role"] == "assistant" and "context" in turn:
|
|
299
|
-
if not isinstance(turn["context"], dict):
|
|
300
|
-
msg = f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}"
|
|
301
|
-
raise EvaluationException(
|
|
302
|
-
message=msg,
|
|
303
|
-
internal_message=msg,
|
|
304
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
305
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
306
|
-
blame=ErrorBlame.USER_ERROR,
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
if "citations" not in turn["context"]:
|
|
310
|
-
msg = (
|
|
311
|
-
f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}"
|
|
312
|
-
)
|
|
313
|
-
raise EvaluationException(
|
|
314
|
-
message=msg,
|
|
315
|
-
internal_message=msg,
|
|
316
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
317
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
318
|
-
blame=ErrorBlame.USER_ERROR,
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
if not isinstance(turn["context"]["citations"], list):
|
|
322
|
-
msg = f"'citations' in context must be a list. Turn number: {one_based_turn_num}"
|
|
323
|
-
raise EvaluationException(
|
|
324
|
-
message=msg,
|
|
325
|
-
internal_message=msg,
|
|
326
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
327
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
328
|
-
blame=ErrorBlame.USER_ERROR,
|
|
329
|
-
)
|
|
330
|
-
|
|
331
|
-
for citation_num, citation in enumerate(turn["context"]["citations"]):
|
|
332
|
-
if not isinstance(citation, dict):
|
|
333
|
-
msg = (
|
|
334
|
-
"Each citation in 'citations' must be a dictionary. "
|
|
335
|
-
+ f"Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}"
|
|
336
|
-
)
|
|
337
|
-
raise EvaluationException(
|
|
338
|
-
message=msg,
|
|
339
|
-
internal_message=msg,
|
|
340
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
341
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
342
|
-
blame=ErrorBlame.USER_ERROR,
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
# Toggle expected role for the next turn
|
|
346
|
-
expected_role = "user" if expected_role == "assistant" else "assistant"
|
|
347
|
-
|
|
348
|
-
# Ensure the conversation ends with an assistant's turn
|
|
349
|
-
if expected_role != "user":
|
|
350
|
-
msg = "The conversation must end with an assistant's turn."
|
|
351
|
-
raise EvaluationException(
|
|
352
|
-
message=msg,
|
|
353
|
-
internal_message=msg,
|
|
354
|
-
target=ErrorTarget.CHAT_EVALUATOR,
|
|
355
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
356
|
-
blame=ErrorBlame.USER_ERROR,
|
|
357
|
-
)
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
# ---------------------------------------------------------
|
|
2
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
-
# ---------------------------------------------------------
|
|
4
|
-
|
|
5
|
-
from ._retrieval import RetrievalChatEvaluator
|
|
6
|
-
|
|
7
|
-
__all__ = [
|
|
8
|
-
"RetrievalChatEvaluator",
|
|
9
|
-
]
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
# ---------------------------------------------------------
|
|
2
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
-
# ---------------------------------------------------------
|
|
4
|
-
|
|
5
|
-
from abc import ABC
|
|
6
|
-
|
|
7
|
-
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
8
|
-
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
9
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class ContentSafetyEvaluatorBase(ABC):
|
|
13
|
-
"""
|
|
14
|
-
Initialize a evaluator for a specified Evaluation Metric. Base class that is not
|
|
15
|
-
meant to be instantiated by users.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
:param metric: The metric to be evaluated.
|
|
19
|
-
:type metric: ~azure.ai.evaluation._evaluators._content_safety.flow.constants.EvaluationMetrics
|
|
20
|
-
:param azure_ai_project: The scope of the Azure AI project.
|
|
21
|
-
It contains subscription id, resource group, and project name.
|
|
22
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
23
|
-
:param credential: The credential for connecting to Azure AI project.
|
|
24
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
def __init__(self, metric: EvaluationMetrics, azure_ai_project: dict, credential=None):
|
|
28
|
-
self._metric = metric
|
|
29
|
-
self._azure_ai_project = azure_ai_project
|
|
30
|
-
self._credential = credential
|
|
31
|
-
|
|
32
|
-
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
33
|
-
"""
|
|
34
|
-
Evaluates content according to this evaluator's metric.
|
|
35
|
-
|
|
36
|
-
:keyword query: The query to be evaluated.
|
|
37
|
-
:paramtype query: str
|
|
38
|
-
:keyword response: The response to be evaluated.
|
|
39
|
-
:paramtype response: str
|
|
40
|
-
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
41
|
-
:rtype: Any
|
|
42
|
-
"""
|
|
43
|
-
# Validate inputs
|
|
44
|
-
# Raises value error if failed, so execution alone signifies success.
|
|
45
|
-
if not (query and query.strip() and query != "None") or not (
|
|
46
|
-
response and response.strip() and response != "None"
|
|
47
|
-
):
|
|
48
|
-
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
49
|
-
raise EvaluationException(
|
|
50
|
-
message=msg,
|
|
51
|
-
internal_message=msg,
|
|
52
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
53
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
54
|
-
error_target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
# Run score computation based on supplied metric.
|
|
58
|
-
result = await evaluate_with_rai_service(
|
|
59
|
-
metric_name=self._metric,
|
|
60
|
-
query=query,
|
|
61
|
-
response=response,
|
|
62
|
-
project_scope=self._azure_ai_project,
|
|
63
|
-
credential=self._credential,
|
|
64
|
-
)
|
|
65
|
-
return result
|