azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +118 -0
- azure/ai/evaluation/_common/rai_service.py +30 -21
- azure/ai/evaluation/_constants.py +19 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +16 -43
- azure/ai/evaluation/_evaluate/_evaluate.py +76 -44
- azure/ai/evaluation/_evaluate/_utils.py +93 -34
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +140 -5
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -1
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +40 -2
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +6 -43
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +42 -82
- azure/ai/evaluation/_http_utils.py +6 -4
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +35 -16
- azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
- azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +40 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
- azure/ai/evaluation/simulator/_simulator.py +24 -13
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/METADATA +84 -15
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/RECORD +47 -41
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
9
9
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
@experimental
|
|
@@ -71,6 +72,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
71
72
|
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
|
|
72
73
|
azure_ai_project=azure_ai_project,
|
|
73
74
|
credential=credential,
|
|
75
|
+
conversation_aggregation_type=_AggregationType.MAX,
|
|
74
76
|
)
|
|
75
77
|
|
|
76
78
|
@overload
|
|
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
9
9
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
@experimental
|
|
@@ -65,6 +66,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
65
66
|
eval_metric=EvaluationMetrics.SELF_HARM,
|
|
66
67
|
azure_ai_project=azure_ai_project,
|
|
67
68
|
credential=credential,
|
|
69
|
+
conversation_aggregation_type=_AggregationType.MAX,
|
|
68
70
|
)
|
|
69
71
|
|
|
70
72
|
@overload
|
|
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
9
9
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
@experimental
|
|
@@ -67,6 +68,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
67
68
|
eval_metric=EvaluationMetrics.SEXUAL,
|
|
68
69
|
azure_ai_project=azure_ai_project,
|
|
69
70
|
credential=credential,
|
|
71
|
+
conversation_aggregation_type=_AggregationType.MAX,
|
|
70
72
|
)
|
|
71
73
|
|
|
72
74
|
@overload
|
|
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
9
9
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
@experimental
|
|
@@ -67,6 +68,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
67
68
|
eval_metric=EvaluationMetrics.VIOLENCE,
|
|
68
69
|
azure_ai_project=azure_ai_project,
|
|
69
70
|
credential=credential,
|
|
71
|
+
conversation_aggregation_type=_AggregationType.MAX,
|
|
70
72
|
)
|
|
71
73
|
|
|
72
74
|
@overload
|
|
@@ -3,45 +3,44 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
from collections import Counter
|
|
6
|
-
from typing import List
|
|
6
|
+
from typing import List, Dict
|
|
7
|
+
from typing_extensions import overload, override
|
|
7
8
|
|
|
8
|
-
from
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
9
10
|
|
|
10
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
11
11
|
|
|
12
|
+
class F1ScoreEvaluator(EvaluatorBase):
|
|
13
|
+
"""
|
|
14
|
+
Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
def __init__(self):
|
|
15
|
-
pass
|
|
16
|
+
F1 Scores range from 0 to 1, with 1 being the best possible score.
|
|
16
17
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
The F1-score computes the ratio of the number of shared words between the model generation and
|
|
19
|
+
the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
|
|
20
|
+
truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
|
|
21
|
+
precision is the ratio of the number of shared words to the total number of words in the generation, and recall
|
|
22
|
+
is the ratio of the number of shared words to the total number of words in the ground truth.
|
|
20
23
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
:paramtype ground_truth: str
|
|
25
|
-
:return: The F1 score.
|
|
26
|
-
:rtype: Dict[str, float]
|
|
27
|
-
"""
|
|
28
|
-
# Validate inputs
|
|
29
|
-
if not (response and response.strip() and response != "None") or not (
|
|
30
|
-
ground_truth and ground_truth.strip() and ground_truth != "None"
|
|
31
|
-
):
|
|
32
|
-
msg = "Both 'response' and 'ground_truth' must be non-empty strings."
|
|
33
|
-
raise EvaluationException(
|
|
34
|
-
message=msg,
|
|
35
|
-
internal_message=msg,
|
|
36
|
-
error_category=ErrorCategory.MISSING_FIELD,
|
|
37
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
38
|
-
error_target=ErrorTarget.F1_EVALUATOR,
|
|
39
|
-
)
|
|
24
|
+
Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
|
|
25
|
+
model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
|
|
26
|
+
information in the response.
|
|
40
27
|
|
|
41
|
-
# Run f1 score computation.
|
|
42
|
-
f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
|
|
43
28
|
|
|
44
|
-
|
|
29
|
+
.. admonition:: Example:
|
|
30
|
+
|
|
31
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
32
|
+
:start-after: [START f1_score_evaluator]
|
|
33
|
+
:end-before: [END f1_score_evaluator]
|
|
34
|
+
:language: python
|
|
35
|
+
:dedent: 8
|
|
36
|
+
:caption: Initialize and call an F1ScoreEvaluator.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
|
|
40
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
41
|
+
|
|
42
|
+
def __init__(self):
|
|
43
|
+
super().__init__()
|
|
45
44
|
|
|
46
45
|
@classmethod
|
|
47
46
|
def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
|
|
@@ -103,41 +102,24 @@ class _AsyncF1ScoreEvaluator:
|
|
|
103
102
|
|
|
104
103
|
return f1
|
|
105
104
|
|
|
105
|
+
@override
|
|
106
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
107
|
+
"""Produce an f1 score evaluation result.
|
|
106
108
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
precision is the ratio of the number of shared words to the total number of words in the generation, and recall
|
|
117
|
-
is the ratio of the number of shared words to the total number of words in the ground truth.
|
|
118
|
-
|
|
119
|
-
Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
|
|
120
|
-
model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
|
|
121
|
-
information in the response.
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
.. admonition:: Example:
|
|
125
|
-
|
|
126
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
127
|
-
:start-after: [START f1_score_evaluator]
|
|
128
|
-
:end-before: [END f1_score_evaluator]
|
|
129
|
-
:language: python
|
|
130
|
-
:dedent: 8
|
|
131
|
-
:caption: Initialize and call an F1ScoreEvaluator.
|
|
132
|
-
"""
|
|
133
|
-
|
|
134
|
-
id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
|
|
135
|
-
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
109
|
+
:param eval_input: The input to the evaluation function.
|
|
110
|
+
:type eval_input: Dict
|
|
111
|
+
:return: The evaluation result.
|
|
112
|
+
:rtype: Dict
|
|
113
|
+
"""
|
|
114
|
+
ground_truth = eval_input["ground_truth"]
|
|
115
|
+
response = eval_input["response"]
|
|
116
|
+
# Run f1 score computation.
|
|
117
|
+
f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
|
|
136
118
|
|
|
137
|
-
|
|
138
|
-
self._async_evaluator = _AsyncF1ScoreEvaluator()
|
|
119
|
+
return {"f1_score": f1_result}
|
|
139
120
|
|
|
140
|
-
|
|
121
|
+
@overload # type: ignore
|
|
122
|
+
def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
|
|
141
123
|
"""
|
|
142
124
|
Evaluate F1 score.
|
|
143
125
|
|
|
@@ -149,9 +131,20 @@ class F1ScoreEvaluator:
|
|
|
149
131
|
:rtype: Dict[str, float]
|
|
150
132
|
"""
|
|
151
133
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
134
|
+
@override
|
|
135
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
136
|
+
self,
|
|
137
|
+
*args,
|
|
138
|
+
**kwargs,
|
|
139
|
+
):
|
|
140
|
+
"""
|
|
141
|
+
Evaluate F1 score.
|
|
155
142
|
|
|
156
|
-
|
|
157
|
-
|
|
143
|
+
:keyword response: The response to be evaluated.
|
|
144
|
+
:paramtype response: str
|
|
145
|
+
:keyword ground_truth: The ground truth to be evaluated.
|
|
146
|
+
:paramtype ground_truth: str
|
|
147
|
+
:return: The F1 score.
|
|
148
|
+
:rtype: Dict[str, float]
|
|
149
|
+
"""
|
|
150
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -1,28 +1,16 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict
|
|
4
5
|
from nltk.translate.gleu_score import sentence_gleu
|
|
5
|
-
from
|
|
6
|
+
from typing_extensions import overload, override
|
|
6
7
|
|
|
7
8
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
9
|
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
9
11
|
|
|
10
|
-
class _AsyncGleuScoreEvaluator:
|
|
11
|
-
def __init__(self):
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
async def __call__(self, *, ground_truth: str, response: str, **kwargs):
|
|
15
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
-
|
|
18
|
-
score = sentence_gleu([reference_tokens], hypothesis_tokens)
|
|
19
|
-
|
|
20
|
-
return {
|
|
21
|
-
"gleu_score": score,
|
|
22
|
-
}
|
|
23
12
|
|
|
24
|
-
|
|
25
|
-
class GleuScoreEvaluator:
|
|
13
|
+
class GleuScoreEvaluator(EvaluatorBase):
|
|
26
14
|
"""
|
|
27
15
|
Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
|
|
28
16
|
|
|
@@ -47,10 +35,32 @@ class GleuScoreEvaluator:
|
|
|
47
35
|
id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
|
|
48
36
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
49
37
|
|
|
38
|
+
@override
|
|
50
39
|
def __init__(self):
|
|
51
|
-
|
|
40
|
+
super().__init__()
|
|
41
|
+
|
|
42
|
+
@override
|
|
43
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
44
|
+
"""Produce a glue score evaluation result.
|
|
45
|
+
|
|
46
|
+
:param eval_input: The input to the evaluation function.
|
|
47
|
+
:type eval_input: Dict
|
|
48
|
+
:return: The evaluation result.
|
|
49
|
+
:rtype: Dict
|
|
50
|
+
"""
|
|
51
|
+
ground_truth = eval_input["ground_truth"]
|
|
52
|
+
response = eval_input["response"]
|
|
53
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
54
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
52
55
|
|
|
53
|
-
|
|
56
|
+
score = sentence_gleu([reference_tokens], hypothesis_tokens)
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"gleu_score": score,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
@overload # type: ignore
|
|
63
|
+
def __call__(self, *, ground_truth: str, response: str):
|
|
54
64
|
"""
|
|
55
65
|
Evaluate the GLEU score between the response and the ground truth.
|
|
56
66
|
|
|
@@ -61,9 +71,21 @@ class GleuScoreEvaluator:
|
|
|
61
71
|
:return: The GLEU score.
|
|
62
72
|
:rtype: Dict[str, float]
|
|
63
73
|
"""
|
|
64
|
-
return async_run_allowing_running_loop(
|
|
65
|
-
self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
|
|
66
|
-
)
|
|
67
74
|
|
|
68
|
-
|
|
69
|
-
|
|
75
|
+
@override
|
|
76
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
77
|
+
self,
|
|
78
|
+
*args,
|
|
79
|
+
**kwargs,
|
|
80
|
+
):
|
|
81
|
+
"""
|
|
82
|
+
Evaluate the GLEU score between the response and the ground truth.
|
|
83
|
+
|
|
84
|
+
:keyword response: The response to be evaluated.
|
|
85
|
+
:paramtype response: str
|
|
86
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
87
|
+
:paramtype ground_truth: str
|
|
88
|
+
:return: The GLEU score.
|
|
89
|
+
:rtype: Dict[str, float]
|
|
90
|
+
"""
|
|
91
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -1,38 +1,16 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
4
6
|
from nltk.translate.meteor_score import meteor_score
|
|
5
|
-
from
|
|
7
|
+
from typing_extensions import overload, override
|
|
6
8
|
|
|
7
9
|
from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
8
11
|
|
|
9
12
|
|
|
10
|
-
class
|
|
11
|
-
def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
|
|
12
|
-
self._alpha = alpha
|
|
13
|
-
self._beta = beta
|
|
14
|
-
self._gamma = gamma
|
|
15
|
-
|
|
16
|
-
ensure_nltk_data_downloaded()
|
|
17
|
-
|
|
18
|
-
async def __call__(self, *, ground_truth: str, response: str, **kwargs):
|
|
19
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
20
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
21
|
-
|
|
22
|
-
score = meteor_score(
|
|
23
|
-
[reference_tokens],
|
|
24
|
-
hypothesis_tokens,
|
|
25
|
-
alpha=self._alpha,
|
|
26
|
-
beta=self._beta,
|
|
27
|
-
gamma=self._gamma,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
return {
|
|
31
|
-
"meteor_score": score,
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class MeteorScoreEvaluator:
|
|
13
|
+
class MeteorScoreEvaluator(EvaluatorBase):
|
|
36
14
|
"""
|
|
37
15
|
Calculates the METEOR score for a given response and ground truth.
|
|
38
16
|
|
|
@@ -68,10 +46,41 @@ class MeteorScoreEvaluator:
|
|
|
68
46
|
id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
|
|
69
47
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
70
48
|
|
|
49
|
+
@override
|
|
71
50
|
def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
|
|
72
|
-
self.
|
|
51
|
+
self._alpha = alpha
|
|
52
|
+
self._beta = beta
|
|
53
|
+
self._gamma = gamma
|
|
54
|
+
ensure_nltk_data_downloaded()
|
|
55
|
+
super().__init__()
|
|
73
56
|
|
|
74
|
-
|
|
57
|
+
@override
|
|
58
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
59
|
+
"""Produce a meteor score evaluation result.
|
|
60
|
+
|
|
61
|
+
:param eval_input: The input to the evaluation function.
|
|
62
|
+
:type eval_input: Dict
|
|
63
|
+
:return: The evaluation result.
|
|
64
|
+
:rtype: Dict
|
|
65
|
+
"""
|
|
66
|
+
ground_truth = eval_input["ground_truth"]
|
|
67
|
+
response = eval_input["response"]
|
|
68
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
69
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
70
|
+
score = meteor_score(
|
|
71
|
+
[reference_tokens],
|
|
72
|
+
hypothesis_tokens,
|
|
73
|
+
alpha=self._alpha,
|
|
74
|
+
beta=self._beta,
|
|
75
|
+
gamma=self._gamma,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
"meteor_score": score,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
@overload # type: ignore
|
|
83
|
+
def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
|
|
75
84
|
"""
|
|
76
85
|
Evaluate the METEOR score between the response and the ground truth.
|
|
77
86
|
|
|
@@ -82,9 +91,21 @@ class MeteorScoreEvaluator:
|
|
|
82
91
|
:return: The METEOR score.
|
|
83
92
|
:rtype: Dict[str, float]
|
|
84
93
|
"""
|
|
85
|
-
return async_run_allowing_running_loop(
|
|
86
|
-
self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
|
|
87
|
-
)
|
|
88
94
|
|
|
89
|
-
|
|
90
|
-
|
|
95
|
+
@override
|
|
96
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
97
|
+
self,
|
|
98
|
+
*args,
|
|
99
|
+
**kwargs,
|
|
100
|
+
):
|
|
101
|
+
"""
|
|
102
|
+
Evaluate the METEOR score between the response and the ground truth.
|
|
103
|
+
|
|
104
|
+
:keyword response: The response to be evaluated.
|
|
105
|
+
:paramtype response: str
|
|
106
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
107
|
+
:paramtype ground_truth: str
|
|
108
|
+
:return: The METEOR score.
|
|
109
|
+
:rtype: Dict[str, float]
|
|
110
|
+
"""
|
|
111
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from typing import Callable, Dict, List, Union
|
|
5
|
+
from typing import Union
|
|
7
6
|
|
|
8
|
-
from
|
|
7
|
+
from typing_extensions import overload, override
|
|
8
|
+
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
|
|
9
10
|
|
|
10
11
|
from .._coherence import CoherenceEvaluator
|
|
11
12
|
from .._f1_score import F1ScoreEvaluator
|
|
@@ -15,7 +16,7 @@ from .._relevance import RelevanceEvaluator
|
|
|
15
16
|
from .._similarity import SimilarityEvaluator
|
|
16
17
|
|
|
17
18
|
|
|
18
|
-
class QAEvaluator:
|
|
19
|
+
class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
19
20
|
"""
|
|
20
21
|
Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
|
|
21
22
|
|
|
@@ -46,9 +47,7 @@ class QAEvaluator:
|
|
|
46
47
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
47
48
|
|
|
48
49
|
def __init__(self, model_config, **kwargs):
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
|
|
50
|
+
evaluators = [
|
|
52
51
|
GroundednessEvaluator(model_config),
|
|
53
52
|
RelevanceEvaluator(model_config),
|
|
54
53
|
CoherenceEvaluator(model_config),
|
|
@@ -56,8 +55,31 @@ class QAEvaluator:
|
|
|
56
55
|
SimilarityEvaluator(model_config),
|
|
57
56
|
F1ScoreEvaluator(),
|
|
58
57
|
]
|
|
58
|
+
super().__init__(evaluators=evaluators, **kwargs)
|
|
59
|
+
|
|
60
|
+
@overload # type: ignore
|
|
61
|
+
def __call__(self, *, query: str, response: str, context: str, ground_truth: str):
|
|
62
|
+
"""
|
|
63
|
+
Evaluates question-answering scenario.
|
|
64
|
+
|
|
65
|
+
:keyword query: The query to be evaluated.
|
|
66
|
+
:paramtype query: str
|
|
67
|
+
:keyword response: The response to be evaluated.
|
|
68
|
+
:paramtype response: str
|
|
69
|
+
:keyword context: The context to be evaluated.
|
|
70
|
+
:paramtype context: str
|
|
71
|
+
:keyword ground_truth: The ground truth to be evaluated.
|
|
72
|
+
:paramtype ground_truth: str
|
|
73
|
+
:return: The scores for QA scenario.
|
|
74
|
+
:rtype: Dict[str, Union[str, float]]
|
|
75
|
+
"""
|
|
59
76
|
|
|
60
|
-
|
|
77
|
+
@override
|
|
78
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
79
|
+
self,
|
|
80
|
+
*args,
|
|
81
|
+
**kwargs,
|
|
82
|
+
):
|
|
61
83
|
"""
|
|
62
84
|
Evaluates question-answering scenario.
|
|
63
85
|
|
|
@@ -72,22 +94,5 @@ class QAEvaluator:
|
|
|
72
94
|
:return: The scores for QA scenario.
|
|
73
95
|
:rtype: Dict[str, Union[str, float]]
|
|
74
96
|
"""
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
with ThreadPoolExecutor() as executor:
|
|
78
|
-
futures = {
|
|
79
|
-
executor.submit(
|
|
80
|
-
evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
|
|
81
|
-
): evaluator
|
|
82
|
-
for evaluator in self._evaluators
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
# Collect results as they complete
|
|
86
|
-
for future in as_completed(futures):
|
|
87
|
-
results.update(future.result())
|
|
88
|
-
else:
|
|
89
|
-
for evaluator in self._evaluators:
|
|
90
|
-
result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
|
|
91
|
-
results.update(result)
|
|
92
|
-
|
|
93
|
-
return results
|
|
97
|
+
|
|
98
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -3,9 +3,11 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from typing import Dict
|
|
7
|
+
from typing_extensions import overload, override
|
|
7
8
|
|
|
8
9
|
from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class RougeType(Enum):
|
|
@@ -32,21 +34,7 @@ class RougeType(Enum):
|
|
|
32
34
|
"""Overlap of L-grams (L consecutive words) between generated and reference text."""
|
|
33
35
|
|
|
34
36
|
|
|
35
|
-
class
|
|
36
|
-
def __init__(self, rouge_type: RougeType):
|
|
37
|
-
self._rouge_type = rouge_type
|
|
38
|
-
|
|
39
|
-
async def __call__(self, *, ground_truth: str, response: str, **kwargs):
|
|
40
|
-
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
|
|
41
|
-
metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
|
|
42
|
-
return {
|
|
43
|
-
"rouge_precision": metrics.precision,
|
|
44
|
-
"rouge_recall": metrics.recall,
|
|
45
|
-
"rouge_f1_score": metrics.fmeasure,
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class RougeScoreEvaluator:
|
|
37
|
+
class RougeScoreEvaluator(EvaluatorBase):
|
|
50
38
|
"""
|
|
51
39
|
Calculates the ROUGE score for a given response and ground truth.
|
|
52
40
|
|
|
@@ -76,10 +64,32 @@ class RougeScoreEvaluator:
|
|
|
76
64
|
id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
|
|
77
65
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
78
66
|
|
|
67
|
+
@override
|
|
79
68
|
def __init__(self, rouge_type: RougeType):
|
|
80
|
-
self.
|
|
69
|
+
self._rouge_type = rouge_type
|
|
70
|
+
super().__init__()
|
|
71
|
+
|
|
72
|
+
@override
|
|
73
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
74
|
+
"""Produce a rouge score evaluation result.
|
|
81
75
|
|
|
82
|
-
|
|
76
|
+
:param eval_input: The input to the evaluation function.
|
|
77
|
+
:type eval_input: Dict
|
|
78
|
+
:return: The evaluation result.
|
|
79
|
+
:rtype: Dict
|
|
80
|
+
"""
|
|
81
|
+
ground_truth = eval_input["ground_truth"]
|
|
82
|
+
response = eval_input["response"]
|
|
83
|
+
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
|
|
84
|
+
metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
|
|
85
|
+
return {
|
|
86
|
+
"rouge_precision": metrics.precision,
|
|
87
|
+
"rouge_recall": metrics.recall,
|
|
88
|
+
"rouge_f1_score": metrics.fmeasure,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
@overload # type: ignore
|
|
92
|
+
def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
|
|
83
93
|
"""
|
|
84
94
|
Evaluate the ROUGE score between the response and the ground truth.
|
|
85
95
|
|
|
@@ -90,9 +100,20 @@ class RougeScoreEvaluator:
|
|
|
90
100
|
:return: The ROUGE score.
|
|
91
101
|
:rtype: Dict[str, float]
|
|
92
102
|
"""
|
|
93
|
-
return async_run_allowing_running_loop(
|
|
94
|
-
self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
|
|
95
|
-
)
|
|
96
103
|
|
|
97
|
-
|
|
98
|
-
|
|
104
|
+
@override
|
|
105
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
106
|
+
self,
|
|
107
|
+
*args,
|
|
108
|
+
**kwargs,
|
|
109
|
+
):
|
|
110
|
+
"""
|
|
111
|
+
Evaluate route score.
|
|
112
|
+
:keyword response: The response to be evaluated.
|
|
113
|
+
:paramtype response: str
|
|
114
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
115
|
+
:paramtype ground_truth: str
|
|
116
|
+
:return: The ROUGE score.
|
|
117
|
+
:rtype: Dict[str, float]
|
|
118
|
+
"""
|
|
119
|
+
return super().__call__(*args, **kwargs)
|