azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/_common/_experimental.py +4 -0
- azure/ai/evaluation/_common/math.py +62 -2
- azure/ai/evaluation/_common/rai_service.py +80 -29
- azure/ai/evaluation/_common/utils.py +50 -16
- azure/ai/evaluation/_constants.py +1 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +11 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +34 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +59 -103
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_utils.py +6 -4
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +17 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +4 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -50
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
- azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_model_configurations.py +65 -14
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +15 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +25 -34
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +11 -1
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
- azure/ai/evaluation/simulator/_simulator.py +51 -45
- azure/ai/evaluation/simulator/_utils.py +25 -7
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/METADATA +232 -324
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/RECORD +60 -61
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -2,114 +2,31 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
import logging
|
|
7
|
-
import math
|
|
8
6
|
import os
|
|
9
|
-
from typing import
|
|
7
|
+
from typing import Dict, List, Union
|
|
8
|
+
from typing_extensions import overload, override
|
|
10
9
|
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
15
|
-
from ..._common.math import list_mean_nan_safe
|
|
16
|
-
from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
|
|
10
|
+
from azure.ai.evaluation._evaluators._common._base_prompty_eval import PromptyEvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
17
12
|
|
|
18
13
|
logger = logging.getLogger(__name__)
|
|
19
14
|
|
|
20
|
-
try:
|
|
21
|
-
from .._user_agent import USER_AGENT
|
|
22
|
-
except ImportError:
|
|
23
|
-
USER_AGENT = "None"
|
|
24
15
|
|
|
16
|
+
class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
17
|
+
"""
|
|
18
|
+
Evaluates retrieval score for a given query and context or a multi-turn conversation, including reasoning.
|
|
25
19
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
_PROMPTY_FILE = "retrieval.prompty"
|
|
29
|
-
_LLM_CALL_TIMEOUT = 600
|
|
30
|
-
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
20
|
+
The retrieval measure assesses the AI system's performance in retrieving information
|
|
21
|
+
for additional context (e.g. a RAG scenario).
|
|
31
22
|
|
|
32
|
-
|
|
33
|
-
prompty_model_config = construct_prompty_model_config(
|
|
34
|
-
validate_model_config(model_config),
|
|
35
|
-
self._DEFAULT_OPEN_API_VERSION,
|
|
36
|
-
USER_AGENT,
|
|
37
|
-
)
|
|
23
|
+
Retrieval scores range from 1 to 5, with 1 being the worst and 5 being the best.
|
|
38
24
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if conversation:
|
|
45
|
-
# Extract queries, responses and contexts from conversation
|
|
46
|
-
queries = []
|
|
47
|
-
responses = []
|
|
48
|
-
contexts = []
|
|
49
|
-
|
|
50
|
-
conversation = conversation.get("messages", None)
|
|
51
|
-
|
|
52
|
-
for each_turn in conversation:
|
|
53
|
-
role = each_turn["role"]
|
|
54
|
-
if role == "user":
|
|
55
|
-
queries.append(each_turn["content"])
|
|
56
|
-
elif role == "assistant":
|
|
57
|
-
responses.append(each_turn["content"])
|
|
58
|
-
if "context" in each_turn:
|
|
59
|
-
if "citations" in each_turn["context"]:
|
|
60
|
-
citations = json.dumps(each_turn["context"]["citations"])
|
|
61
|
-
contexts.append(citations)
|
|
62
|
-
elif isinstance(each_turn["context"], str):
|
|
63
|
-
contexts.append(each_turn["context"])
|
|
64
|
-
|
|
65
|
-
# Evaluate each turn
|
|
66
|
-
per_turn_scores = []
|
|
67
|
-
per_turn_reasons = []
|
|
68
|
-
for turn_num, turn_query in enumerate(queries):
|
|
69
|
-
try:
|
|
70
|
-
if turn_num >= len(queries):
|
|
71
|
-
turn_query = ""
|
|
72
|
-
context = contexts[turn_num] if turn_num < len(contexts) else ""
|
|
73
|
-
|
|
74
|
-
llm_output = await self._flow(
|
|
75
|
-
query=turn_query, context=context, timeout=self._LLM_CALL_TIMEOUT, **kwargs
|
|
76
|
-
)
|
|
77
|
-
score, reason = parse_quality_evaluator_reason_score(llm_output)
|
|
78
|
-
per_turn_scores.append(score)
|
|
79
|
-
per_turn_reasons.append(reason)
|
|
80
|
-
|
|
81
|
-
except Exception as e: # pylint: disable=broad-exception-caught
|
|
82
|
-
logger.warning(
|
|
83
|
-
"Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
per_turn_scores.append(math.nan)
|
|
87
|
-
per_turn_reasons.append("")
|
|
88
|
-
|
|
89
|
-
mean_per_turn_score = list_mean_nan_safe(per_turn_scores)
|
|
90
|
-
|
|
91
|
-
return {
|
|
92
|
-
"retrieval": mean_per_turn_score,
|
|
93
|
-
"gpt_retrieval": mean_per_turn_score,
|
|
94
|
-
"evaluation_per_turn": {
|
|
95
|
-
"gpt_retrieval": per_turn_scores,
|
|
96
|
-
"retrieval": per_turn_scores,
|
|
97
|
-
"retrieval_reason": per_turn_reasons,
|
|
98
|
-
},
|
|
99
|
-
}
|
|
100
|
-
llm_output = await self._flow(query=query, context=context, timeout=self._LLM_CALL_TIMEOUT, **kwargs)
|
|
101
|
-
score, reason = parse_quality_evaluator_reason_score(llm_output)
|
|
102
|
-
|
|
103
|
-
return {
|
|
104
|
-
"retrieval": score,
|
|
105
|
-
"retrieval_reason": reason,
|
|
106
|
-
"gpt_retrieval": score,
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
class RetrievalEvaluator:
|
|
111
|
-
"""
|
|
112
|
-
Initialize an evaluator configured for a specific Azure OpenAI model.
|
|
25
|
+
High retrieval scores indicate that the AI system has successfully extracted and ranked
|
|
26
|
+
the most relevant information at the top, without introducing bias from external knowledge
|
|
27
|
+
and ignoring factual correctness. Conversely, low retrieval scores suggest that the AI system
|
|
28
|
+
has failed to surface the most relevant context chunks at the top of the list
|
|
29
|
+
and/or introduced bias and ignored factual correctness.
|
|
113
30
|
|
|
114
31
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
115
32
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
@@ -117,45 +34,68 @@ class RetrievalEvaluator:
|
|
|
117
34
|
:return: A function that evaluates and generates metrics for "chat" scenario.
|
|
118
35
|
:rtype: Callable
|
|
119
36
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
result = chat_eval(conversation=conversation)
|
|
135
|
-
|
|
136
|
-
**Output format**
|
|
137
|
-
|
|
138
|
-
.. code-block:: python
|
|
139
|
-
|
|
140
|
-
{
|
|
141
|
-
"gpt_retrieval": 3.0,
|
|
142
|
-
"retrieval": 3.0,
|
|
143
|
-
"evaluation_per_turn": {
|
|
144
|
-
"gpt_retrieval": [1.0, 2.0, 3.0],
|
|
145
|
-
"retrieval": [1.0, 2.0, 3.0],
|
|
146
|
-
"retrieval_reason": ["<reasoning for score 1>", "<reasoning for score 2>", "<reasoning for score 3>"]
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
|
|
151
|
-
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
152
|
-
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
37
|
+
.. admonition:: Example:
|
|
38
|
+
|
|
39
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
40
|
+
:start-after: [START retrieval_evaluator]
|
|
41
|
+
:end-before: [END retrieval_evaluator]
|
|
42
|
+
:language: python
|
|
43
|
+
:dedent: 8
|
|
44
|
+
:caption: Initialize and call a RetrievalEvaluator.
|
|
45
|
+
|
|
46
|
+
.. note::
|
|
47
|
+
|
|
48
|
+
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
49
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
50
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
153
51
|
"""
|
|
154
52
|
|
|
155
|
-
|
|
156
|
-
|
|
53
|
+
_PROMPTY_FILE = "retrieval.prompty"
|
|
54
|
+
_RESULT_KEY = "retrieval"
|
|
55
|
+
|
|
56
|
+
id = "azureml://registries/azureml/models/Retrieval-Evaluator/versions/1"
|
|
57
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
58
|
+
|
|
59
|
+
@override
|
|
60
|
+
def __init__(self, model_config): # pylint: disable=super-init-not-called
|
|
61
|
+
current_dir = os.path.dirname(__file__)
|
|
62
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
63
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
|
|
64
|
+
|
|
65
|
+
@overload
|
|
66
|
+
def __call__(
|
|
67
|
+
self,
|
|
68
|
+
*,
|
|
69
|
+
query: str,
|
|
70
|
+
context: str,
|
|
71
|
+
) -> Dict[str, Union[str, float]]:
|
|
72
|
+
"""Evaluates retrieval for a given a query and context
|
|
73
|
+
|
|
74
|
+
:keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter.
|
|
75
|
+
:paramtype query: Optional[str]
|
|
76
|
+
:keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter.
|
|
77
|
+
:paramtype context: Optional[str]
|
|
78
|
+
:return: The scores for Chat scenario.
|
|
79
|
+
:rtype: Dict[str, Union[str, float]]
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
@overload
|
|
83
|
+
def __call__(
|
|
84
|
+
self,
|
|
85
|
+
*,
|
|
86
|
+
conversation: Conversation,
|
|
87
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
88
|
+
"""Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn,
|
|
89
|
+
the evaluator will aggregate the results of each turn.
|
|
90
|
+
|
|
91
|
+
:keyword conversation: The conversation to be evaluated.
|
|
92
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
93
|
+
:return: The scores for Chat scenario.
|
|
94
|
+
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
95
|
+
"""
|
|
157
96
|
|
|
158
|
-
|
|
97
|
+
@override
|
|
98
|
+
def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param
|
|
159
99
|
"""Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation,
|
|
160
100
|
or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
|
|
161
101
|
the evaluator will aggregate the results of each turn.
|
|
@@ -167,31 +107,6 @@ class RetrievalEvaluator:
|
|
|
167
107
|
:keyword conversation: The conversation to be evaluated.
|
|
168
108
|
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
169
109
|
:return: The scores for Chat scenario.
|
|
170
|
-
:rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
110
|
+
:rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]]
|
|
171
111
|
"""
|
|
172
|
-
|
|
173
|
-
msg = "Either a pair of 'query'/'context' or 'conversation' must be provided."
|
|
174
|
-
raise EvaluationException(
|
|
175
|
-
message=msg,
|
|
176
|
-
internal_message=msg,
|
|
177
|
-
blame=ErrorBlame.USER_ERROR,
|
|
178
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
179
|
-
target=ErrorTarget.RETRIEVAL_EVALUATOR,
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
if (query or context) and conversation:
|
|
183
|
-
msg = "Either a pair of 'query'/'context' or 'conversation' must be provided, but not both."
|
|
184
|
-
raise EvaluationException(
|
|
185
|
-
message=msg,
|
|
186
|
-
internal_message=msg,
|
|
187
|
-
blame=ErrorBlame.USER_ERROR,
|
|
188
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
189
|
-
target=ErrorTarget.RETRIEVAL_EVALUATOR,
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
return async_run_allowing_running_loop(
|
|
193
|
-
self._async_evaluator, query=query, context=context, conversation=conversation, **kwargs
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
def _to_async(self):
|
|
197
|
-
return self._async_evaluator
|
|
112
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -6,10 +6,9 @@ from enum import Enum
|
|
|
6
6
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
7
7
|
|
|
8
8
|
from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
|
|
9
|
-
from azure.core import CaseInsensitiveEnumMeta
|
|
10
9
|
|
|
11
10
|
|
|
12
|
-
class RougeType(
|
|
11
|
+
class RougeType(Enum):
|
|
13
12
|
"""
|
|
14
13
|
Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
|
|
15
14
|
"""
|
|
@@ -38,8 +37,8 @@ class _AsyncRougeScoreEvaluator:
|
|
|
38
37
|
self._rouge_type = rouge_type
|
|
39
38
|
|
|
40
39
|
async def __call__(self, *, ground_truth: str, response: str, **kwargs):
|
|
41
|
-
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
|
|
42
|
-
metrics = scorer.score(ground_truth, response)[self._rouge_type]
|
|
40
|
+
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
|
|
41
|
+
metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
|
|
43
42
|
return {
|
|
44
43
|
"rouge_precision": metrics.precision,
|
|
45
44
|
"rouge_recall": metrics.recall,
|
|
@@ -49,34 +48,34 @@ class _AsyncRougeScoreEvaluator:
|
|
|
49
48
|
|
|
50
49
|
class RougeScoreEvaluator:
|
|
51
50
|
"""
|
|
52
|
-
|
|
51
|
+
Calculates the ROUGE score for a given response and ground truth.
|
|
53
52
|
|
|
54
|
-
ROUGE (Recall-Oriented Understudy for Gisting Evaluation)
|
|
55
|
-
|
|
56
|
-
ROUGE
|
|
57
|
-
|
|
58
|
-
|
|
53
|
+
The ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) evaluates the similarity between the
|
|
54
|
+
generated text and reference text based on n-gram overlap, including ROUGE-N (unigram, bigram, etc.), and
|
|
55
|
+
ROUGE-L (longest common subsequence). It calculates precision, recall, and F1 scores to capture how well
|
|
56
|
+
the generated text matches the reference text. Rouge type options are "rouge1" (Unigram overlap), "rouge2"
|
|
57
|
+
(Bigram overlap), "rouge3" (Trigram overlap), "rouge4" (4-gram overlap), "rouge5" (5-gram overlap), "rougeL"
|
|
58
|
+
(L-graph overlap)
|
|
59
59
|
|
|
60
|
-
|
|
60
|
+
Use the ROUGE score when you need a robust evaluation metric for text summarization, machine translation, and
|
|
61
|
+
other natural language processing tasks, especially when focusing on recall and the ability to capture relevant
|
|
62
|
+
information from the reference text.
|
|
61
63
|
|
|
62
|
-
|
|
64
|
+
ROUGE scores range from 0 to 1, with higher scores indicating better quality.
|
|
63
65
|
|
|
64
|
-
|
|
65
|
-
result = eval_fn(
|
|
66
|
-
response="Tokyo is the capital of Japan.",
|
|
67
|
-
ground_truth="The capital of Japan is Tokyo.")
|
|
66
|
+
.. admonition:: Example:
|
|
68
67
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
"rouge_recall": 1.0,
|
|
76
|
-
"rouge_f1_score": 1.0
|
|
77
|
-
}
|
|
68
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
69
|
+
:start-after: [START rouge_score_evaluator]
|
|
70
|
+
:end-before: [END rouge_score_evaluator]
|
|
71
|
+
:language: python
|
|
72
|
+
:dedent: 8
|
|
73
|
+
:caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
78
74
|
"""
|
|
79
75
|
|
|
76
|
+
id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
|
|
77
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
78
|
+
|
|
80
79
|
def __init__(self, rouge_type: RougeType):
|
|
81
80
|
self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
|
|
82
81
|
|
|
@@ -1,22 +1,26 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import
|
|
5
|
-
from typing_extensions import override
|
|
4
|
+
from typing import List, Union, Dict
|
|
5
|
+
from typing_extensions import overload, override
|
|
6
6
|
|
|
7
7
|
from azure.ai.evaluation._common._experimental import experimental
|
|
8
8
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
9
9
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
10
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
@experimental
|
|
13
|
-
class GroundednessProEvaluator(RaiServiceEvaluatorBase):
|
|
14
|
+
class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
14
15
|
"""
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
Evaluates service-based groundedness score for a given response, context, and query or a multi-turn conversation,
|
|
17
|
+
including reasoning.
|
|
17
18
|
|
|
18
|
-
|
|
19
|
-
|
|
19
|
+
The groundedness measure calls Azure AI Evaluation service to assess how well the AI-generated answer is grounded
|
|
20
|
+
in the source context. Even if the responses from LLM are factually correct, they'll be considered ungrounded if
|
|
21
|
+
they can't be verified against the provided sources (such as your input source or your database).
|
|
22
|
+
|
|
23
|
+
Service-based groundedness scores are boolean values, where True indicates that the response is grounded.
|
|
20
24
|
|
|
21
25
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
22
26
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
@@ -26,64 +30,24 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
|
|
|
26
30
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
27
31
|
:type kwargs: Any
|
|
28
32
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
.. code-block:: python
|
|
32
|
-
|
|
33
|
-
azure_ai_project = {
|
|
34
|
-
"subscription_id": "<subscription_id>",
|
|
35
|
-
"resource_group_name": "<resource_group_name>",
|
|
36
|
-
"project_name": "<project_name>",
|
|
37
|
-
}
|
|
38
|
-
credential = DefaultAzureCredential()
|
|
39
|
-
|
|
40
|
-
eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
|
|
41
|
-
result = eval_fn(query="What's the capital of France", response="Paris", context="Paris.")
|
|
42
|
-
|
|
43
|
-
**Output format**
|
|
44
|
-
|
|
45
|
-
.. code-block:: python
|
|
46
|
-
|
|
47
|
-
{
|
|
48
|
-
"groundedness_pro_label": True,
|
|
49
|
-
"reason": "'All Contents are grounded"
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
**Usage with conversation input**
|
|
53
|
-
|
|
54
|
-
.. code-block:: python
|
|
33
|
+
.. admonition:: Example:
|
|
55
34
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
35
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
36
|
+
:start-after: [START groundedness_pro_evaluator]
|
|
37
|
+
:end-before: [END groundedness_pro_evaluator]
|
|
38
|
+
:language: python
|
|
39
|
+
:dedent: 8
|
|
40
|
+
:caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
|
|
62
41
|
|
|
63
|
-
|
|
64
|
-
conversation = {
|
|
65
|
-
"messages": [
|
|
66
|
-
{"role": "user", "content": "What is the capital of France?"},
|
|
67
|
-
{"role": "assistant", "content": "Paris.", "context": "Paris."}
|
|
68
|
-
{"role": "user", "content": "What is the capital of Germany?"},
|
|
69
|
-
{"role": "assistant", "content": "Berlin.", "context": "Berlin."}
|
|
70
|
-
]
|
|
71
|
-
}
|
|
72
|
-
result = eval_fn(conversation=conversation)
|
|
42
|
+
.. note::
|
|
73
43
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
.. code-block:: python
|
|
77
|
-
|
|
78
|
-
{
|
|
79
|
-
"groundedness_pro_label": 1.0,
|
|
80
|
-
"evaluation_per_turn": {
|
|
81
|
-
"groundedness_pro_label": [True, True],
|
|
82
|
-
"groundedness_pro_reason": ["All contents are grounded", "All contents are grounded"]
|
|
83
|
-
}
|
|
84
|
-
}
|
|
44
|
+
If this evaluator is supplied to the `evaluate` function, the aggregated metric
|
|
45
|
+
for the groundedness pro label will be "groundedness_pro_passing_rate".
|
|
85
46
|
"""
|
|
86
47
|
|
|
48
|
+
id = "azureml://registries/azureml/models/Groundedness-Pro-Evaluator/versions/1"
|
|
49
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
50
|
+
|
|
87
51
|
@override
|
|
88
52
|
def __init__(
|
|
89
53
|
self,
|
|
@@ -91,7 +55,7 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
|
|
|
91
55
|
azure_ai_project,
|
|
92
56
|
**kwargs,
|
|
93
57
|
):
|
|
94
|
-
self._passing_score =
|
|
58
|
+
self._passing_score = 5 # TODO update once the binarization PR is merged
|
|
95
59
|
self._output_prefix = "groundedness_pro"
|
|
96
60
|
super().__init__(
|
|
97
61
|
eval_metric=EvaluationMetrics.GROUNDEDNESS,
|
|
@@ -100,14 +64,48 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
|
|
|
100
64
|
**kwargs,
|
|
101
65
|
)
|
|
102
66
|
|
|
103
|
-
@
|
|
67
|
+
@overload
|
|
68
|
+
def __call__(
|
|
69
|
+
self,
|
|
70
|
+
*,
|
|
71
|
+
response: str,
|
|
72
|
+
context: str,
|
|
73
|
+
query: str,
|
|
74
|
+
) -> Dict[str, Union[str, bool]]:
|
|
75
|
+
"""Evaluate groundedness for a given query/response/context
|
|
76
|
+
|
|
77
|
+
:keyword response: The response to be evaluated.
|
|
78
|
+
:paramtype response: str
|
|
79
|
+
:keyword context: The context to be evaluated.
|
|
80
|
+
:paramtype context: str
|
|
81
|
+
:keyword query: The query to be evaluated.
|
|
82
|
+
:paramtype query: Optional[str]
|
|
83
|
+
:return: The relevance score.
|
|
84
|
+
:rtype: Dict[str, Union[str, bool]]
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
@overload
|
|
104
88
|
def __call__(
|
|
105
89
|
self,
|
|
106
90
|
*,
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
91
|
+
conversation: Conversation,
|
|
92
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
|
|
93
|
+
"""Evaluate groundedness for a conversation for a multi-turn evaluation. If the conversation has
|
|
94
|
+
more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results
|
|
95
|
+
available in the output under the "evaluation_per_turn" key.
|
|
96
|
+
|
|
97
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
98
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
99
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
100
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
101
|
+
:return: The relevance score.
|
|
102
|
+
:rtype: Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
@override
|
|
106
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
107
|
+
self,
|
|
108
|
+
*args,
|
|
111
109
|
**kwargs,
|
|
112
110
|
):
|
|
113
111
|
"""Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
|
|
@@ -128,7 +126,7 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
|
|
|
128
126
|
:return: The relevance score.
|
|
129
127
|
:rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
|
|
130
128
|
"""
|
|
131
|
-
return super().__call__(
|
|
129
|
+
return super().__call__(*args, **kwargs)
|
|
132
130
|
|
|
133
131
|
@override
|
|
134
132
|
async def _do_eval(self, eval_input: Dict):
|
|
@@ -80,36 +80,42 @@ class _AsyncSimilarityEvaluator:
|
|
|
80
80
|
|
|
81
81
|
class SimilarityEvaluator:
|
|
82
82
|
"""
|
|
83
|
-
|
|
83
|
+
Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
|
|
84
84
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
85
|
+
The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
|
|
86
|
+
AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
|
|
87
|
+
the ground truth and the model's prediction, which are high-dimensional vector representations capturing
|
|
88
|
+
the semantic meaning and context of the sentences.
|
|
88
89
|
|
|
89
|
-
|
|
90
|
+
Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
|
|
91
|
+
tasks where you have access to ground truth responses. Similarity enables you to assess the generated
|
|
92
|
+
text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
|
|
90
93
|
|
|
91
|
-
|
|
94
|
+
Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
|
|
92
95
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
response="The capital of Japan is Tokyo.",
|
|
97
|
-
ground_truth="Tokyo is Japan's capital.")
|
|
96
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
97
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
98
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
98
99
|
|
|
99
|
-
|
|
100
|
+
.. admonition:: Example:
|
|
100
101
|
|
|
101
|
-
|
|
102
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
103
|
+
:start-after: [START rouge_score_evaluator]
|
|
104
|
+
:end-before: [END rouge_score_evaluator]
|
|
105
|
+
:language: python
|
|
106
|
+
:dedent: 8
|
|
107
|
+
:caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
102
108
|
|
|
103
|
-
|
|
104
|
-
"similarity": 3.0,
|
|
105
|
-
"gpt_similarity": 3.0,
|
|
106
|
-
}
|
|
109
|
+
.. note::
|
|
107
110
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
+
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
112
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
113
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
111
114
|
"""
|
|
112
115
|
|
|
116
|
+
id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
|
|
117
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
118
|
+
|
|
113
119
|
def __init__(self, model_config):
|
|
114
120
|
self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
|
|
115
121
|
|