azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. azure/ai/evaluation/_common/_experimental.py +4 -0
  2. azure/ai/evaluation/_common/math.py +62 -2
  3. azure/ai/evaluation/_common/rai_service.py +80 -29
  4. azure/ai/evaluation/_common/utils.py +50 -16
  5. azure/ai/evaluation/_constants.py +1 -0
  6. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
  7. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
  8. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +11 -0
  9. azure/ai/evaluation/_evaluate/_eval_run.py +34 -10
  10. azure/ai/evaluation/_evaluate/_evaluate.py +59 -103
  11. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
  12. azure/ai/evaluation/_evaluate/_utils.py +6 -4
  13. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
  15. azure/ai/evaluation/_evaluators/_common/_base_eval.py +17 -5
  16. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +4 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -9
  18. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -50
  19. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
  20. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
  21. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
  22. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
  23. azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
  24. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  25. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
  26. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
  27. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
  28. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
  29. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
  30. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
  31. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
  32. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
  33. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
  34. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
  35. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
  36. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
  37. azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
  38. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
  39. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
  40. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
  41. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
  42. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
  43. azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
  44. azure/ai/evaluation/_exceptions.py +2 -0
  45. azure/ai/evaluation/_model_configurations.py +65 -14
  46. azure/ai/evaluation/_version.py +1 -1
  47. azure/ai/evaluation/simulator/_adversarial_scenario.py +15 -1
  48. azure/ai/evaluation/simulator/_adversarial_simulator.py +25 -34
  49. azure/ai/evaluation/simulator/_constants.py +11 -1
  50. azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
  51. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +11 -1
  52. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
  53. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
  54. azure/ai/evaluation/simulator/_simulator.py +51 -45
  55. azure/ai/evaluation/simulator/_utils.py +25 -7
  56. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/METADATA +232 -324
  57. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/RECORD +60 -61
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  59. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +0 -0
  60. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +0 -0
  61. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
@@ -2,114 +2,31 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- import json
6
5
  import logging
7
- import math
8
6
  import os
9
- from typing import Optional
7
+ from typing import Dict, List, Union
8
+ from typing_extensions import overload, override
10
9
 
11
- from promptflow._utils.async_utils import async_run_allowing_running_loop
12
- from promptflow.core import AsyncPrompty
13
-
14
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
15
- from ..._common.math import list_mean_nan_safe
16
- from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
10
+ from azure.ai.evaluation._evaluators._common._base_prompty_eval import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._model_configurations import Conversation
17
12
 
18
13
  logger = logging.getLogger(__name__)
19
14
 
20
- try:
21
- from .._user_agent import USER_AGENT
22
- except ImportError:
23
- USER_AGENT = "None"
24
15
 
16
+ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
17
+ """
18
+ Evaluates retrieval score for a given query and context or a multi-turn conversation, including reasoning.
25
19
 
26
- class _AsyncRetrievalScoreEvaluator:
27
- # Constants must be defined within eval's directory to be save/loadable
28
- _PROMPTY_FILE = "retrieval.prompty"
29
- _LLM_CALL_TIMEOUT = 600
30
- _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
20
+ The retrieval measure assesses the AI system's performance in retrieving information
21
+ for additional context (e.g. a RAG scenario).
31
22
 
32
- def __init__(self, model_config: dict):
33
- prompty_model_config = construct_prompty_model_config(
34
- validate_model_config(model_config),
35
- self._DEFAULT_OPEN_API_VERSION,
36
- USER_AGENT,
37
- )
23
+ Retrieval scores range from 1 to 5, with 1 being the worst and 5 being the best.
38
24
 
39
- current_dir = os.path.dirname(__file__)
40
- prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
41
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
42
-
43
- async def __call__(self, *, query, context, conversation, **kwargs):
44
- if conversation:
45
- # Extract queries, responses and contexts from conversation
46
- queries = []
47
- responses = []
48
- contexts = []
49
-
50
- conversation = conversation.get("messages", None)
51
-
52
- for each_turn in conversation:
53
- role = each_turn["role"]
54
- if role == "user":
55
- queries.append(each_turn["content"])
56
- elif role == "assistant":
57
- responses.append(each_turn["content"])
58
- if "context" in each_turn:
59
- if "citations" in each_turn["context"]:
60
- citations = json.dumps(each_turn["context"]["citations"])
61
- contexts.append(citations)
62
- elif isinstance(each_turn["context"], str):
63
- contexts.append(each_turn["context"])
64
-
65
- # Evaluate each turn
66
- per_turn_scores = []
67
- per_turn_reasons = []
68
- for turn_num, turn_query in enumerate(queries):
69
- try:
70
- if turn_num >= len(queries):
71
- turn_query = ""
72
- context = contexts[turn_num] if turn_num < len(contexts) else ""
73
-
74
- llm_output = await self._flow(
75
- query=turn_query, context=context, timeout=self._LLM_CALL_TIMEOUT, **kwargs
76
- )
77
- score, reason = parse_quality_evaluator_reason_score(llm_output)
78
- per_turn_scores.append(score)
79
- per_turn_reasons.append(reason)
80
-
81
- except Exception as e: # pylint: disable=broad-exception-caught
82
- logger.warning(
83
- "Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
84
- )
85
-
86
- per_turn_scores.append(math.nan)
87
- per_turn_reasons.append("")
88
-
89
- mean_per_turn_score = list_mean_nan_safe(per_turn_scores)
90
-
91
- return {
92
- "retrieval": mean_per_turn_score,
93
- "gpt_retrieval": mean_per_turn_score,
94
- "evaluation_per_turn": {
95
- "gpt_retrieval": per_turn_scores,
96
- "retrieval": per_turn_scores,
97
- "retrieval_reason": per_turn_reasons,
98
- },
99
- }
100
- llm_output = await self._flow(query=query, context=context, timeout=self._LLM_CALL_TIMEOUT, **kwargs)
101
- score, reason = parse_quality_evaluator_reason_score(llm_output)
102
-
103
- return {
104
- "retrieval": score,
105
- "retrieval_reason": reason,
106
- "gpt_retrieval": score,
107
- }
108
-
109
-
110
- class RetrievalEvaluator:
111
- """
112
- Initialize an evaluator configured for a specific Azure OpenAI model.
25
+ High retrieval scores indicate that the AI system has successfully extracted and ranked
26
+ the most relevant information at the top, without introducing bias from external knowledge
27
+ and ignoring factual correctness. Conversely, low retrieval scores suggest that the AI system
28
+ has failed to surface the most relevant context chunks at the top of the list
29
+ and/or introduced bias and ignored factual correctness.
113
30
 
114
31
  :param model_config: Configuration for the Azure OpenAI model.
115
32
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
@@ -117,45 +34,68 @@ class RetrievalEvaluator:
117
34
  :return: A function that evaluates and generates metrics for "chat" scenario.
118
35
  :rtype: Callable
119
36
 
120
- **Usage**
121
-
122
- .. code-block:: python
123
-
124
- chat_eval = RetrievalEvaluator(model_config)
125
- conversation = {
126
- "messages": [
127
- {"role": "user", "content": "What is the value of 2 + 2?"},
128
- {
129
- "role": "assistant", "content": "2 + 2 = 4",
130
- "context": "From 'math_doc.md': Information about additions: 1 + 2 = 3, 2 + 2 = 4"
131
- }
132
- ]
133
- }
134
- result = chat_eval(conversation=conversation)
135
-
136
- **Output format**
137
-
138
- .. code-block:: python
139
-
140
- {
141
- "gpt_retrieval": 3.0,
142
- "retrieval": 3.0,
143
- "evaluation_per_turn": {
144
- "gpt_retrieval": [1.0, 2.0, 3.0],
145
- "retrieval": [1.0, 2.0, 3.0],
146
- "retrieval_reason": ["<reasoning for score 1>", "<reasoning for score 2>", "<reasoning for score 3>"]
147
- }
148
- }
149
-
150
- Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
151
- To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
152
- however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
37
+ .. admonition:: Example:
38
+
39
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
40
+ :start-after: [START retrieval_evaluator]
41
+ :end-before: [END retrieval_evaluator]
42
+ :language: python
43
+ :dedent: 8
44
+ :caption: Initialize and call a RetrievalEvaluator.
45
+
46
+ .. note::
47
+
48
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
49
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
50
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
153
51
  """
154
52
 
155
- def __init__(self, model_config):
156
- self._async_evaluator = _AsyncRetrievalScoreEvaluator(model_config)
53
+ _PROMPTY_FILE = "retrieval.prompty"
54
+ _RESULT_KEY = "retrieval"
55
+
56
+ id = "azureml://registries/azureml/models/Retrieval-Evaluator/versions/1"
57
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
58
+
59
+ @override
60
+ def __init__(self, model_config): # pylint: disable=super-init-not-called
61
+ current_dir = os.path.dirname(__file__)
62
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
63
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
64
+
65
+ @overload
66
+ def __call__(
67
+ self,
68
+ *,
69
+ query: str,
70
+ context: str,
71
+ ) -> Dict[str, Union[str, float]]:
72
+ """Evaluates retrieval for a given a query and context
73
+
74
+ :keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter.
75
+ :paramtype query: Optional[str]
76
+ :keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter.
77
+ :paramtype context: Optional[str]
78
+ :return: The scores for Chat scenario.
79
+ :rtype: Dict[str, Union[str, float]]
80
+ """
81
+
82
+ @overload
83
+ def __call__(
84
+ self,
85
+ *,
86
+ conversation: Conversation,
87
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
88
+ """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn,
89
+ the evaluator will aggregate the results of each turn.
90
+
91
+ :keyword conversation: The conversation to be evaluated.
92
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
93
+ :return: The scores for Chat scenario.
94
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
95
+ """
157
96
 
158
- def __call__(self, *, query: Optional[str] = None, context: Optional[str] = None, conversation=None, **kwargs):
97
+ @override
98
+ def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param
159
99
  """Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation,
160
100
  or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
161
101
  the evaluator will aggregate the results of each turn.
@@ -167,31 +107,6 @@ class RetrievalEvaluator:
167
107
  :keyword conversation: The conversation to be evaluated.
168
108
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
169
109
  :return: The scores for Chat scenario.
170
- :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
110
+ :rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]]
171
111
  """
172
- if (query is None or context is None) and conversation is None:
173
- msg = "Either a pair of 'query'/'context' or 'conversation' must be provided."
174
- raise EvaluationException(
175
- message=msg,
176
- internal_message=msg,
177
- blame=ErrorBlame.USER_ERROR,
178
- category=ErrorCategory.MISSING_FIELD,
179
- target=ErrorTarget.RETRIEVAL_EVALUATOR,
180
- )
181
-
182
- if (query or context) and conversation:
183
- msg = "Either a pair of 'query'/'context' or 'conversation' must be provided, but not both."
184
- raise EvaluationException(
185
- message=msg,
186
- internal_message=msg,
187
- blame=ErrorBlame.USER_ERROR,
188
- category=ErrorCategory.INVALID_VALUE,
189
- target=ErrorTarget.RETRIEVAL_EVALUATOR,
190
- )
191
-
192
- return async_run_allowing_running_loop(
193
- self._async_evaluator, query=query, context=context, conversation=conversation, **kwargs
194
- )
195
-
196
- def _to_async(self):
197
- return self._async_evaluator
112
+ return super().__call__(*args, **kwargs)
@@ -6,10 +6,9 @@ from enum import Enum
6
6
  from promptflow._utils.async_utils import async_run_allowing_running_loop
7
7
 
8
8
  from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
9
- from azure.core import CaseInsensitiveEnumMeta
10
9
 
11
10
 
12
- class RougeType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
11
+ class RougeType(Enum):
13
12
  """
14
13
  Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
15
14
  """
@@ -38,8 +37,8 @@ class _AsyncRougeScoreEvaluator:
38
37
  self._rouge_type = rouge_type
39
38
 
40
39
  async def __call__(self, *, ground_truth: str, response: str, **kwargs):
41
- scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
42
- metrics = scorer.score(ground_truth, response)[self._rouge_type]
40
+ scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
41
+ metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
43
42
  return {
44
43
  "rouge_precision": metrics.precision,
45
44
  "rouge_recall": metrics.recall,
@@ -49,34 +48,34 @@ class _AsyncRougeScoreEvaluator:
49
48
 
50
49
  class RougeScoreEvaluator:
51
50
  """
52
- Evaluator for computes the ROUGE scores between two strings.
51
+ Calculates the ROUGE score for a given response and ground truth.
53
52
 
54
- ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate automatic
55
- summarization and machine translation. It measures the overlap between generated text and reference summaries.
56
- ROUGE focuses on recall-oriented measures to assess how well the generated text covers the reference text. Text
57
- summarization and document comparison are among optimal use cases for ROUGE, particularly in scenarios where text
58
- coherence and relevance are critical.
53
+ The ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) evaluates the similarity between the
54
+ generated text and reference text based on n-gram overlap, including ROUGE-N (unigram, bigram, etc.), and
55
+ ROUGE-L (longest common subsequence). It calculates precision, recall, and F1 scores to capture how well
56
+ the generated text matches the reference text. Rouge type options are "rouge1" (Unigram overlap), "rouge2"
57
+ (Bigram overlap), "rouge3" (Trigram overlap), "rouge4" (4-gram overlap), "rouge5" (5-gram overlap), "rougeL"
58
+ (L-graph overlap)
59
59
 
60
- **Usage**
60
+ Use the ROUGE score when you need a robust evaluation metric for text summarization, machine translation, and
61
+ other natural language processing tasks, especially when focusing on recall and the ability to capture relevant
62
+ information from the reference text.
61
63
 
62
- .. code-block:: python
64
+ ROUGE scores range from 0 to 1, with higher scores indicating better quality.
63
65
 
64
- eval_fn = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
65
- result = eval_fn(
66
- response="Tokyo is the capital of Japan.",
67
- ground_truth="The capital of Japan is Tokyo.")
66
+ .. admonition:: Example:
68
67
 
69
- **Output format**
70
-
71
- .. code-block:: python
72
-
73
- {
74
- "rouge_precision": 1.0,
75
- "rouge_recall": 1.0,
76
- "rouge_f1_score": 1.0
77
- }
68
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
69
+ :start-after: [START rouge_score_evaluator]
70
+ :end-before: [END rouge_score_evaluator]
71
+ :language: python
72
+ :dedent: 8
73
+ :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
78
74
  """
79
75
 
76
+ id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
77
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
78
+
80
79
  def __init__(self, rouge_type: RougeType):
81
80
  self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
82
81
 
@@ -1,22 +1,26 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from typing import Optional, Dict
5
- from typing_extensions import override
4
+ from typing import List, Union, Dict
5
+ from typing_extensions import overload, override
6
6
 
7
7
  from azure.ai.evaluation._common._experimental import experimental
8
8
  from azure.ai.evaluation._common.constants import EvaluationMetrics
9
9
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
+ from azure.ai.evaluation._model_configurations import Conversation
10
11
 
11
12
 
12
13
  @experimental
13
- class GroundednessProEvaluator(RaiServiceEvaluatorBase):
14
+ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
14
15
  """
15
- Initialize a Groundedness Pro evaluator for determine if the response is grounded
16
- in the query and context.
16
+ Evaluates service-based groundedness score for a given response, context, and query or a multi-turn conversation,
17
+ including reasoning.
17
18
 
18
- If this evaluator is supplied to the `evaluate` function, the aggregated metric
19
- for the groundedness pro label will be "groundedness_pro_passing_rate".
19
+ The groundedness measure calls Azure AI Evaluation service to assess how well the AI-generated answer is grounded
20
+ in the source context. Even if the responses from LLM are factually correct, they'll be considered ungrounded if
21
+ they can't be verified against the provided sources (such as your input source or your database).
22
+
23
+ Service-based groundedness scores are boolean values, where True indicates that the response is grounded.
20
24
 
21
25
  :param credential: The credential for connecting to Azure AI project. Required
22
26
  :type credential: ~azure.core.credentials.TokenCredential
@@ -26,64 +30,24 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
26
30
  :param kwargs: Additional arguments to pass to the evaluator.
27
31
  :type kwargs: Any
28
32
 
29
- **Usage**
30
-
31
- .. code-block:: python
32
-
33
- azure_ai_project = {
34
- "subscription_id": "<subscription_id>",
35
- "resource_group_name": "<resource_group_name>",
36
- "project_name": "<project_name>",
37
- }
38
- credential = DefaultAzureCredential()
39
-
40
- eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
41
- result = eval_fn(query="What's the capital of France", response="Paris", context="Paris.")
42
-
43
- **Output format**
44
-
45
- .. code-block:: python
46
-
47
- {
48
- "groundedness_pro_label": True,
49
- "reason": "'All Contents are grounded"
50
- }
51
-
52
- **Usage with conversation input**
53
-
54
- .. code-block:: python
33
+ .. admonition:: Example:
55
34
 
56
- azure_ai_project = {
57
- "subscription_id": "<subscription_id>",
58
- "resource_group_name": "<resource_group_name>",
59
- "project_name": "<project_name>",
60
- }
61
- credential = DefaultAzureCredential()
35
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
36
+ :start-after: [START groundedness_pro_evaluator]
37
+ :end-before: [END groundedness_pro_evaluator]
38
+ :language: python
39
+ :dedent: 8
40
+ :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
62
41
 
63
- eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
64
- conversation = {
65
- "messages": [
66
- {"role": "user", "content": "What is the capital of France?"},
67
- {"role": "assistant", "content": "Paris.", "context": "Paris."}
68
- {"role": "user", "content": "What is the capital of Germany?"},
69
- {"role": "assistant", "content": "Berlin.", "context": "Berlin."}
70
- ]
71
- }
72
- result = eval_fn(conversation=conversation)
42
+ .. note::
73
43
 
74
- **Output format**
75
-
76
- .. code-block:: python
77
-
78
- {
79
- "groundedness_pro_label": 1.0,
80
- "evaluation_per_turn": {
81
- "groundedness_pro_label": [True, True],
82
- "groundedness_pro_reason": ["All contents are grounded", "All contents are grounded"]
83
- }
84
- }
44
+ If this evaluator is supplied to the `evaluate` function, the aggregated metric
45
+ for the groundedness pro label will be "groundedness_pro_passing_rate".
85
46
  """
86
47
 
48
+ id = "azureml://registries/azureml/models/Groundedness-Pro-Evaluator/versions/1"
49
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
50
+
87
51
  @override
88
52
  def __init__(
89
53
  self,
@@ -91,7 +55,7 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
91
55
  azure_ai_project,
92
56
  **kwargs,
93
57
  ):
94
- self._passing_score = 3 # TODO update once the binarization PR is merged
58
+ self._passing_score = 5 # TODO update once the binarization PR is merged
95
59
  self._output_prefix = "groundedness_pro"
96
60
  super().__init__(
97
61
  eval_metric=EvaluationMetrics.GROUNDEDNESS,
@@ -100,14 +64,48 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
100
64
  **kwargs,
101
65
  )
102
66
 
103
- @override
67
+ @overload
68
+ def __call__(
69
+ self,
70
+ *,
71
+ response: str,
72
+ context: str,
73
+ query: str,
74
+ ) -> Dict[str, Union[str, bool]]:
75
+ """Evaluate groundedness for a given query/response/context
76
+
77
+ :keyword response: The response to be evaluated.
78
+ :paramtype response: str
79
+ :keyword context: The context to be evaluated.
80
+ :paramtype context: str
81
+ :keyword query: The query to be evaluated.
82
+ :paramtype query: Optional[str]
83
+ :return: The relevance score.
84
+ :rtype: Dict[str, Union[str, bool]]
85
+ """
86
+
87
+ @overload
104
88
  def __call__(
105
89
  self,
106
90
  *,
107
- query: Optional[str] = None,
108
- response: Optional[str] = None,
109
- context: Optional[str] = None,
110
- conversation=None,
91
+ conversation: Conversation,
92
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
93
+ """Evaluate groundedness for a conversation for a multi-turn evaluation. If the conversation has
94
+ more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results
95
+ available in the output under the "evaluation_per_turn" key.
96
+
97
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
98
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
99
+ to be dictionaries with keys "content", "role", and possibly "context".
100
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
101
+ :return: The relevance score.
102
+ :rtype: Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]
103
+ """
104
+
105
+ @override
106
+ def __call__( # pylint: disable=docstring-missing-param
107
+ self,
108
+ *args,
111
109
  **kwargs,
112
110
  ):
113
111
  """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
@@ -128,7 +126,7 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
128
126
  :return: The relevance score.
129
127
  :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
130
128
  """
131
- return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
129
+ return super().__call__(*args, **kwargs)
132
130
 
133
131
  @override
134
132
  async def _do_eval(self, eval_input: Dict):
@@ -80,36 +80,42 @@ class _AsyncSimilarityEvaluator:
80
80
 
81
81
  class SimilarityEvaluator:
82
82
  """
83
- Initialize a similarity evaluator configured for a specific Azure OpenAI model.
83
+ Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
84
84
 
85
- :param model_config: Configuration for the Azure OpenAI model.
86
- :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
87
- ~azure.ai.evaluation.OpenAIModelConfiguration]
85
+ The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
86
+ AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
87
+ the ground truth and the model's prediction, which are high-dimensional vector representations capturing
88
+ the semantic meaning and context of the sentences.
88
89
 
89
- **Usage**
90
+ Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
91
+ tasks where you have access to ground truth responses. Similarity enables you to assess the generated
92
+ text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
90
93
 
91
- .. code-block:: python
94
+ Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
92
95
 
93
- eval_fn = SimilarityEvaluator(model_config)
94
- result = eval_fn(
95
- query="What is the capital of Japan?",
96
- response="The capital of Japan is Tokyo.",
97
- ground_truth="Tokyo is Japan's capital.")
96
+ :param model_config: Configuration for the Azure OpenAI model.
97
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
98
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
98
99
 
99
- **Output format**
100
+ .. admonition:: Example:
100
101
 
101
- .. code-block:: python
102
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
103
+ :start-after: [START rouge_score_evaluator]
104
+ :end-before: [END rouge_score_evaluator]
105
+ :language: python
106
+ :dedent: 8
107
+ :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
102
108
 
103
- {
104
- "similarity": 3.0,
105
- "gpt_similarity": 3.0,
106
- }
109
+ .. note::
107
110
 
108
- Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
109
- To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
110
- however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
111
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
112
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
113
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
111
114
  """
112
115
 
116
+ id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
117
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
118
+
113
119
  def __init__(self, model_config):
114
120
  self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
115
121