azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (72) hide show
  1. azure/ai/evaluation/_azure/__init__.py +3 -0
  2. azure/ai/evaluation/_azure/_clients.py +188 -0
  3. azure/ai/evaluation/_azure/_models.py +227 -0
  4. azure/ai/evaluation/_azure/_token_manager.py +118 -0
  5. azure/ai/evaluation/_common/_experimental.py +4 -0
  6. azure/ai/evaluation/_common/math.py +62 -2
  7. azure/ai/evaluation/_common/rai_service.py +110 -50
  8. azure/ai/evaluation/_common/utils.py +50 -16
  9. azure/ai/evaluation/_constants.py +2 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +12 -1
  13. azure/ai/evaluation/_evaluate/_eval_run.py +38 -43
  14. azure/ai/evaluation/_evaluate/_evaluate.py +62 -131
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
  16. azure/ai/evaluation/_evaluate/_utils.py +72 -38
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +88 -6
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +16 -3
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +39 -10
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +58 -52
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
  33. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
  34. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
  35. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
  36. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
  37. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
  38. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
  39. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
  40. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
  43. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
  45. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
  46. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
  48. azure/ai/evaluation/_exceptions.py +2 -0
  49. azure/ai/evaluation/_http_utils.py +6 -4
  50. azure/ai/evaluation/_model_configurations.py +65 -14
  51. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  52. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  53. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  54. azure/ai/evaluation/_version.py +1 -1
  55. azure/ai/evaluation/simulator/_adversarial_scenario.py +17 -1
  56. azure/ai/evaluation/simulator/_adversarial_simulator.py +57 -47
  57. azure/ai/evaluation/simulator/_constants.py +11 -1
  58. azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
  59. azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
  60. azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +12 -1
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
  63. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +48 -4
  64. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
  65. azure/ai/evaluation/simulator/_simulator.py +54 -45
  66. azure/ai/evaluation/simulator/_utils.py +25 -7
  67. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +240 -327
  68. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +71 -68
  69. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  70. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
  71. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
  72. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
@@ -2,70 +2,101 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import os
5
- from typing import Optional
5
+ from typing import Dict, Union, List
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import overload, override
8
8
 
9
9
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
10
+ from azure.ai.evaluation._model_configurations import Conversation
10
11
 
11
12
 
12
- class CoherenceEvaluator(PromptyEvaluatorBase):
13
+ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
13
14
  """
14
- Initialize a coherence evaluator configured for a specific Azure OpenAI model.
15
+ Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
16
+
17
+ The coherence measure assesses the ability of the language model to generate text that reads naturally,
18
+ flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
19
+ and user-friendliness of a model's generated responses in real-world applications.
15
20
 
16
21
  :param model_config: Configuration for the Azure OpenAI model.
17
22
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
18
23
  ~azure.ai.evaluation.OpenAIModelConfiguration]
19
24
 
20
- **Usage**
21
-
22
- .. code-block:: python
23
-
24
- eval_fn = CoherenceEvaluator(model_config)
25
- result = eval_fn(
26
- query="What is the capital of Japan?",
27
- response="The capital of Japan is Tokyo.")
25
+ .. admonition:: Example:
28
26
 
29
- **Output format**
27
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
28
+ :start-after: [START coherence_evaluator]
29
+ :end-before: [END coherence_evaluator]
30
+ :language: python
31
+ :dedent: 8
32
+ :caption: Initialize and call a CoherenceEvaluator with a query and response.
30
33
 
31
- .. code-block:: python
34
+ .. note::
32
35
 
33
- {
34
- "coherence": 1.0,
35
- "gpt_coherence": 1.0,
36
- }
37
-
38
- Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
39
- To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
40
- however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
36
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
37
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
38
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
41
39
  """
42
40
 
43
41
  _PROMPTY_FILE = "coherence.prompty"
44
42
  _RESULT_KEY = "coherence"
45
43
 
44
+ id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
45
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
46
+
46
47
  @override
47
48
  def __init__(self, model_config):
48
49
  current_dir = os.path.dirname(__file__)
49
50
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
50
51
  super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
51
52
 
52
- @override
53
+ @overload
54
+ def __call__(
55
+ self,
56
+ *,
57
+ query: str,
58
+ response: str,
59
+ ) -> Dict[str, Union[str, float]]:
60
+ """Evaluate coherence for given input of query, response
61
+
62
+ :keyword query: The query to be evaluated.
63
+ :paramtype query: str
64
+ :keyword response: The response to be evaluated.
65
+ :paramtype response: str
66
+ :return: The coherence score.
67
+ :rtype: Dict[str, float]
68
+ """
69
+
70
+ @overload
53
71
  def __call__(
54
72
  self,
55
73
  *,
56
- query: Optional[str] = None,
57
- response: Optional[str] = None,
58
- conversation=None,
74
+ conversation: Conversation,
75
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
76
+ """Evaluate coherence for a conversation
77
+
78
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
79
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
80
+ to be dictionaries with keys "content", "role", and possibly "context".
81
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
82
+ :return: The coherence score.
83
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
84
+ """
85
+
86
+ @override
87
+ def __call__( # pylint: disable=docstring-missing-param
88
+ self,
89
+ *args,
59
90
  **kwargs,
60
91
  ):
61
92
  """Evaluate coherence. Accepts either a query and response for a single evaluation,
62
93
  or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
63
94
  turns, the evaluator will aggregate the results of each turn.
64
95
 
96
+ :keyword query: The query to be evaluated.
97
+ :paramtype query: str
65
98
  :keyword response: The response to be evaluated.
66
99
  :paramtype response: Optional[str]
67
- :keyword context: The context to be evaluated.
68
- :paramtype context: Optional[str]
69
100
  :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
70
101
  key "messages". Conversation turns are expected
71
102
  to be dictionaries with keys "content" and "role".
@@ -73,4 +104,4 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
73
104
  :return: The relevance score.
74
105
  :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
75
106
  """
76
- return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
107
+ return super().__call__(*args, **kwargs)
@@ -7,11 +7,12 @@ from abc import ABC, abstractmethod
7
7
  from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
8
8
 
9
9
  from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from typing_extensions import ParamSpec, TypeAlias
10
+ from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
11
 
12
12
  from azure.ai.evaluation._common.math import list_mean
13
13
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
14
  from azure.ai.evaluation._common.utils import remove_optional_singletons
15
+ from azure.ai.evaluation._model_configurations import Conversation
15
16
 
16
17
  P = ParamSpec("P")
17
18
  T = TypeVar("T")
@@ -88,7 +89,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
88
89
  # This needs to be overridden just to change the function header into something more informative,
89
90
  # and to be able to add a more specific docstring. The actual function contents should just be
90
91
  # super().__call__(<inputs>)
91
- def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
92
+ def __call__( # pylint: disable=docstring-missing-param
93
+ self,
94
+ *args,
95
+ **kwargs,
96
+ ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
92
97
  """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
93
98
  one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
94
99
  The actual behavior of this function shouldn't change beyond adding more inputs to the
@@ -127,11 +132,19 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
127
132
  :rtype: List[str]
128
133
  """
129
134
 
135
+ overloads = get_overloads(self.__call__)
136
+ if not overloads:
137
+ call_signatures = [inspect.signature(self.__call__)]
138
+ else:
139
+ call_signatures = [inspect.signature(overload) for overload in overloads]
130
140
  call_signature = inspect.signature(self.__call__)
131
141
  singletons = []
132
- for param in call_signature.parameters:
133
- if param not in self._not_singleton_inputs:
134
- singletons.append(param)
142
+ for call_signature in call_signatures:
143
+ params = call_signature.parameters
144
+ if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
145
+ continue
146
+ # exclude self since it is not a singleton input
147
+ singletons.extend([p for p in params if p != "self"])
135
148
  return singletons
136
149
 
137
150
  def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
@@ -190,6 +203,59 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
190
203
 
191
204
  return converter
192
205
 
206
+ def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
207
+ """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
208
+ This uses the inputs derived from the _derive_singleton_inputs function to determine which
209
+ aspects of a conversation ought to be extracted.
210
+
211
+ :return: The function that will be used to convert conversations to evaluable inputs.
212
+ :rtype: Callable
213
+ """
214
+
215
+ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
216
+ messages = cast(List[Dict[str, Any]], conversation["messages"])
217
+ # Extract user messages, assistant messages from conversation
218
+ user_messages: List[Dict[str, Any]] = []
219
+ assistant_messages: List[Dict[str, Any]] = []
220
+ system_messages: List[Dict[str, Any]] = []
221
+
222
+ # Convert conversation slice into queries and responses.
223
+ # Assume that 'user' role is asking queries and 'assistant' role is responding.
224
+ if self._eval_last_turn and len(messages) > 1:
225
+ messages = messages[-2:]
226
+
227
+ for each_turn in messages:
228
+ role = each_turn["role"]
229
+ if role == "user":
230
+ user_messages.append(each_turn)
231
+ elif role == "assistant":
232
+ assistant_messages.append(each_turn)
233
+ elif role == "system":
234
+ system_messages.append(each_turn)
235
+
236
+ # validation
237
+ if len(user_messages) != len(assistant_messages):
238
+ raise EvaluationException(
239
+ message="Mismatched number of user and assistant messages.",
240
+ internal_message=("Mismatched number of user and assistant messages."),
241
+ )
242
+ if len(assistant_messages) > 1:
243
+ raise EvaluationException(
244
+ message="Conversation can have only one assistant message.",
245
+ internal_message=("Conversation can have only one assistant message."),
246
+ )
247
+ eval_conv_inputs = []
248
+ for user_msg, assist_msg in zip(user_messages, assistant_messages):
249
+ conv_messages = []
250
+ if len(system_messages) == 1:
251
+ conv_messages.append(system_messages[0])
252
+ conv_messages.append(user_msg)
253
+ conv_messages.append(assist_msg)
254
+ eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
255
+ return eval_conv_inputs
256
+
257
+ return multi_modal_converter
258
+
193
259
  def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
194
260
  """Convert an arbitrary input into a list of inputs for evaluators.
195
261
  It is assumed that evaluators generally make use of their inputs in one of two ways.
@@ -198,7 +264,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
198
264
  values.
199
265
 
200
266
  The self._singleton_inputs list assigned during initialization is used to find and extract
201
- singleton keywords, and self._allow_converssation_input is used to determine if a conversation
267
+ singleton keywords, and self._allow_conversation_input is used to determine if a conversation
202
268
  is a valid input.
203
269
 
204
270
  If both conversations and singletons are allowed, the function will raise an exception if both
@@ -229,6 +295,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
229
295
  )
230
296
  # Handle Conversation
231
297
  if conversation is not None:
298
+ if self._is_multi_modal_conversation(conversation):
299
+ return self._derive_multi_modal_conversation_converter()(conversation)
232
300
  return self._derive_conversation_converter()(conversation)
233
301
  # Handle Singletons
234
302
  required_singletons = remove_optional_singletons(self, singletons)
@@ -243,6 +311,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
243
311
  target=ErrorTarget.CONVERSATION,
244
312
  )
245
313
 
314
+ def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
315
+ if "messages" not in conversation:
316
+ return False
317
+ messages = conversation["messages"]
318
+ if not isinstance(messages, list):
319
+ return False
320
+ for message in messages:
321
+ if "content" in message:
322
+ content = message.get("content", "")
323
+ if isinstance(content, list):
324
+ if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
325
+ return True
326
+ return False
327
+
246
328
  def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
247
329
  """Aggregate the evaluation results of each conversation turn into a single result.
248
330
 
@@ -4,12 +4,13 @@
4
4
 
5
5
  import math
6
6
  import re
7
- from typing import Dict, Union
7
+ from typing import Dict, TypeVar, Union
8
8
 
9
9
  from promptflow.core import AsyncPrompty
10
10
  from typing_extensions import override
11
11
 
12
12
  from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
13
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
13
14
  from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
14
15
  from . import EvaluatorBase
15
16
 
@@ -18,8 +19,10 @@ try:
18
19
  except ImportError:
19
20
  USER_AGENT = "None"
20
21
 
22
+ T = TypeVar("T")
21
23
 
22
- class PromptyEvaluatorBase(EvaluatorBase[float]):
24
+
25
+ class PromptyEvaluatorBase(EvaluatorBase[T]):
23
26
  """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
24
27
  make use of a prompty file, and return their results as a dictionary, with a single key-value pair
25
28
  linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
@@ -45,10 +48,12 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
45
48
  self._prompty_file = prompty_file
46
49
  super().__init__(eval_last_turn=eval_last_turn)
47
50
 
51
+ subclass_name = self.__class__.__name__
52
+ user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
48
53
  prompty_model_config = construct_prompty_model_config(
49
54
  validate_model_config(model_config),
50
55
  self._DEFAULT_OPEN_API_VERSION,
51
- USER_AGENT,
56
+ user_agent,
52
57
  )
53
58
 
54
59
  self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
@@ -67,6 +72,14 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
67
72
  :return: The evaluation result.
68
73
  :rtype: Dict
69
74
  """
75
+ if "query" not in eval_input and "response" not in eval_input:
76
+ raise EvaluationException(
77
+ message="Only text conversation inputs are supported.",
78
+ internal_message="Only text conversation inputs are supported.",
79
+ blame=ErrorBlame.USER_ERROR,
80
+ category=ErrorCategory.INVALID_VALUE,
81
+ target=ErrorTarget.CONVERSATION,
82
+ )
70
83
  llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
71
84
 
72
85
  score = math.nan
@@ -1,7 +1,7 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from typing import Dict, Optional, Union
4
+ from typing import Dict, TypeVar, Union
5
5
 
6
6
  from typing_extensions import override
7
7
 
@@ -11,14 +11,15 @@ from azure.ai.evaluation._common.constants import (
11
11
  Tasks,
12
12
  _InternalAnnotationTasks,
13
13
  )
14
- from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
14
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
15
15
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
16
16
  from azure.ai.evaluation._exceptions import EvaluationException
17
+ from azure.ai.evaluation._common.utils import validate_conversation
17
18
  from azure.core.credentials import TokenCredential
18
19
 
19
20
  from . import EvaluatorBase
20
21
 
21
- T = Union[str, float]
22
+ T = TypeVar("T")
22
23
 
23
24
 
24
25
  class RaiServiceEvaluatorBase(EvaluatorBase[T]):
@@ -50,12 +51,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
50
51
  self._credential = credential
51
52
 
52
53
  @override
53
- def __call__(
54
+ def __call__( # pylint: disable=docstring-missing-param
54
55
  self,
55
- *,
56
- query: Optional[str] = None,
57
- response: Optional[str] = None,
58
- conversation=None,
56
+ *args,
59
57
  **kwargs,
60
58
  ):
61
59
  """Evaluate either a query and response or a conversation. Must supply either a query AND response,
@@ -71,7 +69,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
71
69
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
72
70
  :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
73
71
  """
74
- return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
72
+ return super().__call__(*args, **kwargs)
75
73
 
76
74
  @override
77
75
  async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
@@ -84,6 +82,36 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
84
82
  :return: The evaluation result.
85
83
  :rtype: Dict
86
84
  """
85
+ if "query" in eval_input and "response" in eval_input:
86
+ return await self._evaluate_query_response(eval_input)
87
+
88
+ conversation = eval_input.get("conversation", None)
89
+ return await self._evaluate_conversation(conversation)
90
+
91
+ async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
92
+ """
93
+ Evaluates content according to this evaluator's metric.
94
+ :keyword conversation: The conversation contains list of messages to be evaluated.
95
+ Each message should have "role" and "content" keys.
96
+
97
+ :param conversation: The conversation to evaluate.
98
+ :type conversation: ~azure.ai.evaluation.Conversation
99
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
100
+ :rtype: Dict[str, Union[float, str]]
101
+ """
102
+ # validate inputs
103
+ validate_conversation(conversation)
104
+ messages = conversation["messages"]
105
+ # Run score computation based on supplied metric.
106
+ result = await evaluate_with_rai_service_multimodal(
107
+ messages=messages,
108
+ metric_name=self._eval_metric,
109
+ project_scope=self._azure_ai_project,
110
+ credential=self._credential,
111
+ )
112
+ return result
113
+
114
+ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
87
115
  query = eval_input.get("query", None)
88
116
  response = eval_input.get("response", None)
89
117
  if query is None or response is None:
@@ -108,12 +136,13 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
108
136
  )
109
137
  input_data["context"] = context
110
138
 
111
- return await evaluate_with_rai_service(
139
+ return await evaluate_with_rai_service( # type: ignore
112
140
  metric_name=self._eval_metric,
113
141
  data=input_data,
114
142
  project_scope=self._azure_ai_project,
115
143
  credential=self._credential,
116
144
  annotation_task=self._get_task(),
145
+ evaluator_name=self.__class__.__name__,
117
146
  )
118
147
 
119
148
  def _get_task(self):
@@ -2,13 +2,14 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  from concurrent.futures import as_completed
5
- from typing import Callable, Dict, List, Optional, Union
5
+ from typing import Callable, Dict, List, Union
6
6
 
7
7
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
8
- from typing_extensions import override
8
+ from typing_extensions import overload, override
9
9
 
10
- from azure.ai.evaluation._common._experimental import experimental
11
10
  from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._model_configurations import Conversation
12
+ from azure.ai.evaluation._common._experimental import experimental
12
13
 
13
14
  from ._hate_unfairness import HateUnfairnessEvaluator
14
15
  from ._self_harm import SelfHarmEvaluator
@@ -17,61 +18,36 @@ from ._violence import ViolenceEvaluator
17
18
 
18
19
 
19
20
  @experimental
20
- class ContentSafetyEvaluator(EvaluatorBase):
21
+ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
21
22
  """
22
- Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
23
+ Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
23
24
 
24
25
  :param credential: The credential for connecting to Azure AI project. Required
25
26
  :type credential: ~azure.core.credentials.TokenCredential
26
27
  :param azure_ai_project: The scope of the Azure AI project.
27
28
  It contains subscription id, resource group, and project name.
28
29
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
29
- :param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
30
- :type eval_last_turn: bool
31
30
  :param kwargs: Additional arguments to pass to the evaluator.
32
31
  :type kwargs: Any
33
32
  :return: A function that evaluates content-safety metrics for "question-answering" scenario.
34
- :rtype: Callable
35
-
36
- **Usage**
37
-
38
- .. code-block:: python
39
-
40
- azure_ai_project = {
41
- "subscription_id": "<subscription_id>",
42
- "resource_group_name": "<resource_group_name>",
43
- "project_name": "<project_name>",
44
- }
45
- eval_fn = ContentSafetyEvaluator(azure_ai_project)
46
- result = eval_fn(
47
- query="What is the capital of France?",
48
- response="Paris.",
49
- )
50
-
51
- **Output format**
52
-
53
- .. code-block:: python
54
-
55
- {
56
- "violence": "Medium",
57
- "violence_score": 5.0,
58
- "violence_reason": "Some reason",
59
- "sexual": "Medium",
60
- "sexual_score": 5.0,
61
- "sexual_reason": "Some reason",
62
- "self_harm": "Medium",
63
- "self_harm_score": 5.0,
64
- "self_harm_reason": "Some reason",
65
- "hate_unfairness": "Medium",
66
- "hate_unfairness_score": 5.0,
67
- "hate_unfairness_reason": "Some reason"
68
- }
33
+
34
+ .. admonition:: Example:
35
+
36
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
37
+ :start-after: [START content_safety_evaluator]
38
+ :end-before: [END content_safety_evaluator]
39
+ :language: python
40
+ :dedent: 8
41
+ :caption: Initialize and call a ContentSafetyEvaluator.
69
42
  """
70
43
 
44
+ id = "content_safety"
45
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
46
+
71
47
  # TODO address 3579092 to re-enabled parallel evals.
72
- def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
73
- super().__init__(eval_last_turn=eval_last_turn)
74
- self._parallel = kwargs.pop("parallel", False)
48
+ def __init__(self, credential, azure_ai_project, **kwargs):
49
+ super().__init__()
50
+ self._parallel = kwargs.pop("_parallel", True)
75
51
  self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
76
52
  ViolenceEvaluator(credential, azure_ai_project),
77
53
  SexualEvaluator(credential, azure_ai_project),
@@ -79,13 +55,43 @@ class ContentSafetyEvaluator(EvaluatorBase):
79
55
  HateUnfairnessEvaluator(credential, azure_ai_project),
80
56
  ]
81
57
 
82
- @override
58
+ @overload
59
+ def __call__(
60
+ self,
61
+ *,
62
+ query: str,
63
+ response: str,
64
+ ) -> Dict[str, Union[str, float]]:
65
+ """Evaluate a collection of content safety metrics for the given query/response pair
66
+
67
+ :keyword query: The query to be evaluated.
68
+ :paramtype query: str
69
+ :keyword response: The response to be evaluated.
70
+ :paramtype response: str
71
+ :return: The content safety scores.
72
+ :rtype: Dict[str, Union[str, float]]
73
+ """
74
+
75
+ @overload
83
76
  def __call__(
84
77
  self,
85
78
  *,
86
- query: Optional[str] = None,
87
- response: Optional[str] = None,
88
- conversation=None,
79
+ conversation: Conversation,
80
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
81
+ """Evaluate a collection of content safety metrics for a conversation
82
+
83
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
84
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
85
+ to be dictionaries with keys "content", "role", and possibly "context".
86
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
87
+ :return: The content safety scores.
88
+ :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
89
+ """
90
+
91
+ @override
92
+ def __call__( # pylint: disable=docstring-missing-param
93
+ self,
94
+ *args,
89
95
  **kwargs,
90
96
  ):
91
97
  """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
@@ -100,9 +106,9 @@ class ContentSafetyEvaluator(EvaluatorBase):
100
106
  to be dictionaries with keys "content", "role", and possibly "context".
101
107
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
102
108
  :return: The evaluation result.
103
- :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
109
+ :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
104
110
  """
105
- return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
111
+ return super().__call__(*args, **kwargs)
106
112
 
107
113
  @override
108
114
  async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
@@ -124,7 +130,7 @@ class ContentSafetyEvaluator(EvaluatorBase):
124
130
  with ThreadPoolExecutor() as executor:
125
131
  # pylint: disable=no-value-for-parameter
126
132
  futures = {
127
- executor.submit(query=query, response=response, conversation=conversation): evaluator
133
+ executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
128
134
  for evaluator in self._evaluators
129
135
  }
130
136