azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (93) hide show
  1. azure/ai/evaluation/__init__.py +23 -1
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
  3. azure/ai/evaluation/_common/constants.py +9 -2
  4. azure/ai/evaluation/_common/math.py +29 -0
  5. azure/ai/evaluation/_common/rai_service.py +222 -93
  6. azure/ai/evaluation/_common/utils.py +328 -19
  7. azure/ai/evaluation/_constants.py +16 -8
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
  10. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
  11. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  13. azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
  14. azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
  16. azure/ai/evaluation/_evaluate/_utils.py +77 -25
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
  19. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  20. azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
  21. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
  22. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
  24. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
  25. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
  26. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
  27. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
  28. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
  29. azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
  30. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  31. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
  32. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  33. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  34. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
  35. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  38. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  44. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  45. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  46. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
  47. azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
  48. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
  49. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  50. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
  51. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  52. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  53. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  55. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
  56. azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
  57. azure/ai/evaluation/_exceptions.py +26 -6
  58. azure/ai/evaluation/_http_utils.py +203 -132
  59. azure/ai/evaluation/_model_configurations.py +23 -6
  60. azure/ai/evaluation/_vendor/__init__.py +3 -0
  61. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  62. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  63. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  64. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  65. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  66. azure/ai/evaluation/_version.py +1 -1
  67. azure/ai/evaluation/simulator/__init__.py +2 -1
  68. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  69. azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
  70. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  71. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  72. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  73. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  74. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  75. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  76. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
  78. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  80. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  81. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  82. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
  83. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  84. azure/ai/evaluation/simulator/_simulator.py +222 -169
  85. azure/ai/evaluation/simulator/_tracing.py +4 -4
  86. azure/ai/evaluation/simulator/_utils.py +6 -6
  87. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
  88. azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
  89. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  90. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  91. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  92. azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
  93. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -2,19 +2,56 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from typing import List, Dict, Callable, Any
6
5
  import inspect
6
+ from abc import ABC, abstractmethod
7
+ from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
7
8
 
8
- from abc import ABC
9
-
10
- import numpy as np
11
9
  from promptflow._utils.async_utils import async_run_allowing_running_loop
10
+ from typing_extensions import ParamSpec, TypeAlias
11
+
12
+ from azure.ai.evaluation._common.math import list_mean
13
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
+ from azure.ai.evaluation._common.utils import remove_optional_singletons
15
+
16
+ P = ParamSpec("P")
17
+ T = TypeVar("T")
18
+ T_EvalValue = TypeVar("T_EvalValue")
19
+
20
+
21
+ class DerivedEvalInput(TypedDict, total=False):
22
+ """The eval input generated by EvaluatorBase._derive_conversation_starter."""
23
+
24
+ query: Dict[str, Any]
25
+ response: Dict[str, Any]
26
+ context: str
27
+
28
+
29
+ AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
30
+ """TypeAlias that models the return value of EvaluatorBase._aggregate_results
31
+
32
+ .. code-block:: python
12
33
 
13
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
34
+ foo: AggregateResult[float] = {
35
+ "evaluation_per_turn": {
36
+ "coherence": [1.0, 2.0, 3.0]
37
+ },
38
+ "coherence": 2.0
39
+ }
40
+ """
41
+
42
+ DoEvalResult: TypeAlias = Dict[str, T]
43
+ """TypeAlias that models the return value of EvaluatorBase._do_eval
44
+
45
+ .. code-block:: python
46
+
47
+ foo: DoEvalResult[float] = {
48
+ "coherence": 2.0
49
+ }
50
+ """
14
51
 
15
52
 
16
53
  # TODO exception target pass down?
17
- class EvaluatorBase(ABC):
54
+ class EvaluatorBase(ABC, Generic[T_EvalValue]):
18
55
  """Base class for all evaluators that are capable of accepting either a group of single values,
19
56
  or conversation as input. All such evaluators need to implement two functions of their own:
20
57
  - _convert_conversation_to_eval_input
@@ -51,7 +88,7 @@ class EvaluatorBase(ABC):
51
88
  # This needs to be overridden just to change the function header into something more informative,
52
89
  # and to be able to add a more specific docstring. The actual function contents should just be
53
90
  # super().__call__(<inputs>)
54
- def __call__(self, **kwargs) -> Dict:
91
+ def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
55
92
  """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
56
93
  one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
57
94
  The actual behavior of this function shouldn't change beyond adding more inputs to the
@@ -60,13 +97,12 @@ class EvaluatorBase(ABC):
60
97
  :keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
61
98
  :type kwargs: Dict
62
99
  :return: The evaluation result
63
- :rtype: Dict
100
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
64
101
  """
65
102
  return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
66
103
 
67
- # Probably the only thing that can't be simplified. Each evaluator, or at least each family
68
- # of evaluators, will need to implement their own version of this function.
69
- async def _do_eval(self, eval_input: Any) -> Dict:
104
+ @abstractmethod
105
+ async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
70
106
  """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
71
107
  In the default case, all required inputs are assumed to be within eval_input, as user-friendly
72
108
  typing is handled above this function in favor of polymorphic simplicity. This function must be
@@ -75,13 +111,8 @@ class EvaluatorBase(ABC):
75
111
  :param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
76
112
  :type eval_input: Any
77
113
  :return: A single evaluation result
78
- :rtype: Dict
79
-
114
+ :rtype: DoEvalResult[T_EvalValue]
80
115
  """
81
- raise EvaluationException(
82
- message="Not implemented",
83
- internal_message="BaseConversationEval's _do_eval method called somehow. This should be overridden.",
84
- )
85
116
 
86
117
  # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
87
118
 
@@ -103,7 +134,7 @@ class EvaluatorBase(ABC):
103
134
  singletons.append(param)
104
135
  return singletons
105
136
 
106
- def _derive_conversation_converter(self) -> Callable:
137
+ def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
107
138
  """Produce the function that will be used to convert conversations to a list of evaluable inputs.
108
139
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
109
140
  aspects of a conversation ought to be extracted.
@@ -115,12 +146,12 @@ class EvaluatorBase(ABC):
115
146
  include_query = "query" in self._singleton_inputs
116
147
  include_response = "response" in self._singleton_inputs
117
148
 
118
- def converter(conversation: Dict) -> List:
119
- messages = conversation["messages"]
149
+ def converter(conversation: Dict) -> List[DerivedEvalInput]:
150
+ messages = cast(List[Dict[str, Any]], conversation["messages"])
120
151
  global_context = conversation.get("context", None)
121
152
  # Extract queries, responses from conversation
122
- queries = []
123
- responses = []
153
+ queries: List[Dict[str, Any]] = []
154
+ responses: List[Dict[str, Any]] = []
124
155
 
125
156
  # Convert conversation slice into queries and responses.
126
157
  # Assume that 'user' role is asking queries and 'assistant' role is responding.
@@ -142,16 +173,16 @@ class EvaluatorBase(ABC):
142
173
  response_context = response.get("context", None)
143
174
  if global_context:
144
175
  context["global_context"] = global_context
145
- if query_context and not include_query:
176
+ if query_context and include_query:
146
177
  context["query_context"] = query_context
147
- if response_context and not include_response:
178
+ if response_context and include_response:
148
179
  context["response_context"] = response_context
149
180
 
150
- eval_input = {}
181
+ eval_input: DerivedEvalInput = {}
151
182
  if include_query:
152
- eval_input["query"] = query
183
+ eval_input["query"] = query.get("content", "")
153
184
  if include_response:
154
- eval_input["response"] = response
185
+ eval_input["response"] = response.get("content", "")
155
186
  if include_context:
156
187
  eval_input["context"] = str(context)
157
188
  eval_inputs.append(eval_input)
@@ -159,7 +190,7 @@ class EvaluatorBase(ABC):
159
190
 
160
191
  return converter
161
192
 
162
- def _convert_kwargs_to_eval_input(self, **kwargs) -> List:
193
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
163
194
  """Convert an arbitrary input into a list of inputs for evaluators.
164
195
  It is assumed that evaluators generally make use of their inputs in one of two ways.
165
196
  Either they receive a collection of keyname inputs that are all single values
@@ -189,9 +220,9 @@ class EvaluatorBase(ABC):
189
220
  singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
190
221
  # Check that both conversation and other inputs aren't set
191
222
  if conversation is not None and any(singletons.values()):
223
+ msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
192
224
  raise EvaluationException(
193
- message="Invalid input",
194
- internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
225
+ message=msg,
195
226
  blame=ErrorBlame.USER_ERROR,
196
227
  category=ErrorCategory.INVALID_VALUE,
197
228
  target=ErrorTarget.CONVERSATION,
@@ -200,18 +231,19 @@ class EvaluatorBase(ABC):
200
231
  if conversation is not None:
201
232
  return self._derive_conversation_converter()(conversation)
202
233
  # Handle Singletons
203
- if all(value is not None for value in singletons.values()):
204
- return [singletons] # TODO loosen requirements to allow for optional singletons?
234
+ required_singletons = remove_optional_singletons(self, singletons)
235
+ if all(value is not None for value in required_singletons.values()):
236
+ return [singletons]
205
237
  # Missing input
238
+ msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
206
239
  raise EvaluationException(
207
- message="Missing input",
208
- internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
240
+ message=msg,
209
241
  blame=ErrorBlame.USER_ERROR,
210
242
  category=ErrorCategory.INVALID_VALUE,
211
243
  target=ErrorTarget.CONVERSATION,
212
244
  )
213
245
 
214
- def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict:
246
+ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
215
247
  """Aggregate the evaluation results of each conversation turn into a single result.
216
248
 
217
249
  Exact implementation might need to vary slightly depending on the results produced.
@@ -224,11 +256,11 @@ class EvaluatorBase(ABC):
224
256
  values (including non-numerics) located in under the "evaluation_per_turn" key,
225
257
  which each sub-key being a metric and each sub-value being a the list of that metric's
226
258
  per-turn values.
227
- :rtype: Dict
259
+ :rtype: AggregateResult[T_EvalValue]
228
260
  """
229
261
 
230
- aggregated = {}
231
- evaluation_per_turn = {}
262
+ aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
263
+ evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
232
264
 
233
265
  # Go over each turn, and rotate the results into a
234
266
  # metric: List[values] format for the evals_per_turn dictionary.
@@ -241,19 +273,18 @@ class EvaluatorBase(ABC):
241
273
  # Find and average all numeric values
242
274
  for metric, values in evaluation_per_turn.items():
243
275
  if all(isinstance(value, (int, float)) for value in values):
244
- aggregated[metric] = np.mean(values)
276
+ aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
245
277
  # Slap the per-turn results back in.
246
278
  aggregated["evaluation_per_turn"] = evaluation_per_turn
247
-
248
279
  return aggregated
249
280
 
250
- async def _real_call(self, **kwargs):
281
+ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
251
282
  """The asynchronous call where real end-to-end evaluation logic is performed.
252
283
 
253
284
  :keyword kwargs: The inputs to evaluate.
254
285
  :type kwargs: Dict
255
286
  :return: The evaluation result.
256
- :rtype: Dict
287
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
257
288
  """
258
289
  # Convert inputs into list of evaluable inputs.
259
290
  eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
@@ -270,9 +301,8 @@ class EvaluatorBase(ABC):
270
301
  # Otherwise, aggregate results.
271
302
  return self._aggregate_results(per_turn_results=per_turn_results)
272
303
 
273
- # ~~~ METHODS THAT SHOULD NEVER BE OVERRIDDEN BY CHILDREN~~~
274
-
275
- def _to_async(self):
304
+ @final
305
+ def _to_async(self) -> "AsyncEvaluatorBase":
276
306
  return self._async_evaluator
277
307
 
278
308
 
@@ -286,7 +316,7 @@ class AsyncEvaluatorBase:
286
316
 
287
317
  # Don't look at my shame. Nothing to see here....
288
318
  # Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
289
- # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature#
319
+ # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
290
320
  # are just not passed into this function instead of ending up in kwargs.
291
321
  # Since we want this to be relatively call-agnostic, we just account for every input that any children
292
322
  # are known to throw at this, mash them into kwargs, and then pass them into the real call.
@@ -2,26 +2,24 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import math
5
6
  import re
6
- from typing import Dict
7
-
8
- from typing_extensions import override
9
-
10
-
11
- import numpy as np
7
+ from typing import Dict, Union
12
8
 
13
9
  from promptflow.core import AsyncPrompty
10
+ from typing_extensions import override
14
11
 
15
- from ..._common.utils import construct_prompty_model_config
12
+ from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
13
+ from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
14
+ from . import EvaluatorBase
16
15
 
17
16
  try:
18
17
  from ..._user_agent import USER_AGENT
19
18
  except ImportError:
20
- USER_AGENT = None
21
- from . import EvaluatorBase
19
+ USER_AGENT = "None"
22
20
 
23
21
 
24
- class PromptyEvaluatorBase(EvaluatorBase):
22
+ class PromptyEvaluatorBase(EvaluatorBase[float]):
25
23
  """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
26
24
  make use of a prompty file, and return their results as a dictionary, with a single key-value pair
27
25
  linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
@@ -39,17 +37,17 @@ class PromptyEvaluatorBase(EvaluatorBase):
39
37
  :type ignore_queries: bool
40
38
  """
41
39
 
42
- LLM_CALL_TIMEOUT = 600
43
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
40
+ _LLM_CALL_TIMEOUT = 600
41
+ _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
44
42
 
45
- def __init__(self, *, result_key: str, prompty_file: str, model_config: Dict, eval_last_turn: bool = False):
43
+ def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
46
44
  self._result_key = result_key
47
45
  self._prompty_file = prompty_file
48
46
  super().__init__(eval_last_turn=eval_last_turn)
49
47
 
50
48
  prompty_model_config = construct_prompty_model_config(
51
- model_config,
52
- self.DEFAULT_OPEN_API_VERSION,
49
+ validate_model_config(model_config),
50
+ self._DEFAULT_OPEN_API_VERSION,
53
51
  USER_AGENT,
54
52
  )
55
53
 
@@ -59,7 +57,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
59
57
  # defining a default here.
60
58
 
61
59
  @override
62
- async def _do_eval(self, eval_input: Dict) -> Dict:
60
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
63
61
  """Do a relevance evaluation.
64
62
 
65
63
  :param eval_input: The input to the evaluator. Expected to contain
@@ -69,11 +67,20 @@ class PromptyEvaluatorBase(EvaluatorBase):
69
67
  :return: The evaluation result.
70
68
  :rtype: Dict
71
69
  """
72
- llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
70
+ llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
73
71
 
74
- score = np.nan
72
+ score = math.nan
75
73
  if llm_output:
74
+ # Parse out score and reason from evaluators known to possess them.
75
+ if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
76
+ score, reason = parse_quality_evaluator_reason_score(llm_output)
77
+ return {
78
+ self._result_key: float(score),
79
+ f"gpt_{self._result_key}": float(score),
80
+ f"{self._result_key}_reason": reason,
81
+ }
76
82
  match = re.search(r"\d", llm_output)
77
83
  if match:
78
84
  score = float(match.group())
79
- return {self._result_key: float(score)}
85
+ return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
86
+ return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
@@ -1,48 +1,53 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict, Optional, Union
4
5
 
5
- from typing import Dict, Optional
6
6
  from typing_extensions import override
7
7
 
8
- from azure.identity import DefaultAzureCredential
9
- from azure.ai.evaluation._common.constants import EvaluationMetrics
8
+ from azure.ai.evaluation._common.constants import (
9
+ EvaluationMetrics,
10
+ _InternalEvaluationMetrics,
11
+ Tasks,
12
+ _InternalAnnotationTasks,
13
+ )
10
14
  from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
15
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
11
16
  from azure.ai.evaluation._exceptions import EvaluationException
17
+ from azure.core.credentials import TokenCredential
18
+
12
19
  from . import EvaluatorBase
13
20
 
21
+ T = Union[str, float]
22
+
14
23
 
15
- class RaiServiceEvaluatorBase(EvaluatorBase):
24
+ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
16
25
  """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
17
26
  This includes content safety evaluators, protected material evaluators, and others. These evaluators
18
27
  are all assumed to be of the "query and response or conversation" input variety.
19
28
 
20
- param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
21
- to specify which evaluation to perform.
22
- type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
23
- param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
29
+ :param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
30
+ to specify which evaluation to perform.
31
+ :type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
32
+ :param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
24
33
  aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
25
34
  aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
26
35
  when this occurs. Default is False, resulting full conversation evaluation and aggregation.
27
- type eval_last_turn: bool
36
+ :type eval_last_turn: bool
28
37
  """
29
38
 
30
39
  @override
31
40
  def __init__(
32
41
  self,
33
- eval_metric: EvaluationMetrics,
42
+ eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
34
43
  azure_ai_project: dict,
35
- credential: Optional[dict] = None,
44
+ credential: TokenCredential,
36
45
  eval_last_turn: bool = False,
37
46
  ):
38
47
  super().__init__(eval_last_turn=eval_last_turn)
39
48
  self._eval_metric = eval_metric
40
- self._azure_ai_project = azure_ai_project
41
- if credential is None:
42
- # Use DefaultCredential if no credential is provided
43
- self._credential = DefaultAzureCredential()
44
- else:
45
- self._credential = credential
49
+ self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
50
+ self._credential = credential
46
51
 
47
52
  @override
48
53
  def __call__(
@@ -50,8 +55,8 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
50
55
  *,
51
56
  query: Optional[str] = None,
52
57
  response: Optional[str] = None,
53
- conversation: Optional[dict] = None,
54
- **kwargs
58
+ conversation=None,
59
+ **kwargs,
55
60
  ):
56
61
  """Evaluate either a query and response or a conversation. Must supply either a query AND response,
57
62
  or a conversation, but not both.
@@ -63,14 +68,13 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
63
68
  :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
64
69
  key "messages", and potentially a global context under the key "context". Conversation turns are expected
65
70
  to be dictionaries with keys "content", "role", and possibly "context".
66
- :paramtype conversation: Optional[Dict]
67
- :return: The evaluation result.
68
- :rtype: Dict
71
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
72
+ :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
69
73
  """
70
74
  return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
71
75
 
72
76
  @override
73
- async def _do_eval(self, eval_input: Dict):
77
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
74
78
  """Perform the evaluation using the Azure AI RAI service.
75
79
  The exact evaluation performed is determined by the evaluation metric supplied
76
80
  by the child class initializer.
@@ -90,10 +94,43 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
90
94
  + " This should have failed earlier."
91
95
  ),
92
96
  )
97
+ input_data = {"query": query, "response": response}
98
+
99
+ if "context" in self._singleton_inputs:
100
+ context = eval_input.get("context", None)
101
+ if context is None:
102
+ raise EvaluationException(
103
+ message="Not implemented",
104
+ internal_message=(
105
+ "Attempted context-based evaluation without supplying context."
106
+ + " This should have failed earlier."
107
+ ),
108
+ )
109
+ input_data["context"] = context
110
+
93
111
  return await evaluate_with_rai_service(
94
112
  metric_name=self._eval_metric,
95
- query=query,
96
- response=response,
113
+ data=input_data,
97
114
  project_scope=self._azure_ai_project,
98
115
  credential=self._credential,
116
+ annotation_task=self._get_task(),
99
117
  )
118
+
119
+ def _get_task(self):
120
+ """Get the annotation task for the current evaluation metric.
121
+ The annotation task is used by the RAI service script to determine a the message format
122
+ of the API call, and how the output is processed, among other things.
123
+
124
+ :return: The annotation task for the evaluator's self._eval_metric value.
125
+ :rtype: ~azure.ai.evaluation._common.constants.Tasks
126
+
127
+ """
128
+ if self._eval_metric == EvaluationMetrics.GROUNDEDNESS:
129
+ return Tasks.GROUNDEDNESS
130
+ if self._eval_metric == EvaluationMetrics.XPIA:
131
+ return Tasks.XPIA
132
+ if self._eval_metric == _InternalEvaluationMetrics.ECI:
133
+ return _InternalAnnotationTasks.ECI
134
+ if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
135
+ return Tasks.PROTECTED_MATERIAL
136
+ return Tasks.CONTENT_HARM
@@ -2,32 +2,34 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  from concurrent.futures import as_completed
5
+ from typing import Callable, Dict, List, Optional, Union
5
6
 
6
7
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
8
+ from typing_extensions import override
7
9
 
8
- try:
9
- from ._hate_unfairness import HateUnfairnessEvaluator
10
- from ._self_harm import SelfHarmEvaluator
11
- from ._sexual import SexualEvaluator
12
- from ._violence import ViolenceEvaluator
13
- except ImportError:
14
- from _hate_unfairness import HateUnfairnessEvaluator
15
- from _self_harm import SelfHarmEvaluator
16
- from _sexual import SexualEvaluator
17
- from _violence import ViolenceEvaluator
10
+ from azure.ai.evaluation._common._experimental import experimental
11
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
18
12
 
13
+ from ._hate_unfairness import HateUnfairnessEvaluator
14
+ from ._self_harm import SelfHarmEvaluator
15
+ from ._sexual import SexualEvaluator
16
+ from ._violence import ViolenceEvaluator
19
17
 
20
- class ContentSafetyEvaluator:
18
+
19
+ @experimental
20
+ class ContentSafetyEvaluator(EvaluatorBase):
21
21
  """
22
22
  Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
23
23
 
24
+ :param credential: The credential for connecting to Azure AI project. Required
25
+ :type credential: ~azure.core.credentials.TokenCredential
24
26
  :param azure_ai_project: The scope of the Azure AI project.
25
27
  It contains subscription id, resource group, and project name.
26
28
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
27
- :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
28
- Default is True.
29
- :param credential: The credential for connecting to Azure AI project.
30
- :type credential: ~azure.core.credentials.TokenCredential
29
+ :param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
30
+ :type eval_last_turn: bool
31
+ :param kwargs: Additional arguments to pass to the evaluator.
32
+ :type kwargs: Any
31
33
  :return: A function that evaluates content-safety metrics for "question-answering" scenario.
32
34
  :rtype: Callable
33
35
 
@@ -66,33 +68,63 @@ class ContentSafetyEvaluator:
66
68
  }
67
69
  """
68
70
 
69
- def __init__(self, azure_ai_project: dict, parallel: bool = True, credential=None):
70
- self._parallel = parallel
71
- self._evaluators = [
72
- ViolenceEvaluator(azure_ai_project, credential),
73
- SexualEvaluator(azure_ai_project, credential),
74
- SelfHarmEvaluator(azure_ai_project, credential),
75
- HateUnfairnessEvaluator(azure_ai_project, credential),
71
+ # TODO address 3579092 to re-enabled parallel evals.
72
+ def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
73
+ super().__init__(eval_last_turn=eval_last_turn)
74
+ self._parallel = kwargs.pop("parallel", False)
75
+ self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
76
+ ViolenceEvaluator(credential, azure_ai_project),
77
+ SexualEvaluator(credential, azure_ai_project),
78
+ SelfHarmEvaluator(credential, azure_ai_project),
79
+ HateUnfairnessEvaluator(credential, azure_ai_project),
76
80
  ]
77
81
 
78
- def __call__(self, *, query: str, response: str, **kwargs):
82
+ @override
83
+ def __call__(
84
+ self,
85
+ *,
86
+ query: Optional[str] = None,
87
+ response: Optional[str] = None,
88
+ conversation=None,
89
+ **kwargs,
90
+ ):
91
+ """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
92
+ This inputs must supply either a query AND response, or a conversation, but not both.
93
+
94
+ :keyword query: The query to evaluate.
95
+ :paramtype query: Optional[str]
96
+ :keyword response: The response to evaluate.
97
+ :paramtype response: Optional[str]
98
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
99
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
100
+ to be dictionaries with keys "content", "role", and possibly "context".
101
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
102
+ :return: The evaluation result.
103
+ :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
79
104
  """
80
- Evaluates content-safety metrics for "question-answering" scenario.
81
-
82
- :keyword query: The query to be evaluated.
83
- :paramtype query: str
84
- :keyword response: The response to be evaluated.
85
- :paramtype response: str
86
- :keyword parallel: Whether to evaluate in parallel.
87
- :paramtype parallel: bool
88
- :return: The scores for content-safety.
89
- :rtype: dict
105
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
106
+
107
+ @override
108
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
109
+ """Perform the evaluation using the Azure AI RAI service.
110
+ The exact evaluation performed is determined by the evaluation metric supplied
111
+ by the child class initializer.
112
+
113
+ :param eval_input: The input to the evaluation function.
114
+ :type eval_input: Dict
115
+ :return: The evaluation result.
116
+ :rtype: Dict
90
117
  """
91
- results = {}
118
+ query = eval_input.get("query", None)
119
+ response = eval_input.get("response", None)
120
+ conversation = eval_input.get("conversation", None)
121
+ results: Dict[str, Union[str, float]] = {}
122
+ # TODO fix this to not explode on empty optional inputs (PF SKD error)
92
123
  if self._parallel:
93
124
  with ThreadPoolExecutor() as executor:
125
+ # pylint: disable=no-value-for-parameter
94
126
  futures = {
95
- executor.submit(evaluator, query=query, response=response, **kwargs): evaluator
127
+ executor.submit(query=query, response=response, conversation=conversation): evaluator
96
128
  for evaluator in self._evaluators
97
129
  }
98
130
 
@@ -100,7 +132,7 @@ class ContentSafetyEvaluator:
100
132
  results.update(future.result())
101
133
  else:
102
134
  for evaluator in self._evaluators:
103
- result = evaluator(query=query, response=response, **kwargs)
135
+ result = evaluator(query=query, response=response, conversation=conversation)
104
136
  results.update(result)
105
137
 
106
138
  return results