azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. azure/ai/evaluation/__init__.py +1 -15
  2. azure/ai/evaluation/_azure/_clients.py +24 -8
  3. azure/ai/evaluation/_azure/_models.py +2 -2
  4. azure/ai/evaluation/_common/utils.py +8 -8
  5. azure/ai/evaluation/_constants.py +21 -0
  6. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
  7. azure/ai/evaluation/_evaluate/_eval_run.py +3 -1
  8. azure/ai/evaluation/_evaluate/_evaluate.py +74 -14
  9. azure/ai/evaluation/_evaluate/_utils.py +27 -0
  10. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
  11. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  12. azure/ai/evaluation/_evaluators/_common/_base_eval.py +69 -4
  13. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
  14. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +7 -1
  15. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  16. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +5 -42
  17. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
  18. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
  20. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
  21. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
  22. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
  23. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
  24. azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
  25. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
  26. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +41 -81
  27. azure/ai/evaluation/_exceptions.py +0 -1
  28. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  29. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +640 -0
  30. azure/ai/evaluation/_version.py +2 -1
  31. azure/ai/evaluation/simulator/_adversarial_simulator.py +10 -3
  32. azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
  33. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
  34. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
  35. azure/ai/evaluation/simulator/_simulator.py +21 -13
  36. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/METADATA +77 -7
  37. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/RECORD +40 -44
  38. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  39. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  40. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  41. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  42. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  43. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  44. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  45. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  46. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/NOTICE.txt +0 -0
  47. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/WHEEL +0 -0
  48. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/top_level.txt +0 -0
@@ -4,15 +4,18 @@
4
4
 
5
5
  import inspect
6
6
  from abc import ABC, abstractmethod
7
- from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
7
+ from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
8
8
 
9
9
  from promptflow._utils.async_utils import async_run_allowing_running_loop
10
10
  from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
11
 
12
- from azure.ai.evaluation._common.math import list_mean
13
12
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
13
  from azure.ai.evaluation._common.utils import remove_optional_singletons
14
+ from azure.ai.evaluation._constants import _AggregationType
15
15
  from azure.ai.evaluation._model_configurations import Conversation
16
+ from azure.ai.evaluation._common._experimental import experimental
17
+
18
+ from ._conversation_aggregators import GetAggregator, GetAggregatorType
16
19
 
17
20
  P = ParamSpec("P")
18
21
  T = TypeVar("T")
@@ -25,6 +28,7 @@ class DerivedEvalInput(TypedDict, total=False):
25
28
  query: Dict[str, Any]
26
29
  response: Dict[str, Any]
27
30
  context: str
31
+ ground_truth: str
28
32
 
29
33
 
30
34
  AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
@@ -69,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
69
73
  :type not_singleton_inputs: List[str]
70
74
  :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
71
75
  :type eval_last_turn: bool
76
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
77
+ to produce a single result.
78
+ Default is ~azure.ai.evaluation._AggregationType.MEAN.
79
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
80
+ :param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
81
+ overrides the standard aggregator implied by conversation_aggregation_type. None by default.
82
+ :type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
72
83
  """
73
84
 
74
85
  # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -80,11 +91,17 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
80
91
  *,
81
92
  not_singleton_inputs: List[str] = ["conversation", "kwargs"],
82
93
  eval_last_turn: bool = False,
94
+ conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
95
+ conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
83
96
  ):
84
97
  self._not_singleton_inputs = not_singleton_inputs
85
98
  self._eval_last_turn = eval_last_turn
86
99
  self._singleton_inputs = self._derive_singleton_inputs()
87
100
  self._async_evaluator = AsyncEvaluatorBase(self._real_call)
101
+ self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
102
+ if conversation_aggregator_override is not None:
103
+ # Type ignore since we already checked for None, but mypy doesn't know that.
104
+ self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]
88
105
 
89
106
  # This needs to be overridden just to change the function header into something more informative,
90
107
  # and to be able to add a more specific docstring. The actual function contents should just be
@@ -158,6 +175,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
158
175
  include_context = "context" in self._singleton_inputs
159
176
  include_query = "query" in self._singleton_inputs
160
177
  include_response = "response" in self._singleton_inputs
178
+ include_ground_truth = "ground_truth" in self._singleton_inputs
161
179
 
162
180
  def converter(conversation: Dict) -> List[DerivedEvalInput]:
163
181
  messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -198,6 +216,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
198
216
  eval_input["response"] = response.get("content", "")
199
217
  if include_context:
200
218
  eval_input["context"] = str(context)
219
+ if include_ground_truth:
220
+ eval_input["ground_truth"] = response.get("ground_truth", "")
201
221
  eval_inputs.append(eval_input)
202
222
  return eval_inputs
203
223
 
@@ -355,7 +375,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
355
375
  # Find and average all numeric values
356
376
  for metric, values in evaluation_per_turn.items():
357
377
  if all(isinstance(value, (int, float)) for value in values):
358
- aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
378
+ aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
359
379
  # Slap the per-turn results back in.
360
380
  aggregated["evaluation_per_turn"] = evaluation_per_turn
361
381
  return aggregated
@@ -383,10 +403,51 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
383
403
  # Otherwise, aggregate results.
384
404
  return self._aggregate_results(per_turn_results=per_turn_results)
385
405
 
406
+ # ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
407
+
386
408
  @final
387
409
  def _to_async(self) -> "AsyncEvaluatorBase":
388
410
  return self._async_evaluator
389
411
 
412
+ @experimental
413
+ @final
414
+ def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
415
+ """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
416
+ multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
417
+ multi-turn conversation into a single top-level result.
418
+
419
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn
420
+ results of a conversation to produce a single result.
421
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
422
+ """
423
+ self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
424
+
425
+ @experimental
426
+ @final
427
+ def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
428
+ """Set the conversation aggregator function directly. This function will be applied to all numeric outputs
429
+ of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
430
+ evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
431
+ suit your needs, but use with caution.
432
+
433
+ :param aggregator: The function to use to aggregate per-turn results.
434
+ :type aggregator: Callable[[List[float]], float]
435
+ """
436
+ self._conversation_aggregation_function = aggregator
437
+
438
+ @experimental
439
+ @final
440
+ def _get_conversation_aggregator_type(self) -> _AggregationType:
441
+ """Get the current conversation aggregation type used by this evaluator. This refers to the
442
+ method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
443
+ is inputted into an evaluator that evaluates each turn individually). The individual inputs
444
+ are combined by the function implied here to produce a single overall result.
445
+
446
+ :return: The conversation aggregation type.
447
+ :rtype: ~azure.ai.evaluation._AggregationType
448
+ """
449
+ return GetAggregatorType(self._conversation_aggregation_function)
450
+
390
451
 
391
452
  class AsyncEvaluatorBase:
392
453
  """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
@@ -402,7 +463,9 @@ class AsyncEvaluatorBase:
402
463
  # are just not passed into this function instead of ending up in kwargs.
403
464
  # Since we want this to be relatively call-agnostic, we just account for every input that any children
404
465
  # are known to throw at this, mash them into kwargs, and then pass them into the real call.
405
- async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
466
+ async def __call__(
467
+ self, *, query=None, response=None, context=None, conversation=None, ground_truth=None, **kwargs
468
+ ):
406
469
  if conversation is not None:
407
470
  kwargs["conversation"] = conversation
408
471
  if query is not None:
@@ -411,4 +474,6 @@ class AsyncEvaluatorBase:
411
474
  kwargs["response"] = response
412
475
  if context is not None:
413
476
  kwargs["context"] = context
477
+ if ground_truth is not None:
478
+ kwargs["ground_truth"] = ground_truth
414
479
  return await self._real_call(**kwargs)
@@ -0,0 +1,61 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from concurrent.futures import as_completed
5
+ from typing import TypeVar, Dict, List
6
+
7
+ from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
8
+ from typing_extensions import override
9
+
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ class MultiEvaluatorBase(EvaluatorBase[T]):
16
+ """
17
+ Base class for evaluators that contain and run multiple other evaluators to produce a
18
+ suite of metrics.
19
+
20
+ Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval.
21
+
22
+ :param evaluators: The list of evaluators to run when this evaluator is called.
23
+ :type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase]
24
+ :param kwargs: Additional arguments to pass to the evaluator.
25
+ :type kwargs: Any
26
+ :return: An evaluator that runs multiple other evaluators and combines their results.
27
+ """
28
+
29
+ def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
30
+ super().__init__()
31
+ self._parallel = kwargs.pop("_parallel", True)
32
+ self._evaluators = evaluators
33
+
34
+ @override
35
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
36
+ """Run each evaluator, possibly in parallel, and combine the results into
37
+ a single large dictionary containing each evaluation. Inputs are passed
38
+ directly to each evaluator without additional processing.
39
+
40
+
41
+ :param eval_input: The input to the evaluation function.
42
+ :type eval_input: Dict
43
+ :return: The evaluation result.
44
+ :rtype: Dict
45
+ """
46
+ results: Dict[str, T] = {}
47
+ if self._parallel:
48
+ with ThreadPoolExecutor() as executor:
49
+ # pylint: disable=no-value-for-parameter
50
+ futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators}
51
+
52
+ for future in as_completed(futures):
53
+ results.update(future.result())
54
+ else:
55
+ for evaluator in self._evaluators:
56
+ result = evaluator(**eval_input)
57
+ # Ignore is to avoid mypy getting upset over the amount of duck-typing
58
+ # that's going on to shove evaluators around like this.
59
+ results.update(result) # type: ignore[arg-type]
60
+
61
+ return results
@@ -15,6 +15,7 @@ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, e
15
15
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
16
16
  from azure.ai.evaluation._exceptions import EvaluationException
17
17
  from azure.ai.evaluation._common.utils import validate_conversation
18
+ from azure.ai.evaluation._constants import _AggregationType
18
19
  from azure.core.credentials import TokenCredential
19
20
 
20
21
  from . import EvaluatorBase
@@ -35,6 +36,10 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
35
36
  aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
36
37
  when this occurs. Default is False, resulting full conversation evaluation and aggregation.
37
38
  :type eval_last_turn: bool
39
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
40
+ to produce a single result.
41
+ Default is ~azure.ai.evaluation._AggregationType.MEAN.
42
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
38
43
  """
39
44
 
40
45
  @override
@@ -44,8 +49,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
44
49
  azure_ai_project: dict,
45
50
  credential: TokenCredential,
46
51
  eval_last_turn: bool = False,
52
+ conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
47
53
  ):
48
- super().__init__(eval_last_turn=eval_last_turn)
54
+ super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type)
49
55
  self._eval_metric = eval_metric
50
56
  self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
51
57
  self._credential = credential
@@ -0,0 +1,49 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from typing import Callable, List
6
+ from azure.ai.evaluation._common.math import list_mean
7
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
8
+ from azure.ai.evaluation._constants import _AggregationType
9
+
10
+
11
+ def GetAggregator(aggregation_type: _AggregationType) -> Callable[[List[float]], float]:
12
+ if aggregation_type == _AggregationType.SUM:
13
+ return sum
14
+ if aggregation_type == _AggregationType.MEAN:
15
+ return list_mean
16
+ if aggregation_type == _AggregationType.MAX:
17
+ return max
18
+ if aggregation_type == _AggregationType.MIN:
19
+ return min
20
+ if aggregation_type == _AggregationType.CUSTOM:
21
+ msg = (
22
+ "Cannot 'get' aggregator function associated with custom aggregation enum."
23
+ + " This enum value should only be outputted as an indicator of an injected"
24
+ + " aggregation function, not inputted directly"
25
+ )
26
+ raise EvaluationException(
27
+ message=msg,
28
+ blame=ErrorBlame.UNKNOWN,
29
+ category=ErrorCategory.INVALID_VALUE,
30
+ target=ErrorTarget.EVALUATE,
31
+ )
32
+ raise EvaluationException(
33
+ message=f"Unaccounted for aggregation type: {aggregation_type}",
34
+ blame=ErrorBlame.UNKNOWN,
35
+ category=ErrorCategory.INVALID_VALUE,
36
+ target=ErrorTarget.EVALUATE,
37
+ )
38
+
39
+
40
+ def GetAggregatorType(aggregation_function: Callable) -> _AggregationType:
41
+ if aggregation_function == sum: # pylint: disable=comparison-with-callable
42
+ return _AggregationType.SUM
43
+ if aggregation_function == list_mean: # pylint: disable=comparison-with-callable
44
+ return _AggregationType.MEAN
45
+ if aggregation_function == max: # pylint: disable=comparison-with-callable
46
+ return _AggregationType.MAX
47
+ if aggregation_function == min: # pylint: disable=comparison-with-callable
48
+ return _AggregationType.MIN
49
+ return _AggregationType.CUSTOM
@@ -1,13 +1,11 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from concurrent.futures import as_completed
5
- from typing import Callable, Dict, List, Union
4
+ from typing import Dict, List, Union
6
5
 
7
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
8
6
  from typing_extensions import overload, override
9
7
 
10
- from azure.ai.evaluation._evaluators._common import EvaluatorBase
8
+ from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
11
9
  from azure.ai.evaluation._model_configurations import Conversation
12
10
  from azure.ai.evaluation._common._experimental import experimental
13
11
 
@@ -18,7 +16,7 @@ from ._violence import ViolenceEvaluator
18
16
 
19
17
 
20
18
  @experimental
21
- class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
19
+ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
22
20
  """
23
21
  Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
24
22
 
@@ -44,16 +42,14 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
44
42
  id = "content_safety"
45
43
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
46
44
 
47
- # TODO address 3579092 to re-enabled parallel evals.
48
45
  def __init__(self, credential, azure_ai_project, **kwargs):
49
- super().__init__()
50
- self._parallel = kwargs.pop("_parallel", True)
51
- self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
46
+ evaluators = [
52
47
  ViolenceEvaluator(credential, azure_ai_project),
53
48
  SexualEvaluator(credential, azure_ai_project),
54
49
  SelfHarmEvaluator(credential, azure_ai_project),
55
50
  HateUnfairnessEvaluator(credential, azure_ai_project),
56
51
  ]
52
+ super().__init__(evaluators=evaluators, **kwargs)
57
53
 
58
54
  @overload
59
55
  def __call__(
@@ -109,36 +105,3 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
109
105
  :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
110
106
  """
111
107
  return super().__call__(*args, **kwargs)
112
-
113
- @override
114
- async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
115
- """Perform the evaluation using the Azure AI RAI service.
116
- The exact evaluation performed is determined by the evaluation metric supplied
117
- by the child class initializer.
118
-
119
- :param eval_input: The input to the evaluation function.
120
- :type eval_input: Dict
121
- :return: The evaluation result.
122
- :rtype: Dict
123
- """
124
- query = eval_input.get("query", None)
125
- response = eval_input.get("response", None)
126
- conversation = eval_input.get("conversation", None)
127
- results: Dict[str, Union[str, float]] = {}
128
- # TODO fix this to not explode on empty optional inputs (PF SKD error)
129
- if self._parallel:
130
- with ThreadPoolExecutor() as executor:
131
- # pylint: disable=no-value-for-parameter
132
- futures = {
133
- executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
134
- for evaluator in self._evaluators
135
- }
136
-
137
- for future in as_completed(futures):
138
- results.update(future.result())
139
- else:
140
- for evaluator in self._evaluators:
141
- result = evaluator(query=query, response=response, conversation=conversation)
142
- results.update(result)
143
-
144
- return results
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
9
9
  from azure.ai.evaluation._common.constants import EvaluationMetrics
10
10
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
12
+ from azure.ai.evaluation._constants import _AggregationType
12
13
 
13
14
 
14
15
  @experimental
@@ -71,6 +72,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
71
72
  eval_metric=EvaluationMetrics.HATE_FAIRNESS,
72
73
  azure_ai_project=azure_ai_project,
73
74
  credential=credential,
75
+ conversation_aggregation_type=_AggregationType.MAX,
74
76
  )
75
77
 
76
78
  @overload
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
9
9
  from azure.ai.evaluation._common.constants import EvaluationMetrics
10
10
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
12
+ from azure.ai.evaluation._constants import _AggregationType
12
13
 
13
14
 
14
15
  @experimental
@@ -65,6 +66,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
65
66
  eval_metric=EvaluationMetrics.SELF_HARM,
66
67
  azure_ai_project=azure_ai_project,
67
68
  credential=credential,
69
+ conversation_aggregation_type=_AggregationType.MAX,
68
70
  )
69
71
 
70
72
  @overload
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
9
9
  from azure.ai.evaluation._common.constants import EvaluationMetrics
10
10
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
12
+ from azure.ai.evaluation._constants import _AggregationType
12
13
 
13
14
 
14
15
  @experimental
@@ -67,6 +68,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
67
68
  eval_metric=EvaluationMetrics.SEXUAL,
68
69
  azure_ai_project=azure_ai_project,
69
70
  credential=credential,
71
+ conversation_aggregation_type=_AggregationType.MAX,
70
72
  )
71
73
 
72
74
  @overload
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
9
9
  from azure.ai.evaluation._common.constants import EvaluationMetrics
10
10
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
12
+ from azure.ai.evaluation._constants import _AggregationType
12
13
 
13
14
 
14
15
  @experimental
@@ -67,6 +68,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
67
68
  eval_metric=EvaluationMetrics.VIOLENCE,
68
69
  azure_ai_project=azure_ai_project,
69
70
  credential=credential,
71
+ conversation_aggregation_type=_AggregationType.MAX,
70
72
  )
71
73
 
72
74
  @overload
@@ -3,45 +3,44 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  from collections import Counter
6
- from typing import List
6
+ from typing import List, Dict
7
+ from typing_extensions import overload, override
7
8
 
8
- from promptflow._utils.async_utils import async_run_allowing_running_loop
9
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
10
 
10
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
11
11
 
12
+ class F1ScoreEvaluator(EvaluatorBase):
13
+ """
14
+ Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
12
15
 
13
- class _AsyncF1ScoreEvaluator:
14
- def __init__(self):
15
- pass
16
+ F1 Scores range from 0 to 1, with 1 being the best possible score.
16
17
 
17
- async def __call__(self, *, response: str, ground_truth: str, **kwargs):
18
- """
19
- Evaluate F1 score.
18
+ The F1-score computes the ratio of the number of shared words between the model generation and
19
+ the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
20
+ truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
21
+ precision is the ratio of the number of shared words to the total number of words in the generation, and recall
22
+ is the ratio of the number of shared words to the total number of words in the ground truth.
20
23
 
21
- :keyword response: The response to be evaluated.
22
- :paramtype response: str
23
- :keyword ground_truth: The ground truth to be evaluated.
24
- :paramtype ground_truth: str
25
- :return: The F1 score.
26
- :rtype: Dict[str, float]
27
- """
28
- # Validate inputs
29
- if not (response and response.strip() and response != "None") or not (
30
- ground_truth and ground_truth.strip() and ground_truth != "None"
31
- ):
32
- msg = "Both 'response' and 'ground_truth' must be non-empty strings."
33
- raise EvaluationException(
34
- message=msg,
35
- internal_message=msg,
36
- error_category=ErrorCategory.MISSING_FIELD,
37
- error_blame=ErrorBlame.USER_ERROR,
38
- error_target=ErrorTarget.F1_EVALUATOR,
39
- )
24
+ Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
25
+ model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
26
+ information in the response.
40
27
 
41
- # Run f1 score computation.
42
- f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
43
28
 
44
- return {"f1_score": f1_result}
29
+ .. admonition:: Example:
30
+
31
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
32
+ :start-after: [START f1_score_evaluator]
33
+ :end-before: [END f1_score_evaluator]
34
+ :language: python
35
+ :dedent: 8
36
+ :caption: Initialize and call an F1ScoreEvaluator.
37
+ """
38
+
39
+ id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
40
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
41
+
42
+ def __init__(self):
43
+ super().__init__()
45
44
 
46
45
  @classmethod
47
46
  def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
@@ -103,41 +102,24 @@ class _AsyncF1ScoreEvaluator:
103
102
 
104
103
  return f1
105
104
 
105
+ @override
106
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
107
+ """Produce an f1 score evaluation result.
106
108
 
107
- class F1ScoreEvaluator:
108
- """
109
- Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
110
-
111
- F1 Scores range from 0 to 1, with 1 being the best possible score.
112
-
113
- The F1-score computes the ratio of the number of shared words between the model generation and
114
- the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
115
- truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
116
- precision is the ratio of the number of shared words to the total number of words in the generation, and recall
117
- is the ratio of the number of shared words to the total number of words in the ground truth.
118
-
119
- Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
120
- model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
121
- information in the response.
122
-
123
-
124
- .. admonition:: Example:
125
-
126
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
127
- :start-after: [START f1_score_evaluator]
128
- :end-before: [END f1_score_evaluator]
129
- :language: python
130
- :dedent: 8
131
- :caption: Initialize and call an F1ScoreEvaluator.
132
- """
133
-
134
- id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
135
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
109
+ :param eval_input: The input to the evaluation function.
110
+ :type eval_input: Dict
111
+ :return: The evaluation result.
112
+ :rtype: Dict
113
+ """
114
+ ground_truth = eval_input["ground_truth"]
115
+ response = eval_input["response"]
116
+ # Run f1 score computation.
117
+ f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
136
118
 
137
- def __init__(self):
138
- self._async_evaluator = _AsyncF1ScoreEvaluator()
119
+ return {"f1_score": f1_result}
139
120
 
140
- def __call__(self, *, response: str, ground_truth: str, **kwargs):
121
+ @overload # type: ignore
122
+ def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
141
123
  """
142
124
  Evaluate F1 score.
143
125
 
@@ -149,9 +131,20 @@ class F1ScoreEvaluator:
149
131
  :rtype: Dict[str, float]
150
132
  """
151
133
 
152
- return async_run_allowing_running_loop(
153
- self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
154
- )
134
+ @override
135
+ def __call__( # pylint: disable=docstring-missing-param
136
+ self,
137
+ *args,
138
+ **kwargs,
139
+ ):
140
+ """
141
+ Evaluate F1 score.
155
142
 
156
- def _to_async(self):
157
- return self._async_evaluator
143
+ :keyword response: The response to be evaluated.
144
+ :paramtype response: str
145
+ :keyword ground_truth: The ground truth to be evaluated.
146
+ :paramtype ground_truth: str
147
+ :return: The F1 score.
148
+ :rtype: Dict[str, float]
149
+ """
150
+ return super().__call__(*args, **kwargs)