azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (78) hide show
  1. azure/ai/evaluation/__init__.py +9 -5
  2. azure/ai/evaluation/_common/constants.py +4 -2
  3. azure/ai/evaluation/_common/math.py +18 -0
  4. azure/ai/evaluation/_common/rai_service.py +54 -62
  5. azure/ai/evaluation/_common/utils.py +201 -16
  6. azure/ai/evaluation/_constants.py +12 -0
  7. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
  8. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
  10. azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
  11. azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +44 -25
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
  15. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  16. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
  20. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  37. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  38. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
  39. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  40. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
  42. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  43. azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
  44. azure/ai/evaluation/_exceptions.py +9 -7
  45. azure/ai/evaluation/_http_utils.py +203 -132
  46. azure/ai/evaluation/_model_configurations.py +37 -9
  47. azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
  48. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  49. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  50. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  51. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  52. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  53. azure/ai/evaluation/_version.py +1 -1
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
  55. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  56. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  57. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  58. azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
  59. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
  60. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
  61. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  62. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  63. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  64. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  65. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
  66. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
  67. azure/ai/evaluation/simulator/_simulator.py +127 -117
  68. azure/ai/evaluation/simulator/_tracing.py +4 -4
  69. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
  70. azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
  71. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
  72. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  73. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  74. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  75. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  76. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  77. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
  78. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,76 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import math
6
+ import re
7
+ from typing import Dict
8
+
9
+ from promptflow.core import AsyncPrompty
10
+ from typing_extensions import override
11
+
12
+ from ..._common.utils import construct_prompty_model_config, validate_model_config
13
+ from . import EvaluatorBase
14
+
15
+ try:
16
+ from ..._user_agent import USER_AGENT
17
+ except ImportError:
18
+ USER_AGENT = "None"
19
+
20
+
21
+ class PromptyEvaluatorBase(EvaluatorBase[float]):
22
+ """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
23
+ make use of a prompty file, and return their results as a dictionary, with a single key-value pair
24
+ linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
25
+ per-turn results are stored in a list under the key "evaluation_per_turn").
26
+
27
+ :param result_key: The key to use for the result of the evaluation. Single turn evaluations will return
28
+ a dictionary in the format {result_key: float}.
29
+ :type result_key: str
30
+ :param prompty_file: The path to the prompty file to use for evaluation.
31
+ :type prompty_file: str
32
+ :param model_config: The model configuration to use for evaluation.
33
+ :type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
34
+ :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
35
+ Useful since some evaluators of this format are response-only.
36
+ :type ignore_queries: bool
37
+ """
38
+
39
+ LLM_CALL_TIMEOUT = 600
40
+ DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
41
+
42
+ def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
43
+ self._result_key = result_key
44
+ self._prompty_file = prompty_file
45
+ super().__init__(eval_last_turn=eval_last_turn)
46
+
47
+ prompty_model_config = construct_prompty_model_config(
48
+ validate_model_config(model_config),
49
+ self.DEFAULT_OPEN_API_VERSION,
50
+ USER_AGENT,
51
+ )
52
+
53
+ self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
54
+
55
+ # __call__ not overridden here because child classes have such varied signatures that there's no point
56
+ # defining a default here.
57
+
58
+ @override
59
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
60
+ """Do a relevance evaluation.
61
+
62
+ :param eval_input: The input to the evaluator. Expected to contain
63
+ whatever inputs are needed for the _flow method, including context
64
+ and other fields depending on the child class.
65
+ :type eval_input: Dict
66
+ :return: The evaluation result.
67
+ :rtype: Dict
68
+ """
69
+ llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
70
+
71
+ score = math.nan
72
+ if llm_output:
73
+ match = re.search(r"\d", llm_output)
74
+ if match:
75
+ score = float(match.group())
76
+ return {self._result_key: float(score)}
@@ -0,0 +1,97 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing import Dict, Optional, Union
5
+
6
+ from typing_extensions import override
7
+
8
+ from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
9
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
10
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
11
+ from azure.ai.evaluation._exceptions import EvaluationException
12
+ from azure.core.credentials import TokenCredential
13
+
14
+ from . import EvaluatorBase
15
+
16
+
17
+ class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
18
+ """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
19
+ This includes content safety evaluators, protected material evaluators, and others. These evaluators
20
+ are all assumed to be of the "query and response or conversation" input variety.
21
+
22
+ :param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
23
+ to specify which evaluation to perform.
24
+ :type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
25
+ :param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
26
+ aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
27
+ aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
28
+ when this occurs. Default is False, resulting full conversation evaluation and aggregation.
29
+ :type eval_last_turn: bool
30
+ """
31
+
32
+ @override
33
+ def __init__(
34
+ self,
35
+ eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
36
+ azure_ai_project: dict,
37
+ credential: TokenCredential,
38
+ eval_last_turn: bool = False,
39
+ ):
40
+ super().__init__(eval_last_turn=eval_last_turn)
41
+ self._eval_metric = eval_metric
42
+ self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
43
+ self._credential = credential
44
+
45
+ @override
46
+ def __call__(
47
+ self,
48
+ *,
49
+ query: Optional[str] = None,
50
+ response: Optional[str] = None,
51
+ conversation: Optional[dict] = None,
52
+ **kwargs,
53
+ ):
54
+ """Evaluate either a query and response or a conversation. Must supply either a query AND response,
55
+ or a conversation, but not both.
56
+
57
+ :keyword query: The query to evaluate.
58
+ :paramtype query: Optional[str]
59
+ :keyword response: The response to evaluate.
60
+ :paramtype response: Optional[str]
61
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
62
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
63
+ to be dictionaries with keys "content", "role", and possibly "context".
64
+ :paramtype conversation: Optional[Dict]
65
+ :return: The evaluation result.
66
+ :rtype: Dict[str, Union[str, float]]
67
+ """
68
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
69
+
70
+ @override
71
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
72
+ """Perform the evaluation using the Azure AI RAI service.
73
+ The exact evaluation performed is determined by the evaluation metric supplied
74
+ by the child class initializer.
75
+
76
+ :param eval_input: The input to the evaluation function.
77
+ :type eval_input: Dict
78
+ :return: The evaluation result.
79
+ :rtype: Dict
80
+ """
81
+ query = eval_input.get("query", None)
82
+ response = eval_input.get("response", None)
83
+ if query is None or response is None:
84
+ raise EvaluationException(
85
+ message="Not implemented",
86
+ internal_message=(
87
+ "Reached query/response evaluation without supplying query or response."
88
+ + " This should have failed earlier."
89
+ ),
90
+ )
91
+ return await evaluate_with_rai_service(
92
+ metric_name=self._eval_metric,
93
+ query=query,
94
+ response=response,
95
+ project_scope=self._azure_ai_project,
96
+ credential=self._credential,
97
+ )
@@ -3,8 +3,6 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  from ._content_safety import ContentSafetyEvaluator
6
- from ._content_safety_base import ContentSafetyEvaluatorBase
7
- from ._content_safety_chat import ContentSafetyChatEvaluator
8
6
  from ._hate_unfairness import HateUnfairnessEvaluator
9
7
  from ._self_harm import SelfHarmEvaluator
10
8
  from ._sexual import SexualEvaluator
@@ -16,6 +14,4 @@ __all__ = [
16
14
  "SelfHarmEvaluator",
17
15
  "HateUnfairnessEvaluator",
18
16
  "ContentSafetyEvaluator",
19
- "ContentSafetyChatEvaluator",
20
- "ContentSafetyEvaluatorBase",
21
17
  ]
@@ -2,32 +2,27 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  from concurrent.futures import as_completed
5
+ from typing import Callable, Dict, List, Union
5
6
 
6
7
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
7
8
 
8
- try:
9
- from ._hate_unfairness import HateUnfairnessEvaluator
10
- from ._self_harm import SelfHarmEvaluator
11
- from ._sexual import SexualEvaluator
12
- from ._violence import ViolenceEvaluator
13
- except ImportError:
14
- from _hate_unfairness import HateUnfairnessEvaluator
15
- from _self_harm import SelfHarmEvaluator
16
- from _sexual import SexualEvaluator
17
- from _violence import ViolenceEvaluator
9
+ from ._hate_unfairness import HateUnfairnessEvaluator
10
+ from ._self_harm import SelfHarmEvaluator
11
+ from ._sexual import SexualEvaluator
12
+ from ._violence import ViolenceEvaluator
18
13
 
19
14
 
20
15
  class ContentSafetyEvaluator:
21
16
  """
22
17
  Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
23
18
 
19
+ :param credential: The credential for connecting to Azure AI project. Required
20
+ :type credential: ~azure.core.credentials.TokenCredential
24
21
  :param azure_ai_project: The scope of the Azure AI project.
25
22
  It contains subscription id, resource group, and project name.
26
23
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
27
24
  :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
28
25
  Default is True.
29
- :param credential: The credential for connecting to Azure AI project.
30
- :type credential: ~azure.core.credentials.TokenCredential
31
26
  :return: A function that evaluates content-safety metrics for "question-answering" scenario.
32
27
  :rtype: Callable
33
28
 
@@ -66,13 +61,13 @@ class ContentSafetyEvaluator:
66
61
  }
67
62
  """
68
63
 
69
- def __init__(self, azure_ai_project: dict, parallel: bool = True, credential=None):
64
+ def __init__(self, credential, azure_ai_project: dict, parallel: bool = True):
70
65
  self._parallel = parallel
71
- self._evaluators = [
72
- ViolenceEvaluator(azure_ai_project, credential),
73
- SexualEvaluator(azure_ai_project, credential),
74
- SelfHarmEvaluator(azure_ai_project, credential),
75
- HateUnfairnessEvaluator(azure_ai_project, credential),
66
+ self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
67
+ ViolenceEvaluator(credential, azure_ai_project),
68
+ SexualEvaluator(credential, azure_ai_project),
69
+ SelfHarmEvaluator(credential, azure_ai_project),
70
+ HateUnfairnessEvaluator(credential, azure_ai_project),
76
71
  ]
77
72
 
78
73
  def __call__(self, *, query: str, response: str, **kwargs):
@@ -86,9 +81,9 @@ class ContentSafetyEvaluator:
86
81
  :keyword parallel: Whether to evaluate in parallel.
87
82
  :paramtype parallel: bool
88
83
  :return: The scores for content-safety.
89
- :rtype: dict
84
+ :rtype: Dict[str, Union[str, float]]
90
85
  """
91
- results = {}
86
+ results: Dict[str, Union[str, float]] = {}
92
87
  if self._parallel:
93
88
  with ThreadPoolExecutor() as executor:
94
89
  futures = {
@@ -2,32 +2,36 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import logging
5
+ import math
5
6
  from concurrent.futures import as_completed
6
- from typing import Dict, List
7
+ from typing import Callable, Dict, List, TypedDict, Union, cast
7
8
 
8
- import numpy as np
9
9
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
10
10
 
11
+ from azure.ai.evaluation._common.constants import HarmSeverityLevel
12
+ from azure.ai.evaluation._common.math import list_mean_nan_safe
11
13
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
12
14
 
13
- try:
14
- from ._hate_unfairness import HateUnfairnessEvaluator
15
- from ._self_harm import SelfHarmEvaluator
16
- from ._sexual import SexualEvaluator
17
- from ._violence import ViolenceEvaluator
18
- except ImportError:
19
- from _hate_unfairness import HateUnfairnessEvaluator
20
- from _self_harm import SelfHarmEvaluator
21
- from _sexual import SexualEvaluator
22
- from _violence import ViolenceEvaluator
15
+ from ._hate_unfairness import HateUnfairnessEvaluator
16
+ from ._self_harm import SelfHarmEvaluator
17
+ from ._sexual import SexualEvaluator
18
+ from ._violence import ViolenceEvaluator
23
19
 
24
20
  logger = logging.getLogger(__name__)
25
21
 
26
22
 
23
+ class _EvaluationPerTurn(TypedDict):
24
+ severity: List[str]
25
+ score: List[float]
26
+ reason: List[str]
27
+
28
+
27
29
  class ContentSafetyChatEvaluator:
28
30
  """
29
31
  Initialize a content safety chat evaluator configured to evaluate content safetry metrics for chat scenario.
30
32
 
33
+ :param credential: The credential for connecting to Azure AI project. Required
34
+ :type credential: ~azure.core.credentials.TokenCredential
31
35
  :param azure_ai_project: The scope of the Azure AI project.
32
36
  It contains subscription id, resource group, and project name.
33
37
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
@@ -37,8 +41,6 @@ class ContentSafetyChatEvaluator:
37
41
  :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
38
42
  Default is True.
39
43
  :type parallel: bool
40
- :param credential: The credential for connecting to Azure AI project.
41
- :type credential: ~azure.core.credentials.TokenCredential
42
44
  :return: A function that evaluates and generates metrics for "chat" scenario.
43
45
  :rtype: Callable
44
46
 
@@ -87,24 +89,30 @@ class ContentSafetyChatEvaluator:
87
89
  }
88
90
  """
89
91
 
90
- def __init__(self, azure_ai_project: dict, eval_last_turn: bool = False, parallel: bool = True, credential=None):
92
+ def __init__(
93
+ self,
94
+ credential,
95
+ azure_ai_project: dict,
96
+ eval_last_turn: bool = False,
97
+ parallel: bool = True,
98
+ ):
91
99
  self._eval_last_turn = eval_last_turn
92
100
  self._parallel = parallel
93
- self._evaluators = [
101
+ self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
94
102
  ViolenceEvaluator(azure_ai_project, credential),
95
103
  SexualEvaluator(azure_ai_project, credential),
96
104
  SelfHarmEvaluator(azure_ai_project, credential),
97
105
  HateUnfairnessEvaluator(azure_ai_project, credential),
98
106
  ]
99
107
 
100
- def __call__(self, *, conversation, **kwargs):
108
+ def __call__(self, *, conversation: list, **kwargs):
101
109
  """
102
110
  Evaluates content-safety metrics for "chat" scenario.
103
111
 
104
112
  :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
105
113
  :paramtype conversation: List[Dict]
106
114
  :return: The scores for Chat scenario.
107
- :rtype: dict
115
+ :rtype: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]
108
116
  """
109
117
  self._validate_conversation(conversation)
110
118
 
@@ -141,7 +149,7 @@ class ContentSafetyChatEvaluator:
141
149
  }
142
150
 
143
151
  for future in as_completed(future_to_evaluator):
144
- result = future.result()
152
+ result: Dict[str, Union[str, float]] = future.result()
145
153
  current_turn_result.update(result)
146
154
  else:
147
155
  # Sequential execution
@@ -154,7 +162,13 @@ class ContentSafetyChatEvaluator:
154
162
  aggregated = self._aggregate_results(per_turn_results)
155
163
  return aggregated
156
164
 
157
- def _evaluate_turn(self, turn_num, queries, responses, evaluator):
165
+ def _evaluate_turn(
166
+ self,
167
+ turn_num: int,
168
+ queries: List[str],
169
+ responses: List[str],
170
+ evaluator: Callable[..., Dict[str, Union[str, float]]],
171
+ ) -> Dict[str, Union[str, float]]:
158
172
  try:
159
173
  query = queries[turn_num] if turn_num < len(queries) else ""
160
174
  response = responses[turn_num] if turn_num < len(responses) else ""
@@ -171,41 +185,48 @@ class ContentSafetyChatEvaluator:
171
185
  )
172
186
  return {}
173
187
 
174
- def _aggregate_results(self, per_turn_results: List[Dict]):
175
- scores = {}
176
- reasons = {}
177
- levels = {}
188
+ def _aggregate_results(
189
+ self, per_turn_results: List[Dict[str, Union[str, float]]]
190
+ ) -> Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]:
191
+ scores: Dict[str, List[float]] = {}
192
+ reasons: Dict[str, List[str]] = {}
193
+ levels: Dict[str, List[str]] = {}
178
194
 
179
195
  for turn in per_turn_results:
180
196
  for metric, value in turn.items():
181
197
  if "_score" in metric:
182
198
  if metric not in scores:
183
199
  scores[metric] = []
184
- scores[metric].append(value)
200
+ scores[metric].append(cast(float, value))
185
201
  elif "_reason" in metric:
186
202
  if metric not in reasons:
187
203
  reasons[metric] = []
188
- reasons[metric].append(value)
204
+ reasons[metric].append(cast(str, value))
189
205
  else:
190
206
  if metric not in levels:
191
207
  levels[metric] = []
192
- levels[metric].append(value)
208
+ levels[metric].append(cast(str, value))
193
209
 
194
- aggregated = {}
195
- evaluation_per_turn = {}
210
+ aggregated: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]] = {}
211
+ evaluation_per_turn: Dict[str, _EvaluationPerTurn] = {}
196
212
 
197
213
  for metric, values in levels.items():
198
214
  score_key = f"{metric}_score"
199
215
  reason_key = f"{metric}_reason"
200
216
 
201
- aggregated_score = np.nanmean(scores[score_key])
202
- aggregated[metric] = self._get_harm_severity_level(aggregated_score)
217
+ aggregated_score = list_mean_nan_safe(scores[score_key])
218
+ harm_severity_level = self._get_harm_severity_level(aggregated_score)
219
+ aggregated[metric] = (
220
+ harm_severity_level.value if isinstance(harm_severity_level, HarmSeverityLevel) else harm_severity_level
221
+ )
203
222
  aggregated[score_key] = aggregated_score
204
223
 
205
224
  # Prepare per-turn evaluations
206
- evaluation_per_turn[metric] = {"severity": values}
207
- evaluation_per_turn[metric]["score"] = scores[score_key]
208
- evaluation_per_turn[metric]["reason"] = reasons[reason_key]
225
+ evaluation_per_turn[metric] = {
226
+ "severity": values,
227
+ "score": scores[score_key],
228
+ "reason": reasons[reason_key],
229
+ }
209
230
 
210
231
  aggregated["evaluation_per_turn"] = evaluation_per_turn
211
232
 
@@ -283,19 +304,19 @@ class ContentSafetyChatEvaluator:
283
304
  blame=ErrorBlame.USER_ERROR,
284
305
  )
285
306
 
286
- def _get_harm_severity_level(self, harm_score: float) -> str:
307
+ def _get_harm_severity_level(self, harm_score: float) -> Union[HarmSeverityLevel, float]:
287
308
  HARM_SEVERITY_LEVEL_MAPPING = {
288
- "Very low": [0, 1],
289
- "Low": [2, 3],
290
- "Medium": [4, 5],
291
- "High": [6, 7],
309
+ HarmSeverityLevel.VeryLow: (0, 1),
310
+ HarmSeverityLevel.Low: (2, 3),
311
+ HarmSeverityLevel.Medium: (4, 5),
312
+ HarmSeverityLevel.High: (6, 7),
292
313
  }
293
314
 
294
- if harm_score == np.nan or harm_score is None:
295
- return np.nan
315
+ if math.isnan(harm_score) or harm_score is None:
316
+ return math.nan
296
317
 
297
318
  for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
298
319
  if harm_score_range[0] <= harm_score <= harm_score_range[1]:
299
320
  return harm_level
300
321
 
301
- return np.nan
322
+ return math.nan
@@ -1,39 +1,21 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from promptflow._utils.async_utils import async_run_allowing_running_loop
4
+ from typing_extensions import override
5
5
 
6
6
  from azure.ai.evaluation._common.constants import EvaluationMetrics
7
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
7
8
 
8
- try:
9
- from ._content_safety_base import ContentSafetyEvaluatorBase
10
- except ImportError:
11
- from _content_safety_base import ContentSafetyEvaluatorBase
12
9
 
13
-
14
- class _AsyncHateUnfairnessEvaluator(ContentSafetyEvaluatorBase):
15
- def __init__(self, azure_ai_project: dict, credential=None):
16
- # Hate_fairness is the actual backend metric name. Which, uh, doesn't sound great.
17
- # so invert the name.
18
- super().__init__(
19
- metric=EvaluationMetrics.HATE_FAIRNESS,
20
- azure_ai_project=azure_ai_project,
21
- credential=credential,
22
- )
23
-
24
- async def __call__(self, *, query: str, response: str, **kwargs):
25
- return await super().__call__(query=query, response=response, **kwargs)
26
-
27
-
28
- class HateUnfairnessEvaluator:
10
+ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase):
29
11
  """
30
12
  Initialize a hate-unfairness evaluator for hate unfairness score.
31
13
 
14
+ :param credential: The credential for connecting to Azure AI project. Required
15
+ :type credential: ~azure.core.credentials.TokenCredential
32
16
  :param azure_ai_project: The scope of the Azure AI project.
33
17
  It contains subscription id, resource group, and project name.
34
18
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
35
- :param credential: The credential for connecting to Azure AI project.
36
- :type credential: ~azure.core.credentials.TokenCredential
37
19
 
38
20
  **Usage**
39
21
 
@@ -58,21 +40,16 @@ class HateUnfairnessEvaluator:
58
40
  }
59
41
  """
60
42
 
61
- def __init__(self, azure_ai_project: dict, credential=None) -> None:
62
- self._async_evaluator = _AsyncHateUnfairnessEvaluator(azure_ai_project, credential)
63
-
64
- def __call__(self, *, query: str, response: str, **kwargs):
65
- """
66
- Evaluates hate unfairness content.
67
-
68
- :keyword query: The query to be evaluated.
69
- :paramtype query: str
70
- :keyword response: The response to be evaluated.
71
- :paramtype response: str
72
- :return: The hate unfairness score.
73
- :rtype: dict
74
- """
75
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
76
-
77
- def _to_async(self):
78
- return self._async_evaluator
43
+ @override
44
+ def __init__(
45
+ self,
46
+ credential,
47
+ azure_ai_project: dict,
48
+ eval_last_turn: bool = False,
49
+ ):
50
+ super().__init__(
51
+ eval_metric=EvaluationMetrics.HATE_FAIRNESS,
52
+ azure_ai_project=azure_ai_project,
53
+ credential=credential,
54
+ eval_last_turn=eval_last_turn,
55
+ )
@@ -1,37 +1,21 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from promptflow._utils.async_utils import async_run_allowing_running_loop
4
+ from typing_extensions import override
5
5
 
6
6
  from azure.ai.evaluation._common.constants import EvaluationMetrics
7
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
7
8
 
8
- try:
9
- from ._content_safety_base import ContentSafetyEvaluatorBase
10
- except ImportError:
11
- from _content_safety_base import ContentSafetyEvaluatorBase
12
9
 
13
-
14
- class _AsyncSelfHarmEvaluator(ContentSafetyEvaluatorBase):
15
- def __init__(self, azure_ai_project: dict, credential=None):
16
- super().__init__(
17
- metric=EvaluationMetrics.SELF_HARM,
18
- azure_ai_project=azure_ai_project,
19
- credential=credential,
20
- )
21
-
22
- async def __call__(self, *, query: str, response: str, **kwargs):
23
- return await super().__call__(query=query, response=response, **kwargs)
24
-
25
-
26
- class SelfHarmEvaluator:
10
+ class SelfHarmEvaluator(RaiServiceEvaluatorBase):
27
11
  """
28
12
  Initialize a self harm evaluator for self harm score.
29
13
 
14
+ :param credential: The credential for connecting to Azure AI project. Required
15
+ :type credential: ~azure.core.credentials.TokenCredential
30
16
  :param azure_ai_project: The scope of the Azure AI project.
31
17
  It contains subscription id, resource group, and project name.
32
18
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
33
- :param credential: The credential for connecting to Azure AI project.
34
- :type credential: ~azure.core.credentials.TokenCredential
35
19
 
36
20
  **Usage**
37
21
 
@@ -56,21 +40,16 @@ class SelfHarmEvaluator:
56
40
  }
57
41
  """
58
42
 
59
- def __init__(self, azure_ai_project: dict, credential=None):
60
- self._async_evaluator = _AsyncSelfHarmEvaluator(azure_ai_project, credential)
61
-
62
- def __call__(self, *, query: str, response: str, **kwargs):
63
- """
64
- Evaluates self harm content.
65
-
66
- :keyword query: The query to be evaluated.
67
- :paramtype query: str
68
- :keyword response: The response to be evaluated.
69
- :paramtype response: str
70
- :return: The self harm score.
71
- :rtype: dict
72
- """
73
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
74
-
75
- def _to_async(self):
76
- return self._async_evaluator
43
+ @override
44
+ def __init__(
45
+ self,
46
+ credential,
47
+ azure_ai_project: dict,
48
+ eval_last_turn: bool = False,
49
+ ):
50
+ super().__init__(
51
+ eval_metric=EvaluationMetrics.SELF_HARM,
52
+ azure_ai_project=azure_ai_project,
53
+ credential=credential,
54
+ eval_last_turn=eval_last_turn,
55
+ )