azure-ai-evaluation 1.0.0b1__py3-none-any.whl → 1.0.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (76) hide show
  1. azure/ai/evaluation/__init__.py +4 -4
  2. azure/ai/evaluation/_common/rai_service.py +4 -4
  3. azure/ai/evaluation/_common/utils.py +40 -25
  4. azure/ai/evaluation/_constants.py +13 -0
  5. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +2 -1
  6. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +39 -17
  7. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +23 -13
  8. azure/ai/evaluation/_evaluate/_eval_run.py +38 -18
  9. azure/ai/evaluation/_evaluate/_evaluate.py +88 -63
  10. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +13 -8
  11. azure/ai/evaluation/_evaluate/_utils.py +29 -22
  12. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  13. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +34 -86
  14. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  15. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  16. azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
  17. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
  18. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  20. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -2
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +9 -4
  22. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  23. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  24. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  25. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  26. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  27. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +2 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +29 -79
  29. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +33 -85
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  33. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -0
  34. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  35. azure/ai/evaluation/_evaluators/_qa/_qa.py +3 -14
  36. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +34 -88
  37. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  38. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  39. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +17 -29
  40. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  41. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +3 -2
  42. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +5 -18
  43. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  44. azure/ai/evaluation/_evaluators/_xpia/xpia.py +16 -91
  45. azure/ai/evaluation/_exceptions.py +0 -1
  46. azure/ai/evaluation/_http_utils.py +3 -3
  47. azure/ai/evaluation/_model_configurations.py +36 -8
  48. azure/ai/evaluation/_version.py +1 -1
  49. azure/ai/evaluation/simulator/__init__.py +1 -1
  50. azure/ai/evaluation/simulator/_adversarial_simulator.py +8 -6
  51. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  52. azure/ai/evaluation/simulator/_conversation/_conversation.py +16 -16
  53. azure/ai/evaluation/simulator/_direct_attack_simulator.py +6 -6
  54. azure/ai/evaluation/simulator/_helpers/__init__.py +3 -2
  55. azure/ai/evaluation/simulator/_helpers/_experimental.py +157 -0
  56. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +11 -29
  57. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +6 -6
  58. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -3
  59. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +18 -11
  60. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  61. azure/ai/evaluation/simulator/_model_tools/models.py +9 -11
  62. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  63. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
  64. azure/ai/evaluation/simulator/{simulator.py → _simulator.py} +166 -88
  65. azure/ai/evaluation/simulator/_tracing.py +21 -24
  66. azure/ai/evaluation/simulator/_utils.py +4 -1
  67. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/METADATA +144 -14
  68. azure_ai_evaluation-1.0.0b3.dist-info/RECORD +98 -0
  69. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -350
  70. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
  71. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -66
  72. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  73. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  74. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +0 -97
  75. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/WHEEL +0 -0
  76. {azure_ai_evaluation-1.0.0b1.dist-info → azure_ai_evaluation-1.0.0b3.dist-info}/top_level.txt +0 -0
@@ -1,82 +1,14 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
5
4
  import os
6
- import re
7
- from typing import Union
8
-
9
- import numpy as np
10
-
11
- from promptflow._utils.async_utils import async_run_allowing_running_loop
12
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
13
- from promptflow.core import AsyncPrompty
14
-
15
- from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
16
- from ..._common.utils import (
17
- check_and_add_api_version_for_aoai_model_config,
18
- check_and_add_user_agent_for_aoai_model_config,
19
- )
20
-
21
- try:
22
- from ..._user_agent import USER_AGENT
23
- except ImportError:
24
- USER_AGENT = None
25
-
26
-
27
- class _AsyncCoherenceEvaluator:
28
- # Constants must be defined within eval's directory to be save/loadable
29
- PROMPTY_FILE = "coherence.prompty"
30
- LLM_CALL_TIMEOUT = 600
31
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
32
-
33
- def __init__(self, model_config: dict):
34
- check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
35
-
36
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
5
+ from typing import Optional
6
+ from typing_extensions import override
37
7
 
38
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
39
- # https://github.com/encode/httpx/discussions/2959
40
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
8
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
41
9
 
42
- check_and_add_user_agent_for_aoai_model_config(
43
- model_config,
44
- prompty_model_config,
45
- USER_AGENT,
46
- )
47
-
48
- current_dir = os.path.dirname(__file__)
49
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
50
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
51
10
 
52
- async def __call__(self, *, query: str, response: str, **kwargs):
53
- # Validate input parameters
54
- query = str(query or "")
55
- response = str(response or "")
56
-
57
- if not (query.strip() and response.strip()):
58
- msg = "Both 'query' and 'response' must be non-empty strings."
59
- raise EvaluationException(
60
- message=msg,
61
- internal_message=msg,
62
- error_category=ErrorCategory.INVALID_VALUE,
63
- error_blame=ErrorBlame.USER_ERROR,
64
- error_target=ErrorTarget.COHERENCE_EVALUATOR,
65
- )
66
-
67
- # Run the evaluation flow
68
- llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
69
-
70
- score = np.nan
71
- if llm_output:
72
- match = re.search(r"\d", llm_output)
73
- if match:
74
- score = float(match.group())
75
-
76
- return {"gpt_coherence": float(score)}
77
-
78
-
79
- class CoherenceEvaluator:
11
+ class CoherenceEvaluator(PromptyEvaluatorBase):
80
12
  """
81
13
  Initialize a coherence evaluator configured for a specific Azure OpenAI model.
82
14
 
@@ -102,21 +34,37 @@ class CoherenceEvaluator:
102
34
  }
103
35
  """
104
36
 
105
- def __init__(self, model_config: dict):
106
- self._async_evaluator = _AsyncCoherenceEvaluator(model_config)
37
+ PROMPTY_FILE = "coherence.prompty"
38
+ RESULT_KEY = "gpt_coherence"
107
39
 
108
- def __call__(self, *, query: str, response: str, **kwargs):
109
- """
110
- Evaluate coherence.
40
+ @override
41
+ def __init__(self, model_config: dict):
42
+ current_dir = os.path.dirname(__file__)
43
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
44
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
45
+
46
+ @override
47
+ def __call__(
48
+ self,
49
+ *,
50
+ query: Optional[str] = None,
51
+ response: Optional[str] = None,
52
+ conversation: Optional[dict] = None,
53
+ **kwargs
54
+ ):
55
+ """Evaluate coherence. Accepts either a query and response for a single evaluation,
56
+ or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
57
+ turns, the evaluator will aggregate the results of each turn.
111
58
 
112
- :keyword query: The query to be evaluated.
113
- :paramtype query: str
114
59
  :keyword response: The response to be evaluated.
115
- :paramtype response: str
116
- :return: The coherence score.
117
- :rtype: Dict[str, float]
60
+ :paramtype response: Optional[str]
61
+ :keyword context: The context to be evaluated.
62
+ :paramtype context: Optional[str]
63
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
64
+ key "messages". Conversation turns are expected
65
+ to be dictionaries with keys "content" and "role".
66
+ :paramtype conversation: Optional[Dict]
67
+ :return: The relevance score.
68
+ :rtype: dict
118
69
  """
119
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
120
-
121
- def _to_async(self):
122
- return self._async_evaluator
70
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -3,11 +3,6 @@ name: Coherence
3
3
  description: Evaluates coherence score for QA scenario
4
4
  model:
5
5
  api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
6
  parameters:
12
7
  temperature: 0.0
13
8
  max_tokens: 1
@@ -0,0 +1,13 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._base_eval import EvaluatorBase
6
+ from ._base_prompty_eval import PromptyEvaluatorBase
7
+ from ._base_rai_svc_eval import RaiServiceEvaluatorBase
8
+
9
+ __all__ = [
10
+ "EvaluatorBase",
11
+ "PromptyEvaluatorBase",
12
+ "RaiServiceEvaluatorBase",
13
+ ]
@@ -0,0 +1,302 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from typing import List, Dict, Callable, Any
6
+ import inspect
7
+
8
+ from abc import ABC
9
+
10
+ import numpy as np
11
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
12
+
13
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
14
+
15
+
16
+ # TODO exception target pass down?
17
+ class EvaluatorBase(ABC):
18
+ """Base class for all evaluators that are capable of accepting either a group of single values,
19
+ or conversation as input. All such evaluators need to implement two functions of their own:
20
+ - _convert_conversation_to_eval_input
21
+ - _do_eval
22
+
23
+ Additionally, __call__ should be overridden to reshape the function header as needed to produce more informative
24
+ documentation, although ideally the actual child implementation of __call__ should just amount to
25
+ 'super().__init__()'.
26
+
27
+
28
+ :param not_singleton_inputs: A list of strings that represent the names of
29
+ inputs to the child evaluator's __call__ function that are NOT singleton inputs. By default, this
30
+ is ["conversation", "kwargs"].
31
+ :type not_singleton_inputs: List[str]
32
+ :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
33
+ :type eval_last_turn: bool
34
+ """
35
+
36
+ # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
37
+
38
+ # Make sure to call super().__init__() in the child class's __init__ method.
39
+ # pylint: disable=dangerous-default-value
40
+ def __init__(
41
+ self,
42
+ *,
43
+ not_singleton_inputs: List[str] = ["conversation", "kwargs"],
44
+ eval_last_turn: bool = False,
45
+ ):
46
+ self._not_singleton_inputs = not_singleton_inputs
47
+ self._eval_last_turn = eval_last_turn
48
+ self._singleton_inputs = self._derive_singleton_inputs()
49
+ self._async_evaluator = AsyncEvaluatorBase(self._real_call)
50
+
51
+ # This needs to be overridden just to change the function header into something more informative,
52
+ # and to be able to add a more specific docstring. The actual function contents should just be
53
+ # super().__call__(<inputs>)
54
+ def __call__(self, **kwargs) -> Dict:
55
+ """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
56
+ one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
57
+ The actual behavior of this function shouldn't change beyond adding more inputs to the
58
+ async_run_allowing_running_loop call.
59
+
60
+ :keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
61
+ :type kwargs: Dict
62
+ :return: The evaluation result
63
+ :rtype: Dict
64
+ """
65
+ return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
66
+
67
+ # Probably the only thing that can't be simplified. Each evaluator, or at least each family
68
+ # of evaluators, will need to implement their own version of this function.
69
+ async def _do_eval(self, eval_input: Any) -> Dict:
70
+ """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
71
+ In the default case, all required inputs are assumed to be within eval_input, as user-friendly
72
+ typing is handled above this function in favor of polymorphic simplicity. This function must be
73
+ asynchronous.
74
+
75
+ :param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
76
+ :type eval_input: Any
77
+ :return: A single evaluation result
78
+ :rtype: Dict
79
+
80
+ """
81
+ raise EvaluationException(
82
+ message="Not implemented",
83
+ internal_message="BaseConversationEval's _do_eval method called somehow. This should be overridden.",
84
+ )
85
+
86
+ # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
87
+
88
+ def _derive_singleton_inputs(self) -> List[str]:
89
+ """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
90
+ when the evaluator is being used in a non-conversation context.
91
+ By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
92
+ Thankfully this works the way you'd hope, with the call_signature being based on the child
93
+ function's signature, not the parent's.
94
+
95
+ :return: A list of strings representing the names of singleton inputs.
96
+ :rtype: List[str]
97
+ """
98
+
99
+ call_signature = inspect.signature(self.__call__)
100
+ singletons = []
101
+ for param in call_signature.parameters:
102
+ if param not in self._not_singleton_inputs:
103
+ singletons.append(param)
104
+ return singletons
105
+
106
+ def _derive_conversation_converter(self) -> Callable:
107
+ """Produce the function that will be used to convert conversations to a list of evaluable inputs.
108
+ This uses the inputs derived from the _derive_singleton_inputs function to determine which
109
+ aspects of a conversation ought to be extracted.
110
+
111
+ :return: The function that will be used to convert conversations to evaluable inputs.
112
+ :rtype: Callable
113
+ """
114
+ include_context = "context" in self._singleton_inputs
115
+ include_query = "query" in self._singleton_inputs
116
+ include_response = "response" in self._singleton_inputs
117
+
118
+ def converter(conversation: Dict) -> List:
119
+ messages = conversation["messages"]
120
+ global_context = conversation.get("context", None)
121
+ # Extract queries, responses from conversation
122
+ queries = []
123
+ responses = []
124
+
125
+ # Convert conversation slice into queries and responses.
126
+ # Assume that 'user' role is asking queries and 'assistant' role is responding.
127
+ if self._eval_last_turn and len(messages) > 1:
128
+ messages = messages[-2:]
129
+
130
+ for each_turn in messages:
131
+ role = each_turn["role"]
132
+ if role == "user":
133
+ queries.append(each_turn)
134
+ elif role == "assistant":
135
+ responses.append(each_turn)
136
+ # TODO complain if len(queries) != len(responses)?
137
+ eval_inputs = []
138
+ for query, response in zip(queries, responses):
139
+ context = {}
140
+ if include_context:
141
+ query_context = query.get("context", None)
142
+ response_context = response.get("context", None)
143
+ if global_context:
144
+ context["global_context"] = global_context
145
+ if query_context and not include_query:
146
+ context["query_context"] = query_context
147
+ if response_context and not include_response:
148
+ context["response_context"] = response_context
149
+
150
+ eval_input = {}
151
+ if include_query:
152
+ eval_input["query"] = query
153
+ if include_response:
154
+ eval_input["response"] = response
155
+ if include_context:
156
+ eval_input["context"] = str(context)
157
+ eval_inputs.append(eval_input)
158
+ return eval_inputs
159
+
160
+ return converter
161
+
162
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> List:
163
+ """Convert an arbitrary input into a list of inputs for evaluators.
164
+ It is assumed that evaluators generally make use of their inputs in one of two ways.
165
+ Either they receive a collection of keyname inputs that are all single values
166
+ (like a query and response), or they receive conversation that iss a list of dictionary
167
+ values.
168
+
169
+ The self._singleton_inputs list assigned during initialization is used to find and extract
170
+ singleton keywords, and self._allow_converssation_input is used to determine if a conversation
171
+ is a valid input.
172
+
173
+ If both conversations and singletons are allowed, the function will raise an exception if both
174
+ are inputted.
175
+
176
+ This function must be overridden by child classes IF they need to both a conversation and
177
+ other inputs to be passed in.
178
+
179
+ :keyword kwargs: The inputs to convert.
180
+ :type kwargs: Dict
181
+ :return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
182
+ :rtype: List
183
+ """
184
+
185
+ # Collect inputs
186
+ conversation = kwargs.get("conversation", None)
187
+ singletons = {}
188
+ if len(self._singleton_inputs) > 0:
189
+ singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
190
+ # Check that both conversation and other inputs aren't set
191
+ if conversation is not None and any(singletons.values()):
192
+ raise EvaluationException(
193
+ message="Invalid input",
194
+ internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
195
+ blame=ErrorBlame.USER_ERROR,
196
+ category=ErrorCategory.INVALID_VALUE,
197
+ target=ErrorTarget.CONVERSATION,
198
+ )
199
+ # Handle Conversation
200
+ if conversation is not None:
201
+ return self._derive_conversation_converter()(conversation)
202
+ # Handle Singletons
203
+ if all(value is not None for value in singletons.values()):
204
+ return [singletons] # TODO loosen requirements to allow for optional singletons?
205
+ # Missing input
206
+ raise EvaluationException(
207
+ message="Missing input",
208
+ internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
209
+ blame=ErrorBlame.USER_ERROR,
210
+ category=ErrorCategory.INVALID_VALUE,
211
+ target=ErrorTarget.CONVERSATION,
212
+ )
213
+
214
+ def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict:
215
+ """Aggregate the evaluation results of each conversation turn into a single result.
216
+
217
+ Exact implementation might need to vary slightly depending on the results produced.
218
+ Default behavior is to average the all number-based outputs.
219
+
220
+ :param per_turn_results: List of evaluation results for each turn in the conversation.
221
+ :type per_turn_results: List[Dict]
222
+ :return: A dictionary containing aggregated results, with numeric metrics having their
223
+ means as top-level values in the dictionary, and all original
224
+ values (including non-numerics) located in under the "evaluation_per_turn" key,
225
+ which each sub-key being a metric and each sub-value being a the list of that metric's
226
+ per-turn values.
227
+ :rtype: Dict
228
+ """
229
+
230
+ aggregated = {}
231
+ evaluation_per_turn = {}
232
+
233
+ # Go over each turn, and rotate the results into a
234
+ # metric: List[values] format for the evals_per_turn dictionary.
235
+ for turn in per_turn_results:
236
+ for metric, value in turn.items():
237
+ if metric not in evaluation_per_turn:
238
+ evaluation_per_turn[metric] = []
239
+ evaluation_per_turn[metric].append(value)
240
+
241
+ # Find and average all numeric values
242
+ for metric, values in evaluation_per_turn.items():
243
+ if all(isinstance(value, (int, float)) for value in values):
244
+ aggregated[metric] = np.mean(values)
245
+ # Slap the per-turn results back in.
246
+ aggregated["evaluation_per_turn"] = evaluation_per_turn
247
+
248
+ return aggregated
249
+
250
+ async def _real_call(self, **kwargs):
251
+ """The asynchronous call where real end-to-end evaluation logic is performed.
252
+
253
+ :keyword kwargs: The inputs to evaluate.
254
+ :type kwargs: Dict
255
+ :return: The evaluation result.
256
+ :rtype: Dict
257
+ """
258
+ # Convert inputs into list of evaluable inputs.
259
+ eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
260
+ per_turn_results = []
261
+ # Evaluate all inputs.
262
+ for eval_input in eval_input_list:
263
+ per_turn_results.append(await self._do_eval(eval_input))
264
+ # Return results as-is if only one result was produced.
265
+
266
+ if len(per_turn_results) == 1:
267
+ return per_turn_results[0]
268
+ if len(per_turn_results) == 0:
269
+ return {} # TODO raise something?
270
+ # Otherwise, aggregate results.
271
+ return self._aggregate_results(per_turn_results=per_turn_results)
272
+
273
+ # ~~~ METHODS THAT SHOULD NEVER BE OVERRIDDEN BY CHILDREN~~~
274
+
275
+ def _to_async(self):
276
+ return self._async_evaluator
277
+
278
+
279
+ class AsyncEvaluatorBase:
280
+ """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
281
+ to ensure that no one ever needs to extend or otherwise modify this class directly.
282
+ """
283
+
284
+ def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION
285
+ self._real_call = real_call
286
+
287
+ # Don't look at my shame. Nothing to see here....
288
+ # Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
289
+ # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature#
290
+ # are just not passed into this function instead of ending up in kwargs.
291
+ # Since we want this to be relatively call-agnostic, we just account for every input that any children
292
+ # are known to throw at this, mash them into kwargs, and then pass them into the real call.
293
+ async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
294
+ if conversation is not None:
295
+ kwargs["conversation"] = conversation
296
+ if query is not None:
297
+ kwargs["query"] = query
298
+ if response is not None:
299
+ kwargs["response"] = response
300
+ if context is not None:
301
+ kwargs["context"] = context
302
+ return await self._real_call(**kwargs)
@@ -0,0 +1,79 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import re
6
+ from typing import Dict
7
+
8
+ from typing_extensions import override
9
+
10
+
11
+ import numpy as np
12
+
13
+ from promptflow.core import AsyncPrompty
14
+
15
+ from ..._common.utils import construct_prompty_model_config
16
+
17
+ try:
18
+ from ..._user_agent import USER_AGENT
19
+ except ImportError:
20
+ USER_AGENT = None
21
+ from . import EvaluatorBase
22
+
23
+
24
+ class PromptyEvaluatorBase(EvaluatorBase):
25
+ """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
26
+ make use of a prompty file, and return their results as a dictionary, with a single key-value pair
27
+ linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
28
+ per-turn results are stored in a list under the key "evaluation_per_turn").
29
+
30
+ :param result_key: The key to use for the result of the evaluation. Single turn evaluations will return
31
+ a dictionary in the format {result_key: float}.
32
+ :type result_key: str
33
+ :param prompty_file: The path to the prompty file to use for evaluation.
34
+ :type prompty_file: str
35
+ :param model_config: The model configuration to use for evaluation.
36
+ :type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
37
+ :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
38
+ Useful since some evaluators of this format are response-only.
39
+ :type ignore_queries: bool
40
+ """
41
+
42
+ LLM_CALL_TIMEOUT = 600
43
+ DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
44
+
45
+ def __init__(self, *, result_key: str, prompty_file: str, model_config: Dict, eval_last_turn: bool = False):
46
+ self._result_key = result_key
47
+ self._prompty_file = prompty_file
48
+ super().__init__(eval_last_turn=eval_last_turn)
49
+
50
+ prompty_model_config = construct_prompty_model_config(
51
+ model_config,
52
+ self.DEFAULT_OPEN_API_VERSION,
53
+ USER_AGENT,
54
+ )
55
+
56
+ self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
57
+
58
+ # __call__ not overridden here because child classes have such varied signatures that there's no point
59
+ # defining a default here.
60
+
61
+ @override
62
+ async def _do_eval(self, eval_input: Dict) -> Dict:
63
+ """Do a relevance evaluation.
64
+
65
+ :param eval_input: The input to the evaluator. Expected to contain
66
+ whatever inputs are needed for the _flow method, including context
67
+ and other fields depending on the child class.
68
+ :type eval_input: Dict
69
+ :return: The evaluation result.
70
+ :rtype: Dict
71
+ """
72
+ llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
73
+
74
+ score = np.nan
75
+ if llm_output:
76
+ match = re.search(r"\d", llm_output)
77
+ if match:
78
+ score = float(match.group())
79
+ return {self._result_key: float(score)}
@@ -0,0 +1,99 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from typing import Dict, Optional
6
+ from typing_extensions import override
7
+
8
+ from azure.identity import DefaultAzureCredential
9
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
10
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
11
+ from azure.ai.evaluation._exceptions import EvaluationException
12
+ from . import EvaluatorBase
13
+
14
+
15
+ class RaiServiceEvaluatorBase(EvaluatorBase):
16
+ """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
17
+ This includes content safety evaluators, protected material evaluators, and others. These evaluators
18
+ are all assumed to be of the "query and response or conversation" input variety.
19
+
20
+ param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
21
+ to specify which evaluation to perform.
22
+ type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
23
+ param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
24
+ aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
25
+ aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
26
+ when this occurs. Default is False, resulting full conversation evaluation and aggregation.
27
+ type eval_last_turn: bool
28
+ """
29
+
30
+ @override
31
+ def __init__(
32
+ self,
33
+ eval_metric: EvaluationMetrics,
34
+ azure_ai_project: dict,
35
+ credential: Optional[dict] = None,
36
+ eval_last_turn: bool = False,
37
+ ):
38
+ super().__init__(eval_last_turn=eval_last_turn)
39
+ self._eval_metric = eval_metric
40
+ self._azure_ai_project = azure_ai_project
41
+ if credential is None:
42
+ # Use DefaultCredential if no credential is provided
43
+ self._credential = DefaultAzureCredential()
44
+ else:
45
+ self._credential = credential
46
+
47
+ @override
48
+ def __call__(
49
+ self,
50
+ *,
51
+ query: Optional[str] = None,
52
+ response: Optional[str] = None,
53
+ conversation: Optional[dict] = None,
54
+ **kwargs
55
+ ):
56
+ """Evaluate either a query and response or a conversation. Must supply either a query AND response,
57
+ or a conversation, but not both.
58
+
59
+ :keyword query: The query to evaluate.
60
+ :paramtype query: Optional[str]
61
+ :keyword response: The response to evaluate.
62
+ :paramtype response: Optional[str]
63
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
64
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
65
+ to be dictionaries with keys "content", "role", and possibly "context".
66
+ :paramtype conversation: Optional[Dict]
67
+ :return: The evaluation result.
68
+ :rtype: Dict
69
+ """
70
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
71
+
72
+ @override
73
+ async def _do_eval(self, eval_input: Dict):
74
+ """Perform the evaluation using the Azure AI RAI service.
75
+ The exact evaluation performed is determined by the evaluation metric supplied
76
+ by the child class initializer.
77
+
78
+ :param eval_input: The input to the evaluation function.
79
+ :type eval_input: Dict
80
+ :return: The evaluation result.
81
+ :rtype: Dict
82
+ """
83
+ query = eval_input.get("query", None)
84
+ response = eval_input.get("response", None)
85
+ if query is None or response is None:
86
+ raise EvaluationException(
87
+ message="Not implemented",
88
+ internal_message=(
89
+ "Reached query/response evaluation without supplying query or response."
90
+ + " This should have failed earlier."
91
+ ),
92
+ )
93
+ return await evaluate_with_rai_service(
94
+ metric_name=self._eval_metric,
95
+ query=query,
96
+ response=response,
97
+ project_scope=self._azure_ai_project,
98
+ credential=self._credential,
99
+ )
@@ -3,8 +3,6 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  from ._content_safety import ContentSafetyEvaluator
6
- from ._content_safety_base import ContentSafetyEvaluatorBase
7
- from ._content_safety_chat import ContentSafetyChatEvaluator
8
6
  from ._hate_unfairness import HateUnfairnessEvaluator
9
7
  from ._self_harm import SelfHarmEvaluator
10
8
  from ._sexual import SexualEvaluator
@@ -16,6 +14,4 @@ __all__ = [
16
14
  "SelfHarmEvaluator",
17
15
  "HateUnfairnessEvaluator",
18
16
  "ContentSafetyEvaluator",
19
- "ContentSafetyChatEvaluator",
20
- "ContentSafetyEvaluatorBase",
21
17
  ]
@@ -5,8 +5,6 @@ from concurrent.futures import as_completed
5
5
 
6
6
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
7
7
 
8
- from azure.ai.evaluation._model_configurations import AzureAIProject
9
-
10
8
  try:
11
9
  from ._hate_unfairness import HateUnfairnessEvaluator
12
10
  from ._self_harm import SelfHarmEvaluator