azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (100) hide show
  1. azure/ai/evaluation/__init__.py +60 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/constants.py +65 -0
  4. azure/ai/evaluation/_common/rai_service.py +452 -0
  5. azure/ai/evaluation/_common/utils.py +87 -0
  6. azure/ai/evaluation/_constants.py +50 -0
  7. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  8. azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
  9. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
  10. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
  11. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
  15. azure/ai/evaluation/_evaluate/_utils.py +237 -0
  16. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  17. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  18. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
  19. azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
  20. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  21. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
  22. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  23. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  24. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  25. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
  26. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
  27. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
  28. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  47. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  48. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
  49. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  50. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
  51. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  52. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  53. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
  55. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  56. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
  57. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
  58. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  59. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  60. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
  62. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
  63. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  64. azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
  65. azure/ai/evaluation/_exceptions.py +107 -0
  66. azure/ai/evaluation/_http_utils.py +395 -0
  67. azure/ai/evaluation/_model_configurations.py +27 -0
  68. azure/ai/evaluation/_user_agent.py +6 -0
  69. azure/ai/evaluation/_version.py +5 -0
  70. azure/ai/evaluation/py.typed +0 -0
  71. azure/ai/evaluation/simulator/__init__.py +15 -0
  72. azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
  73. azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
  74. azure/ai/evaluation/simulator/_constants.py +17 -0
  75. azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
  76. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  77. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  78. azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
  79. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  80. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  81. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
  82. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
  83. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  84. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
  85. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
  86. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
  87. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
  88. azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
  89. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
  90. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
  91. azure/ai/evaluation/simulator/_tracing.py +92 -0
  92. azure/ai/evaluation/simulator/_utils.py +111 -0
  93. azure/ai/evaluation/simulator/simulator.py +579 -0
  94. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  95. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  96. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  97. azure_ai_evaluation-1.0.0b1.dist-info/top_level.txt +1 -0
  98. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  99. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  100. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,350 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import json
5
+ import logging
6
+ from concurrent.futures import as_completed
7
+ from typing import Dict, List, Union
8
+
9
+ import numpy as np
10
+
11
+ from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
12
+
13
+ from .._coherence import CoherenceEvaluator
14
+ from .._fluency import FluencyEvaluator
15
+ from .._groundedness import GroundednessEvaluator
16
+ from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
17
+ from .._relevance import RelevanceEvaluator
18
+ from .retrieval import RetrievalChatEvaluator
19
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class ChatEvaluator:
25
+ """
26
+ Initialize a chat evaluator configured for a specific Azure OpenAI model.
27
+
28
+ :param model_config: Configuration for the Azure OpenAI model.
29
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
30
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
31
+ :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
32
+ focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
33
+ :type eval_last_turn: bool
34
+ :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
35
+ Default is True.
36
+ :type parallel: bool
37
+ :return: A function that evaluates and generates metrics for "chat" scenario.
38
+ :rtype: Callable
39
+
40
+ **Usage**
41
+
42
+ .. code-block:: python
43
+
44
+ chat_eval = ChatEvaluator(model_config)
45
+ conversation = [
46
+ {"role": "user", "content": "What is the value of 2 + 2?"},
47
+ {"role": "assistant", "content": "2 + 2 = 4", "context": {
48
+ "citations": [
49
+ {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
50
+ ]
51
+ }
52
+ }
53
+ ]
54
+ result = chat_eval(conversation=conversation)
55
+
56
+ **Output format**
57
+
58
+ .. code-block:: python
59
+
60
+ {
61
+ "evaluation_per_turn": {
62
+ "gpt_retrieval": [1.0, 2.0],
63
+ "gpt_groundedness": [5.0, 2.0],
64
+ "gpt_relevance": [3.0, 5.0],
65
+ "gpt_coherence": [1.0, 2.0],
66
+ "gpt_fluency": [3.0, 5.0]
67
+ }
68
+ "gpt_retrieval": 1.5,
69
+ "gpt_groundedness": 3.5,
70
+ "gpt_relevance": 4.0,
71
+ "gpt_coherence": 1.5,
72
+ "gpt_fluency": 4.0
73
+ }
74
+ """
75
+
76
+ def __init__(
77
+ self,
78
+ model_config: dict,
79
+ eval_last_turn: bool = False,
80
+ parallel: bool = True,
81
+ ):
82
+ self._eval_last_turn = eval_last_turn
83
+ self._parallel = parallel
84
+
85
+ # TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
86
+ self._rag_evaluators = [
87
+ GroundednessEvaluator(model_config),
88
+ RelevanceEvaluator(model_config),
89
+ ]
90
+ self._non_rag_evaluators = [
91
+ CoherenceEvaluator(model_config),
92
+ FluencyEvaluator(model_config),
93
+ ]
94
+ # TODO: Temporary workaround to close the gap of missing retrieval score
95
+ # https://msdata.visualstudio.com/Vienna/_workitems/edit/3186644
96
+ # For long term, we need to add a built-in evaluator for retrieval after prompt is generalized for QA and Chat
97
+ self._retrieval_chat_evaluator = RetrievalChatEvaluator(model_config)
98
+
99
+ def __call__(self, *, conversation, **kwargs):
100
+ """
101
+ Evaluates chat scenario.
102
+
103
+ :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
104
+ "context" key is optional for assistant's turn and should have "citations" key with list of citations.
105
+ :paramtype conversation: List[Dict]
106
+ :return: The scores for Chat scenario.
107
+ :rtype: dict
108
+ """
109
+ self._validate_conversation(conversation)
110
+
111
+ # Extract queries, responses and contexts from conversation
112
+ queries = []
113
+ responses = []
114
+ contexts = []
115
+
116
+ if self._eval_last_turn:
117
+ # Process only the last two turns if _eval_last_turn is True
118
+ conversation_slice = conversation[-2:] if len(conversation) >= 2 else conversation
119
+ else:
120
+ conversation_slice = conversation
121
+
122
+ for each_turn in conversation_slice:
123
+ role = each_turn["role"]
124
+ if role == "user":
125
+ queries.append(each_turn["content"])
126
+ elif role == "assistant":
127
+ responses.append(each_turn["content"])
128
+ if "context" in each_turn and "citations" in each_turn["context"]:
129
+ citations = json.dumps(each_turn["context"]["citations"])
130
+ contexts.append(citations)
131
+
132
+ # Select evaluators to be used for evaluation
133
+ compute_rag_based_metrics = True
134
+ if len(responses) != len(contexts):
135
+ safe_message = (
136
+ "Skipping rag based metrics as we need citations or "
137
+ "retrieved_documents in context key of every assistant's turn"
138
+ )
139
+ logger.warning(safe_message)
140
+ compute_rag_based_metrics = False
141
+
142
+ selected_evaluators = []
143
+ selected_evaluators.extend(self._non_rag_evaluators)
144
+ if compute_rag_based_metrics:
145
+ selected_evaluators.extend(self._rag_evaluators)
146
+
147
+ # Evaluate each turn
148
+ per_turn_results = []
149
+ for turn_num in range(len(queries)):
150
+ current_turn_result = {}
151
+
152
+ if self._parallel:
153
+ # Parallel execution
154
+ with ThreadPoolExecutor() as executor:
155
+ future_to_evaluator = {
156
+ executor.submit(
157
+ self._evaluate_turn, turn_num, queries, responses, contexts, evaluator
158
+ ): evaluator
159
+ for evaluator in selected_evaluators
160
+ }
161
+
162
+ for future in as_completed(future_to_evaluator):
163
+ result = future.result()
164
+ current_turn_result.update(result)
165
+ else:
166
+ # Sequential execution
167
+ for evaluator in selected_evaluators:
168
+ async_evaluator = evaluator._to_async()
169
+ result = self._evaluate_turn(turn_num, queries, responses, contexts, async_evaluator)
170
+ current_turn_result.update(result)
171
+
172
+ per_turn_results.append(current_turn_result)
173
+
174
+ # Aggregate results
175
+ # Final aggregated results for a conversation will look like:
176
+ # "gpt_groundedness": 2.0, # Mean of all groundedness scores
177
+ # "evaluation_per_turn": {
178
+ # "gpt_groundedness": {
179
+ # "score": [1.0, ...],
180
+ # "reason": ["reason1", ...],
181
+ # },
182
+ # },
183
+ # }
184
+ aggregated = self._aggregate_results(per_turn_results)
185
+
186
+ # Run RetrievalChatEvaluator and merge the results
187
+ if compute_rag_based_metrics:
188
+ retrieval_score = self._retrieval_chat_evaluator(conversation=conversation_slice)
189
+ aggregated["gpt_retrieval"] = retrieval_score["gpt_retrieval"]
190
+ aggregated["evaluation_per_turn"]["gpt_retrieval"] = retrieval_score["evaluation_per_turn"]["gpt_retrieval"]
191
+ aggregated = dict(sorted(aggregated.items()))
192
+
193
+ return aggregated
194
+
195
+ def _evaluate_turn(self, turn_num, queries, responses, contexts, evaluator):
196
+ try:
197
+ query = queries[turn_num] if turn_num < len(queries) else ""
198
+ response = responses[turn_num] if turn_num < len(responses) else ""
199
+ context = contexts[turn_num] if turn_num < len(contexts) else ""
200
+
201
+ score = evaluator(query=query, response=response, context=context)
202
+
203
+ return score
204
+ except Exception as e: # pylint: disable=broad-exception-caught
205
+ logger.warning(
206
+ f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}"
207
+ )
208
+ return {}
209
+
210
+ def _aggregate_results(self, per_turn_results: List[Dict]):
211
+ scores = {}
212
+ reasons = {}
213
+
214
+ for turn in per_turn_results:
215
+ for metric, value in turn.items():
216
+ if "reason" in metric:
217
+ if metric not in reasons:
218
+ reasons[metric] = []
219
+ reasons[metric].append(value)
220
+ else:
221
+ if metric not in scores:
222
+ scores[metric] = []
223
+ scores[metric].append(value)
224
+
225
+ aggregated = {}
226
+ evaluation_per_turn = {}
227
+
228
+ for metric, values in scores.items():
229
+ aggregated[metric] = np.nanmean(values)
230
+
231
+ # Prepare per-turn evaluations
232
+ evaluation_per_turn[metric] = {"score": values}
233
+ reason_key = f"{metric}_reason"
234
+ if reason_key in reasons:
235
+ evaluation_per_turn[metric]["reason"] = reasons[reason_key]
236
+
237
+ aggregated["evaluation_per_turn"] = evaluation_per_turn
238
+
239
+ return aggregated
240
+
241
+ def _validate_conversation(self, conversation: List[Dict]):
242
+ if conversation is None or not isinstance(conversation, list):
243
+ msg = "conversation must be a list of dictionaries"
244
+ raise EvaluationException(
245
+ message=msg,
246
+ internal_message=msg,
247
+ target=ErrorTarget.CHAT_EVALUATOR,
248
+ category=ErrorCategory.INVALID_VALUE,
249
+ blame=ErrorBlame.USER_ERROR,
250
+ )
251
+
252
+ expected_role = "user"
253
+ for turn_num, turn in enumerate(conversation):
254
+ one_based_turn_num = turn_num + 1
255
+
256
+ if not isinstance(turn, dict):
257
+ msg = f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}"
258
+ raise EvaluationException(
259
+ message=msg,
260
+ internal_message=msg,
261
+ target=ErrorTarget.CHAT_EVALUATOR,
262
+ category=ErrorCategory.INVALID_VALUE,
263
+ blame=ErrorBlame.USER_ERROR,
264
+ )
265
+
266
+ if "role" not in turn or "content" not in turn:
267
+ msg = f"Each turn in 'conversation' must have 'role' and 'content' keys. Turn number: {one_based_turn_num}"
268
+ raise EvaluationException(
269
+ message=msg,
270
+ internal_message=msg,
271
+ target=ErrorTarget.CHAT_EVALUATOR,
272
+ category=ErrorCategory.INVALID_VALUE,
273
+ blame=ErrorBlame.USER_ERROR,
274
+ )
275
+
276
+ if turn["role"] != expected_role:
277
+ msg = f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}"
278
+ raise EvaluationException(
279
+ message=msg,
280
+ internal_message=msg,
281
+ target=ErrorTarget.CHAT_EVALUATOR,
282
+ category=ErrorCategory.INVALID_VALUE,
283
+ blame=ErrorBlame.USER_ERROR,
284
+ )
285
+
286
+ if not isinstance(turn["content"], str):
287
+ msg = f"Content in each turn must be a string. Turn number: {one_based_turn_num}"
288
+ raise EvaluationException(
289
+ message=msg,
290
+ internal_message=msg,
291
+ target=ErrorTarget.CHAT_EVALUATOR,
292
+ category=ErrorCategory.INVALID_VALUE,
293
+ blame=ErrorBlame.USER_ERROR,
294
+ )
295
+
296
+ if turn["role"] == "assistant" and "context" in turn:
297
+ if not isinstance(turn["context"], dict):
298
+ msg = f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}"
299
+ raise EvaluationException(
300
+ message=msg,
301
+ internal_message=msg,
302
+ target=ErrorTarget.CHAT_EVALUATOR,
303
+ category=ErrorCategory.INVALID_VALUE,
304
+ blame=ErrorBlame.USER_ERROR,
305
+ )
306
+
307
+ if "citations" not in turn["context"]:
308
+ msg = f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}"
309
+ raise EvaluationException(
310
+ message=msg,
311
+ internal_message=msg,
312
+ target=ErrorTarget.CHAT_EVALUATOR,
313
+ category=ErrorCategory.MISSING_FIELD,
314
+ blame=ErrorBlame.USER_ERROR,
315
+ )
316
+
317
+ if not isinstance(turn["context"]["citations"], list):
318
+ msg = f"'citations' in context must be a list. Turn number: {one_based_turn_num}"
319
+ raise EvaluationException(
320
+ message=msg,
321
+ internal_message=msg,
322
+ target=ErrorTarget.CHAT_EVALUATOR,
323
+ category=ErrorCategory.INVALID_VALUE,
324
+ blame=ErrorBlame.USER_ERROR,
325
+ )
326
+
327
+ for citation_num, citation in enumerate(turn["context"]["citations"]):
328
+ if not isinstance(citation, dict):
329
+ msg = f"Each citation in 'citations' must be a dictionary. Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}"
330
+ raise EvaluationException(
331
+ message=msg,
332
+ internal_message=msg,
333
+ target=ErrorTarget.CHAT_EVALUATOR,
334
+ category=ErrorCategory.INVALID_VALUE,
335
+ blame=ErrorBlame.USER_ERROR,
336
+ )
337
+
338
+ # Toggle expected role for the next turn
339
+ expected_role = "user" if expected_role == "assistant" else "assistant"
340
+
341
+ # Ensure the conversation ends with an assistant's turn
342
+ if expected_role != "user":
343
+ msg = "The conversation must end with an assistant's turn."
344
+ raise EvaluationException(
345
+ message=msg,
346
+ internal_message=msg,
347
+ target=ErrorTarget.CHAT_EVALUATOR,
348
+ category=ErrorCategory.INVALID_VALUE,
349
+ blame=ErrorBlame.USER_ERROR,
350
+ )
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._retrieval import RetrievalChatEvaluator
6
+
7
+ __all__ = [
8
+ "RetrievalChatEvaluator",
9
+ ]
@@ -0,0 +1,163 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ from typing import Union
10
+
11
+ import numpy as np
12
+
13
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
14
+ from promptflow.core import AsyncPrompty
15
+
16
+ from ...._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
17
+ from ...._common.utils import (
18
+ check_and_add_api_version_for_aoai_model_config,
19
+ check_and_add_user_agent_for_aoai_model_config,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ try:
25
+ from ...._user_agent import USER_AGENT
26
+ except ImportError:
27
+ USER_AGENT = None
28
+
29
+
30
+ class _AsyncRetrievalChatEvaluator:
31
+ # Constants must be defined within eval's directory to be save/loadable
32
+ PROMPTY_FILE = "retrieval.prompty"
33
+ LLM_CALL_TIMEOUT = 600
34
+ DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
35
+
36
+ def __init__(self, model_config: dict):
37
+ check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
38
+
39
+ prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
40
+
41
+ # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
42
+ # https://github.com/encode/httpx/discussions/2959
43
+ prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
44
+
45
+ check_and_add_user_agent_for_aoai_model_config(
46
+ model_config,
47
+ prompty_model_config,
48
+ USER_AGENT,
49
+ )
50
+
51
+ current_dir = os.path.dirname(__file__)
52
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
53
+ self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
54
+
55
+ async def __call__(self, *, conversation, **kwargs):
56
+ # Extract queries, responses and contexts from conversation
57
+ queries = []
58
+ responses = []
59
+ contexts = []
60
+
61
+ for each_turn in conversation:
62
+ role = each_turn["role"]
63
+ if role == "user":
64
+ queries.append(each_turn["content"])
65
+ elif role == "assistant":
66
+ responses.append(each_turn["content"])
67
+ if "context" in each_turn and "citations" in each_turn["context"]:
68
+ citations = json.dumps(each_turn["context"]["citations"])
69
+ contexts.append(citations)
70
+
71
+ # Evaluate each turn
72
+ per_turn_scores = []
73
+ history = []
74
+ for turn_num, query in enumerate(queries):
75
+ try:
76
+ query = query if turn_num < len(queries) else ""
77
+ answer = responses[turn_num] if turn_num < len(responses) else ""
78
+ context = contexts[turn_num] if turn_num < len(contexts) else ""
79
+
80
+ history.append({"user": query, "assistant": answer})
81
+
82
+ llm_output = await self._flow(
83
+ query=query, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
84
+ )
85
+ score = np.nan
86
+ if llm_output:
87
+ parsed_score_response = re.findall(r"\d+", llm_output.split("# Result")[-1].strip())
88
+ if len(parsed_score_response) > 0:
89
+ score = float(parsed_score_response[0].replace("'", "").strip())
90
+
91
+ per_turn_scores.append(score)
92
+
93
+ except Exception as e: # pylint: disable=broad-exception-caught
94
+ logger.warning(
95
+ f"Evaluator {self.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}"
96
+ )
97
+
98
+ per_turn_scores.append(np.nan)
99
+
100
+ return {
101
+ "gpt_retrieval": np.nanmean(per_turn_scores),
102
+ "evaluation_per_turn": {
103
+ "gpt_retrieval": {
104
+ "score": per_turn_scores,
105
+ }
106
+ },
107
+ }
108
+
109
+
110
+ class RetrievalChatEvaluator:
111
+ """
112
+ Initialize an evaluator configured for a specific Azure OpenAI model.
113
+
114
+ :param model_config: Configuration for the Azure OpenAI model.
115
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
116
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
117
+ :return: A function that evaluates and generates metrics for "chat" scenario.
118
+ :rtype: Callable
119
+ **Usage**
120
+
121
+ .. code-block:: python
122
+
123
+ chat_eval = RetrievalChatEvaluator(model_config)
124
+ conversation = [
125
+ {"role": "user", "content": "What is the value of 2 + 2?"},
126
+ {"role": "assistant", "content": "2 + 2 = 4", "context": {
127
+ "citations": [
128
+ {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
129
+ ]
130
+ }
131
+ }
132
+ ]
133
+ result = chat_eval(conversation=conversation)
134
+
135
+ **Output format**
136
+
137
+ .. code-block:: python
138
+
139
+ {
140
+ "gpt_retrieval": 3.0
141
+ "evaluation_per_turn": {
142
+ "gpt_retrieval": {
143
+ "score": [1.0, 2.0, 3.0]
144
+ }
145
+ }
146
+ }
147
+ """
148
+
149
+ def __init__(self, model_config: dict):
150
+ self._async_evaluator = _AsyncRetrievalChatEvaluator(model_config)
151
+
152
+ def __call__(self, *, conversation, **kwargs):
153
+ """Evaluates retrieval score chat scenario.
154
+
155
+ :keyword conversation: The conversation to be evaluated.
156
+ :paramtype conversation: List[Dict]
157
+ :return: The scores for Chat scenario.
158
+ :rtype: dict
159
+ """
160
+ return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs)
161
+
162
+ def _to_async(self):
163
+ return self._async_evaluator
@@ -0,0 +1,48 @@
1
+ ---
2
+ name: Retrieval
3
+ description: Evaluates retrieval score for Chat scenario
4
+ model:
5
+ api: chat
6
+ configuration:
7
+ type: azure_openai
8
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
9
+ api_key: ${env:AZURE_OPENAI_API_KEY}
10
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
+ parameters:
12
+ temperature: 0.0
13
+ top_p: 1.0
14
+ presence_penalty: 0
15
+ frequency_penalty: 0
16
+ response_format:
17
+ type: text
18
+
19
+ inputs:
20
+ query:
21
+ type: string
22
+ history:
23
+ type: string
24
+ documents:
25
+ type: string
26
+
27
+ ---
28
+ system:
29
+ A chat history between user and bot is shown below
30
+ A list of documents is shown below in json format, and each document has one unique id.
31
+ These listed documents are used as context to answer the given question.
32
+ The task is to score the relevance between the documents and the potential answer to the given question in the range of 1 to 5.
33
+ 1 means none of the documents is relevant to the question at all. 5 means either one of the document or combination of a few documents is ideal for answering the given question.
34
+ Think through step by step:
35
+ - Summarize each given document first
36
+ - Determine the underlying intent of the given question, when the question is ambiguous, refer to the given chat history
37
+ - Measure how suitable each document to the given question, list the document id and the corresponding relevance score.
38
+ - Summarize the overall relevance of given list of documents to the given question after # Overall Reason, note that the answer to the question can solely from single document or a combination of multiple documents.
39
+ - Finally, output "# Result" followed by a score from 1 to 5.
40
+
41
+ # Question
42
+ {{ query }}
43
+ # Chat History
44
+ {{ history }}
45
+ # Documents
46
+ ===BEGIN RETRIEVED DOCUMENTS===
47
+ {{ documents }}
48
+ ===END RETRIEVED DOCUMENTS===
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._coherence import CoherenceEvaluator
6
+
7
+ __all__ = ["CoherenceEvaluator"]