azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show
  1. azure/ai/evaluation/__init__.py +46 -12
  2. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  3. azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
  4. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  5. azure/ai/evaluation/_common/rai_service.py +3 -3
  6. azure/ai/evaluation/_common/utils.py +74 -17
  7. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  8. azure/ai/evaluation/_converters/_models.py +75 -26
  9. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
  10. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  11. azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
  12. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
  13. azure/ai/evaluation/_evaluate/_utils.py +5 -2
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  15. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
  20. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  21. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  22. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  23. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  24. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  25. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  26. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  27. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  29. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  30. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
  31. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  37. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
  38. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
  39. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  40. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
  42. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
  43. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  44. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
  45. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
  46. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  48. azure/ai/evaluation/_exceptions.py +1 -0
  49. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  50. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
  51. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  52. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  53. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
  54. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  55. azure/ai/evaluation/_version.py +1 -1
  56. azure/ai/evaluation/red_team/__init__.py +4 -3
  57. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  58. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  59. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  60. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  61. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  62. azure/ai/evaluation/red_team/_red_team.py +655 -2665
  63. azure/ai/evaluation/red_team/_red_team_result.py +6 -0
  64. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  65. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  66. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
  67. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  68. azure/ai/evaluation/red_team/_utils/constants.py +0 -2
  69. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  70. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  71. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  72. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  73. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  74. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  75. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  76. azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
  78. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
  80. azure/ai/evaluation/simulator/_simulator.py +12 -0
  81. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
  82. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
  83. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
  84. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
  85. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- import os
4
+ import os, logging
5
5
  from typing import Dict, List, Optional, Union
6
6
 
7
7
  from typing_extensions import overload, override
@@ -9,7 +9,14 @@ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
12
- from ..._common.utils import construct_prompty_model_config, validate_model_config
12
+ from ..._common.utils import (
13
+ ErrorBlame,
14
+ ErrorTarget,
15
+ EvaluationException,
16
+ ErrorCategory,
17
+ construct_prompty_model_config,
18
+ validate_model_config,
19
+ )
13
20
 
14
21
  try:
15
22
  from ..._user_agent import UserAgentSingleton
@@ -21,6 +28,9 @@ except ImportError:
21
28
  return "None"
22
29
 
23
30
 
31
+ logger = logging.getLogger(__name__)
32
+
33
+
24
34
  class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
25
35
  """
26
36
  Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
@@ -78,12 +88,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
88
  _PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
79
89
  _RESULT_KEY = "groundedness"
80
90
  _OPTIONAL_PARAMS = ["query"]
91
+ _SUPPORTED_TOOLS = ["file_search"]
81
92
 
82
- id = "azureml://registries/azureml/models/Groundedness-Evaluator/versions/4"
93
+ id = "azureai://built-in/evaluators/groundedness"
83
94
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
84
95
 
85
96
  @override
86
- def __init__(self, model_config, *, threshold=3, **kwargs):
97
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
87
98
  current_dir = os.path.dirname(__file__)
88
99
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
89
100
 
@@ -93,6 +104,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
93
104
  prompty_file=prompty_path,
94
105
  result_key=self._RESULT_KEY,
95
106
  threshold=threshold,
107
+ credential=credential,
96
108
  _higher_is_better=self._higher_is_better,
97
109
  )
98
110
  self._model_config = model_config
@@ -120,6 +132,26 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
120
132
  :rtype: Dict[str, float]
121
133
  """
122
134
 
135
+ @overload
136
+ def __call__(
137
+ self,
138
+ *,
139
+ query: str,
140
+ response: List[dict],
141
+ tool_definitions: List[dict],
142
+ ) -> Dict[str, Union[str, float]]:
143
+ """Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
144
+
145
+ :keyword query: The query to be evaluated.
146
+ :paramtype query: str
147
+ :keyword response: The response from the agent to be evaluated.
148
+ :paramtype response: List[dict]
149
+ :keyword tool_definitions: The tool definitions used by the agent.
150
+ :paramtype tool_definitions: List[dict]
151
+ :return: The groundedness score.
152
+ :rtype: Dict[str, Union[str, float]]
153
+ """
154
+
123
155
  @overload
124
156
  def __call__(
125
157
  self,
@@ -174,3 +206,81 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
174
206
  self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
175
207
 
176
208
  return super().__call__(*args, **kwargs)
209
+
210
+ async def _real_call(self, **kwargs):
211
+ """The asynchronous call where real end-to-end evaluation logic is performed.
212
+
213
+ :keyword kwargs: The inputs to evaluate.
214
+ :type kwargs: Dict
215
+ :return: The evaluation result.
216
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
217
+ """
218
+ # Convert inputs into list of evaluable inputs.
219
+ try:
220
+ return await super()._real_call(**kwargs)
221
+ except EvaluationException as ex:
222
+ if ex.category == ErrorCategory.NOT_APPLICABLE:
223
+ return {
224
+ self._result_key: self._NOT_APPLICABLE_RESULT,
225
+ f"{self._result_key}_result": "pass",
226
+ f"{self._result_key}_threshold": self.threshold,
227
+ f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
228
+ }
229
+ else:
230
+ raise ex
231
+
232
+ def _convert_kwargs_to_eval_input(self, **kwargs):
233
+ if "context" in kwargs or "conversation" in kwargs:
234
+ return super()._convert_kwargs_to_eval_input(**kwargs)
235
+
236
+ query = kwargs.get("query")
237
+ response = kwargs.get("response")
238
+ tool_definitions = kwargs.get("tool_definitions")
239
+
240
+ if not query or not response or not tool_definitions:
241
+ msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query', 'response' and 'tool_definitions' are required."
242
+ raise EvaluationException(
243
+ message=msg,
244
+ blame=ErrorBlame.USER_ERROR,
245
+ category=ErrorCategory.INVALID_VALUE,
246
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
247
+ )
248
+
249
+ context = self._get_context_from_agent_response(response, tool_definitions)
250
+ if not context:
251
+ raise EvaluationException(
252
+ message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
253
+ blame=ErrorBlame.USER_ERROR,
254
+ category=ErrorCategory.NOT_APPLICABLE,
255
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
256
+ )
257
+
258
+ return super()._convert_kwargs_to_eval_input(response=response[-1], context=context, query=query)
259
+
260
+ def _get_context_from_agent_response(self, response, tool_definitions):
261
+ context = ""
262
+ try:
263
+ logger.debug("Extracting context from response")
264
+ tool_calls = self._parse_tools_from_response(response=response)
265
+ logger.debug(f"Tool Calls parsed successfully : {tool_calls}")
266
+ if tool_calls:
267
+ for tool_call in tool_calls:
268
+ if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
269
+ tool_name = tool_call.get("name")
270
+ for tool in tool_definitions:
271
+ if tool.get("name") == tool_name and tool.get("type") in self._SUPPORTED_TOOLS:
272
+ if tool_name == "file_search":
273
+ tool_result = tool_call.get("tool_result")
274
+ if tool_result:
275
+ for result in tool_result:
276
+ content_list = result.get("content")
277
+ if content_list:
278
+ for content in content_list:
279
+ text = content.get("text")
280
+ if text:
281
+ context = context + "\n" + str(text)
282
+ except Exception as ex:
283
+ logger.debug(f"Error extracting context from agent response : {str(ex)}")
284
+ context = ""
285
+
286
+ return context if context else None
@@ -57,15 +57,21 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
57
57
  _MAX_INTENT_RESOLUTION_SCORE = 5
58
58
  _DEFAULT_INTENT_RESOLUTION_THRESHOLD = 3
59
59
 
60
- id = None
60
+ id = "azureai://built-in/evaluators/intent_resolution"
61
61
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
62
62
 
63
63
  @override
64
- def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
64
+ def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
65
65
  current_dir = os.path.dirname(__file__)
66
66
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
67
67
  self.threshold = threshold
68
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
68
+ super().__init__(
69
+ model_config=model_config,
70
+ prompty_file=prompty_path,
71
+ result_key=self._RESULT_KEY,
72
+ credential=credential,
73
+ **kwargs,
74
+ )
69
75
 
70
76
  @overload
71
77
  def __call__(
@@ -65,7 +65,7 @@ class MeteorScoreEvaluator(EvaluatorBase):
65
65
  :caption: Initialize with threshold and call a MeteorScoreEvaluator.
66
66
  """
67
67
 
68
- id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
68
+ id = "azureai://built-in/evaluators/meteor_score"
69
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
70
 
71
71
  @override
@@ -50,19 +50,26 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
50
50
 
51
51
  """
52
52
 
53
- id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
53
+ id = "azureai://built-in/evaluators/protected_material"
54
54
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
+ _OPTIONAL_PARAMS = ["query"]
55
56
 
56
57
  @override
57
58
  def __init__(
58
59
  self,
59
60
  credential,
60
61
  azure_ai_project,
62
+ **kwargs,
61
63
  ):
64
+ # Set default for evaluate_query if not provided
65
+ if "evaluate_query" not in kwargs:
66
+ kwargs["evaluate_query"] = True
67
+
62
68
  super().__init__(
63
69
  eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
64
70
  azure_ai_project=azure_ai_project,
65
71
  credential=credential,
72
+ **kwargs,
66
73
  )
67
74
 
68
75
  @overload
@@ -74,7 +74,7 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
74
74
  however, it is recommended to use the new keys moving forward as the old keys will be deprecated in the future.
75
75
  """
76
76
 
77
- id = "qa"
77
+ id = "azureai://built-in/evaluators/qa"
78
78
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
79
79
 
80
80
  def __init__(
@@ -1,15 +1,21 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
4
+ import logging
5
+ import math
5
6
  import os
6
7
  from typing import Dict, Union, List
7
8
 
8
9
  from typing_extensions import overload, override
9
10
 
11
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12
+ from ..._common.utils import reformat_conversation_history, reformat_agent_response
13
+
10
14
  from azure.ai.evaluation._model_configurations import Conversation
11
15
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
12
16
 
17
+ logger = logging.getLogger(__name__)
18
+
13
19
 
14
20
  class RelevanceEvaluator(PromptyEvaluatorBase):
15
21
  """
@@ -69,11 +75,11 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
69
75
  _PROMPTY_FILE = "relevance.prompty"
70
76
  _RESULT_KEY = "relevance"
71
77
 
72
- id = "azureml://registries/azureml/models/Relevance-Evaluator/versions/4"
78
+ id = "azureai://built-in/evaluators/relevance"
73
79
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
74
80
 
75
81
  @override
76
- def __init__(self, model_config, *, threshold=3):
82
+ def __init__(self, model_config, *, credential=None, threshold=3):
77
83
  current_dir = os.path.dirname(__file__)
78
84
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
79
85
  self._threshold = threshold
@@ -83,6 +89,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
83
89
  prompty_file=prompty_path,
84
90
  result_key=self._RESULT_KEY,
85
91
  threshold=threshold,
92
+ credential=credential,
86
93
  _higher_is_better=self._higher_is_better,
87
94
  )
88
95
 
@@ -141,3 +148,49 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
141
148
  :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
142
149
  """
143
150
  return super().__call__(*args, **kwargs)
151
+
152
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
153
+ """Do a relevance evaluation.
154
+
155
+ :param eval_input: The input to the evaluator. Expected to contain
156
+ whatever inputs are needed for the _flow method, including context
157
+ and other fields depending on the child class.
158
+ :type eval_input: Dict
159
+ :return: The evaluation result.
160
+ :rtype: Dict
161
+ """
162
+ if "query" not in eval_input and "response" not in eval_input:
163
+ raise EvaluationException(
164
+ message="Only text conversation inputs are supported.",
165
+ internal_message="Only text conversation inputs are supported.",
166
+ blame=ErrorBlame.USER_ERROR,
167
+ category=ErrorCategory.INVALID_VALUE,
168
+ target=ErrorTarget.CONVERSATION,
169
+ )
170
+ if not isinstance(eval_input["query"], str):
171
+ eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
172
+ if not isinstance(eval_input["response"], str):
173
+ eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
174
+ llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
175
+ score = math.nan
176
+
177
+ if isinstance(llm_output, dict):
178
+ score = float(llm_output.get("score", math.nan))
179
+ reason = llm_output.get("explanation", "")
180
+ # Parse out score and reason from evaluators known to possess them.
181
+ binary_result = self._get_binary_result(score)
182
+ return {
183
+ self._result_key: float(score),
184
+ f"gpt_{self._result_key}": float(score),
185
+ f"{self._result_key}_reason": reason,
186
+ f"{self._result_key}_result": binary_result,
187
+ f"{self._result_key}_threshold": self._threshold,
188
+ }
189
+
190
+ binary_result = self._get_binary_result(score)
191
+ return {
192
+ self._result_key: float(score),
193
+ f"gpt_{self._result_key}": float(score),
194
+ f"{self._result_key}_result": binary_result,
195
+ f"{self._result_key}_threshold": self._threshold,
196
+ }
@@ -10,91 +10,172 @@ model:
10
10
  presence_penalty: 0
11
11
  frequency_penalty: 0
12
12
  response_format:
13
- type: text
13
+ type: json_object
14
14
 
15
15
  inputs:
16
16
  query:
17
17
  type: string
18
18
  response:
19
19
  type: string
20
-
21
20
  ---
21
+
22
22
  system:
23
- # Instruction
24
- ## Goal
25
- ### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
26
- - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
27
- - **Data**: Your input data include QUERY and RESPONSE.
28
- - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
23
+ You are a Relevance-Judge, an impartial evaluator that scores how well the RESPONSE addresses the QUERY using the definitions provided.
29
24
 
30
25
  user:
31
- # Definition
32
- **Relevance** refers to how effectively a response addresses a question. It assesses the accuracy, completeness, and direct relevance of the response based solely on the given information.
33
-
34
- # Ratings
35
- ## [Relevance: 1] (Irrelevant Response)
36
- **Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed.
37
-
38
- **Examples:**
39
- **Query:** What is the team preparing for?
40
- **Response:** I went grocery shopping yesterday evening.
41
-
42
- **Query:** When will the company's new product line launch?
43
- **Response:** International travel can be very rewarding and educational.
26
+ ROLE
27
+ ====
28
+ You are a Relevance Evaluator. Your task is to judge how relevant a RESPONSE is to a QUERY using the Relevance definitions provided.
44
29
 
45
- ## [Relevance: 2] (Incorrect Response)
46
- **Definition:** The response attempts to address the question but includes incorrect information. It provides a response that is factually wrong based on the provided information.
30
+ INPUT
31
+ =====
32
+ QUERY: {{query}}
33
+ RESPONSE: {{response}}
47
34
 
48
- **Examples:**
49
- **Query:** When was the merger between the two firms finalized?
50
- **Response:** The merger was finalized on April 10th.
35
+ TASK
36
+ ====
37
+ Output a JSON object with:
38
+ 1) a concise explanation of 15-60 words justifying your score based on how well the response is relevant to the query.
39
+ 2) an integer score from 1 (very poor) to 5 (excellent) using the rubric below.
51
40
 
52
- **Query:** Where and when will the solar eclipse be visible?
53
- **Response:** The solar eclipse will be visible in Asia on December 14th.
41
+ The explanation should always precede the score and should clearly justify the score based on the rubric definitions.
42
+ Response format exactly as follows:
54
43
 
55
- ## [Relevance: 3] (Incomplete Response)
56
- **Definition:** The response addresses the question but omits key details necessary for a full understanding. It provides a partial response that lacks essential information.
44
+ {
45
+ "explanation": "<15-60 words>",
46
+ "score": <1-5>
47
+ }
57
48
 
58
- **Examples:**
59
- **Query:** What type of food does the new restaurant offer?
60
- **Response:** The restaurant offers Italian food like pasta.
61
49
 
62
- **Query:** What topics will the conference cover?
63
- **Response:** The conference will cover renewable energy and climate change.
50
+ EVALUATION STEPS
51
+ ================
52
+ A. Read the QUERY and RESPONSE carefully.
53
+ B. Compare the RESPONSE against the rubric below:
54
+ - Does the response directly address the query?
55
+ - Is the information complete, partial, or off-topic?
56
+ - Is it vague, generic, or insightful?
57
+ C. Match the response to the best score from the rubric.
58
+ D. Provide a short explanation and the score using the required format.
64
59
 
65
- ## [Relevance: 4] (Complete Response)
66
- **Definition:** The response fully addresses the question with accurate and complete information. It includes all essential details required for a comprehensive understanding, without adding any extraneous information.
60
+ SCORING RUBRIC
61
+ ==============
67
62
 
68
- **Examples:**
69
- **Query:** What type of food does the new restaurant offer?
70
- **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto.
63
+ ### Score 1 - Irrelevant Response
64
+ Definition: The response is unrelated to the question. It provides off-topic information and does not attempt to address the question posed.
71
65
 
72
- **Query:** What topics will the conference cover?
73
- **Response:** The conference will cover renewable energy, climate change, and sustainability practices.
66
+ **Example A**
67
+ QUERY: What is the team preparing for?
68
+ RESPONSE: I went grocery shopping yesterday evening.
74
69
 
75
- ## [Relevance: 5] (Comprehensive Response with Insights)
76
- **Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding.
70
+ Expected Output:
71
+ {
72
+ "explanation": "The response is entirely off-topic and doesn't address the question.",
73
+ "score": 1
74
+ }
77
75
 
78
- **Examples:**
79
- **Query:** What type of food does the new restaurant offer?
80
- **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience.
81
76
 
82
- **Query:** What topics will the conference cover?
83
- **Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues.
77
+ **Example B**
78
+ QUERY: When will the company's new product line launch?
79
+ RESPONSE: International travel can be very rewarding and educational.
84
80
 
81
+ Expected Output:
82
+ {
83
+ "explanation": "The response is completely irrelevant to the product launch question.",
84
+ "score": 1
85
+ }
86
+
87
+
88
+ ### Score 2 – Related but Unhelpful / Superficial
89
+ Definition: The response is loosely or formally related to the query but fails to deliver any meaningful, specific, or useful information. This includes vague phrases, non-answers, or failure/error messages.
90
+
91
+ **Example A**
92
+ QUERY: What is the event about?
93
+ RESPONSE: It’s something important.
94
+
95
+ Expected Output:
96
+ {
97
+ "explanation": "The response vaguely refers to the query topic but lacks specific or informative content.",
98
+ "score": 2
99
+ }
100
+
101
+ **Example B**
102
+ QUERY: What’s the weather in Paris?
103
+ RESPONSE: I tried to find the forecast but the query failed.
104
+
105
+ Expected Output:
106
+ {
107
+ "explanation": "The response acknowledges the query but provides no usable weather information. It is related but unhelpful.",
108
+ "score": 2
109
+ }
110
+
111
+ ### Score 3 - Partially Relevant / Incomplete
112
+ Definition: The response addresses the query and includes relevant information, but omits essential components or detail. The answer is on-topic but insufficient to fully satisfy the request.
113
+
114
+ **Example A**
115
+ QUERY: What amenities does the new apartment complex provide?
116
+ RESPONSE: The apartment complex has a gym.
117
+
118
+ Expected Output:
119
+ {
120
+ "explanation": "The response mentions one amenity but does not provide a fuller list or clarify whether other standard features (like parking or security) are included. It partially addresses the query but lacks completeness.",
121
+ "score": 3
122
+ }
123
+
124
+ **Example B**
125
+ QUERY: What services does the premium membership include?
126
+ RESPONSE: It includes priority customer support.
127
+
128
+ Expected Output:
129
+ {
130
+ "explanation": "The response identifies one service but omits other likely components of a premium membership (e.g., exclusive content or discounts). The information is relevant but incomplete.",
131
+ "score": 3
132
+ }
133
+
134
+
135
+
136
+ ### Score 4 - Fully Relevant / Sufficient Response
137
+ Definition: The response fully addresses the question with accurate and sufficient information, covering all essential aspects. Very minor omissions are acceptable as long as the core information is intact and the intent is clearly conveyed.
138
+
139
+ **Example A**
140
+ QUERY: What amenities does the new apartment complex provide?
141
+ RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security.
142
+
143
+ Expected Output:
144
+ {
145
+ "explanation": "The response mentions multiple key amenities likely to be relevant to most users. While it may not list every feature, it clearly conveys the core offerings of the complex.",
146
+ "score": 4
147
+ }
85
148
 
149
+ **Example B**
150
+ QUERY: What services does the premium membership include?
151
+ RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases.
152
+
153
+ Expected Output:
154
+ {
155
+ "explanation": "The response outlines all major services expected from a premium membership. Even if a minor service is not mentioned, the core value is clearly and fully represented.",
156
+ "score": 4
157
+ }
86
158
 
87
- # Data
88
- QUERY: {{query}}
89
- RESPONSE: {{response}}
90
159
 
160
+ ### Score 5 - Comprehensive Response with Insights
161
+ Definition: The response not only fully and accurately answers the question, but also adds meaningful elaboration, interpretation, or context that enhances the user's understanding. This goes beyond just listing relevant details — it offers insight into why the information matters, how it's useful, or what impact it has.
91
162
 
92
- # Tasks
93
- ## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
94
- - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
95
- - **Explanation**: a very short explanation of why you think the input Data should get that Score.
96
- - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
163
+ **Example A**
164
+ QUERY: What amenities does the new apartment complex provide?
165
+ RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security, designed to offer residents a comfortable and active lifestyle while ensuring their safety.
166
+
167
+ Expected Output:
168
+ {
169
+ "explanation": "The response fully lists key amenities and additionally explains how these features contribute to resident experience, enhancing the usefulness of the information.",
170
+ "score": 5
171
+ }
97
172
 
173
+ **Example B**
174
+ QUERY: What services does the premium membership include?
175
+ RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases — tailored for users who want quicker resolutions and first access to new features.
98
176
 
99
- ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
100
- # Output
177
+ Expected Output:
178
+ {
179
+ "explanation": "The response covers all essential services and adds valuable insight about the target user and benefits, enriching the response beyond basic listing.",
180
+ "score": 5
181
+ }
@@ -64,7 +64,7 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
64
64
  _PROMPTY_FILE = "response_completeness.prompty"
65
65
  _RESULT_KEY = "response_completeness"
66
66
 
67
- id = "completeness"
67
+ id = "azureai://built-in/evaluators/response_completeness"
68
68
 
69
69
  _MIN_COMPLETENESS_SCORE = 1
70
70
  _MAX_COMPLETENESS_SCORE = 5
@@ -73,11 +73,19 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
73
73
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
74
74
 
75
75
  @override
76
- def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs):
76
+ def __init__(
77
+ self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
78
+ ):
77
79
  current_dir = os.path.dirname(__file__)
78
80
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
79
81
  self.threshold = threshold
80
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
82
+ super().__init__(
83
+ model_config=model_config,
84
+ prompty_file=prompty_path,
85
+ result_key=self._RESULT_KEY,
86
+ credential=credential,
87
+ **kwargs,
88
+ )
81
89
 
82
90
  @overload
83
91
  def __call__(
@@ -74,11 +74,11 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
74
74
  _PROMPTY_FILE = "retrieval.prompty"
75
75
  _RESULT_KEY = "retrieval"
76
76
 
77
- id = "azureml://registries/azureml/models/Retrieval-Evaluator/versions/1"
77
+ id = "azureai://built-in/evaluators/retrieval"
78
78
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
79
79
 
80
80
  @override
81
- def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=super-init-not-called
81
+ def __init__(self, model_config, *, threshold: float = 3, credential=None):
82
82
  current_dir = os.path.dirname(__file__)
83
83
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
84
84
  self._threshold = threshold
@@ -88,6 +88,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
88
88
  prompty_file=prompty_path,
89
89
  result_key=self._RESULT_KEY,
90
90
  threshold=threshold,
91
+ credential=credential,
91
92
  _higher_is_better=self._higher_is_better,
92
93
  )
93
94
 
@@ -90,7 +90,7 @@ class RougeScoreEvaluator(EvaluatorBase):
90
90
  :caption: Initialize with a specified threshold and call a RougeScoreEvaluator with a four-gram rouge type.
91
91
  """
92
92
 
93
- id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
93
+ id = "azureai://built-in/evaluators/rouge_score"
94
94
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
95
95
 
96
96
  @override
@@ -66,8 +66,9 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
66
66
  for the groundedness pro label will be "groundedness_pro_passing_rate".
67
67
  """
68
68
 
69
- id = "azureml://registries/azureml/models/Groundedness-Pro-Evaluator/versions/1"
69
+ id = "azureai://built-in/evaluators/groundedness_pro"
70
70
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71
+ _OPTIONAL_PARAMS = ["query"]
71
72
 
72
73
  @override
73
74
  def __init__(