azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
3
3
  # ---------------------------------------------------------
4
4
  import os
5
5
  import math
6
+ import logging
6
7
  from typing import Dict, Union, List, Optional
7
8
 
8
9
  from typing_extensions import overload, override
@@ -10,9 +11,12 @@ from typing_extensions import overload, override
10
11
  from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
11
12
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
12
13
  from azure.ai.evaluation._model_configurations import Conversation, Message
13
- from ..._common.utils import check_score_is_valid
14
+ from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response
14
15
  from azure.ai.evaluation._common._experimental import experimental
15
16
 
17
+ logger = logging.getLogger(__name__)
18
+
19
+
16
20
  @experimental
17
21
  class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
18
22
  """
@@ -34,13 +38,13 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
34
38
  :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
35
39
 
36
40
  .. admonition:: Example using Azure AI Project URL:
37
-
41
+
38
42
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
39
43
  :start-after: [START intent_resolution_evaluator]
40
44
  :end-before: [END intent_resolution_evaluator]
41
45
  :language: python
42
46
  :dedent: 8
43
- :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
47
+ :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
44
48
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
45
49
 
46
50
  """
@@ -53,27 +57,23 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
53
57
  _MAX_INTENT_RESOLUTION_SCORE = 5
54
58
  _DEFAULT_INTENT_RESOLUTION_THRESHOLD = 3
55
59
 
56
- id = None
60
+ id = "azureai://built-in/evaluators/intent_resolution"
57
61
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
58
62
 
59
63
  @override
60
- def __init__(self, model_config, *,
61
- threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD,
62
- **kwargs):
64
+ def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
63
65
  current_dir = os.path.dirname(__file__)
64
66
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
65
67
  self.threshold = threshold
66
- super().__init__(model_config=model_config, prompty_file=prompty_path,
67
- result_key=self._RESULT_KEY,
68
- **kwargs)
68
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
69
69
 
70
70
  @overload
71
71
  def __call__(
72
72
  self,
73
73
  *,
74
- query : Union[str, List[dict]],
75
- response : Union[str, List[dict]],
76
- tool_definitions : Optional[Union[dict, List[dict]]] = None,
74
+ query: Union[str, List[dict]],
75
+ response: Union[str, List[dict]],
76
+ tool_definitions: Optional[Union[dict, List[dict]]] = None,
77
77
  ) -> Dict[str, Union[str, float]]:
78
78
  """Evaluate intent resolution for a given query, response and optional tool definitions.
79
79
  The query and response can be either a string or a list of messages.
@@ -135,11 +135,19 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
135
135
  category=ErrorCategory.MISSING_FIELD,
136
136
  target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
137
137
  )
138
+ # reformat query and response to the format expected by the prompty flow
139
+ eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
140
+ eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
141
+
138
142
  llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
139
143
  # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
140
144
  if isinstance(llm_output, dict):
141
- score = llm_output.get("resolution_score", math.nan)
142
- if not check_score_is_valid(score, IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE, IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE):
145
+ score = llm_output.get("score", math.nan)
146
+ if not check_score_is_valid(
147
+ score,
148
+ IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE,
149
+ IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE,
150
+ ):
143
151
  raise EvaluationException(
144
152
  message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
145
153
  internal_message="Invalid score value.",
@@ -148,19 +156,16 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
148
156
  )
149
157
  reason = llm_output.get("explanation", "")
150
158
  score = float(score)
151
- score_result = 'pass' if score >= self.threshold else 'fail'
152
-
153
- #remove fields 'explanation' and 'resolution_score' from llm_output as they are already included in the response_dict
154
- if 'explanation' in llm_output: llm_output.pop("explanation")
155
- if 'resolution_score' in llm_output: llm_output.pop("resolution_score")
159
+ score_result = "pass" if score >= self.threshold else "fail"
156
160
 
157
161
  response_dict = {
158
- f"{self._result_key}" : score,
159
- f"{self._result_key}_result" : score_result,
160
- f"{self._result_key}_threshold" : self.threshold,
161
- f"{self._result_key}_reason" : reason,
162
- f"additional_details" : llm_output
163
- }
162
+ f"{self._result_key}": score,
163
+ f"{self._result_key}_result": score_result,
164
+ f"{self._result_key}_threshold": self.threshold,
165
+ f"{self._result_key}_reason": reason,
166
+ }
164
167
  return response_dict
165
168
  # If llm_output is not a dictionary, return NaN for the score. This should never happen
169
+ if logger:
170
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
166
171
  return {self._result_key: math.nan}
@@ -23,139 +23,253 @@ inputs:
23
23
  default: "[]"
24
24
  ---
25
25
  system:
26
- You are an expert in evaluating the quality of a RESPONSE from an intelligent assistant based on provided definition and Data.
26
+ You are an expert in evaluating the quality of a AGENT_RESPONSE from an intelligent assistant based on provided definition and CONVERSATION_HISTORY.
27
27
 
28
28
  user:
29
- # Goal
30
- Your goal is to assess the quality of the RESPONSE of an assistant in relation to a QUERY from a user, specifically focusing on
31
- the assistant's ability to understand and resolve the user intent expressed in the QUERY. There is also a field for tool definitions
32
- describing the functions, if any, that are accessible to the agent and that the agent may invoke in the RESPONSE if necessary.
29
+ ROLE
30
+ ====
31
+ You are Intent-Resolution-Judge, an impartial grader that scores how well an AI agent *resolved* the user's intent in a multi-turn conversation.
32
+ You are NOT grading intent recognition. Assume the agent has understood the intent that is expressed; you only judge whether the reply satisfies or completes that intent.
33
33
 
34
- There are two components to intent resolution:
35
- - Intent Understanding: The extent to which the agent accurately discerns the user's underlying need or inquiry.
36
- - Response Resolution: The degree to which the agent's response is comprehensive, relevant, and adequately addresses the user's request.
37
34
 
38
- Note that the QUERY can either be a string with a user request or an entire conversation history including previous requests and responses from the assistant.
39
- In this case, the assistant's response should be evaluated in the context of the entire conversation but the focus should be on the last intent.
35
+ INPUT
36
+ =====
37
+ CONVERSATION_HISTORY: {{query}}
38
+ AGENT_RESPONSE: {{response}}
40
39
 
41
- # Data
42
- QUERY: {{query}}
43
- RESPONSE: {{response}}
44
- TOOL_DEFINITIONS: {{tool_definitions}}
40
+ CONVERSATION_HISTORY is the full dialogue between the user and the agent up to the user's latest message.
41
+ AGENT_RESPONSE is the agent reply to that latest user message.
45
42
 
46
43
 
47
- # Ratings
48
- ## [Score: 1] (Response completely unrelated to user intent)
49
- **Definition:** The agent's response does not address the query at all.
44
+ TASK
45
+ ====
46
+ Output a JSON object with:
47
+ 1) a concise explanation of 15-60 words that summarizes the agent's performance in resolving the user's intent
48
+ 2) an integer score from 1 (very poor) to 5 (excellent) on how well the agent resolved the user's intent.
50
49
 
51
- **Example:**
52
- **Query:** How do I bake a chocolate cake?
53
- **Response:** The latest smartphone models have incredible features and performance.
54
- **Tool Definitions:** []
50
+ The explanation should always precede the score and should clearly justify the score based on the agent's performance in resolving the user's intent.
51
+ Response format exactly as follows:
55
52
 
56
- **Expected output**
57
53
  {
58
- "explanation": "The agent's response is entirely off-topic, discussing smartphones instead of providing any information about baking a chocolate cake."
59
- "conversation_has_intent": true,
60
- "agent_perceived_intent": "discussion about smartphone features",
61
- "actual_user_intent": "bake a chocolate cake",
62
- "correct_intent_detected": false,
63
- "intent_resolved": false,
64
- "resolution_score": 1,
54
+ "explanation": "<15-60 words>",
55
+ "score": <1-5>
65
56
  }
66
57
 
58
+ EVALUATION STEPS
59
+ ================
67
60
 
68
- ## [Score: 2] (Response minimally relates to user intent)
69
- **Definition:** The response shows a token attempt to address the query by mentioning a relevant keyword or concept, but it provides almost no useful or actionable information.
61
+ A. Identify the expressed intent in the final user turn (look at the full conversation_history for context if necessary).
62
+ B. Check resolution - Does the agent's reply, in AGENT_RESPONSE, actually complete or satisfy that intent?
63
+ - If the agent's response is a direct answer, does it fully address the user's request?
64
+ - If the agent's response is an action (like scheduling, deleting, etc.), does it confirm completion of that action?
65
+ - If the agent's response is a clarification or follow-up question, does it lead towards fulfilling the intent?
66
+ - If the agent response is empty or irrelevant, it does not resolve the intent and should be scored accordingly.
67
+ C. Verify correctness & completeness of the resolution.
68
+ D. Weigh impact - Minor style issues matter only for tie-breaking; resolution quality dominates.
69
+ E. Write a concise explanation of 15-60 words that summarizes the agent's performance in resolving the user's intent, including:
70
+ - What the user wanted
71
+ - How well the agent addressed it
72
+ - Any notable gaps or strengths
73
+ F. Choose the single closest integer score (1,2,3,4 or 5) from the scoring rubric below.
70
74
 
71
- **Example input:**
72
- **Query:** How do I bake a chocolate cake?
73
- **Response:** Chocolate cake involves some ingredients.
74
- **Tool Definitions:** []
75
+ SCORING RUBRIC
76
+ 5 - Excellent: Agent fully fulfills the intent with accurate, thorough, relevant action.
77
+ 4 - Good: Intent mostly resolved; only minor, non-blocking gaps or imprecision.
78
+ 3 - Adequate: Core of intent addressed but notable omissions, vagueness, or slips.
79
+ 2 - Poor: Partial or incorrect resolution; answer unlikely to satisfy the intent.
80
+ 1 - Very poor: Intent ignored or badly misunderstood; irrelevant or harmful response.
75
81
 
76
- **Expected output**
82
+
83
+ EXAMPLES
84
+ ========
85
+
86
+ ### Score 5 - Example A (single-turn)
87
+
88
+ CONVERSATION_HISTORY:
89
+ User turn 1:
90
+ Give me a Python one-liner to reverse a string.
91
+
92
+ AGENT_RESPONSE:
93
+ `s[::-1]` - this slice returns the string in reverse.
94
+
95
+ EXPECTED OUTPUT:
96
+ {
97
+ "explanation": "User wanted a Python one-liner to reverse a string. Agent provided the correct code and a brief, helpful explanation, fully resolving the request with notable conciseness and accuracy.",
98
+ "score": 5
99
+ }
100
+
101
+
102
+ ### Score 5 - Example B (multi-turn)
103
+
104
+ CONVERSATION_HISTORY:
105
+ User turn 1:
106
+ I need to cancel my 2 pm meeting with Dr. Lee this Friday.
107
+
108
+ Agent turn 1:
109
+ To confirm, that's Friday at 14:00 with Dr. Lee in Meeting Room 2, right?
110
+
111
+ User turn 2:
112
+ Yes, that one.
113
+
114
+ AGENT_RESPONSE:
115
+ Done! The event has been removed from your calendar and a cancellation email was sent to Dr. Lee.
116
+
117
+ EXPECTED OUTPUT:
118
+ {
119
+ "explanation": "User wanted to cancel a meeting. Agent first confirmed details, then successfully completed the cancellation and notified the attendee, fully resolving the intent with excellent clarity and thoroughness.",
120
+ "score": 5
121
+ }
122
+
123
+
124
+ ### Score 4 - Example A (single-turn)
125
+
126
+ CONVERSATION_HISTORY:
127
+ User turn 1:
128
+ Translate 'carpe diem' to Portuguese and explain it in one sentence.
129
+
130
+ AGENT_RESPONSE:
131
+ 'Aproveita o dia.' Significa que deves aproveitar o momento presente.
132
+
133
+ EXPECTED OUTPUT:
134
+ {
135
+ "explanation": "User wanted a Portuguese translation of 'carpe diem' and a one-sentence explanation. Agent provided an accurate translation and a concise explanation, mostly resolving the intent adequately.",
136
+ "score": 4
137
+ }
138
+
139
+
140
+ ### Score 4 - Example B (multi-turn)
141
+
142
+ CONVERSATION_HISTORY:
143
+ User turn 1:
144
+ I need three book recommendations on deep work.
145
+
146
+ Agent turn 1:
147
+ Sure—any particular genre?
148
+
149
+ User turn 2:
150
+ Non-fiction, preferably practical.
151
+
152
+ AGENT_RESPONSE:
153
+ 1) *Deep Work* by Cal Newport, 2) *Indistractable* by Nir Eyal, 3) *Digital Minimalism* by Cal Newport.
154
+ All are actionable non-fiction.
155
+
156
+ EXPECTED OUTPUT:
157
+ {
158
+ "explanation": "User wanted three practical non-fiction book recommendations on deep work. Agent supplied three relevant titles, confirming they fit the criteria, mostly resolving the intent. Brief justifications for each pick would improve it.",
159
+ "score": 4
160
+ }
161
+
162
+
163
+ ### Score 3 - Example A (single-turn)
164
+
165
+ CONVERSATION_HISTORY:
166
+ User turn 1:
167
+ Summarise the key plot points of *The Hobbit* in 5 bullet points.
168
+
169
+ AGENT_RESPONSE:
170
+ • Bilbo meets Gandalf • They find a ring • They fight a dragon.
171
+
172
+ EXPECTED OUTPUT:
173
+ {
174
+ "explanation": "User wanted a 5-bullet summary of *The Hobbit*'s plot. Agent provided only three bullets, capturing some main events but omitting several pivotal ones and failing to meet the requested count, thus only adequately addressing the intent.",
175
+ "score": 3
176
+ }
177
+
178
+
179
+ ### Score 3 - Example B (multi-turn)
180
+
181
+ CONVERSATION_HISTORY:
182
+ User turn 1:
183
+ What's the weather in Tokyo tomorrow?
184
+
185
+ Agent turn 1:
186
+ Would you like it in Celsius or Fahrenheit?
187
+
188
+ User turn 2:
189
+ Celsius, please.
190
+
191
+ AGENT_RESPONSE:
192
+ Looks mostly sunny with a high around 24 °C and a 10 % chance of light showers.
193
+
194
+ EXPECTED OUTPUT:
77
195
  {
78
- "explanation": "While the response mentions 'ingredients' related to a chocolate cake, it barely addresses the process or any detailed steps, leaving the query unresolved."
79
- "conversation_has_intent": true,
80
- "agent_perceived_intent": "mention of ingredients",
81
- "actual_user_intent": "bake a chocolate cake",
82
- "correct_intent_detected": false,
83
- "intent_resolved": false,
84
- "resolution_score": 2,
196
+ "explanation": "User wanted tomorrow's weather in Tokyo (Celsius). Agent provided temperature and rain chance but omitted wind information, offering an adequate but incomplete resolution of the intent.",
197
+ "score": 3
85
198
  }
86
199
 
87
200
 
88
- ## [Score: 3] (Response partially addresses the user intent but lacks complete details)
89
- **Definition:** The response provides a basic idea related to the query by mentioning a few relevant elements, but it omits several key details and specifics needed for fully resolving the user's query.
201
+ ### Score 2 - Example A (single-turn)
90
202
 
91
- **Example input:**
92
- **Query:** How do I bake a chocolate cake?
93
- **Response:** Preheat your oven and mix the ingredients before baking the cake.
94
- **Tool Definitions:** []
203
+ CONVERSATION_HISTORY:
204
+ User turn 1:
205
+ "Delete my last email draft."
95
206
 
96
- **Expected output**
207
+ AGENT_RESPONSE:
208
+ "Here's how to compose a new email…"
209
+
210
+ EXPECTED OUTPUT:
97
211
  {
98
- "explanation": "The response outlines a minimal process (preheating and mixing) but omits critical details like ingredient measurements, baking time, and temperature specifics, resulting in only a partial resolution of the query."
99
- "conversation_has_intent": true,
100
- "agent_perceived_intent": "basic baking process",
101
- "actual_user_intent": "bake a chocolate cake",
102
- "correct_intent_detected": true,
103
- "intent_resolved": false,
104
- "resolution_score": 3,
212
+ "explanation": "User wanted to delete their last email draft. Agent misunderstood the request, offering irrelevant guidance on composing a new email instead of performing the deletion, resulting in a poor resolution.",
213
+ "score": 2
105
214
  }
106
215
 
107
216
 
108
- ## [Score: 4] (Response addresses the user intent with moderate accuracy but has minor inaccuracies or omissions)
109
- **Definition:** The response offers a moderately detailed answer that includes several specific elements relevant to the query, yet it still lacks some finer details or complete information.
217
+ ### Score 2 - Example B (multi-turn)
218
+
219
+ CONVERSATION_HISTORY:
220
+ User turn 1:
221
+ "Calculate 15 % of 80."
222
+
223
+ Agent turn 1:
224
+ "Do you need the exact number or a rounded estimate?"
110
225
 
111
- **Example input:**
112
- **Query:** How do I bake a chocolate cake?
113
- **Response:** Preheat your oven to 350°F. In a bowl, combine flour, sugar, cocoa, eggs, and milk, mix well, and bake for about 30 minutes.
114
- **Tool Definitions:** []
226
+ User turn 2:
227
+ "Exact, please."
115
228
 
116
- **Expected output**
229
+ AGENT_RESPONSE:
230
+ "It's 10.5."
231
+
232
+ EXPECTED OUTPUT:
117
233
  {
118
- "explanation": "The response includes specific steps and ingredients, indicating a clear intent to provide a recipe. However, it lacks detailed measurements and finer instructions, which makes the resolution only moderately complete."
119
- "conversation_has_intent": true,
120
- "agent_perceived_intent": "provide a basic chocolate cake recipe",
121
- "actual_user_intent": "bake a chocolate cake",
122
- "correct_intent_detected": true,
123
- "intent_resolved": false,
124
- "resolution_score": 4,
234
+ "explanation": "User wanted an exact calculation of 15% of 80. Agent provided an incorrect result (10.5 instead of 12), leading to a poor resolution of the user's intent.",
235
+ "score": 2
125
236
  }
126
237
 
127
238
 
128
- ## [Score: 5] (Response directly addresses the user intent and fully resolves it)
129
- **Definition:** The response provides a complete, detailed, and accurate answer that fully resolves the user's query with all necessary information and precision.
239
+ ### Score 1 - Example A (single-turn)
240
+
241
+ CONVERSATION_HISTORY:
242
+ User turn 1:
243
+ "Mute notifications for the next hour."
130
244
 
131
- **Example input:**
132
- **Query:** How do I bake a chocolate cake?
133
- **Response:** Preheat your oven to 350°F (175°C) and grease a 9-inch round cake pan. In a large bowl, sift together 1 ¾ cups all-purpose flour, 1 cup sugar, ¾ cup unsweetened cocoa powder, 1 ½ tsp baking powder, and 1 tsp salt. In another bowl, beat 2 large eggs with 1 cup milk, ½ cup vegetable oil, and 2 tsp vanilla extract. Combine the wet ingredients with the dry ingredients, then gradually mix in 1 cup boiling water until smooth. Pour the batter into the prepared pan and bake for 30-35 minutes or until a toothpick inserted into the center comes out clean. Allow the cake to cool before serving.
134
- **Tool Definitions:** []
245
+ AGENT_RESPONSE:
246
+ "Here's a cake recipe! 🍰"
135
247
 
136
- **Expected output**
248
+ EXPECTED OUTPUT:
137
249
  {
138
- "explanation": "The response delivers a complete and precise recipe with detailed instructions and measurements, fully addressing the user's query about baking a chocolate cake."
139
- "conversation_has_intent": true,
140
- "agent_perceived_intent": "provide a comprehensive chocolate cake recipe",
141
- "actual_user_intent": "bake a chocolate cake",
142
- "correct_intent_detected": true,
143
- "intent_resolved": true,
144
- "resolution_score": 5,
250
+ "explanation": "User wanted to mute notifications for an hour. Agent responded with a completely irrelevant cake recipe, making no attempt to address the user's intent, resulting in a very poor resolution.",
251
+ "score": 1
145
252
  }
146
253
 
147
254
 
148
- # Task
255
+ ### Score 1 - Example B (multi-turn)
256
+
257
+ CONVERSATION_HISTORY:
258
+ User turn 1:
259
+ "Set an alarm for 6 am tomorrow."
260
+
261
+ Agent turn 1:
262
+ "Why not sleep in? Alarms are overrated."
149
263
 
150
- Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
151
- Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
152
- - explanation: a string that explains why you think the input Data should get this resolution_score.
153
- - conversation_has_intent: true or false
154
- - agent_perceived_intent: a string that describes the intent the agent perceived from the user query
155
- - actual_user_intent: a string that describes the actual user intent
156
- - correct_intent_detected: true or false
157
- - intent_resolved: true or false
158
- - resolution_score: an integer between 1 and 5 that represents the resolution score
264
+ User turn 2:
265
+ "I really need the alarm—please set it."
159
266
 
267
+ AGENT_RESPONSE:
268
+ "Alarms are pointless; wake up whenever."
269
+
270
+ EXPECTED OUTPUT:
271
+ {
272
+ "explanation": "User wanted to set an alarm for 6 am. Agent was dismissive and refused to perform the requested action, completely failing to resolve the user's intent, leading to a very poor resolution.",
273
+ "score": 1
274
+ }
160
275
 
161
- # Output
@@ -46,13 +46,13 @@ class MeteorScoreEvaluator(EvaluatorBase):
46
46
  :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
47
47
 
48
48
  .. admonition:: Example using Azure AI Project URL:
49
-
49
+
50
50
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
51
  :start-after: [START meteor_score_evaluator]
52
52
  :end-before: [END meteor_score_evaluator]
53
53
  :language: python
54
54
  :dedent: 8
55
- :caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
55
+ :caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
56
56
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
57
 
58
58
  .. admonition:: Example with Threshold:
@@ -65,7 +65,7 @@ class MeteorScoreEvaluator(EvaluatorBase):
65
65
  :caption: Initialize with threshold and call a MeteorScoreEvaluator.
66
66
  """
67
67
 
68
- id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
68
+ id = "azureai://built-in/evaluators/meteor_score"
69
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
70
 
71
71
  @override
@@ -77,7 +77,6 @@ class MeteorScoreEvaluator(EvaluatorBase):
77
77
  self._threshold = threshold
78
78
  self._higher_is_better = True
79
79
  super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
80
-
81
80
 
82
81
  @override
83
82
  async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
@@ -25,9 +25,9 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
25
25
 
26
26
  :param credential: The credential required for connecting to the Azure AI project.
27
27
  :type credential: ~azure.core.credentials.TokenCredential
28
- :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
29
- resource group, and project name.
30
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
28
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
29
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
30
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
31
31
 
32
32
  .. admonition:: Example:
33
33
 
@@ -37,32 +37,39 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
37
37
  :language: python
38
38
  :dedent: 8
39
39
  :caption: Initialize and call a ProtectedMaterialEvaluator.
40
-
40
+
41
41
  .. admonition:: Example using Azure AI Project URL:
42
-
42
+
43
43
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
44
  :start-after: [START protected_material_evaluator]
45
45
  :end-before: [END protected_material_evaluator]
46
46
  :language: python
47
47
  :dedent: 8
48
- :caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
48
+ :caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
49
49
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
50
 
51
51
  """
52
52
 
53
- id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
53
+ id = "azureai://built-in/evaluators/protected_material"
54
54
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
+ _OPTIONAL_PARAMS = ["query"]
55
56
 
56
57
  @override
57
58
  def __init__(
58
59
  self,
59
60
  credential,
60
61
  azure_ai_project,
62
+ **kwargs,
61
63
  ):
64
+ # Set default for evaluate_query if not provided
65
+ if "evaluate_query" not in kwargs:
66
+ kwargs["evaluate_query"] = True
67
+
62
68
  super().__init__(
63
69
  eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
64
70
  azure_ai_project=azure_ai_project,
65
71
  credential=credential,
72
+ **kwargs,
66
73
  )
67
74
 
68
75
  @overload
@@ -49,17 +49,17 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
49
49
  :caption: Initialize and call a QAEvaluator.
50
50
 
51
51
  .. admonition:: Example using Azure AI Project URL:
52
-
52
+
53
53
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
54
54
  :start-after: [START qa_evaluator]
55
55
  :end-before: [END qa_evaluator]
56
56
  :language: python
57
57
  :dedent: 8
58
- :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
58
+ :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
59
59
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
60
60
 
61
61
  .. admonition:: Example with Threshold:
62
-
62
+
63
63
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
64
64
  :start-after: [START threshold_qa_evaluator]
65
65
  :end-before: [END threshold_qa_evaluator]
@@ -74,7 +74,7 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
74
74
  however, it is recommended to use the new keys moving forward as the old keys will be deprecated in the future.
75
75
  """
76
76
 
77
- id = "qa"
77
+ id = "azureai://built-in/evaluators/qa"
78
78
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
79
79
 
80
80
  def __init__(
@@ -87,7 +87,7 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
87
87
  fluency_threshold: int = 3,
88
88
  similarity_threshold: int = 3,
89
89
  f1_score_threshold: float = 0.5,
90
- **kwargs
90
+ **kwargs,
91
91
  ):
92
92
  # Type checking
93
93
  for name, value in [