azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -29,11 +29,16 @@ system:
29
29
 
30
30
  user:
31
31
  # Definition
32
- **Groundedness** refers to how faithfully a response adheres to the information provided in the CONTEXT, ensuring that all content is directly supported by the context without introducing unsupported information or omitting critical details. It evaluates the fidelity and precision of the response in relation to the source material.
32
+ **Groundedness** refers to how well a response is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the response directly and fully addresses the information without introducing unrelated or incorrect information.
33
+
34
+ > Context is the source of truth for evaluating the response.
35
+ > Evaluate the groundedness of the response message based on the provided context.
33
36
 
34
37
  # Ratings
35
- ## [Groundedness: 1] (Completely Ungrounded Response)
36
- **Definition:** The response is entirely unrelated to the CONTEXT, introducing topics or information that have no connection to the provided material.
38
+ ## [Groundedness: 1] (Completely Unrelated Response)
39
+ **Definition:** A response that does not relate to the context in any way.
40
+ - Does not relate to the context at all.
41
+ - Talks about the general topic but does not respond to the context.
37
42
 
38
43
  **Examples:**
39
44
  **Context:** The company's profits increased by 20% in the last quarter.
@@ -42,8 +47,8 @@ user:
42
47
  **Context:** The new smartphone model features a larger display and improved battery life.
43
48
  **Response:** The history of ancient Egypt is fascinating and full of mysteries.
44
49
 
45
- ## [Groundedness: 2] (Contradictory Response)
46
- **Definition:** The response directly contradicts or misrepresents the information provided in the CONTEXT.
50
+ ## [Groundedness: 2] (Attempts to Respond but Contains Incorrect Information)
51
+ **Definition:** A response that attempts to relate to the context but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. Even if some points are correct, the presence of inaccuracies makes the response unreliable.
47
52
 
48
53
  **Examples:**
49
54
  **Context:** The company's profits increased by 20% in the last quarter.
@@ -52,18 +57,18 @@ user:
52
57
  **Context:** The new smartphone model features a larger display and improved battery life.
53
58
  **Response:** The new smartphone model has a smaller display and shorter battery life.
54
59
 
55
- ## [Groundedness: 3] (Accurate Response with Unsupported Additions)
56
- **Definition:** The response accurately includes information from the CONTEXT but adds details, opinions, or explanations that are not supported by the provided material.
60
+ ## [Groundedness: 3] (Accurate but Vague Response)
61
+ **Definition:** A response that provides accurate information from the context but is overly generic or vague, not meaningfully engaging with the specific details in the context. The information is correct but lacks specificity and detail.
57
62
 
58
63
  **Examples:**
59
- **Context:** The company's profits increased by 20% in the last quarter.
60
- **Response:** The company's profits increased by 20% in the last quarter due to their aggressive marketing strategy.
64
+ **Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
65
+ **Response:** The company is doing well financially.
61
66
 
62
- **Context:** The new smartphone model features a larger display and improved battery life.
63
- **Response:** The new smartphone model features a larger display, improved battery life, and comes with a free case.
67
+ **Context:** The new smartphone model features a larger display, improved battery life, and an upgraded camera system.
68
+ **Response:** The smartphone has some nice features.
64
69
 
65
- ## [Groundedness: 4] (Incomplete Response Missing Critical Details)
66
- **Definition:** The response contains information from the CONTEXT but omits essential details that are necessary for a comprehensive understanding of the main point.
70
+ ## [Groundedness: 4] (Partially Correct Response)
71
+ **Definition:** A response that provides correct information from the context but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding.
67
72
 
68
73
  **Examples:**
69
74
  **Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
@@ -73,7 +78,7 @@ user:
73
78
  **Response:** The new smartphone model features a larger display and improved battery life.
74
79
 
75
80
  ## [Groundedness: 5] (Fully Grounded and Complete Response)
76
- **Definition:** The response is entirely based on the CONTEXT, accurately and thoroughly conveying all essential information without introducing unsupported details or omitting critical points.
81
+ **Definition:** A response that thoroughly and accurately conveys information from the context, including all relevant details. It directly addresses the context with precise information, demonstrating complete understanding without adding extraneous information.
77
82
 
78
83
  **Examples:**
79
84
  **Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
@@ -69,7 +69,9 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
69
69
  model_config=model_config,
70
70
  prompty_file=prompty_path,
71
71
  result_key=self._RESULT_KEY,
72
+ threshold=threshold,
72
73
  credential=credential,
74
+ _higher_is_better=True,
73
75
  **kwargs,
74
76
  )
75
77
 
@@ -145,8 +147,10 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
145
147
  eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
146
148
  eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
147
149
 
148
- llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
150
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
151
+ llm_output = prompty_output_dict["llm_output"]
149
152
  # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
153
+ score = math.nan
150
154
  if isinstance(llm_output, dict):
151
155
  score = llm_output.get("score", math.nan)
152
156
  if not check_score_is_valid(
@@ -162,16 +166,31 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
162
166
  )
163
167
  reason = llm_output.get("explanation", "")
164
168
  score = float(score)
165
- score_result = "pass" if score >= self.threshold else "fail"
169
+ score_result = "pass" if score >= self._threshold else "fail"
166
170
 
167
171
  response_dict = {
168
172
  f"{self._result_key}": score,
173
+ f"gpt_{self._result_key}": score,
169
174
  f"{self._result_key}_result": score_result,
170
- f"{self._result_key}_threshold": self.threshold,
175
+ f"{self._result_key}_threshold": self._threshold,
171
176
  f"{self._result_key}_reason": reason,
177
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
178
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
179
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
180
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
181
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
182
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
183
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
172
184
  }
173
185
  return response_dict
174
186
  # If llm_output is not a dictionary, return NaN for the score. This should never happen
175
187
  if logger:
176
188
  logger.warning("LLM output is not a dictionary, returning NaN for the score.")
177
- return {self._result_key: math.nan}
189
+
190
+ binary_result = self._get_binary_result(score)
191
+ return {
192
+ self._result_key: float(score),
193
+ f"gpt_{self._result_key}": float(score),
194
+ f"{self._result_key}_result": binary_result,
195
+ f"{self._result_key}_threshold": self._threshold,
196
+ }
@@ -35,6 +35,11 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
35
35
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
36
  :param threshold: The threshold for the relevance evaluator. Default is 3.
37
37
  :type threshold: int
38
+ :param credential: The credential for authenticating to Azure AI service.
39
+ :type credential: ~azure.core.credentials.TokenCredential
40
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
41
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
42
+ :paramtype is_reasoning_model: bool
38
43
 
39
44
  .. admonition:: Example:
40
45
 
@@ -79,18 +84,17 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
79
84
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
80
85
 
81
86
  @override
82
- def __init__(self, model_config, *, credential=None, threshold=3):
87
+ def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
83
88
  current_dir = os.path.dirname(__file__)
84
89
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
85
- self._threshold = threshold
86
- self._higher_is_better = True
87
90
  super().__init__(
88
91
  model_config=model_config,
89
92
  prompty_file=prompty_path,
90
93
  result_key=self._RESULT_KEY,
91
94
  threshold=threshold,
92
95
  credential=credential,
93
- _higher_is_better=self._higher_is_better,
96
+ _higher_is_better=True,
97
+ **kwargs,
94
98
  )
95
99
 
96
100
  @overload
@@ -171,7 +175,8 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
171
175
  eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
172
176
  if not isinstance(eval_input["response"], str):
173
177
  eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
174
- llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
178
+ result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
179
+ llm_output = result.get("llm_output")
175
180
  score = math.nan
176
181
 
177
182
  if isinstance(llm_output, dict):
@@ -182,15 +187,24 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
182
187
  return {
183
188
  self._result_key: float(score),
184
189
  f"gpt_{self._result_key}": float(score),
185
- f"{self._result_key}_reason": reason,
186
190
  f"{self._result_key}_result": binary_result,
187
191
  f"{self._result_key}_threshold": self._threshold,
192
+ f"{self._result_key}_reason": reason,
193
+ f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
194
+ f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
195
+ f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
196
+ f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
197
+ f"{self._result_key}_model": result.get("model_id", ""),
198
+ f"{self._result_key}_sample_input": result.get("sample_input", ""),
199
+ f"{self._result_key}_sample_output": result.get("sample_output", ""),
188
200
  }
189
201
 
202
+ if logger:
203
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
204
+
190
205
  binary_result = self._get_binary_result(score)
191
206
  return {
192
207
  self._result_key: float(score),
193
- f"gpt_{self._result_key}": float(score),
194
208
  f"{self._result_key}_result": binary_result,
195
209
  f"{self._result_key}_threshold": self._threshold,
196
210
  }
@@ -20,22 +20,25 @@ inputs:
20
20
  ---
21
21
 
22
22
  system:
23
- You are a Relevance-Judge, an impartial evaluator that scores how well the RESPONSE addresses the QUERY using the definitions provided.
23
+ You are a Relevance-Judge, an impartial evaluator that scores how well the RESPONSE addresses the user's queries in the CONVERSATION_HISTORY using the definitions provided.
24
24
 
25
25
  user:
26
26
  ROLE
27
27
  ====
28
- You are a Relevance Evaluator. Your task is to judge how relevant a RESPONSE is to a QUERY using the Relevance definitions provided.
28
+ You are a Relevance Evaluator. Your task is to judge how relevant a RESPONSE is to the CONVERSATION_HISTORY using the Relevance definitions provided.
29
29
 
30
30
  INPUT
31
31
  =====
32
- QUERY: {{query}}
32
+ CONVERSATION_HISTORY: {{query}}
33
33
  RESPONSE: {{response}}
34
34
 
35
+ CONVERSATION_HISTORY is the full dialogue between the user and the agent up to the user's latest message. For single-turn interactions, this will be just the user's query.
36
+ RESPONSE is the agent's reply to the user's latest message.
37
+
35
38
  TASK
36
39
  ====
37
40
  Output a JSON object with:
38
- 1) a concise explanation of 15-60 words justifying your score based on how well the response is relevant to the query.
41
+ 1) a concise explanation of 15-60 words justifying your score based on how well the response is relevant to the user's queries in the CONVERSATION_HISTORY.
39
42
  2) an integer score from 1 (very poor) to 5 (excellent) using the rubric below.
40
43
 
41
44
  The explanation should always precede the score and should clearly justify the score based on the rubric definitions.
@@ -49,13 +52,14 @@ Response format exactly as follows:
49
52
 
50
53
  EVALUATION STEPS
51
54
  ================
52
- A. Read the QUERY and RESPONSE carefully.
53
- B. Compare the RESPONSE against the rubric below:
54
- - Does the response directly address the query?
55
+ A. Read the CONVERSATION_HISTORY and RESPONSE carefully.
56
+ B. Identify the user's query from the latest message (use conversation history for context if needed).
57
+ C. Compare the RESPONSE against the rubric below:
58
+ - Does the response directly address the user's query?
55
59
  - Is the information complete, partial, or off-topic?
56
60
  - Is it vague, generic, or insightful?
57
- C. Match the response to the best score from the rubric.
58
- D. Provide a short explanation and the score using the required format.
61
+ D. Match the response to the best score from the rubric.
62
+ E. Provide a short explanation and the score using the required format.
59
63
 
60
64
  SCORING RUBRIC
61
65
  ==============
@@ -64,7 +68,7 @@ SCORING RUBRIC
64
68
  Definition: The response is unrelated to the question. It provides off-topic information and does not attempt to address the question posed.
65
69
 
66
70
  **Example A**
67
- QUERY: What is the team preparing for?
71
+ CONVERSATION_HISTORY: What is the team preparing for?
68
72
  RESPONSE: I went grocery shopping yesterday evening.
69
73
 
70
74
  Expected Output:
@@ -75,7 +79,7 @@ Expected Output:
75
79
 
76
80
 
77
81
  **Example B**
78
- QUERY: When will the company's new product line launch?
82
+ CONVERSATION_HISTORY: When will the company's new product line launch?
79
83
  RESPONSE: International travel can be very rewarding and educational.
80
84
 
81
85
  Expected Output:
@@ -89,7 +93,7 @@ Expected Output:
89
93
  Definition: The response is loosely or formally related to the query but fails to deliver any meaningful, specific, or useful information. This includes vague phrases, non-answers, or failure/error messages.
90
94
 
91
95
  **Example A**
92
- QUERY: What is the event about?
96
+ CONVERSATION_HISTORY: What is the event about?
93
97
  RESPONSE: It’s something important.
94
98
 
95
99
  Expected Output:
@@ -99,7 +103,7 @@ Expected Output:
99
103
  }
100
104
 
101
105
  **Example B**
102
- QUERY: What’s the weather in Paris?
106
+ CONVERSATION_HISTORY: What’s the weather in Paris?
103
107
  RESPONSE: I tried to find the forecast but the query failed.
104
108
 
105
109
  Expected Output:
@@ -112,7 +116,7 @@ Expected Output:
112
116
  Definition: The response addresses the query and includes relevant information, but omits essential components or detail. The answer is on-topic but insufficient to fully satisfy the request.
113
117
 
114
118
  **Example A**
115
- QUERY: What amenities does the new apartment complex provide?
119
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
116
120
  RESPONSE: The apartment complex has a gym.
117
121
 
118
122
  Expected Output:
@@ -122,7 +126,7 @@ Expected Output:
122
126
  }
123
127
 
124
128
  **Example B**
125
- QUERY: What services does the premium membership include?
129
+ CONVERSATION_HISTORY: What services does the premium membership include?
126
130
  RESPONSE: It includes priority customer support.
127
131
 
128
132
  Expected Output:
@@ -137,7 +141,7 @@ Expected Output:
137
141
  Definition: The response fully addresses the question with accurate and sufficient information, covering all essential aspects. Very minor omissions are acceptable as long as the core information is intact and the intent is clearly conveyed.
138
142
 
139
143
  **Example A**
140
- QUERY: What amenities does the new apartment complex provide?
144
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
141
145
  RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security.
142
146
 
143
147
  Expected Output:
@@ -147,7 +151,7 @@ Expected Output:
147
151
  }
148
152
 
149
153
  **Example B**
150
- QUERY: What services does the premium membership include?
154
+ CONVERSATION_HISTORY: What services does the premium membership include?
151
155
  RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases.
152
156
 
153
157
  Expected Output:
@@ -161,7 +165,7 @@ Expected Output:
161
165
  Definition: The response not only fully and accurately answers the question, but also adds meaningful elaboration, interpretation, or context that enhances the user's understanding. This goes beyond just listing relevant details — it offers insight into why the information matters, how it's useful, or what impact it has.
162
166
 
163
167
  **Example A**
164
- QUERY: What amenities does the new apartment complex provide?
168
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
165
169
  RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security, designed to offer residents a comfortable and active lifestyle while ensuring their safety.
166
170
 
167
171
  Expected Output:
@@ -171,7 +175,7 @@ Expected Output:
171
175
  }
172
176
 
173
177
  **Example B**
174
- QUERY: What services does the premium membership include?
178
+ CONVERSATION_HISTORY: What services does the premium membership include?
175
179
  RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases — tailored for users who want quicker resolutions and first access to new features.
176
180
 
177
181
  Expected Output:
@@ -179,3 +183,16 @@ Expected Output:
179
183
  "explanation": "The response covers all essential services and adds valuable insight about the target user and benefits, enriching the response beyond basic listing.",
180
184
  "score": 5
181
185
  }
186
+
187
+ ### Multi-turn Conversation Example
188
+ When evaluating responses in a multi-turn conversation, consider the conversation context to understand the user's intent:
189
+
190
+ **Example - Multi-turn Context**
191
+ CONVERSATION_HISTORY: [{"role":"user","content":"I'm planning a vacation to Europe."},{"role":"assistant","content":"That sounds exciting! What time of year are you thinking of traveling?"},{"role":"user","content":"Probably in July. What's the weather like then?"}]
192
+ RESPONSE: [{"role":"assistant","content":"July is summer in Europe with generally warm and pleasant weather. Most countries have temperatures between 20-25°C (68-77°F). It's a popular travel time, so expect crowds at major tourist attractions and higher accommodation prices."}]
193
+
194
+ Expected Output:
195
+ {
196
+ "explanation": "The response directly addresses the weather question while providing valuable context about crowds and pricing that's relevant to vacation planning established in the conversation.",
197
+ "score": 5
198
+ }
@@ -3,6 +3,7 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import os
6
+ import logging
6
7
  import math
7
8
  from typing import Dict, List, Union, Optional
8
9
 
@@ -14,39 +15,37 @@ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_sco
14
15
  from azure.ai.evaluation._model_configurations import Conversation, Message
15
16
  from azure.ai.evaluation._common._experimental import experimental
16
17
 
18
+ logger = logging.getLogger(__name__)
19
+
17
20
 
18
21
  @experimental
19
22
  class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
20
- """
21
- Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
22
- provided ground truth.
23
+ """Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
24
+ provided ground truth.
25
+
23
26
  The completeness measure assesses how thoroughly an AI model's generated response aligns with the key information,
24
27
  claims, and statements established in the ground truth. This evaluation considers the presence, accuracy,
25
28
  and relevance of the content provided.
29
+
26
30
  The assessment spans multiple levels, ranging from fully incomplete to fully complete, ensuring a comprehensive
27
31
  evaluation of the response's content quality.
32
+
28
33
  Use this metric when you need to evaluate an AI model's ability to deliver comprehensive and accurate information,
29
34
  particularly in text generation tasks where conveying all essential details is crucial for clarity,
30
35
  context, and correctness.
36
+
31
37
  Completeness scores range from 1 to 5:
38
+
32
39
  1: Fully incomplete — Contains none of the necessary information.
33
40
  2: Barely complete — Contains only a small portion of the required information.
34
41
  3: Moderately complete — Covers about half of the required content.
35
42
  4: Mostly complete — Includes most of the necessary details with minimal omissions.
36
43
  5: Fully complete — Contains all key information without any omissions.
44
+
37
45
  :param model_config: Configuration for the Azure OpenAI model.
38
46
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
39
47
  ~azure.ai.evaluation.OpenAIModelConfiguration]
40
48
 
41
- .. admonition:: Example:
42
-
43
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
44
- :start-after: [START completeness_evaluator]
45
- :end-before: [END completeness_evaluator]
46
- :language: python
47
- :dedent: 8
48
- :caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
49
-
50
49
  .. admonition:: Example using Azure AI Project URL:
51
50
 
52
51
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
@@ -78,12 +77,14 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
77
  ):
79
78
  current_dir = os.path.dirname(__file__)
80
79
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
81
- self.threshold = threshold
80
+ self.threshold = threshold # to be removed in favor of _threshold
82
81
  super().__init__(
83
82
  model_config=model_config,
84
83
  prompty_file=prompty_path,
85
84
  result_key=self._RESULT_KEY,
85
+ threshold=threshold,
86
86
  credential=credential,
87
+ _higher_is_better=True,
87
88
  **kwargs,
88
89
  )
89
90
 
@@ -160,20 +161,42 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
160
161
  target=ErrorTarget.COMPLETENESS_EVALUATOR,
161
162
  )
162
163
 
163
- llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
164
+ result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
165
+ llm_output = result.get("llm_output") if isinstance(result, dict) else result
164
166
 
165
167
  score = math.nan
166
- if llm_output:
167
- score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
168
+ llm_output_is_dict = isinstance(llm_output, dict)
169
+ if llm_output_is_dict or isinstance(llm_output, str):
170
+ reason = ""
171
+ if llm_output_is_dict:
172
+ score = float(llm_output.get("score", math.nan))
173
+ reason = llm_output.get("explanation", "")
174
+ else:
175
+ score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
168
176
 
169
- score_result = "pass" if score >= self.threshold else "fail"
177
+ binary_result = self._get_binary_result(score)
170
178
 
171
179
  # updating the result key and threshold to int based on the schema
172
180
  return {
173
181
  f"{self._result_key}": int(score),
174
- f"{self._result_key}_result": score_result,
175
- f"{self._result_key}_threshold": int(self.threshold),
182
+ f"{self._result_key}_result": binary_result,
183
+ f"{self._result_key}_threshold": int(self._threshold),
176
184
  f"{self._result_key}_reason": reason,
185
+ f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
186
+ f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
187
+ f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
188
+ f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
189
+ f"{self._result_key}_model": result.get("model_id", ""),
190
+ f"{self._result_key}_sample_input": result.get("sample_input", ""),
191
+ f"{self._result_key}_sample_output": result.get("sample_output", ""),
177
192
  }
178
193
 
179
- return {self._result_key: math.nan}
194
+ if logger:
195
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
196
+
197
+ binary_result = self._get_binary_result(score)
198
+ return {
199
+ self._result_key: float(score),
200
+ f"{self._result_key}_result": binary_result,
201
+ f"{self._result_key}_threshold": self._threshold,
202
+ }
@@ -33,6 +33,11 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
33
33
  ~azure.ai.evaluation.OpenAIModelConfiguration]
34
34
  :param threshold: The threshold for the evaluation. Default is 3.
35
35
  :type threshold: float
36
+ :param credential: The credential for authenticating to Azure AI service.
37
+ :type credential: ~azure.core.credentials.TokenCredential
38
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
39
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
40
+ :paramtype is_reasoning_model: bool
36
41
  :return: A function that evaluates and generates metrics for "chat" scenario.
37
42
  :rtype: Callable
38
43
 
@@ -78,7 +83,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
83
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
79
84
 
80
85
  @override
81
- def __init__(self, model_config, *, threshold: float = 3, credential=None):
86
+ def __init__(self, model_config, *, threshold: float = 3, credential=None, **kwargs):
82
87
  current_dir = os.path.dirname(__file__)
83
88
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
84
89
  self._threshold = threshold
@@ -90,6 +95,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
90
95
  threshold=threshold,
91
96
  credential=credential,
92
97
  _higher_is_better=self._higher_is_better,
98
+ **kwargs,
93
99
  )
94
100
 
95
101
  @overload
@@ -30,6 +30,11 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
30
30
  ~azure.ai.evaluation.OpenAIModelConfiguration]
31
31
  :param threshold: The threshold for the similarity evaluator. Default is 3.
32
32
  :type threshold: int
33
+ :param credential: The credential for authenticating to Azure AI service.
34
+ :type credential: ~azure.core.credentials.TokenCredential
35
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
36
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
37
+ :paramtype is_reasoning_model: bool
33
38
 
34
39
  .. admonition:: Example:
35
40
 
@@ -75,7 +80,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
75
80
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
76
81
 
77
82
  @override
78
- def __init__(self, model_config, *, threshold=3, credential=None):
83
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
79
84
  current_dir = os.path.dirname(__file__)
80
85
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
81
86
  self._threshold = threshold
@@ -87,6 +92,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
87
92
  threshold=threshold,
88
93
  credential=credential,
89
94
  _higher_is_better=self._higher_is_better,
95
+ **kwargs,
90
96
  )
91
97
 
92
98
  # Ignoring a mypy error about having only 1 overload function.
@@ -10,7 +10,11 @@ from typing_extensions import overload, override
10
10
 
11
11
  from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12
12
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13
- from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
13
+ from ..._common.utils import (
14
+ reformat_conversation_history,
15
+ reformat_agent_response,
16
+ reformat_tool_definitions,
17
+ )
14
18
  from azure.ai.evaluation._model_configurations import Message
15
19
  from azure.ai.evaluation._common._experimental import experimental
16
20
 
@@ -40,6 +44,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
40
44
  ~azure.ai.evaluation.OpenAIModelConfiguration]
41
45
 
42
46
  .. admonition:: Example:
47
+
43
48
  .. literalinclude:: ../samples/evaluation_samples_evaluate.py
44
49
  :start-after: [START task_adherence_evaluator]
45
50
  :end-before: [END task_adherence_evaluator]
@@ -72,12 +77,14 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
72
77
  def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
73
78
  current_dir = os.path.dirname(__file__)
74
79
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
75
- self.threshold = threshold
80
+ self.threshold = threshold # to be removed in favor of _threshold
76
81
  super().__init__(
77
82
  model_config=model_config,
78
83
  prompty_file=prompty_path,
79
84
  result_key=self._RESULT_KEY,
85
+ threshold=threshold,
80
86
  credential=credential,
87
+ _higher_is_better=True,
81
88
  **kwargs,
82
89
  )
83
90
 
@@ -153,19 +160,38 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
153
160
  eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
154
161
  if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
155
162
  eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
156
- llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
163
+
164
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
165
+ llm_output = prompty_output_dict["llm_output"]
166
+
167
+ score = math.nan
157
168
  if isinstance(llm_output, dict):
158
169
  score = float(llm_output.get("score", math.nan))
159
- score_result = "pass" if score >= self.threshold else "fail"
170
+ score_result = "pass" if score >= self._threshold else "fail"
160
171
  reason = llm_output.get("explanation", "")
161
172
  return {
162
173
  f"{self._result_key}": score,
174
+ f"gpt_{self._result_key}": score,
163
175
  f"{self._result_key}_result": score_result,
164
- f"{self._result_key}_threshold": self.threshold,
176
+ f"{self._result_key}_threshold": self._threshold,
165
177
  f"{self._result_key}_reason": reason,
166
178
  # Uncomment the following line in the next iteration after UI contracts are validated.
167
179
  # f"{self._result_key}_additional_details": llm_output
180
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
181
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
182
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
183
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
184
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
185
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
186
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
168
187
  }
169
188
  if logger:
170
189
  logger.warning("LLM output is not a dictionary, returning NaN for the score.")
171
- return {self._result_key: math.nan}
190
+
191
+ binary_result = self._get_binary_result(score)
192
+ return {
193
+ self._result_key: float(score),
194
+ f"gpt_{self._result_key}": float(score),
195
+ f"{self._result_key}_result": binary_result,
196
+ f"{self._result_key}_threshold": self._threshold,
197
+ }
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._task_completion import _TaskCompletionEvaluator
6
+
7
+ __all__ = ["_TaskCompletionEvaluator"]