azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,8 @@
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
7
|
import os
|
|
8
|
-
from
|
|
8
|
+
from itertools import chain
|
|
9
|
+
from typing import Dict, Optional, TypeVar, Union, List
|
|
9
10
|
|
|
10
11
|
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
|
|
11
12
|
from promptflow.core._flow import AsyncPrompty
|
|
@@ -132,10 +133,19 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
132
133
|
category=ErrorCategory.INVALID_VALUE,
|
|
133
134
|
target=ErrorTarget.CONVERSATION,
|
|
134
135
|
)
|
|
135
|
-
|
|
136
|
+
# Call the prompty flow to get the evaluation result.
|
|
137
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
136
138
|
|
|
137
139
|
score = math.nan
|
|
138
|
-
if
|
|
140
|
+
if prompty_output_dict:
|
|
141
|
+
llm_output = prompty_output_dict.get("llm_output", "")
|
|
142
|
+
input_token_count = prompty_output_dict.get("input_token_count", 0)
|
|
143
|
+
output_token_count = prompty_output_dict.get("output_token_count", 0)
|
|
144
|
+
total_token_count = prompty_output_dict.get("total_token_count", 0)
|
|
145
|
+
finish_reason = prompty_output_dict.get("finish_reason", "")
|
|
146
|
+
model_id = prompty_output_dict.get("model_id", "")
|
|
147
|
+
sample_input = prompty_output_dict.get("sample_input", "")
|
|
148
|
+
sample_output = prompty_output_dict.get("sample_output", "")
|
|
139
149
|
# Parse out score and reason from evaluators known to possess them.
|
|
140
150
|
if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
|
|
141
151
|
score, reason = parse_quality_evaluator_reason_score(llm_output)
|
|
@@ -146,6 +156,13 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
146
156
|
f"{self._result_key}_reason": reason,
|
|
147
157
|
f"{self._result_key}_result": binary_result,
|
|
148
158
|
f"{self._result_key}_threshold": self._threshold,
|
|
159
|
+
f"{self._result_key}_prompt_tokens": input_token_count,
|
|
160
|
+
f"{self._result_key}_completion_tokens": output_token_count,
|
|
161
|
+
f"{self._result_key}_total_tokens": total_token_count,
|
|
162
|
+
f"{self._result_key}_finish_reason": finish_reason,
|
|
163
|
+
f"{self._result_key}_model": model_id,
|
|
164
|
+
f"{self._result_key}_sample_input": sample_input,
|
|
165
|
+
f"{self._result_key}_sample_output": sample_output,
|
|
149
166
|
}
|
|
150
167
|
match = re.search(r"\d", llm_output)
|
|
151
168
|
if match:
|
|
@@ -156,6 +173,13 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
156
173
|
f"gpt_{self._result_key}": float(score),
|
|
157
174
|
f"{self._result_key}_result": binary_result,
|
|
158
175
|
f"{self._result_key}_threshold": self._threshold,
|
|
176
|
+
f"{self._result_key}_prompt_tokens": input_token_count,
|
|
177
|
+
f"{self._result_key}_completion_tokens": output_token_count,
|
|
178
|
+
f"{self._result_key}_total_tokens": total_token_count,
|
|
179
|
+
f"{self._result_key}_finish_reason": finish_reason,
|
|
180
|
+
f"{self._result_key}_model": model_id,
|
|
181
|
+
f"{self._result_key}_sample_input": sample_input,
|
|
182
|
+
f"{self._result_key}_sample_output": sample_output,
|
|
159
183
|
}
|
|
160
184
|
|
|
161
185
|
binary_result = self._get_binary_result(score)
|
|
@@ -165,3 +189,157 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
165
189
|
f"{self._result_key}_result": binary_result,
|
|
166
190
|
f"{self._result_key}_threshold": self._threshold,
|
|
167
191
|
}
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _get_built_in_tool_definition(tool_name: str):
|
|
195
|
+
"""Get the definition for the built-in tool."""
|
|
196
|
+
try:
|
|
197
|
+
from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
|
|
198
|
+
|
|
199
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
200
|
+
return {
|
|
201
|
+
"type": tool_name,
|
|
202
|
+
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
|
|
203
|
+
"name": tool_name,
|
|
204
|
+
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
|
|
205
|
+
}
|
|
206
|
+
except ImportError:
|
|
207
|
+
pass
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
|
|
211
|
+
"""Extract tool definitions needed for the given built-in tool calls."""
|
|
212
|
+
needed_definitions = []
|
|
213
|
+
for tool_call in tool_calls:
|
|
214
|
+
if isinstance(tool_call, dict):
|
|
215
|
+
tool_type = tool_call.get("type")
|
|
216
|
+
|
|
217
|
+
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
|
|
218
|
+
if tool_type == "tool_call":
|
|
219
|
+
tool_name = tool_call.get("name")
|
|
220
|
+
if tool_name:
|
|
221
|
+
definition = self._get_built_in_tool_definition(tool_name)
|
|
222
|
+
if definition and definition not in needed_definitions:
|
|
223
|
+
needed_definitions.append(definition)
|
|
224
|
+
|
|
225
|
+
return needed_definitions
|
|
226
|
+
|
|
227
|
+
def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
|
|
228
|
+
"""Extract just the tool names from tool calls, removing parameters."""
|
|
229
|
+
tool_names = []
|
|
230
|
+
for tool_call in tool_calls:
|
|
231
|
+
if isinstance(tool_call, dict):
|
|
232
|
+
tool_type = tool_call.get("type")
|
|
233
|
+
if tool_type == "tool_call":
|
|
234
|
+
tool_name = tool_call.get("name")
|
|
235
|
+
if tool_name:
|
|
236
|
+
tool_names.append(tool_name)
|
|
237
|
+
elif tool_call.get("function", {}).get("name"):
|
|
238
|
+
# Handle function call format
|
|
239
|
+
tool_names.append(tool_call["function"]["name"])
|
|
240
|
+
elif tool_call.get("name"):
|
|
241
|
+
# Handle direct name format
|
|
242
|
+
tool_names.append(tool_call["name"])
|
|
243
|
+
return tool_names
|
|
244
|
+
|
|
245
|
+
def _extract_needed_tool_definitions(
|
|
246
|
+
self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
|
|
247
|
+
) -> List[Dict]:
|
|
248
|
+
"""Extract the tool definitions that are needed for the provided tool calls.
|
|
249
|
+
|
|
250
|
+
:param tool_calls: The tool calls that need definitions
|
|
251
|
+
:type tool_calls: List[Dict]
|
|
252
|
+
:param tool_definitions: User-provided tool definitions
|
|
253
|
+
:type tool_definitions: List[Dict]
|
|
254
|
+
:param error_target: The evaluator-specific error target for exceptions
|
|
255
|
+
:type error_target: ErrorTarget
|
|
256
|
+
:return: List of needed tool definitions
|
|
257
|
+
:rtype: List[Dict]
|
|
258
|
+
:raises EvaluationException: If validation fails
|
|
259
|
+
"""
|
|
260
|
+
needed_tool_definitions = []
|
|
261
|
+
|
|
262
|
+
# Add all user-provided tool definitions
|
|
263
|
+
needed_tool_definitions.extend(tool_definitions)
|
|
264
|
+
|
|
265
|
+
# Add the needed built-in tool definitions (if they are called)
|
|
266
|
+
built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
|
|
267
|
+
needed_tool_definitions.extend(built_in_definitions)
|
|
268
|
+
|
|
269
|
+
# OpenAPI tool is a collection of functions, so we need to expand it
|
|
270
|
+
tool_definitions_expanded = list(
|
|
271
|
+
chain.from_iterable(
|
|
272
|
+
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
|
|
273
|
+
for tool in needed_tool_definitions
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Validate that all tool calls have corresponding definitions
|
|
278
|
+
for tool_call in tool_calls:
|
|
279
|
+
if isinstance(tool_call, dict):
|
|
280
|
+
tool_type = tool_call.get("type")
|
|
281
|
+
|
|
282
|
+
if tool_type == "tool_call":
|
|
283
|
+
tool_name = tool_call.get("name")
|
|
284
|
+
if tool_name and self._get_built_in_tool_definition(tool_name):
|
|
285
|
+
# This is a built-in tool from converter, already handled above
|
|
286
|
+
continue
|
|
287
|
+
elif tool_name:
|
|
288
|
+
# This is a regular function tool from converter
|
|
289
|
+
tool_definition_exists = any(
|
|
290
|
+
tool.get("name") == tool_name and tool.get("type", "function") == "function"
|
|
291
|
+
for tool in tool_definitions_expanded
|
|
292
|
+
)
|
|
293
|
+
if not tool_definition_exists:
|
|
294
|
+
raise EvaluationException(
|
|
295
|
+
message=f"Tool definition for {tool_name} not found",
|
|
296
|
+
blame=ErrorBlame.USER_ERROR,
|
|
297
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
298
|
+
target=error_target,
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
raise EvaluationException(
|
|
302
|
+
message=f"Tool call missing name: {tool_call}",
|
|
303
|
+
blame=ErrorBlame.USER_ERROR,
|
|
304
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
305
|
+
target=error_target,
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
# Unsupported tool format - only converter format is supported
|
|
309
|
+
raise EvaluationException(
|
|
310
|
+
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
|
|
311
|
+
blame=ErrorBlame.USER_ERROR,
|
|
312
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
313
|
+
target=error_target,
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
# Tool call is not a dictionary
|
|
317
|
+
raise EvaluationException(
|
|
318
|
+
message=f"Tool call is not a dictionary: {tool_call}",
|
|
319
|
+
blame=ErrorBlame.USER_ERROR,
|
|
320
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
321
|
+
target=error_target,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
return needed_tool_definitions
|
|
325
|
+
|
|
326
|
+
def _not_applicable_result(
|
|
327
|
+
self, error_message: str, threshold: Union[int, float]
|
|
328
|
+
) -> Dict[str, Union[str, float, Dict]]:
|
|
329
|
+
"""Return a result indicating that the evaluation is not applicable.
|
|
330
|
+
|
|
331
|
+
:param error_message: The error message explaining why evaluation is not applicable.
|
|
332
|
+
:type error_message: str
|
|
333
|
+
:param threshold: The threshold value for the evaluator.
|
|
334
|
+
:type threshold: Union[int, float]
|
|
335
|
+
:return: A dictionary containing the result of the evaluation.
|
|
336
|
+
:rtype: Dict[str, Union[str, float, Dict]]
|
|
337
|
+
"""
|
|
338
|
+
# If no tool calls were made or tool call type is not supported, return not applicable result
|
|
339
|
+
return {
|
|
340
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
341
|
+
f"{self._result_key}_result": "pass",
|
|
342
|
+
f"{self._result_key}_threshold": threshold,
|
|
343
|
+
f"{self._result_key}_reason": error_message,
|
|
344
|
+
f"{self._result_key}_details": {},
|
|
345
|
+
}
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os, logging
|
|
5
|
-
from typing import Dict, List, Optional, Union
|
|
5
|
+
from typing import Dict, List, Optional, Union, Any, Tuple
|
|
6
6
|
|
|
7
7
|
from typing_extensions import overload, override
|
|
8
|
-
from azure.ai.evaluation._legacy.
|
|
8
|
+
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
9
9
|
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
@@ -16,6 +16,7 @@ from ..._common.utils import (
|
|
|
16
16
|
ErrorCategory,
|
|
17
17
|
construct_prompty_model_config,
|
|
18
18
|
validate_model_config,
|
|
19
|
+
simplify_messages,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
try:
|
|
@@ -32,8 +33,7 @@ logger = logging.getLogger(__name__)
|
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
35
|
-
"""
|
|
36
|
-
Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
36
|
+
"""Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
37
37
|
including reasoning.
|
|
38
38
|
|
|
39
39
|
The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
|
|
@@ -65,6 +65,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
65
65
|
:caption: Initialize and call a GroundednessEvaluator.
|
|
66
66
|
|
|
67
67
|
.. admonition:: Example with Threshold:
|
|
68
|
+
|
|
68
69
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
69
70
|
:start-after: [START threshold_groundedness_evaluator]
|
|
70
71
|
:end-before: [END threshold_groundedness_evaluator]
|
|
@@ -201,18 +202,60 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
201
202
|
"""
|
|
202
203
|
|
|
203
204
|
if kwargs.get("query", None):
|
|
204
|
-
|
|
205
|
-
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
|
|
206
|
-
self._prompty_file = prompty_path
|
|
207
|
-
prompty_model_config = construct_prompty_model_config(
|
|
208
|
-
validate_model_config(self._model_config),
|
|
209
|
-
self._DEFAULT_OPEN_API_VERSION,
|
|
210
|
-
UserAgentSingleton().value,
|
|
211
|
-
)
|
|
212
|
-
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
205
|
+
self._ensure_query_prompty_loaded()
|
|
213
206
|
|
|
214
207
|
return super().__call__(*args, **kwargs)
|
|
215
208
|
|
|
209
|
+
def _ensure_query_prompty_loaded(self):
|
|
210
|
+
"""Switch to the query prompty file if not already loaded."""
|
|
211
|
+
|
|
212
|
+
current_dir = os.path.dirname(__file__)
|
|
213
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
|
|
214
|
+
|
|
215
|
+
self._prompty_file = prompty_path
|
|
216
|
+
prompty_model_config = construct_prompty_model_config(
|
|
217
|
+
validate_model_config(self._model_config),
|
|
218
|
+
self._DEFAULT_OPEN_API_VERSION,
|
|
219
|
+
UserAgentSingleton().value,
|
|
220
|
+
)
|
|
221
|
+
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
222
|
+
|
|
223
|
+
def _has_context(self, eval_input: dict) -> bool:
|
|
224
|
+
"""
|
|
225
|
+
Return True if eval_input contains a non-empty 'context' field.
|
|
226
|
+
Treats None, empty strings, empty lists, and lists of empty strings as no context.
|
|
227
|
+
"""
|
|
228
|
+
context = eval_input.get("context", None)
|
|
229
|
+
if not context:
|
|
230
|
+
return False
|
|
231
|
+
if context == "<>": # Special marker for no context
|
|
232
|
+
return False
|
|
233
|
+
if isinstance(context, list):
|
|
234
|
+
return any(str(c).strip() for c in context)
|
|
235
|
+
if isinstance(context, str):
|
|
236
|
+
return bool(context.strip())
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
@override
|
|
240
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
|
|
241
|
+
if eval_input.get("query", None) is None:
|
|
242
|
+
return await super()._do_eval(eval_input)
|
|
243
|
+
|
|
244
|
+
contains_context = self._has_context(eval_input)
|
|
245
|
+
|
|
246
|
+
simplified_query = simplify_messages(eval_input["query"], drop_tool_calls=contains_context)
|
|
247
|
+
simplified_response = simplify_messages(eval_input["response"], drop_tool_calls=False)
|
|
248
|
+
|
|
249
|
+
# Build simplified input
|
|
250
|
+
simplified_eval_input = {
|
|
251
|
+
"query": simplified_query,
|
|
252
|
+
"response": simplified_response,
|
|
253
|
+
"context": eval_input["context"],
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
# Replace and call the parent method
|
|
257
|
+
return await super()._do_eval(simplified_eval_input)
|
|
258
|
+
|
|
216
259
|
async def _real_call(self, **kwargs):
|
|
217
260
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
218
261
|
|
|
@@ -236,57 +279,76 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
236
279
|
raise ex
|
|
237
280
|
|
|
238
281
|
def _convert_kwargs_to_eval_input(self, **kwargs):
|
|
239
|
-
if "context"
|
|
282
|
+
if kwargs.get("context") or kwargs.get("conversation"):
|
|
240
283
|
return super()._convert_kwargs_to_eval_input(**kwargs)
|
|
241
|
-
|
|
242
284
|
query = kwargs.get("query")
|
|
243
285
|
response = kwargs.get("response")
|
|
244
286
|
tool_definitions = kwargs.get("tool_definitions")
|
|
245
287
|
|
|
246
|
-
if
|
|
247
|
-
|
|
288
|
+
if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
|
|
289
|
+
self._ensure_query_prompty_loaded()
|
|
290
|
+
|
|
291
|
+
if (not query) or (not response): # or not tool_definitions:
|
|
292
|
+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
|
|
248
293
|
raise EvaluationException(
|
|
249
294
|
message=msg,
|
|
250
295
|
blame=ErrorBlame.USER_ERROR,
|
|
251
296
|
category=ErrorCategory.INVALID_VALUE,
|
|
252
297
|
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
|
|
253
298
|
)
|
|
254
|
-
|
|
255
299
|
context = self._get_context_from_agent_response(response, tool_definitions)
|
|
256
|
-
if not context:
|
|
257
|
-
raise EvaluationException(
|
|
258
|
-
message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
|
|
259
|
-
blame=ErrorBlame.USER_ERROR,
|
|
260
|
-
category=ErrorCategory.NOT_APPLICABLE,
|
|
261
|
-
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
|
|
262
|
-
)
|
|
263
300
|
|
|
264
|
-
|
|
301
|
+
filtered_response = self._filter_file_search_results(response)
|
|
302
|
+
return super()._convert_kwargs_to_eval_input(response=filtered_response, context=context, query=query)
|
|
303
|
+
|
|
304
|
+
def _filter_file_search_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
305
|
+
"""Filter out file_search tool results from the messages."""
|
|
306
|
+
file_search_ids = self._get_file_search_tool_call_ids(messages)
|
|
307
|
+
return [
|
|
308
|
+
msg for msg in messages if not (msg.get("role") == "tool" and msg.get("tool_call_id") in file_search_ids)
|
|
309
|
+
]
|
|
265
310
|
|
|
266
311
|
def _get_context_from_agent_response(self, response, tool_definitions):
|
|
312
|
+
"""Extract context text from file_search tool results in the agent response."""
|
|
313
|
+
NO_CONTEXT = "<>"
|
|
267
314
|
context = ""
|
|
268
315
|
try:
|
|
269
316
|
logger.debug("Extracting context from response")
|
|
270
317
|
tool_calls = self._parse_tools_from_response(response=response)
|
|
271
|
-
logger.debug(f"Tool Calls parsed successfully
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
318
|
+
logger.debug(f"Tool Calls parsed successfully: {tool_calls}")
|
|
319
|
+
|
|
320
|
+
if not tool_calls:
|
|
321
|
+
return NO_CONTEXT
|
|
322
|
+
|
|
323
|
+
context_lines = []
|
|
324
|
+
for tool_call in tool_calls:
|
|
325
|
+
if not isinstance(tool_call, dict) or tool_call.get("type") != "tool_call":
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
tool_name = tool_call.get("name")
|
|
329
|
+
if tool_name != "file_search":
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
# Extract tool results
|
|
333
|
+
for result in tool_call.get("tool_result", []):
|
|
334
|
+
results = result if isinstance(result, list) else [result]
|
|
335
|
+
for r in results:
|
|
336
|
+
file_name = r.get("file_name", "Unknown file name")
|
|
337
|
+
for content in r.get("content", []):
|
|
338
|
+
text = content.get("text")
|
|
339
|
+
if text:
|
|
340
|
+
context_lines.append(f"{file_name}:\n- {text}---\n\n")
|
|
341
|
+
|
|
342
|
+
context = "\n".join(context_lines) if len(context_lines) > 0 else None
|
|
343
|
+
|
|
288
344
|
except Exception as ex:
|
|
289
345
|
logger.debug(f"Error extracting context from agent response : {str(ex)}")
|
|
290
|
-
context =
|
|
346
|
+
context = None
|
|
347
|
+
|
|
348
|
+
context = context if context else NO_CONTEXT
|
|
349
|
+
return context
|
|
291
350
|
|
|
292
|
-
|
|
351
|
+
def _get_file_search_tool_call_ids(self, query_or_response):
|
|
352
|
+
"""Return a list of tool_call_ids for file search tool calls."""
|
|
353
|
+
tool_calls = self._parse_tools_from_response(query_or_response)
|
|
354
|
+
return [tc.get("tool_call_id") for tc in tool_calls if tc.get("name") == "file_search"]
|
|
@@ -32,52 +32,53 @@ system:
|
|
|
32
32
|
|
|
33
33
|
user:
|
|
34
34
|
# Definition
|
|
35
|
-
**Groundedness** refers to how well an answer is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the answer directly and fully addresses the question without introducing unrelated or incorrect information.
|
|
35
|
+
**Groundedness** refers to how well an answer is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the answer directly and fully addresses the question without introducing unrelated or incorrect information.
|
|
36
|
+
|
|
37
|
+
> Context is the source of truth for evaluating the response. If it's empty, rely on the tool results in the response and query.
|
|
38
|
+
> Evaluate the groundedness of the response message, not the chat history.
|
|
36
39
|
|
|
37
40
|
# Ratings
|
|
38
41
|
## [Groundedness: 1] (Completely Unrelated Response)
|
|
39
|
-
**Definition:** An answer that does not relate to the question or the context in any way.
|
|
42
|
+
**Definition:** An answer that does not relate to the question or the context in any way.
|
|
43
|
+
- Does not relate to the question or context at all.
|
|
44
|
+
- Talks about the general topic but does not respond to the query.
|
|
40
45
|
|
|
41
46
|
**Examples:**
|
|
42
47
|
**Context:** The company's annual meeting will be held next Thursday.
|
|
43
48
|
**Query:** When is the company's annual meeting?
|
|
44
49
|
**Response:** I enjoy hiking in the mountains during summer.
|
|
45
50
|
|
|
46
|
-
**Context:** The new policy aims to reduce carbon emissions by 20% over the next five years.
|
|
47
|
-
**Query:** What is the goal of the new policy?
|
|
48
|
-
**Response:** My favorite color is blue.
|
|
49
|
-
|
|
50
|
-
## [Groundedness: 2] (Related Topic but Does Not Respond to the Query)
|
|
51
|
-
**Definition:** An answer that relates to the general topic of the context but does not answer the specific question asked. It may mention concepts from the context but fails to provide a direct or relevant response.
|
|
52
|
-
|
|
53
|
-
**Examples:**
|
|
54
51
|
**Context:** The museum will exhibit modern art pieces from various local artists.
|
|
55
52
|
**Query:** What kind of art will be exhibited at the museum?
|
|
56
53
|
**Response:** Museums are important cultural institutions.
|
|
57
54
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
**Response:** Software updates can sometimes fix bugs.
|
|
61
|
-
|
|
62
|
-
## [Groundedness: 3] (Attempts to Respond but Contains Incorrect Information)
|
|
63
|
-
**Definition:** An answer that attempts to respond to the question but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details.
|
|
55
|
+
## [Groundedness: 2] (Attempts to Respond but Contains Incorrect Information)
|
|
56
|
+
**Definition:** An answer that attempts to respond to the question but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. Even if some points are correct, the presence of inaccuracies makes the response unreliable.
|
|
64
57
|
|
|
65
58
|
**Examples:**
|
|
66
|
-
**Context:** The festival starts on June 5th and features international musicians.
|
|
59
|
+
**Context:** - The festival starts on June 5th and features international musicians.
|
|
67
60
|
**Query:** When does the festival start?
|
|
68
61
|
**Response:** The festival starts on July 5th and features local artists.
|
|
69
62
|
|
|
70
|
-
**Context:**
|
|
71
|
-
**Query:**
|
|
72
|
-
**Response:**
|
|
63
|
+
**Context:** bakery_menu.txt: - Croissant au Beurre — flaky, buttery croissant
|
|
64
|
+
**Query:** [{"role":"user","content":"Are there croissants?"}]
|
|
65
|
+
**Response:** [{"role":"assistant","content":"Yes, Croissant au Beurre is on the menu, served with jam."}]
|
|
66
|
+
|
|
67
|
+
## [Groundedness: 3] (Nothing to be Grounded)
|
|
68
|
+
Definition: An answer that does not provide any information that can be evaluated against the context. This includes responses that are asking for clarification, providing polite fillers, or follow-up questions.
|
|
69
|
+
|
|
70
|
+
**Examples:**
|
|
71
|
+
**Context:**
|
|
72
|
+
**Query:** [{"role":"user","content":"How many eggs are needed for the recipe?"}, {"role":"tool","content":"tool_result": [{"file_name": "recipe.txt", "content": "The recipe requires two eggs and one cup of milk."}]}, {"role":"assistant","content":"You need three eggs for the recipe."}, {"role":"user","content":"Thank you."}]
|
|
73
|
+
**Response:** [{"role":"assistant","content":"You're welcome, anything else I can help with?"}]
|
|
73
74
|
|
|
74
75
|
## [Groundedness: 4] (Partially Correct Response)
|
|
75
76
|
**Definition:** An answer that provides a correct response to the question but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding.
|
|
76
77
|
|
|
77
78
|
**Examples:**
|
|
78
|
-
**Context:** The bookstore offers a 15% discount to students and a 10% discount to senior citizens.
|
|
79
|
-
**Query:** What discount does the bookstore offer to students?
|
|
80
|
-
**Response:**
|
|
79
|
+
**Context:** - store_details.txt: The bookstore offers a 15% discount to students and a 10% discount to senior citizens.
|
|
80
|
+
**Query:** [{"role":"user","content":"What discount does the bookstore offer to students, if any?"}]
|
|
81
|
+
**Response:** [{"role":"assistant","content":"Yes, students get a discount at the bookstore."}]
|
|
81
82
|
|
|
82
83
|
**Context:** The company's headquarters are located in Berlin, Germany.
|
|
83
84
|
**Query:** Where are the company's headquarters?
|
|
@@ -87,13 +88,13 @@ user:
|
|
|
87
88
|
**Definition:** An answer that thoroughly and accurately responds to the question, including all relevant details from the context. It directly addresses the question with precise information, demonstrating complete understanding without adding extraneous information.
|
|
88
89
|
|
|
89
90
|
**Examples:**
|
|
90
|
-
**
|
|
91
|
-
**
|
|
92
|
-
**
|
|
91
|
+
**CONTEXT:** The author released her latest novel, 'The Silent Echo', on September 1st.
|
|
92
|
+
**QUERY:** [{"role":"user","content":"When was 'The Silent Echo' released?"}]
|
|
93
|
+
**RESPONSE:** [{"role":"assistant","content":"The 'Silent Echo' was released on September 1st."}]
|
|
93
94
|
|
|
94
|
-
**Context:**
|
|
95
|
+
**Context:**
|
|
95
96
|
**Query:** By what date must participants register to receive early bird pricing?
|
|
96
|
-
**Response:** Participants must register by May 31st to receive early bird pricing.
|
|
97
|
+
**Response:** [{"role":"tool","content":"tool_result": [{"file_name": "store_guidelines.txt", "content": "Participants registering before and including May 31st will be eligible for early bird pricing."}]}, {"role":"assistant","content":"Participants must register by May 31st to receive early bird pricing."}]
|
|
97
98
|
|
|
98
99
|
|
|
99
100
|
# Data
|
|
@@ -103,7 +104,7 @@ RESPONSE: {{response}}
|
|
|
103
104
|
|
|
104
105
|
|
|
105
106
|
# Tasks
|
|
106
|
-
## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and
|
|
107
|
+
## Please provide your assessment Score for the previous RESPONSE message in relation to the CONTEXT, QUERY and RESPONSE tools based on the Definitions above. Your output should include the following information:
|
|
107
108
|
- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
|
|
108
109
|
- **Explanation**: a very short explanation of why you think the input Data should get that Score.
|
|
109
110
|
- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
|
|
@@ -29,11 +29,16 @@ system:
|
|
|
29
29
|
|
|
30
30
|
user:
|
|
31
31
|
# Definition
|
|
32
|
-
**Groundedness** refers to how
|
|
32
|
+
**Groundedness** refers to how well a response is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the response directly and fully addresses the information without introducing unrelated or incorrect information.
|
|
33
|
+
|
|
34
|
+
> Context is the source of truth for evaluating the response.
|
|
35
|
+
> Evaluate the groundedness of the response message based on the provided context.
|
|
33
36
|
|
|
34
37
|
# Ratings
|
|
35
|
-
## [Groundedness: 1] (Completely
|
|
36
|
-
**Definition:**
|
|
38
|
+
## [Groundedness: 1] (Completely Unrelated Response)
|
|
39
|
+
**Definition:** A response that does not relate to the context in any way.
|
|
40
|
+
- Does not relate to the context at all.
|
|
41
|
+
- Talks about the general topic but does not respond to the context.
|
|
37
42
|
|
|
38
43
|
**Examples:**
|
|
39
44
|
**Context:** The company's profits increased by 20% in the last quarter.
|
|
@@ -42,8 +47,8 @@ user:
|
|
|
42
47
|
**Context:** The new smartphone model features a larger display and improved battery life.
|
|
43
48
|
**Response:** The history of ancient Egypt is fascinating and full of mysteries.
|
|
44
49
|
|
|
45
|
-
## [Groundedness: 2] (
|
|
46
|
-
**Definition:**
|
|
50
|
+
## [Groundedness: 2] (Attempts to Respond but Contains Incorrect Information)
|
|
51
|
+
**Definition:** A response that attempts to relate to the context but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. Even if some points are correct, the presence of inaccuracies makes the response unreliable.
|
|
47
52
|
|
|
48
53
|
**Examples:**
|
|
49
54
|
**Context:** The company's profits increased by 20% in the last quarter.
|
|
@@ -52,18 +57,18 @@ user:
|
|
|
52
57
|
**Context:** The new smartphone model features a larger display and improved battery life.
|
|
53
58
|
**Response:** The new smartphone model has a smaller display and shorter battery life.
|
|
54
59
|
|
|
55
|
-
## [Groundedness: 3] (Accurate
|
|
56
|
-
**Definition:**
|
|
60
|
+
## [Groundedness: 3] (Accurate but Vague Response)
|
|
61
|
+
**Definition:** A response that provides accurate information from the context but is overly generic or vague, not meaningfully engaging with the specific details in the context. The information is correct but lacks specificity and detail.
|
|
57
62
|
|
|
58
63
|
**Examples:**
|
|
59
|
-
**Context:** The company's profits increased by 20% in the last quarter.
|
|
60
|
-
**Response:** The company
|
|
64
|
+
**Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
|
|
65
|
+
**Response:** The company is doing well financially.
|
|
61
66
|
|
|
62
|
-
**Context:** The new smartphone model features a larger display
|
|
63
|
-
**Response:** The
|
|
67
|
+
**Context:** The new smartphone model features a larger display, improved battery life, and an upgraded camera system.
|
|
68
|
+
**Response:** The smartphone has some nice features.
|
|
64
69
|
|
|
65
|
-
## [Groundedness: 4] (
|
|
66
|
-
**Definition:**
|
|
70
|
+
## [Groundedness: 4] (Partially Correct Response)
|
|
71
|
+
**Definition:** A response that provides correct information from the context but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding.
|
|
67
72
|
|
|
68
73
|
**Examples:**
|
|
69
74
|
**Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
|
|
@@ -73,7 +78,7 @@ user:
|
|
|
73
78
|
**Response:** The new smartphone model features a larger display and improved battery life.
|
|
74
79
|
|
|
75
80
|
## [Groundedness: 5] (Fully Grounded and Complete Response)
|
|
76
|
-
**Definition:**
|
|
81
|
+
**Definition:** A response that thoroughly and accurately conveys information from the context, including all relevant details. It directly addresses the context with precise information, demonstrating complete understanding without adding extraneous information.
|
|
77
82
|
|
|
78
83
|
**Examples:**
|
|
79
84
|
**Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
|