azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +13 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +104 -35
- azure/ai/evaluation/_evaluate/_utils.py +4 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +2 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +113 -19
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +113 -3
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +8 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +2 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +10 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +2 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +8 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +104 -60
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +58 -41
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +2 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +697 -3067
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +3 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -0
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +19 -5
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +4 -3
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/METADATA +39 -3
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/RECORD +49 -41
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
import os
|
|
4
|
+
import os, logging
|
|
5
5
|
from typing import Dict, List, Optional, Union
|
|
6
6
|
|
|
7
7
|
from typing_extensions import overload, override
|
|
@@ -9,7 +9,14 @@ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
|
|
|
9
9
|
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
-
from ..._common.utils import
|
|
12
|
+
from ..._common.utils import (
|
|
13
|
+
ErrorBlame,
|
|
14
|
+
ErrorTarget,
|
|
15
|
+
EvaluationException,
|
|
16
|
+
ErrorCategory,
|
|
17
|
+
construct_prompty_model_config,
|
|
18
|
+
validate_model_config,
|
|
19
|
+
)
|
|
13
20
|
|
|
14
21
|
try:
|
|
15
22
|
from ..._user_agent import UserAgentSingleton
|
|
@@ -21,6 +28,9 @@ except ImportError:
|
|
|
21
28
|
return "None"
|
|
22
29
|
|
|
23
30
|
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
24
34
|
class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
25
35
|
"""
|
|
26
36
|
Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
@@ -78,12 +88,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
88
|
_PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
|
|
79
89
|
_RESULT_KEY = "groundedness"
|
|
80
90
|
_OPTIONAL_PARAMS = ["query"]
|
|
91
|
+
_SUPPORTED_TOOLS = ["file_search"]
|
|
81
92
|
|
|
82
93
|
id = "azureai://built-in/evaluators/groundedness"
|
|
83
94
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
84
95
|
|
|
85
96
|
@override
|
|
86
|
-
def __init__(self, model_config, *, threshold=3, **kwargs):
|
|
97
|
+
def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
|
|
87
98
|
current_dir = os.path.dirname(__file__)
|
|
88
99
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
|
|
89
100
|
|
|
@@ -93,6 +104,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
93
104
|
prompty_file=prompty_path,
|
|
94
105
|
result_key=self._RESULT_KEY,
|
|
95
106
|
threshold=threshold,
|
|
107
|
+
credential=credential,
|
|
96
108
|
_higher_is_better=self._higher_is_better,
|
|
97
109
|
)
|
|
98
110
|
self._model_config = model_config
|
|
@@ -120,6 +132,26 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
120
132
|
:rtype: Dict[str, float]
|
|
121
133
|
"""
|
|
122
134
|
|
|
135
|
+
@overload
|
|
136
|
+
def __call__(
|
|
137
|
+
self,
|
|
138
|
+
*,
|
|
139
|
+
query: str,
|
|
140
|
+
response: List[dict],
|
|
141
|
+
tool_definitions: List[dict],
|
|
142
|
+
) -> Dict[str, Union[str, float]]:
|
|
143
|
+
"""Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
|
|
144
|
+
|
|
145
|
+
:keyword query: The query to be evaluated.
|
|
146
|
+
:paramtype query: str
|
|
147
|
+
:keyword response: The response from the agent to be evaluated.
|
|
148
|
+
:paramtype response: List[dict]
|
|
149
|
+
:keyword tool_definitions: The tool definitions used by the agent.
|
|
150
|
+
:paramtype tool_definitions: List[dict]
|
|
151
|
+
:return: The groundedness score.
|
|
152
|
+
:rtype: Dict[str, Union[str, float]]
|
|
153
|
+
"""
|
|
154
|
+
|
|
123
155
|
@overload
|
|
124
156
|
def __call__(
|
|
125
157
|
self,
|
|
@@ -174,3 +206,81 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
174
206
|
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
175
207
|
|
|
176
208
|
return super().__call__(*args, **kwargs)
|
|
209
|
+
|
|
210
|
+
async def _real_call(self, **kwargs):
|
|
211
|
+
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
212
|
+
|
|
213
|
+
:keyword kwargs: The inputs to evaluate.
|
|
214
|
+
:type kwargs: Dict
|
|
215
|
+
:return: The evaluation result.
|
|
216
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
217
|
+
"""
|
|
218
|
+
# Convert inputs into list of evaluable inputs.
|
|
219
|
+
try:
|
|
220
|
+
return await super()._real_call(**kwargs)
|
|
221
|
+
except EvaluationException as ex:
|
|
222
|
+
if ex.category == ErrorCategory.NOT_APPLICABLE:
|
|
223
|
+
return {
|
|
224
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
225
|
+
f"{self._result_key}_result": "pass",
|
|
226
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
227
|
+
f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
|
|
228
|
+
}
|
|
229
|
+
else:
|
|
230
|
+
raise ex
|
|
231
|
+
|
|
232
|
+
def _convert_kwargs_to_eval_input(self, **kwargs):
|
|
233
|
+
if "context" in kwargs or "conversation" in kwargs:
|
|
234
|
+
return super()._convert_kwargs_to_eval_input(**kwargs)
|
|
235
|
+
|
|
236
|
+
query = kwargs.get("query")
|
|
237
|
+
response = kwargs.get("response")
|
|
238
|
+
tool_definitions = kwargs.get("tool_definitions")
|
|
239
|
+
|
|
240
|
+
if not query or not response or not tool_definitions:
|
|
241
|
+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query', 'response' and 'tool_definitions' are required."
|
|
242
|
+
raise EvaluationException(
|
|
243
|
+
message=msg,
|
|
244
|
+
blame=ErrorBlame.USER_ERROR,
|
|
245
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
246
|
+
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
context = self._get_context_from_agent_response(response, tool_definitions)
|
|
250
|
+
if not context:
|
|
251
|
+
raise EvaluationException(
|
|
252
|
+
message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
|
|
253
|
+
blame=ErrorBlame.USER_ERROR,
|
|
254
|
+
category=ErrorCategory.NOT_APPLICABLE,
|
|
255
|
+
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
return super()._convert_kwargs_to_eval_input(response=response[-1], context=context, query=query)
|
|
259
|
+
|
|
260
|
+
def _get_context_from_agent_response(self, response, tool_definitions):
|
|
261
|
+
context = ""
|
|
262
|
+
try:
|
|
263
|
+
logger.debug("Extracting context from response")
|
|
264
|
+
tool_calls = self._parse_tools_from_response(response=response)
|
|
265
|
+
logger.debug(f"Tool Calls parsed successfully : {tool_calls}")
|
|
266
|
+
if tool_calls:
|
|
267
|
+
for tool_call in tool_calls:
|
|
268
|
+
if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
|
|
269
|
+
tool_name = tool_call.get("name")
|
|
270
|
+
for tool in tool_definitions:
|
|
271
|
+
if tool.get("name") == tool_name and tool.get("type") in self._SUPPORTED_TOOLS:
|
|
272
|
+
if tool_name == "file_search":
|
|
273
|
+
tool_result = tool_call.get("tool_result")
|
|
274
|
+
if tool_result:
|
|
275
|
+
for result in tool_result:
|
|
276
|
+
content_list = result.get("content")
|
|
277
|
+
if content_list:
|
|
278
|
+
for content in content_list:
|
|
279
|
+
text = content.get("text")
|
|
280
|
+
if text:
|
|
281
|
+
context = context + "\n" + str(text)
|
|
282
|
+
except Exception as ex:
|
|
283
|
+
logger.debug(f"Error extracting context from agent response : {str(ex)}")
|
|
284
|
+
context = ""
|
|
285
|
+
|
|
286
|
+
return context if context else None
|
|
@@ -61,11 +61,17 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
61
61
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
62
62
|
|
|
63
63
|
@override
|
|
64
|
-
def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
|
|
64
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
|
|
65
65
|
current_dir = os.path.dirname(__file__)
|
|
66
66
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
67
67
|
self.threshold = threshold
|
|
68
|
-
super().__init__(
|
|
68
|
+
super().__init__(
|
|
69
|
+
model_config=model_config,
|
|
70
|
+
prompty_file=prompty_path,
|
|
71
|
+
result_key=self._RESULT_KEY,
|
|
72
|
+
credential=credential,
|
|
73
|
+
**kwargs,
|
|
74
|
+
)
|
|
69
75
|
|
|
70
76
|
@overload
|
|
71
77
|
def __call__(
|
|
@@ -79,7 +79,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
79
79
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
80
80
|
|
|
81
81
|
@override
|
|
82
|
-
def __init__(self, model_config, *, threshold=3):
|
|
82
|
+
def __init__(self, model_config, *, credential=None, threshold=3):
|
|
83
83
|
current_dir = os.path.dirname(__file__)
|
|
84
84
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
85
85
|
self._threshold = threshold
|
|
@@ -89,6 +89,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
89
89
|
prompty_file=prompty_path,
|
|
90
90
|
result_key=self._RESULT_KEY,
|
|
91
91
|
threshold=threshold,
|
|
92
|
+
credential=credential,
|
|
92
93
|
_higher_is_better=self._higher_is_better,
|
|
93
94
|
)
|
|
94
95
|
|
|
@@ -73,11 +73,19 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
73
73
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
74
74
|
|
|
75
75
|
@override
|
|
76
|
-
def __init__(
|
|
76
|
+
def __init__(
|
|
77
|
+
self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
|
|
78
|
+
):
|
|
77
79
|
current_dir = os.path.dirname(__file__)
|
|
78
80
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
79
81
|
self.threshold = threshold
|
|
80
|
-
super().__init__(
|
|
82
|
+
super().__init__(
|
|
83
|
+
model_config=model_config,
|
|
84
|
+
prompty_file=prompty_path,
|
|
85
|
+
result_key=self._RESULT_KEY,
|
|
86
|
+
credential=credential,
|
|
87
|
+
**kwargs,
|
|
88
|
+
)
|
|
81
89
|
|
|
82
90
|
@overload
|
|
83
91
|
def __call__(
|
|
@@ -78,7 +78,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
78
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
79
79
|
|
|
80
80
|
@override
|
|
81
|
-
def __init__(self, model_config, *, threshold: float = 3):
|
|
81
|
+
def __init__(self, model_config, *, threshold: float = 3, credential=None):
|
|
82
82
|
current_dir = os.path.dirname(__file__)
|
|
83
83
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
84
84
|
self._threshold = threshold
|
|
@@ -88,6 +88,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
88
88
|
prompty_file=prompty_path,
|
|
89
89
|
result_key=self._RESULT_KEY,
|
|
90
90
|
threshold=threshold,
|
|
91
|
+
credential=credential,
|
|
91
92
|
_higher_is_better=self._higher_is_better,
|
|
92
93
|
)
|
|
93
94
|
|
|
@@ -75,7 +75,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
|
|
|
75
75
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
76
76
|
|
|
77
77
|
@override
|
|
78
|
-
def __init__(self, model_config, *, threshold=3):
|
|
78
|
+
def __init__(self, model_config, *, threshold=3, credential=None):
|
|
79
79
|
current_dir = os.path.dirname(__file__)
|
|
80
80
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
81
81
|
self._threshold = threshold
|
|
@@ -85,6 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
|
|
|
85
85
|
prompty_file=prompty_path,
|
|
86
86
|
result_key=self._RESULT_KEY,
|
|
87
87
|
threshold=threshold,
|
|
88
|
+
credential=credential,
|
|
88
89
|
_higher_is_better=self._higher_is_better,
|
|
89
90
|
)
|
|
90
91
|
|
|
@@ -69,11 +69,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
69
69
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
70
70
|
|
|
71
71
|
@override
|
|
72
|
-
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
|
|
72
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
|
|
73
73
|
current_dir = os.path.dirname(__file__)
|
|
74
74
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
75
75
|
self.threshold = threshold
|
|
76
|
-
super().__init__(
|
|
76
|
+
super().__init__(
|
|
77
|
+
model_config=model_config,
|
|
78
|
+
prompty_file=prompty_path,
|
|
79
|
+
result_key=self._RESULT_KEY,
|
|
80
|
+
credential=credential,
|
|
81
|
+
**kwargs,
|
|
82
|
+
)
|
|
77
83
|
|
|
78
84
|
@overload
|
|
79
85
|
def __call__(
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from itertools import chain
|
|
4
5
|
import math
|
|
5
6
|
import os
|
|
6
7
|
import logging
|
|
7
8
|
import re
|
|
8
|
-
from typing import Dict, List, Union, TypeVar,
|
|
9
|
+
from typing import Dict, List, Union, TypeVar, Optional
|
|
9
10
|
from typing_extensions import overload, override
|
|
10
11
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
12
|
from azure.ai.evaluation._exceptions import (
|
|
@@ -16,12 +17,46 @@ from azure.ai.evaluation._exceptions import (
|
|
|
16
17
|
)
|
|
17
18
|
from ..._common.utils import check_score_is_valid
|
|
18
19
|
from azure.ai.evaluation._common._experimental import experimental
|
|
20
|
+
from ..._converters._models import (
|
|
21
|
+
_BUILT_IN_DESCRIPTIONS,
|
|
22
|
+
_BUILT_IN_PARAMS,
|
|
23
|
+
)
|
|
19
24
|
|
|
20
25
|
logger = logging.getLogger(__name__)
|
|
21
26
|
|
|
22
27
|
T_EvalValue = TypeVar("T_EvalValue")
|
|
23
28
|
|
|
24
29
|
|
|
30
|
+
def _get_built_in_definition(tool_name: str):
|
|
31
|
+
"""Get the definition for the built-in tool."""
|
|
32
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
33
|
+
return {
|
|
34
|
+
"type": tool_name,
|
|
35
|
+
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
|
|
36
|
+
"name": tool_name,
|
|
37
|
+
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
|
|
38
|
+
}
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
|
|
43
|
+
"""Extract tool definitions needed for the given built-in tool calls."""
|
|
44
|
+
needed_definitions = []
|
|
45
|
+
for tool_call in tool_calls:
|
|
46
|
+
if isinstance(tool_call, dict):
|
|
47
|
+
tool_type = tool_call.get("type")
|
|
48
|
+
|
|
49
|
+
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
|
|
50
|
+
if tool_type == "tool_call":
|
|
51
|
+
tool_name = tool_call.get("name")
|
|
52
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
53
|
+
built_in_def = _get_built_in_definition(tool_name)
|
|
54
|
+
if built_in_def and built_in_def not in needed_definitions:
|
|
55
|
+
needed_definitions.append(built_in_def)
|
|
56
|
+
|
|
57
|
+
return needed_definitions
|
|
58
|
+
|
|
59
|
+
|
|
25
60
|
@experimental
|
|
26
61
|
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
27
62
|
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
|
|
@@ -88,7 +123,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
88
123
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
89
124
|
|
|
90
125
|
@override
|
|
91
|
-
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
|
|
126
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
|
|
92
127
|
current_dir = os.path.dirname(__file__)
|
|
93
128
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
94
129
|
self.threshold = threshold
|
|
@@ -96,6 +131,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
96
131
|
model_config=model_config,
|
|
97
132
|
prompty_file=prompty_path,
|
|
98
133
|
result_key=self._RESULT_KEY,
|
|
134
|
+
credential=credential,
|
|
99
135
|
**kwargs,
|
|
100
136
|
)
|
|
101
137
|
|
|
@@ -153,10 +189,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
153
189
|
# TODO add warning that only tool calls of type function are supported
|
|
154
190
|
# Collect inputs
|
|
155
191
|
tool_calls = kwargs.get("tool_calls")
|
|
156
|
-
tool_definitions = kwargs.get("tool_definitions")
|
|
192
|
+
tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
|
|
157
193
|
query = kwargs.get("query")
|
|
158
194
|
response = kwargs.get("response")
|
|
159
|
-
|
|
160
195
|
# TODO : Support classes that represents tool calls, messages etc once client side definitions are available
|
|
161
196
|
if response:
|
|
162
197
|
parsed_tool_calls = self._parse_tools_from_response(response)
|
|
@@ -165,20 +200,23 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
165
200
|
|
|
166
201
|
if not tool_calls:
|
|
167
202
|
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
|
|
168
|
-
if not tool_definitions or len(tool_definitions) == 0:
|
|
169
|
-
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
170
203
|
|
|
171
204
|
if not isinstance(tool_calls, list):
|
|
172
205
|
tool_calls = [tool_calls]
|
|
173
206
|
if not isinstance(tool_definitions, list):
|
|
174
|
-
tool_definitions = [tool_definitions]
|
|
207
|
+
tool_definitions = [tool_definitions] if tool_definitions else []
|
|
175
208
|
|
|
176
209
|
try:
|
|
177
210
|
needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
|
|
178
211
|
except EvaluationException as e:
|
|
179
|
-
|
|
212
|
+
# Check if this is because no tool definitions were provided at all
|
|
213
|
+
if len(tool_definitions) == 0:
|
|
214
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
215
|
+
else:
|
|
216
|
+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
|
|
217
|
+
|
|
180
218
|
if len(needed_tool_definitions) == 0:
|
|
181
|
-
return {"error_message": self.
|
|
219
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
182
220
|
|
|
183
221
|
return {
|
|
184
222
|
"query": query,
|
|
@@ -268,66 +306,72 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
268
306
|
"details": {},
|
|
269
307
|
}
|
|
270
308
|
|
|
271
|
-
def _parse_tools_from_response(self, response):
|
|
272
|
-
"""Parse the response to extract tool calls and results.
|
|
273
|
-
:param response: The response to parse.
|
|
274
|
-
:type response: Union[str, List[dict]]
|
|
275
|
-
:return: List of tool calls extracted from the response.
|
|
276
|
-
:rtype: List[dict]
|
|
277
|
-
"""
|
|
278
|
-
tool_calls = []
|
|
279
|
-
tool_results_map = {}
|
|
280
|
-
if isinstance(response, list):
|
|
281
|
-
for message in response:
|
|
282
|
-
# Extract tool calls from assistant messages
|
|
283
|
-
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
284
|
-
for content_item in message.get("content"):
|
|
285
|
-
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
286
|
-
tool_calls.append(content_item)
|
|
287
|
-
|
|
288
|
-
# Extract tool results from tool messages
|
|
289
|
-
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
290
|
-
tool_call_id = message.get("tool_call_id")
|
|
291
|
-
if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
|
|
292
|
-
result_content = message.get("content")[0]
|
|
293
|
-
if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
|
|
294
|
-
tool_results_map[tool_call_id] = result_content
|
|
295
|
-
|
|
296
|
-
# Attach results to their corresponding calls
|
|
297
|
-
for tool_call in tool_calls:
|
|
298
|
-
tool_call_id = tool_call.get("tool_call_id")
|
|
299
|
-
if tool_call_id in tool_results_map:
|
|
300
|
-
tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
|
|
301
|
-
|
|
302
|
-
return tool_calls
|
|
303
|
-
|
|
304
309
|
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
|
|
305
|
-
"""Extract the tool definitions that are needed for the provided tool calls.
|
|
306
|
-
:param tool_calls: List of tool calls to evaluate.
|
|
307
|
-
:type tool_calls: List[dict]
|
|
308
|
-
:param tool_definitions: List of tool definitions to use for evaluation.
|
|
309
|
-
:type tool_definitions: List[dict]
|
|
310
|
-
:return: List of tool definitions that are needed for the provided tool calls.
|
|
311
|
-
:rtype: List[dict]
|
|
312
|
-
"""
|
|
310
|
+
"""Extract the tool definitions that are needed for the provided tool calls."""
|
|
313
311
|
needed_tool_definitions = []
|
|
312
|
+
|
|
313
|
+
# Add all user-provided tool definitions
|
|
314
|
+
needed_tool_definitions.extend(tool_definitions)
|
|
315
|
+
|
|
316
|
+
# Add the needed built-in tool definitions (if they are called)
|
|
317
|
+
built_in_definitions = _get_needed_built_in_definitions(tool_calls)
|
|
318
|
+
needed_tool_definitions.extend(built_in_definitions)
|
|
319
|
+
|
|
320
|
+
# OpenAPI tool is a collection of functions, so we need to expand it
|
|
321
|
+
tool_definitions_expanded = list(
|
|
322
|
+
chain.from_iterable(
|
|
323
|
+
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
|
|
324
|
+
for tool in needed_tool_definitions
|
|
325
|
+
)
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Validate that all tool calls have corresponding definitions
|
|
314
329
|
for tool_call in tool_calls:
|
|
315
|
-
if isinstance(tool_call, dict)
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
if
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
330
|
+
if isinstance(tool_call, dict):
|
|
331
|
+
tool_type = tool_call.get("type")
|
|
332
|
+
|
|
333
|
+
if tool_type == "tool_call":
|
|
334
|
+
tool_name = tool_call.get("name")
|
|
335
|
+
if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
336
|
+
# This is a built-in tool from converter, already handled above
|
|
337
|
+
continue
|
|
338
|
+
elif tool_name:
|
|
339
|
+
# This is a regular function tool from converter
|
|
340
|
+
tool_definition_exists = any(
|
|
341
|
+
tool.get("name") == tool_name and tool.get("type", "function") == "function"
|
|
342
|
+
for tool in tool_definitions_expanded
|
|
343
|
+
)
|
|
344
|
+
if not tool_definition_exists:
|
|
345
|
+
raise EvaluationException(
|
|
346
|
+
message=f"Tool definition for {tool_name} not found",
|
|
347
|
+
blame=ErrorBlame.USER_ERROR,
|
|
348
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
349
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
350
|
+
)
|
|
351
|
+
else:
|
|
352
|
+
raise EvaluationException(
|
|
353
|
+
message=f"Tool call missing name: {tool_call}",
|
|
354
|
+
blame=ErrorBlame.USER_ERROR,
|
|
355
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
356
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
357
|
+
)
|
|
324
358
|
else:
|
|
359
|
+
# Unsupported tool format - only converter format is supported
|
|
325
360
|
raise EvaluationException(
|
|
326
|
-
message=f"
|
|
361
|
+
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
|
|
327
362
|
blame=ErrorBlame.USER_ERROR,
|
|
328
363
|
category=ErrorCategory.INVALID_VALUE,
|
|
329
364
|
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
330
365
|
)
|
|
366
|
+
else:
|
|
367
|
+
# Tool call is not a dictionary
|
|
368
|
+
raise EvaluationException(
|
|
369
|
+
message=f"Tool call is not a dictionary: {tool_call}",
|
|
370
|
+
blame=ErrorBlame.USER_ERROR,
|
|
371
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
372
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
373
|
+
)
|
|
374
|
+
|
|
331
375
|
return needed_tool_definitions
|
|
332
376
|
|
|
333
377
|
@override
|