azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +13 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +77 -33
- azure/ai/evaluation/_evaluate/_utils.py +4 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +2 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +113 -19
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +113 -3
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +8 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +2 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +10 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +2 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +8 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +104 -60
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +58 -41
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +2 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +697 -3067
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +3 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -0
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +19 -5
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +4 -3
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +32 -2
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +49 -41
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
138
138
|
project_url: str,
|
|
139
139
|
evaluation_name: Optional[str],
|
|
140
140
|
name_map: Dict[str, str],
|
|
141
|
+
tags: Optional[Dict[str, str]] = None,
|
|
141
142
|
**kwargs,
|
|
142
143
|
) -> Optional[str]:
|
|
143
144
|
|
|
@@ -191,6 +192,7 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
191
192
|
evaluation=EvaluationUpload(
|
|
192
193
|
display_name=evaluation_name,
|
|
193
194
|
properties=properties,
|
|
195
|
+
tags=tags,
|
|
194
196
|
)
|
|
195
197
|
)
|
|
196
198
|
|
|
@@ -215,6 +217,7 @@ def _log_metrics_and_instance_results(
|
|
|
215
217
|
run: Optional[Run],
|
|
216
218
|
evaluation_name: Optional[str],
|
|
217
219
|
name_map: Dict[str, str],
|
|
220
|
+
tags: Optional[Dict[str, str]] = None,
|
|
218
221
|
**kwargs,
|
|
219
222
|
) -> Optional[str]:
|
|
220
223
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
@@ -244,6 +247,7 @@ def _log_metrics_and_instance_results(
|
|
|
244
247
|
workspace_name=ws_triad.workspace_name,
|
|
245
248
|
management_client=management_client,
|
|
246
249
|
promptflow_run=run,
|
|
250
|
+
tags=tags,
|
|
247
251
|
) as ev_run:
|
|
248
252
|
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
249
253
|
|
|
@@ -66,7 +66,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
66
66
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
67
67
|
|
|
68
68
|
@override
|
|
69
|
-
def __init__(self, model_config, *, threshold=3):
|
|
69
|
+
def __init__(self, model_config, *, threshold=3, credential=None):
|
|
70
70
|
current_dir = os.path.dirname(__file__)
|
|
71
71
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
72
72
|
self._threshold = threshold
|
|
@@ -76,6 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
prompty_file=prompty_path,
|
|
77
77
|
result_key=self._RESULT_KEY,
|
|
78
78
|
threshold=threshold,
|
|
79
|
+
credential=credential,
|
|
79
80
|
_higher_is_better=self._higher_is_better,
|
|
80
81
|
)
|
|
81
82
|
|
|
@@ -170,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
170
170
|
|
|
171
171
|
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
172
172
|
|
|
173
|
-
def _derive_singleton_inputs(self) -> List[str]:
|
|
173
|
+
def _derive_singleton_inputs(self) -> List[List[str]]:
|
|
174
174
|
"""Inspect the evaluator's __call__ function to determine what singleton inputs are expected
|
|
175
175
|
when the evaluator is being used in a non-conversation context.
|
|
176
176
|
By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
|
|
177
177
|
Thankfully this works the way you'd hope, with the call_signature being based on the child
|
|
178
178
|
function's signature, not the parent's.
|
|
179
179
|
|
|
180
|
-
:return: A list of
|
|
181
|
-
:rtype: List[str]
|
|
180
|
+
:return: A list of lists, where each inner list represents the singleton inputs for each overload.
|
|
181
|
+
:rtype: List[List[str]]
|
|
182
182
|
"""
|
|
183
183
|
|
|
184
184
|
overloads = get_overloads(self.__call__)
|
|
@@ -186,15 +186,66 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
186
186
|
call_signatures = [inspect.signature(self.__call__)]
|
|
187
187
|
else:
|
|
188
188
|
call_signatures = [inspect.signature(overload) for overload in overloads]
|
|
189
|
-
|
|
190
|
-
|
|
189
|
+
|
|
190
|
+
overload_inputs = []
|
|
191
191
|
for call_signature in call_signatures:
|
|
192
192
|
params = call_signature.parameters
|
|
193
193
|
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
|
|
194
194
|
continue
|
|
195
195
|
# exclude self since it is not a singleton input
|
|
196
|
-
|
|
197
|
-
return
|
|
196
|
+
overload_inputs.append([p for p in params if p != "self"])
|
|
197
|
+
return overload_inputs
|
|
198
|
+
|
|
199
|
+
def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
|
|
200
|
+
"""Find the overload that matches the provided kwargs and return its input parameters.
|
|
201
|
+
|
|
202
|
+
:keyword kwargs: The keyword arguments to match against overloads.
|
|
203
|
+
:type kwargs: Dict
|
|
204
|
+
:return: List of input parameter names for the matching overload.
|
|
205
|
+
:rtype: List[str]
|
|
206
|
+
"""
|
|
207
|
+
overload_inputs = self._singleton_inputs
|
|
208
|
+
provided_keys = set(key for key, value in kwargs.items() if value is not None)
|
|
209
|
+
|
|
210
|
+
# Find the overload that best matches the provided parameters
|
|
211
|
+
best_match = None
|
|
212
|
+
best_score = -1
|
|
213
|
+
|
|
214
|
+
for inputs in overload_inputs:
|
|
215
|
+
input_set = set(inputs)
|
|
216
|
+
|
|
217
|
+
# Calculate match score: how many of the overload's params are provided
|
|
218
|
+
if input_set.issubset(provided_keys):
|
|
219
|
+
score = len(input_set)
|
|
220
|
+
if score > best_score:
|
|
221
|
+
best_score = score
|
|
222
|
+
best_match = inputs
|
|
223
|
+
|
|
224
|
+
# If exact match found, return it
|
|
225
|
+
if best_match is not None:
|
|
226
|
+
return best_match
|
|
227
|
+
|
|
228
|
+
# If no exact match, find the overload with the most overlap
|
|
229
|
+
for inputs in overload_inputs:
|
|
230
|
+
input_set = set(inputs)
|
|
231
|
+
overlap = len(input_set.intersection(provided_keys))
|
|
232
|
+
if overlap > best_score:
|
|
233
|
+
best_score = overlap
|
|
234
|
+
best_match = inputs
|
|
235
|
+
|
|
236
|
+
# Return the best match or the first overload as fallback
|
|
237
|
+
return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
|
|
238
|
+
|
|
239
|
+
def _get_all_singleton_inputs(self) -> List[str]:
|
|
240
|
+
"""Get a flattened list of all possible singleton inputs across all overloads.
|
|
241
|
+
|
|
242
|
+
:return: Flattened list of all singleton input names.
|
|
243
|
+
:rtype: List[str]
|
|
244
|
+
"""
|
|
245
|
+
all_inputs = set()
|
|
246
|
+
for inputs in self._singleton_inputs:
|
|
247
|
+
all_inputs.update(inputs)
|
|
248
|
+
return list(all_inputs)
|
|
198
249
|
|
|
199
250
|
def _derive_conversation_converter(
|
|
200
251
|
self,
|
|
@@ -206,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
206
257
|
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
207
258
|
:rtype: Callable
|
|
208
259
|
"""
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
260
|
+
all_singleton_inputs = self._get_all_singleton_inputs()
|
|
261
|
+
include_context = "context" in all_singleton_inputs
|
|
262
|
+
include_query = "query" in all_singleton_inputs
|
|
263
|
+
include_response = "response" in all_singleton_inputs
|
|
264
|
+
include_ground_truth = "ground_truth" in all_singleton_inputs
|
|
213
265
|
|
|
214
266
|
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
215
267
|
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
@@ -319,9 +371,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
319
371
|
(like a query and response), or they receive conversation that iss a list of dictionary
|
|
320
372
|
values.
|
|
321
373
|
|
|
322
|
-
The self._singleton_inputs list assigned during initialization
|
|
323
|
-
|
|
324
|
-
|
|
374
|
+
The self._singleton_inputs list (containing overload signatures) assigned during initialization
|
|
375
|
+
is used to find and extract singleton keywords, and determine which overload matches the
|
|
376
|
+
provided arguments.
|
|
325
377
|
|
|
326
378
|
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
327
379
|
are inputted.
|
|
@@ -339,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
339
391
|
conversation = kwargs.get("conversation", None)
|
|
340
392
|
singletons = {}
|
|
341
393
|
if len(self._singleton_inputs) > 0:
|
|
342
|
-
|
|
394
|
+
# Get all possible singleton inputs and check what's provided
|
|
395
|
+
all_singleton_inputs = self._get_all_singleton_inputs()
|
|
396
|
+
singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
|
|
397
|
+
|
|
343
398
|
# Check that both conversation and other inputs aren't set
|
|
344
399
|
if conversation is not None and any(singletons.values()):
|
|
345
400
|
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
|
|
@@ -354,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
354
409
|
if self._is_multi_modal_conversation(conversation):
|
|
355
410
|
return self._derive_multi_modal_conversation_converter()(conversation)
|
|
356
411
|
return self._derive_conversation_converter()(conversation)
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
412
|
+
|
|
413
|
+
# Handle Singletons - find matching overload
|
|
414
|
+
matching_inputs = self._get_matching_overload_inputs(**kwargs)
|
|
415
|
+
if matching_inputs:
|
|
416
|
+
# Check if all required inputs for this overload are provided
|
|
417
|
+
required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
|
|
418
|
+
required_singletons = remove_optional_singletons(self, required_singletons)
|
|
419
|
+
if all(value is not None for value in required_singletons.values()):
|
|
420
|
+
return [singletons]
|
|
421
|
+
|
|
361
422
|
# Missing input
|
|
362
423
|
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
|
|
363
424
|
raise EvaluationException(
|
|
@@ -416,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
416
477
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
417
478
|
return aggregated
|
|
418
479
|
|
|
480
|
+
def _parse_tools_from_response(self, response):
|
|
481
|
+
"""Parse the response to extract tool calls and results.
|
|
482
|
+
:param response: The response to parse.
|
|
483
|
+
:type response: Union[str, List[dict]]
|
|
484
|
+
:return: List of tool calls extracted from the response.
|
|
485
|
+
:rtype: List[dict]
|
|
486
|
+
"""
|
|
487
|
+
tool_calls = []
|
|
488
|
+
tool_results_map = {}
|
|
489
|
+
if isinstance(response, list):
|
|
490
|
+
for message in response:
|
|
491
|
+
# Extract tool calls from assistant messages
|
|
492
|
+
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
493
|
+
for content_item in message.get("content"):
|
|
494
|
+
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
495
|
+
tool_calls.append(content_item)
|
|
496
|
+
|
|
497
|
+
# Extract tool results from tool messages
|
|
498
|
+
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
499
|
+
tool_call_id = message.get("tool_call_id")
|
|
500
|
+
if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
|
|
501
|
+
result_content = message.get("content")[0]
|
|
502
|
+
if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
|
|
503
|
+
tool_results_map[tool_call_id] = result_content
|
|
504
|
+
|
|
505
|
+
# Attach results to their corresponding calls
|
|
506
|
+
for tool_call in tool_calls:
|
|
507
|
+
tool_call_id = tool_call.get("tool_call_id")
|
|
508
|
+
if tool_call_id in tool_results_map:
|
|
509
|
+
tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
|
|
510
|
+
|
|
511
|
+
return tool_calls
|
|
512
|
+
|
|
419
513
|
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
420
514
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
421
515
|
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
7
|
import os
|
|
8
|
-
from typing import Dict, TypeVar, Union
|
|
8
|
+
from typing import Dict, Optional, TypeVar, Union
|
|
9
9
|
|
|
10
10
|
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
|
|
11
11
|
from promptflow.core._flow import AsyncPrompty
|
|
@@ -13,6 +13,7 @@ else:
|
|
|
13
13
|
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
14
14
|
from typing_extensions import override
|
|
15
15
|
|
|
16
|
+
from azure.core.credentials import TokenCredential
|
|
16
17
|
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
17
18
|
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
18
19
|
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
63
64
|
model_config: dict,
|
|
64
65
|
eval_last_turn: bool = False,
|
|
65
66
|
threshold: int = 3,
|
|
67
|
+
credential: Optional[TokenCredential] = None,
|
|
66
68
|
_higher_is_better: bool = False,
|
|
67
69
|
**kwargs,
|
|
68
70
|
) -> None:
|
|
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
82
84
|
)
|
|
83
85
|
|
|
84
86
|
self._flow = AsyncPrompty.load(
|
|
85
|
-
source=self._prompty_file,
|
|
87
|
+
source=self._prompty_file,
|
|
88
|
+
model=prompty_model_config,
|
|
89
|
+
token_credential=credential,
|
|
90
|
+
is_reasoning_model=self._is_reasoning_model,
|
|
86
91
|
)
|
|
87
92
|
|
|
88
93
|
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
@@ -153,7 +153,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
153
153
|
if query is not None and self._evaluate_query:
|
|
154
154
|
input_data["query"] = str(query)
|
|
155
155
|
|
|
156
|
-
if "context" in self.
|
|
156
|
+
if "context" in self._get_all_singleton_inputs():
|
|
157
157
|
context = eval_input.get("context", None)
|
|
158
158
|
if context is None:
|
|
159
159
|
raise EvaluationException(
|
|
@@ -68,7 +68,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
68
68
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
69
69
|
|
|
70
70
|
@override
|
|
71
|
-
def __init__(self, model_config, *, threshold=3):
|
|
71
|
+
def __init__(self, model_config, *, credential=None, threshold=3):
|
|
72
72
|
current_dir = os.path.dirname(__file__)
|
|
73
73
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
74
74
|
self._threshold = threshold
|
|
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
78
|
prompty_file=prompty_path,
|
|
79
79
|
result_key=self._RESULT_KEY,
|
|
80
80
|
threshold=threshold,
|
|
81
|
+
credential=credential,
|
|
81
82
|
_higher_is_better=self._higher_is_better,
|
|
82
83
|
)
|
|
83
84
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
import os
|
|
4
|
+
import os, logging
|
|
5
5
|
from typing import Dict, List, Optional, Union
|
|
6
6
|
|
|
7
7
|
from typing_extensions import overload, override
|
|
@@ -9,7 +9,14 @@ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
|
|
|
9
9
|
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
11
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
-
from ..._common.utils import
|
|
12
|
+
from ..._common.utils import (
|
|
13
|
+
ErrorBlame,
|
|
14
|
+
ErrorTarget,
|
|
15
|
+
EvaluationException,
|
|
16
|
+
ErrorCategory,
|
|
17
|
+
construct_prompty_model_config,
|
|
18
|
+
validate_model_config,
|
|
19
|
+
)
|
|
13
20
|
|
|
14
21
|
try:
|
|
15
22
|
from ..._user_agent import UserAgentSingleton
|
|
@@ -21,6 +28,9 @@ except ImportError:
|
|
|
21
28
|
return "None"
|
|
22
29
|
|
|
23
30
|
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
24
34
|
class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
25
35
|
"""
|
|
26
36
|
Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
|
|
@@ -78,12 +88,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
88
|
_PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
|
|
79
89
|
_RESULT_KEY = "groundedness"
|
|
80
90
|
_OPTIONAL_PARAMS = ["query"]
|
|
91
|
+
_SUPPORTED_TOOLS = ["file_search"]
|
|
81
92
|
|
|
82
93
|
id = "azureai://built-in/evaluators/groundedness"
|
|
83
94
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
84
95
|
|
|
85
96
|
@override
|
|
86
|
-
def __init__(self, model_config, *, threshold=3, **kwargs):
|
|
97
|
+
def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
|
|
87
98
|
current_dir = os.path.dirname(__file__)
|
|
88
99
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
|
|
89
100
|
|
|
@@ -93,6 +104,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
93
104
|
prompty_file=prompty_path,
|
|
94
105
|
result_key=self._RESULT_KEY,
|
|
95
106
|
threshold=threshold,
|
|
107
|
+
credential=credential,
|
|
96
108
|
_higher_is_better=self._higher_is_better,
|
|
97
109
|
)
|
|
98
110
|
self._model_config = model_config
|
|
@@ -120,6 +132,26 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
120
132
|
:rtype: Dict[str, float]
|
|
121
133
|
"""
|
|
122
134
|
|
|
135
|
+
@overload
|
|
136
|
+
def __call__(
|
|
137
|
+
self,
|
|
138
|
+
*,
|
|
139
|
+
query: str,
|
|
140
|
+
response: List[dict],
|
|
141
|
+
tool_definitions: List[dict],
|
|
142
|
+
) -> Dict[str, Union[str, float]]:
|
|
143
|
+
"""Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
|
|
144
|
+
|
|
145
|
+
:keyword query: The query to be evaluated.
|
|
146
|
+
:paramtype query: str
|
|
147
|
+
:keyword response: The response from the agent to be evaluated.
|
|
148
|
+
:paramtype response: List[dict]
|
|
149
|
+
:keyword tool_definitions: The tool definitions used by the agent.
|
|
150
|
+
:paramtype tool_definitions: List[dict]
|
|
151
|
+
:return: The groundedness score.
|
|
152
|
+
:rtype: Dict[str, Union[str, float]]
|
|
153
|
+
"""
|
|
154
|
+
|
|
123
155
|
@overload
|
|
124
156
|
def __call__(
|
|
125
157
|
self,
|
|
@@ -174,3 +206,81 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
174
206
|
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
175
207
|
|
|
176
208
|
return super().__call__(*args, **kwargs)
|
|
209
|
+
|
|
210
|
+
async def _real_call(self, **kwargs):
|
|
211
|
+
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
212
|
+
|
|
213
|
+
:keyword kwargs: The inputs to evaluate.
|
|
214
|
+
:type kwargs: Dict
|
|
215
|
+
:return: The evaluation result.
|
|
216
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
217
|
+
"""
|
|
218
|
+
# Convert inputs into list of evaluable inputs.
|
|
219
|
+
try:
|
|
220
|
+
return await super()._real_call(**kwargs)
|
|
221
|
+
except EvaluationException as ex:
|
|
222
|
+
if ex.category == ErrorCategory.NOT_APPLICABLE:
|
|
223
|
+
return {
|
|
224
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
225
|
+
f"{self._result_key}_result": "pass",
|
|
226
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
227
|
+
f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
|
|
228
|
+
}
|
|
229
|
+
else:
|
|
230
|
+
raise ex
|
|
231
|
+
|
|
232
|
+
def _convert_kwargs_to_eval_input(self, **kwargs):
|
|
233
|
+
if "context" in kwargs or "conversation" in kwargs:
|
|
234
|
+
return super()._convert_kwargs_to_eval_input(**kwargs)
|
|
235
|
+
|
|
236
|
+
query = kwargs.get("query")
|
|
237
|
+
response = kwargs.get("response")
|
|
238
|
+
tool_definitions = kwargs.get("tool_definitions")
|
|
239
|
+
|
|
240
|
+
if not query or not response or not tool_definitions:
|
|
241
|
+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query', 'response' and 'tool_definitions' are required."
|
|
242
|
+
raise EvaluationException(
|
|
243
|
+
message=msg,
|
|
244
|
+
blame=ErrorBlame.USER_ERROR,
|
|
245
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
246
|
+
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
context = self._get_context_from_agent_response(response, tool_definitions)
|
|
250
|
+
if not context:
|
|
251
|
+
raise EvaluationException(
|
|
252
|
+
message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
|
|
253
|
+
blame=ErrorBlame.USER_ERROR,
|
|
254
|
+
category=ErrorCategory.NOT_APPLICABLE,
|
|
255
|
+
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
return super()._convert_kwargs_to_eval_input(response=response[-1], context=context, query=query)
|
|
259
|
+
|
|
260
|
+
def _get_context_from_agent_response(self, response, tool_definitions):
|
|
261
|
+
context = ""
|
|
262
|
+
try:
|
|
263
|
+
logger.debug("Extracting context from response")
|
|
264
|
+
tool_calls = self._parse_tools_from_response(response=response)
|
|
265
|
+
logger.debug(f"Tool Calls parsed successfully : {tool_calls}")
|
|
266
|
+
if tool_calls:
|
|
267
|
+
for tool_call in tool_calls:
|
|
268
|
+
if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
|
|
269
|
+
tool_name = tool_call.get("name")
|
|
270
|
+
for tool in tool_definitions:
|
|
271
|
+
if tool.get("name") == tool_name and tool.get("type") in self._SUPPORTED_TOOLS:
|
|
272
|
+
if tool_name == "file_search":
|
|
273
|
+
tool_result = tool_call.get("tool_result")
|
|
274
|
+
if tool_result:
|
|
275
|
+
for result in tool_result:
|
|
276
|
+
content_list = result.get("content")
|
|
277
|
+
if content_list:
|
|
278
|
+
for content in content_list:
|
|
279
|
+
text = content.get("text")
|
|
280
|
+
if text:
|
|
281
|
+
context = context + "\n" + str(text)
|
|
282
|
+
except Exception as ex:
|
|
283
|
+
logger.debug(f"Error extracting context from agent response : {str(ex)}")
|
|
284
|
+
context = ""
|
|
285
|
+
|
|
286
|
+
return context if context else None
|
|
@@ -61,11 +61,17 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
61
61
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
62
62
|
|
|
63
63
|
@override
|
|
64
|
-
def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
|
|
64
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
|
|
65
65
|
current_dir = os.path.dirname(__file__)
|
|
66
66
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
67
67
|
self.threshold = threshold
|
|
68
|
-
super().__init__(
|
|
68
|
+
super().__init__(
|
|
69
|
+
model_config=model_config,
|
|
70
|
+
prompty_file=prompty_path,
|
|
71
|
+
result_key=self._RESULT_KEY,
|
|
72
|
+
credential=credential,
|
|
73
|
+
**kwargs,
|
|
74
|
+
)
|
|
69
75
|
|
|
70
76
|
@overload
|
|
71
77
|
def __call__(
|
|
@@ -79,7 +79,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
79
79
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
80
80
|
|
|
81
81
|
@override
|
|
82
|
-
def __init__(self, model_config, *, threshold=3):
|
|
82
|
+
def __init__(self, model_config, *, credential=None, threshold=3):
|
|
83
83
|
current_dir = os.path.dirname(__file__)
|
|
84
84
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
85
85
|
self._threshold = threshold
|
|
@@ -89,6 +89,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
89
89
|
prompty_file=prompty_path,
|
|
90
90
|
result_key=self._RESULT_KEY,
|
|
91
91
|
threshold=threshold,
|
|
92
|
+
credential=credential,
|
|
92
93
|
_higher_is_better=self._higher_is_better,
|
|
93
94
|
)
|
|
94
95
|
|
|
@@ -73,11 +73,19 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
73
73
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
74
74
|
|
|
75
75
|
@override
|
|
76
|
-
def __init__(
|
|
76
|
+
def __init__(
|
|
77
|
+
self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
|
|
78
|
+
):
|
|
77
79
|
current_dir = os.path.dirname(__file__)
|
|
78
80
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
79
81
|
self.threshold = threshold
|
|
80
|
-
super().__init__(
|
|
82
|
+
super().__init__(
|
|
83
|
+
model_config=model_config,
|
|
84
|
+
prompty_file=prompty_path,
|
|
85
|
+
result_key=self._RESULT_KEY,
|
|
86
|
+
credential=credential,
|
|
87
|
+
**kwargs,
|
|
88
|
+
)
|
|
81
89
|
|
|
82
90
|
@overload
|
|
83
91
|
def __call__(
|
|
@@ -78,7 +78,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
78
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
79
79
|
|
|
80
80
|
@override
|
|
81
|
-
def __init__(self, model_config, *, threshold: float = 3):
|
|
81
|
+
def __init__(self, model_config, *, threshold: float = 3, credential=None):
|
|
82
82
|
current_dir = os.path.dirname(__file__)
|
|
83
83
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
84
84
|
self._threshold = threshold
|
|
@@ -88,6 +88,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
88
88
|
prompty_file=prompty_path,
|
|
89
89
|
result_key=self._RESULT_KEY,
|
|
90
90
|
threshold=threshold,
|
|
91
|
+
credential=credential,
|
|
91
92
|
_higher_is_better=self._higher_is_better,
|
|
92
93
|
)
|
|
93
94
|
|
|
@@ -75,7 +75,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
|
|
|
75
75
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
76
76
|
|
|
77
77
|
@override
|
|
78
|
-
def __init__(self, model_config, *, threshold=3):
|
|
78
|
+
def __init__(self, model_config, *, threshold=3, credential=None):
|
|
79
79
|
current_dir = os.path.dirname(__file__)
|
|
80
80
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
81
81
|
self._threshold = threshold
|
|
@@ -85,6 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
|
|
|
85
85
|
prompty_file=prompty_path,
|
|
86
86
|
result_key=self._RESULT_KEY,
|
|
87
87
|
threshold=threshold,
|
|
88
|
+
credential=credential,
|
|
88
89
|
_higher_is_better=self._higher_is_better,
|
|
89
90
|
)
|
|
90
91
|
|
|
@@ -69,11 +69,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
69
69
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
70
70
|
|
|
71
71
|
@override
|
|
72
|
-
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
|
|
72
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
|
|
73
73
|
current_dir = os.path.dirname(__file__)
|
|
74
74
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
75
75
|
self.threshold = threshold
|
|
76
|
-
super().__init__(
|
|
76
|
+
super().__init__(
|
|
77
|
+
model_config=model_config,
|
|
78
|
+
prompty_file=prompty_path,
|
|
79
|
+
result_key=self._RESULT_KEY,
|
|
80
|
+
credential=credential,
|
|
81
|
+
**kwargs,
|
|
82
|
+
)
|
|
77
83
|
|
|
78
84
|
@overload
|
|
79
85
|
def __call__(
|