azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (49) hide show
  1. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  2. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  3. azure/ai/evaluation/_converters/_models.py +75 -26
  4. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  5. azure/ai/evaluation/_evaluate/_evaluate.py +13 -4
  6. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +77 -33
  7. azure/ai/evaluation/_evaluate/_utils.py +4 -0
  8. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +2 -1
  9. azure/ai/evaluation/_evaluators/_common/_base_eval.py +113 -19
  10. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  11. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +1 -1
  12. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -1
  13. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +113 -3
  14. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +8 -2
  15. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +2 -1
  16. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +10 -2
  17. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +2 -1
  18. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +2 -1
  19. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +8 -2
  20. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +104 -60
  21. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +58 -41
  22. azure/ai/evaluation/_exceptions.py +1 -0
  23. azure/ai/evaluation/_version.py +1 -1
  24. azure/ai/evaluation/red_team/__init__.py +2 -1
  25. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  26. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  27. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  28. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  29. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  30. azure/ai/evaluation/red_team/_red_team.py +697 -3067
  31. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  32. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  33. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +3 -1
  34. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  35. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  36. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  37. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  38. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  39. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  40. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  41. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  42. azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -0
  43. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +19 -5
  44. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +4 -3
  45. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +32 -2
  46. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +49 -41
  47. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
  48. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
  49. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
138
138
  project_url: str,
139
139
  evaluation_name: Optional[str],
140
140
  name_map: Dict[str, str],
141
+ tags: Optional[Dict[str, str]] = None,
141
142
  **kwargs,
142
143
  ) -> Optional[str]:
143
144
 
@@ -191,6 +192,7 @@ def _log_metrics_and_instance_results_onedp(
191
192
  evaluation=EvaluationUpload(
192
193
  display_name=evaluation_name,
193
194
  properties=properties,
195
+ tags=tags,
194
196
  )
195
197
  )
196
198
 
@@ -215,6 +217,7 @@ def _log_metrics_and_instance_results(
215
217
  run: Optional[Run],
216
218
  evaluation_name: Optional[str],
217
219
  name_map: Dict[str, str],
220
+ tags: Optional[Dict[str, str]] = None,
218
221
  **kwargs,
219
222
  ) -> Optional[str]:
220
223
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -244,6 +247,7 @@ def _log_metrics_and_instance_results(
244
247
  workspace_name=ws_triad.workspace_name,
245
248
  management_client=management_client,
246
249
  promptflow_run=run,
250
+ tags=tags,
247
251
  ) as ev_run:
248
252
  artifact_name = EvalRun.EVALUATION_ARTIFACT
249
253
 
@@ -66,7 +66,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
66
66
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
67
67
 
68
68
  @override
69
- def __init__(self, model_config, *, threshold=3):
69
+ def __init__(self, model_config, *, threshold=3, credential=None):
70
70
  current_dir = os.path.dirname(__file__)
71
71
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
72
72
  self._threshold = threshold
@@ -76,6 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
76
76
  prompty_file=prompty_path,
77
77
  result_key=self._RESULT_KEY,
78
78
  threshold=threshold,
79
+ credential=credential,
79
80
  _higher_is_better=self._higher_is_better,
80
81
  )
81
82
 
@@ -170,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
170
170
 
171
171
  # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
172
172
 
173
- def _derive_singleton_inputs(self) -> List[str]:
173
+ def _derive_singleton_inputs(self) -> List[List[str]]:
174
174
  """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
175
175
  when the evaluator is being used in a non-conversation context.
176
176
  By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
177
177
  Thankfully this works the way you'd hope, with the call_signature being based on the child
178
178
  function's signature, not the parent's.
179
179
 
180
- :return: A list of strings representing the names of singleton inputs.
181
- :rtype: List[str]
180
+ :return: A list of lists, where each inner list represents the singleton inputs for each overload.
181
+ :rtype: List[List[str]]
182
182
  """
183
183
 
184
184
  overloads = get_overloads(self.__call__)
@@ -186,15 +186,66 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
186
186
  call_signatures = [inspect.signature(self.__call__)]
187
187
  else:
188
188
  call_signatures = [inspect.signature(overload) for overload in overloads]
189
- call_signature = inspect.signature(self.__call__)
190
- singletons = []
189
+
190
+ overload_inputs = []
191
191
  for call_signature in call_signatures:
192
192
  params = call_signature.parameters
193
193
  if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
194
194
  continue
195
195
  # exclude self since it is not a singleton input
196
- singletons.extend([p for p in params if p != "self"])
197
- return singletons
196
+ overload_inputs.append([p for p in params if p != "self"])
197
+ return overload_inputs
198
+
199
+ def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
200
+ """Find the overload that matches the provided kwargs and return its input parameters.
201
+
202
+ :keyword kwargs: The keyword arguments to match against overloads.
203
+ :type kwargs: Dict
204
+ :return: List of input parameter names for the matching overload.
205
+ :rtype: List[str]
206
+ """
207
+ overload_inputs = self._singleton_inputs
208
+ provided_keys = set(key for key, value in kwargs.items() if value is not None)
209
+
210
+ # Find the overload that best matches the provided parameters
211
+ best_match = None
212
+ best_score = -1
213
+
214
+ for inputs in overload_inputs:
215
+ input_set = set(inputs)
216
+
217
+ # Calculate match score: how many of the overload's params are provided
218
+ if input_set.issubset(provided_keys):
219
+ score = len(input_set)
220
+ if score > best_score:
221
+ best_score = score
222
+ best_match = inputs
223
+
224
+ # If exact match found, return it
225
+ if best_match is not None:
226
+ return best_match
227
+
228
+ # If no exact match, find the overload with the most overlap
229
+ for inputs in overload_inputs:
230
+ input_set = set(inputs)
231
+ overlap = len(input_set.intersection(provided_keys))
232
+ if overlap > best_score:
233
+ best_score = overlap
234
+ best_match = inputs
235
+
236
+ # Return the best match or the first overload as fallback
237
+ return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
238
+
239
+ def _get_all_singleton_inputs(self) -> List[str]:
240
+ """Get a flattened list of all possible singleton inputs across all overloads.
241
+
242
+ :return: Flattened list of all singleton input names.
243
+ :rtype: List[str]
244
+ """
245
+ all_inputs = set()
246
+ for inputs in self._singleton_inputs:
247
+ all_inputs.update(inputs)
248
+ return list(all_inputs)
198
249
 
199
250
  def _derive_conversation_converter(
200
251
  self,
@@ -206,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
206
257
  :return: The function that will be used to convert conversations to evaluable inputs.
207
258
  :rtype: Callable
208
259
  """
209
- include_context = "context" in self._singleton_inputs
210
- include_query = "query" in self._singleton_inputs
211
- include_response = "response" in self._singleton_inputs
212
- include_ground_truth = "ground_truth" in self._singleton_inputs
260
+ all_singleton_inputs = self._get_all_singleton_inputs()
261
+ include_context = "context" in all_singleton_inputs
262
+ include_query = "query" in all_singleton_inputs
263
+ include_response = "response" in all_singleton_inputs
264
+ include_ground_truth = "ground_truth" in all_singleton_inputs
213
265
 
214
266
  def converter(conversation: Dict) -> List[DerivedEvalInput]:
215
267
  messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -319,9 +371,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
319
371
  (like a query and response), or they receive conversation that iss a list of dictionary
320
372
  values.
321
373
 
322
- The self._singleton_inputs list assigned during initialization is used to find and extract
323
- singleton keywords, and self._allow_conversation_input is used to determine if a conversation
324
- is a valid input.
374
+ The self._singleton_inputs list (containing overload signatures) assigned during initialization
375
+ is used to find and extract singleton keywords, and determine which overload matches the
376
+ provided arguments.
325
377
 
326
378
  If both conversations and singletons are allowed, the function will raise an exception if both
327
379
  are inputted.
@@ -339,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
339
391
  conversation = kwargs.get("conversation", None)
340
392
  singletons = {}
341
393
  if len(self._singleton_inputs) > 0:
342
- singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
394
+ # Get all possible singleton inputs and check what's provided
395
+ all_singleton_inputs = self._get_all_singleton_inputs()
396
+ singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
397
+
343
398
  # Check that both conversation and other inputs aren't set
344
399
  if conversation is not None and any(singletons.values()):
345
400
  msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
@@ -354,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
354
409
  if self._is_multi_modal_conversation(conversation):
355
410
  return self._derive_multi_modal_conversation_converter()(conversation)
356
411
  return self._derive_conversation_converter()(conversation)
357
- # Handle Singletons
358
- required_singletons = remove_optional_singletons(self, singletons)
359
- if all(value is not None for value in required_singletons.values()):
360
- return [singletons]
412
+
413
+ # Handle Singletons - find matching overload
414
+ matching_inputs = self._get_matching_overload_inputs(**kwargs)
415
+ if matching_inputs:
416
+ # Check if all required inputs for this overload are provided
417
+ required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
418
+ required_singletons = remove_optional_singletons(self, required_singletons)
419
+ if all(value is not None for value in required_singletons.values()):
420
+ return [singletons]
421
+
361
422
  # Missing input
362
423
  msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
363
424
  raise EvaluationException(
@@ -416,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
416
477
  aggregated["evaluation_per_turn"] = evaluation_per_turn
417
478
  return aggregated
418
479
 
480
+ def _parse_tools_from_response(self, response):
481
+ """Parse the response to extract tool calls and results.
482
+ :param response: The response to parse.
483
+ :type response: Union[str, List[dict]]
484
+ :return: List of tool calls extracted from the response.
485
+ :rtype: List[dict]
486
+ """
487
+ tool_calls = []
488
+ tool_results_map = {}
489
+ if isinstance(response, list):
490
+ for message in response:
491
+ # Extract tool calls from assistant messages
492
+ if message.get("role") == "assistant" and isinstance(message.get("content"), list):
493
+ for content_item in message.get("content"):
494
+ if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
495
+ tool_calls.append(content_item)
496
+
497
+ # Extract tool results from tool messages
498
+ elif message.get("role") == "tool" and message.get("tool_call_id"):
499
+ tool_call_id = message.get("tool_call_id")
500
+ if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
501
+ result_content = message.get("content")[0]
502
+ if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
503
+ tool_results_map[tool_call_id] = result_content
504
+
505
+ # Attach results to their corresponding calls
506
+ for tool_call in tool_calls:
507
+ tool_call_id = tool_call.get("tool_call_id")
508
+ if tool_call_id in tool_results_map:
509
+ tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
510
+
511
+ return tool_calls
512
+
419
513
  async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
420
514
  """The asynchronous call where real end-to-end evaluation logic is performed.
421
515
 
@@ -5,7 +5,7 @@
5
5
  import math
6
6
  import re
7
7
  import os
8
- from typing import Dict, TypeVar, Union
8
+ from typing import Dict, Optional, TypeVar, Union
9
9
 
10
10
  if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
11
11
  from promptflow.core._flow import AsyncPrompty
@@ -13,6 +13,7 @@ else:
13
13
  from azure.ai.evaluation._legacy.prompty import AsyncPrompty
14
14
  from typing_extensions import override
15
15
 
16
+ from azure.core.credentials import TokenCredential
16
17
  from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
17
18
  from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
18
19
  from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
63
64
  model_config: dict,
64
65
  eval_last_turn: bool = False,
65
66
  threshold: int = 3,
67
+ credential: Optional[TokenCredential] = None,
66
68
  _higher_is_better: bool = False,
67
69
  **kwargs,
68
70
  ) -> None:
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
82
84
  )
83
85
 
84
86
  self._flow = AsyncPrompty.load(
85
- source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
87
+ source=self._prompty_file,
88
+ model=prompty_model_config,
89
+ token_credential=credential,
90
+ is_reasoning_model=self._is_reasoning_model,
86
91
  )
87
92
 
88
93
  # __call__ not overridden here because child classes have such varied signatures that there's no point
@@ -153,7 +153,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
153
153
  if query is not None and self._evaluate_query:
154
154
  input_data["query"] = str(query)
155
155
 
156
- if "context" in self._singleton_inputs:
156
+ if "context" in self._get_all_singleton_inputs():
157
157
  context = eval_input.get("context", None)
158
158
  if context is None:
159
159
  raise EvaluationException(
@@ -68,7 +68,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
68
68
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
69
69
 
70
70
  @override
71
- def __init__(self, model_config, *, threshold=3):
71
+ def __init__(self, model_config, *, credential=None, threshold=3):
72
72
  current_dir = os.path.dirname(__file__)
73
73
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
74
74
  self._threshold = threshold
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
78
  prompty_file=prompty_path,
79
79
  result_key=self._RESULT_KEY,
80
80
  threshold=threshold,
81
+ credential=credential,
81
82
  _higher_is_better=self._higher_is_better,
82
83
  )
83
84
 
@@ -1,7 +1,7 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- import os
4
+ import os, logging
5
5
  from typing import Dict, List, Optional, Union
6
6
 
7
7
  from typing_extensions import overload, override
@@ -9,7 +9,14 @@ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
12
- from ..._common.utils import construct_prompty_model_config, validate_model_config
12
+ from ..._common.utils import (
13
+ ErrorBlame,
14
+ ErrorTarget,
15
+ EvaluationException,
16
+ ErrorCategory,
17
+ construct_prompty_model_config,
18
+ validate_model_config,
19
+ )
13
20
 
14
21
  try:
15
22
  from ..._user_agent import UserAgentSingleton
@@ -21,6 +28,9 @@ except ImportError:
21
28
  return "None"
22
29
 
23
30
 
31
+ logger = logging.getLogger(__name__)
32
+
33
+
24
34
  class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
25
35
  """
26
36
  Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
@@ -78,12 +88,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
88
  _PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
79
89
  _RESULT_KEY = "groundedness"
80
90
  _OPTIONAL_PARAMS = ["query"]
91
+ _SUPPORTED_TOOLS = ["file_search"]
81
92
 
82
93
  id = "azureai://built-in/evaluators/groundedness"
83
94
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
84
95
 
85
96
  @override
86
- def __init__(self, model_config, *, threshold=3, **kwargs):
97
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
87
98
  current_dir = os.path.dirname(__file__)
88
99
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
89
100
 
@@ -93,6 +104,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
93
104
  prompty_file=prompty_path,
94
105
  result_key=self._RESULT_KEY,
95
106
  threshold=threshold,
107
+ credential=credential,
96
108
  _higher_is_better=self._higher_is_better,
97
109
  )
98
110
  self._model_config = model_config
@@ -120,6 +132,26 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
120
132
  :rtype: Dict[str, float]
121
133
  """
122
134
 
135
+ @overload
136
+ def __call__(
137
+ self,
138
+ *,
139
+ query: str,
140
+ response: List[dict],
141
+ tool_definitions: List[dict],
142
+ ) -> Dict[str, Union[str, float]]:
143
+ """Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
144
+
145
+ :keyword query: The query to be evaluated.
146
+ :paramtype query: str
147
+ :keyword response: The response from the agent to be evaluated.
148
+ :paramtype response: List[dict]
149
+ :keyword tool_definitions: The tool definitions used by the agent.
150
+ :paramtype tool_definitions: List[dict]
151
+ :return: The groundedness score.
152
+ :rtype: Dict[str, Union[str, float]]
153
+ """
154
+
123
155
  @overload
124
156
  def __call__(
125
157
  self,
@@ -174,3 +206,81 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
174
206
  self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
175
207
 
176
208
  return super().__call__(*args, **kwargs)
209
+
210
+ async def _real_call(self, **kwargs):
211
+ """The asynchronous call where real end-to-end evaluation logic is performed.
212
+
213
+ :keyword kwargs: The inputs to evaluate.
214
+ :type kwargs: Dict
215
+ :return: The evaluation result.
216
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
217
+ """
218
+ # Convert inputs into list of evaluable inputs.
219
+ try:
220
+ return await super()._real_call(**kwargs)
221
+ except EvaluationException as ex:
222
+ if ex.category == ErrorCategory.NOT_APPLICABLE:
223
+ return {
224
+ self._result_key: self._NOT_APPLICABLE_RESULT,
225
+ f"{self._result_key}_result": "pass",
226
+ f"{self._result_key}_threshold": self.threshold,
227
+ f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
228
+ }
229
+ else:
230
+ raise ex
231
+
232
+ def _convert_kwargs_to_eval_input(self, **kwargs):
233
+ if "context" in kwargs or "conversation" in kwargs:
234
+ return super()._convert_kwargs_to_eval_input(**kwargs)
235
+
236
+ query = kwargs.get("query")
237
+ response = kwargs.get("response")
238
+ tool_definitions = kwargs.get("tool_definitions")
239
+
240
+ if not query or not response or not tool_definitions:
241
+ msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query', 'response' and 'tool_definitions' are required."
242
+ raise EvaluationException(
243
+ message=msg,
244
+ blame=ErrorBlame.USER_ERROR,
245
+ category=ErrorCategory.INVALID_VALUE,
246
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
247
+ )
248
+
249
+ context = self._get_context_from_agent_response(response, tool_definitions)
250
+ if not context:
251
+ raise EvaluationException(
252
+ message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
253
+ blame=ErrorBlame.USER_ERROR,
254
+ category=ErrorCategory.NOT_APPLICABLE,
255
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
256
+ )
257
+
258
+ return super()._convert_kwargs_to_eval_input(response=response[-1], context=context, query=query)
259
+
260
+ def _get_context_from_agent_response(self, response, tool_definitions):
261
+ context = ""
262
+ try:
263
+ logger.debug("Extracting context from response")
264
+ tool_calls = self._parse_tools_from_response(response=response)
265
+ logger.debug(f"Tool Calls parsed successfully : {tool_calls}")
266
+ if tool_calls:
267
+ for tool_call in tool_calls:
268
+ if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
269
+ tool_name = tool_call.get("name")
270
+ for tool in tool_definitions:
271
+ if tool.get("name") == tool_name and tool.get("type") in self._SUPPORTED_TOOLS:
272
+ if tool_name == "file_search":
273
+ tool_result = tool_call.get("tool_result")
274
+ if tool_result:
275
+ for result in tool_result:
276
+ content_list = result.get("content")
277
+ if content_list:
278
+ for content in content_list:
279
+ text = content.get("text")
280
+ if text:
281
+ context = context + "\n" + str(text)
282
+ except Exception as ex:
283
+ logger.debug(f"Error extracting context from agent response : {str(ex)}")
284
+ context = ""
285
+
286
+ return context if context else None
@@ -61,11 +61,17 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
61
61
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
62
62
 
63
63
  @override
64
- def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
64
+ def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
65
65
  current_dir = os.path.dirname(__file__)
66
66
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
67
67
  self.threshold = threshold
68
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
68
+ super().__init__(
69
+ model_config=model_config,
70
+ prompty_file=prompty_path,
71
+ result_key=self._RESULT_KEY,
72
+ credential=credential,
73
+ **kwargs,
74
+ )
69
75
 
70
76
  @overload
71
77
  def __call__(
@@ -79,7 +79,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
79
79
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
80
80
 
81
81
  @override
82
- def __init__(self, model_config, *, threshold=3):
82
+ def __init__(self, model_config, *, credential=None, threshold=3):
83
83
  current_dir = os.path.dirname(__file__)
84
84
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
85
85
  self._threshold = threshold
@@ -89,6 +89,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
89
89
  prompty_file=prompty_path,
90
90
  result_key=self._RESULT_KEY,
91
91
  threshold=threshold,
92
+ credential=credential,
92
93
  _higher_is_better=self._higher_is_better,
93
94
  )
94
95
 
@@ -73,11 +73,19 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
73
73
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
74
74
 
75
75
  @override
76
- def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs):
76
+ def __init__(
77
+ self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
78
+ ):
77
79
  current_dir = os.path.dirname(__file__)
78
80
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
79
81
  self.threshold = threshold
80
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
82
+ super().__init__(
83
+ model_config=model_config,
84
+ prompty_file=prompty_path,
85
+ result_key=self._RESULT_KEY,
86
+ credential=credential,
87
+ **kwargs,
88
+ )
81
89
 
82
90
  @overload
83
91
  def __call__(
@@ -78,7 +78,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
78
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
79
79
 
80
80
  @override
81
- def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=super-init-not-called
81
+ def __init__(self, model_config, *, threshold: float = 3, credential=None):
82
82
  current_dir = os.path.dirname(__file__)
83
83
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
84
84
  self._threshold = threshold
@@ -88,6 +88,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
88
88
  prompty_file=prompty_path,
89
89
  result_key=self._RESULT_KEY,
90
90
  threshold=threshold,
91
+ credential=credential,
91
92
  _higher_is_better=self._higher_is_better,
92
93
  )
93
94
 
@@ -75,7 +75,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
75
75
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
76
76
 
77
77
  @override
78
- def __init__(self, model_config, *, threshold=3):
78
+ def __init__(self, model_config, *, threshold=3, credential=None):
79
79
  current_dir = os.path.dirname(__file__)
80
80
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
81
81
  self._threshold = threshold
@@ -85,6 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
85
85
  prompty_file=prompty_path,
86
86
  result_key=self._RESULT_KEY,
87
87
  threshold=threshold,
88
+ credential=credential,
88
89
  _higher_is_better=self._higher_is_better,
89
90
  )
90
91
 
@@ -69,11 +69,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
69
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
70
 
71
71
  @override
72
- def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
72
+ def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
73
73
  current_dir = os.path.dirname(__file__)
74
74
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
75
75
  self.threshold = threshold
76
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
76
+ super().__init__(
77
+ model_config=model_config,
78
+ prompty_file=prompty_path,
79
+ result_key=self._RESULT_KEY,
80
+ credential=credential,
81
+ **kwargs,
82
+ )
77
83
 
78
84
  @overload
79
85
  def __call__(