azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,76 @@
1
+ from abc import ABC
2
+ from typing import Dict, List, Optional, Any
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class EvaluatorMetric:
8
+ type: str = "ordinal"
9
+ desirable_direction: Optional[str] = None
10
+ min_value: Optional[float] = None
11
+ max_value: Optional[float] = None
12
+
13
+ def to_dict(self) -> Dict[str, Any]:
14
+ result = {"type": self.type}
15
+ if self.desirable_direction is not None:
16
+ result["desirable_direction"] = self.desirable_direction
17
+ if self.min_value is not None:
18
+ result["min_value"] = self.min_value
19
+ if self.max_value is not None:
20
+ result["max_value"] = self.max_value
21
+ return result
22
+
23
+ @classmethod
24
+ def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorMetric":
25
+ return cls(
26
+ type=data.get("type", "ordinal"),
27
+ desirable_direction=data.get("desirable_direction"),
28
+ min_value=data.get("min_value"),
29
+ max_value=data.get("max_value"),
30
+ )
31
+
32
+
33
+ @dataclass
34
+ class ObjectParameterDescriptorWithRequired:
35
+ required: List[str] = field(default_factory=list)
36
+ type: str = "object"
37
+ properties: Dict[str, Any] = field(default_factory=dict)
38
+
39
+ def to_dict(self) -> Dict[str, Any]:
40
+ return {"required": self.required, "type": self.type, "properties": self.properties}
41
+
42
+ @classmethod
43
+ def from_dict(cls, data: Dict[str, Any]) -> "ObjectParameterDescriptorWithRequired":
44
+ return cls(
45
+ required=data.get("required", []), type=data.get("type", "object"), properties=data.get("properties", {})
46
+ )
47
+
48
+
49
+ class EvaluatorDefinition(ABC):
50
+ """Base class for evaluator definitions"""
51
+
52
+ def __init__(self):
53
+ self.init_parameters: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
54
+ self.metrics: Dict[str, EvaluatorMetric] = {}
55
+ self.data_schema: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
56
+ self.type: str = "unknown"
57
+
58
+ def to_dict(self) -> Dict[str, Any]:
59
+ result = {
60
+ "type": self.type,
61
+ "init_parameters": self.init_parameters.to_dict(),
62
+ "metrics": {k: v.to_dict() for k, v in self.metrics.items()},
63
+ "data_schema": self.data_schema.to_dict(),
64
+ }
65
+ return result
66
+
67
+ @classmethod
68
+ def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorDefinition":
69
+ # Create a generic instance since specific subclasses are not defined
70
+ instance = cls.__new__(cls)
71
+ instance.__init__()
72
+
73
+ instance.init_parameters = ObjectParameterDescriptorWithRequired.from_dict(data.get("init_parameters", {}))
74
+ instance.metrics = {k: EvaluatorMetric.from_dict(v) for k, v in data.get("metrics", {}).items()}
75
+ instance.data_schema = ObjectParameterDescriptorWithRequired.from_dict(data.get("data_schema", {}))
76
+ return instance
@@ -46,6 +46,7 @@ class BleuScoreEvaluator(EvaluatorBase):
46
46
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
47
47
 
48
48
  .. admonition:: Example with Threshold:
49
+
49
50
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
50
51
  :start-after: [START threshold_bleu_score_evaluator]
51
52
  :end-before: [END threshold_bleu_score_evaluator]
@@ -56,23 +56,6 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
56
56
  :param kwargs: Additional arguments to pass to the evaluator.
57
57
  :type kwargs: Any
58
58
 
59
- .. admonition:: Example:
60
-
61
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
62
- :start-after: [START code_vulnerability_evaluator]
63
- :end-before: [END code_vulnerability_evaluator]
64
- :language: python
65
- :dedent: 8
66
- :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
67
-
68
- .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
69
- :start-after: [START code_vulnerability_evaluator]
70
- :end-before: [END code_vulnerability_evaluator]
71
- :language: python
72
- :dedent: 8
73
- :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
74
- https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
75
-
76
59
  .. note::
77
60
 
78
61
  If this evaluator is supplied to the `evaluate` function, the metric
@@ -23,6 +23,11 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
23
23
  ~azure.ai.evaluation.OpenAIModelConfiguration]
24
24
  :param threshold: The threshold for the coherence evaluator. Default is 3.
25
25
  :type threshold: int
26
+ :param credential: The credential for authenticating to Azure AI service.
27
+ :type credential: ~azure.core.credentials.TokenCredential
28
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
29
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
30
+ :paramtype is_reasoning_model: bool
26
31
 
27
32
  .. admonition:: Example:
28
33
 
@@ -66,7 +71,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
66
71
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
67
72
 
68
73
  @override
69
- def __init__(self, model_config, *, threshold=3, credential=None):
74
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
70
75
  current_dir = os.path.dirname(__file__)
71
76
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
72
77
  self._threshold = threshold
@@ -78,6 +83,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
83
  threshold=threshold,
79
84
  credential=credential,
80
85
  _higher_is_better=self._higher_is_better,
86
+ **kwargs,
81
87
  )
82
88
 
83
89
  @overload
@@ -4,12 +4,15 @@
4
4
 
5
5
  import inspect
6
6
  from abc import ABC, abstractmethod
7
+ import json
8
+ import copy
7
9
  from typing import (
8
10
  Any,
9
11
  Callable,
10
12
  Dict,
11
13
  Generic,
12
14
  List,
15
+ Tuple,
13
16
  TypedDict,
14
17
  TypeVar,
15
18
  Union,
@@ -111,6 +114,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
111
114
  _NOT_APPLICABLE_RESULT = "not applicable"
112
115
  _PASS_RESULT = "pass"
113
116
  _FAIL_RESULT = "fail"
117
+ _type = "azure_ai_evaluator"
114
118
 
115
119
  # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
116
120
 
@@ -498,7 +502,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
498
502
  if message.get("role") == "assistant" and isinstance(message.get("content"), list):
499
503
  for content_item in message.get("content"):
500
504
  if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
501
- tool_calls.append(content_item)
505
+ tool_calls.append(copy.deepcopy(content_item))
502
506
 
503
507
  # Extract tool results from tool messages
504
508
  elif message.get("role") == "tool" and message.get("tool_call_id"):
@@ -516,6 +520,67 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
516
520
 
517
521
  return tool_calls
518
522
 
523
+ def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
524
+ """Extract tool names and parameters from the response.
525
+
526
+ :param response: The response to parse.
527
+ :type response: Union[str, List[dict]]
528
+ :return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
529
+ :rtype: List[Tuple[str, Dict[str, str]]]
530
+ """
531
+ tool_calls = self._parse_tools_from_response(response)
532
+ tool_name_param_pairs = []
533
+ for tool_call in tool_calls:
534
+ if not isinstance(tool_call, dict):
535
+ raise EvaluationException(
536
+ "Tool call must be a dictionary.",
537
+ internal_message=str(tool_call),
538
+ target=ErrorTarget.EVALUATE,
539
+ category=ErrorCategory.UNKNOWN,
540
+ )
541
+ if tool_call.get("type") != "tool_call":
542
+ raise EvaluationException(
543
+ "Tool call must have 'type' set to 'tool_call'.",
544
+ internal_message=str(tool_call),
545
+ target=ErrorTarget.EVALUATE,
546
+ category=ErrorCategory.INVALID_VALUE,
547
+ )
548
+
549
+ if "name" not in tool_call:
550
+ raise EvaluationException(
551
+ "Tool call missing 'name' field.",
552
+ internal_message=str(tool_call),
553
+ target=ErrorTarget.EVALUATE,
554
+ category=ErrorCategory.MISSING_FIELD,
555
+ )
556
+
557
+ tool_name = str(tool_call["name"]).strip()
558
+
559
+ # Extract parameters/arguments
560
+ parameters = {}
561
+ if "arguments" in tool_call:
562
+ args = tool_call["arguments"]
563
+ if isinstance(args, dict):
564
+ # Convert all values to strings for consistent comparison
565
+ parameters = {str(k): str(v) for k, v in args.items()}
566
+ elif isinstance(args, str):
567
+ # If arguments is a string, try to parse it as JSON
568
+ try:
569
+ parsed_args = json.loads(args)
570
+ if isinstance(parsed_args, dict):
571
+ parameters = {str(k): str(v) for k, v in parsed_args.items()}
572
+ except json.JSONDecodeError:
573
+ raise EvaluationException(
574
+ "Failed to parse tool call arguments as JSON.",
575
+ internal_message=str(tool_call),
576
+ target=ErrorTarget.EVALUATE,
577
+ category=ErrorCategory.INVALID_VALUE,
578
+ )
579
+
580
+ tool_name_param_pairs.append((tool_name, parameters))
581
+
582
+ return tool_name_param_pairs
583
+
519
584
  async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
520
585
  """The asynchronous call where real end-to-end evaluation logic is performed.
521
586
 
@@ -542,14 +607,25 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
542
607
  base_key = key[:-6] # Remove "_score" suffix
543
608
  result_key = f"{base_key}_result"
544
609
  threshold_key = f"{base_key}_threshold"
545
- result[threshold_key] = self._threshold
610
+ threshold_value = (
611
+ self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
612
+ )
613
+ if not isinstance(threshold_value, (int, float)):
614
+ raise EvaluationException(
615
+ "Threshold value must be a number.",
616
+ internal_message=str(threshold_value),
617
+ target=ErrorTarget.EVALUATE,
618
+ category=ErrorCategory.INVALID_VALUE,
619
+ )
620
+
621
+ result[threshold_key] = threshold_value
546
622
  if self._higher_is_better:
547
- if float(score_value) >= self._threshold:
623
+ if float(score_value) >= threshold_value:
548
624
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
549
625
  else:
550
626
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
551
627
  else:
552
- if float(score_value) <= self._threshold:
628
+ if float(score_value) <= threshold_value:
553
629
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
554
630
  else:
555
631
  result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
@@ -5,7 +5,8 @@
5
5
  import math
6
6
  import re
7
7
  import os
8
- from typing import Dict, Optional, TypeVar, Union
8
+ from itertools import chain
9
+ from typing import Dict, Optional, TypeVar, Union, List
9
10
 
10
11
  if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
11
12
  from promptflow.core._flow import AsyncPrompty
@@ -132,10 +133,19 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
132
133
  category=ErrorCategory.INVALID_VALUE,
133
134
  target=ErrorTarget.CONVERSATION,
134
135
  )
135
- llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
136
+ # Call the prompty flow to get the evaluation result.
137
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
136
138
 
137
139
  score = math.nan
138
- if llm_output:
140
+ if prompty_output_dict:
141
+ llm_output = prompty_output_dict.get("llm_output", "")
142
+ input_token_count = prompty_output_dict.get("input_token_count", 0)
143
+ output_token_count = prompty_output_dict.get("output_token_count", 0)
144
+ total_token_count = prompty_output_dict.get("total_token_count", 0)
145
+ finish_reason = prompty_output_dict.get("finish_reason", "")
146
+ model_id = prompty_output_dict.get("model_id", "")
147
+ sample_input = prompty_output_dict.get("sample_input", "")
148
+ sample_output = prompty_output_dict.get("sample_output", "")
139
149
  # Parse out score and reason from evaluators known to possess them.
140
150
  if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
141
151
  score, reason = parse_quality_evaluator_reason_score(llm_output)
@@ -146,6 +156,13 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
146
156
  f"{self._result_key}_reason": reason,
147
157
  f"{self._result_key}_result": binary_result,
148
158
  f"{self._result_key}_threshold": self._threshold,
159
+ f"{self._result_key}_prompt_tokens": input_token_count,
160
+ f"{self._result_key}_completion_tokens": output_token_count,
161
+ f"{self._result_key}_total_tokens": total_token_count,
162
+ f"{self._result_key}_finish_reason": finish_reason,
163
+ f"{self._result_key}_model": model_id,
164
+ f"{self._result_key}_sample_input": sample_input,
165
+ f"{self._result_key}_sample_output": sample_output,
149
166
  }
150
167
  match = re.search(r"\d", llm_output)
151
168
  if match:
@@ -156,6 +173,13 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
156
173
  f"gpt_{self._result_key}": float(score),
157
174
  f"{self._result_key}_result": binary_result,
158
175
  f"{self._result_key}_threshold": self._threshold,
176
+ f"{self._result_key}_prompt_tokens": input_token_count,
177
+ f"{self._result_key}_completion_tokens": output_token_count,
178
+ f"{self._result_key}_total_tokens": total_token_count,
179
+ f"{self._result_key}_finish_reason": finish_reason,
180
+ f"{self._result_key}_model": model_id,
181
+ f"{self._result_key}_sample_input": sample_input,
182
+ f"{self._result_key}_sample_output": sample_output,
159
183
  }
160
184
 
161
185
  binary_result = self._get_binary_result(score)
@@ -165,3 +189,157 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
165
189
  f"{self._result_key}_result": binary_result,
166
190
  f"{self._result_key}_threshold": self._threshold,
167
191
  }
192
+
193
+ @staticmethod
194
+ def _get_built_in_tool_definition(tool_name: str):
195
+ """Get the definition for the built-in tool."""
196
+ try:
197
+ from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
198
+
199
+ if tool_name in _BUILT_IN_DESCRIPTIONS:
200
+ return {
201
+ "type": tool_name,
202
+ "description": _BUILT_IN_DESCRIPTIONS[tool_name],
203
+ "name": tool_name,
204
+ "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
205
+ }
206
+ except ImportError:
207
+ pass
208
+ return None
209
+
210
+ def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
211
+ """Extract tool definitions needed for the given built-in tool calls."""
212
+ needed_definitions = []
213
+ for tool_call in tool_calls:
214
+ if isinstance(tool_call, dict):
215
+ tool_type = tool_call.get("type")
216
+
217
+ # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
218
+ if tool_type == "tool_call":
219
+ tool_name = tool_call.get("name")
220
+ if tool_name:
221
+ definition = self._get_built_in_tool_definition(tool_name)
222
+ if definition and definition not in needed_definitions:
223
+ needed_definitions.append(definition)
224
+
225
+ return needed_definitions
226
+
227
+ def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
228
+ """Extract just the tool names from tool calls, removing parameters."""
229
+ tool_names = []
230
+ for tool_call in tool_calls:
231
+ if isinstance(tool_call, dict):
232
+ tool_type = tool_call.get("type")
233
+ if tool_type == "tool_call":
234
+ tool_name = tool_call.get("name")
235
+ if tool_name:
236
+ tool_names.append(tool_name)
237
+ elif tool_call.get("function", {}).get("name"):
238
+ # Handle function call format
239
+ tool_names.append(tool_call["function"]["name"])
240
+ elif tool_call.get("name"):
241
+ # Handle direct name format
242
+ tool_names.append(tool_call["name"])
243
+ return tool_names
244
+
245
+ def _extract_needed_tool_definitions(
246
+ self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
247
+ ) -> List[Dict]:
248
+ """Extract the tool definitions that are needed for the provided tool calls.
249
+
250
+ :param tool_calls: The tool calls that need definitions
251
+ :type tool_calls: List[Dict]
252
+ :param tool_definitions: User-provided tool definitions
253
+ :type tool_definitions: List[Dict]
254
+ :param error_target: The evaluator-specific error target for exceptions
255
+ :type error_target: ErrorTarget
256
+ :return: List of needed tool definitions
257
+ :rtype: List[Dict]
258
+ :raises EvaluationException: If validation fails
259
+ """
260
+ needed_tool_definitions = []
261
+
262
+ # Add all user-provided tool definitions
263
+ needed_tool_definitions.extend(tool_definitions)
264
+
265
+ # Add the needed built-in tool definitions (if they are called)
266
+ built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
267
+ needed_tool_definitions.extend(built_in_definitions)
268
+
269
+ # OpenAPI tool is a collection of functions, so we need to expand it
270
+ tool_definitions_expanded = list(
271
+ chain.from_iterable(
272
+ tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
273
+ for tool in needed_tool_definitions
274
+ )
275
+ )
276
+
277
+ # Validate that all tool calls have corresponding definitions
278
+ for tool_call in tool_calls:
279
+ if isinstance(tool_call, dict):
280
+ tool_type = tool_call.get("type")
281
+
282
+ if tool_type == "tool_call":
283
+ tool_name = tool_call.get("name")
284
+ if tool_name and self._get_built_in_tool_definition(tool_name):
285
+ # This is a built-in tool from converter, already handled above
286
+ continue
287
+ elif tool_name:
288
+ # This is a regular function tool from converter
289
+ tool_definition_exists = any(
290
+ tool.get("name") == tool_name and tool.get("type", "function") == "function"
291
+ for tool in tool_definitions_expanded
292
+ )
293
+ if not tool_definition_exists:
294
+ raise EvaluationException(
295
+ message=f"Tool definition for {tool_name} not found",
296
+ blame=ErrorBlame.USER_ERROR,
297
+ category=ErrorCategory.INVALID_VALUE,
298
+ target=error_target,
299
+ )
300
+ else:
301
+ raise EvaluationException(
302
+ message=f"Tool call missing name: {tool_call}",
303
+ blame=ErrorBlame.USER_ERROR,
304
+ category=ErrorCategory.INVALID_VALUE,
305
+ target=error_target,
306
+ )
307
+ else:
308
+ # Unsupported tool format - only converter format is supported
309
+ raise EvaluationException(
310
+ message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
311
+ blame=ErrorBlame.USER_ERROR,
312
+ category=ErrorCategory.INVALID_VALUE,
313
+ target=error_target,
314
+ )
315
+ else:
316
+ # Tool call is not a dictionary
317
+ raise EvaluationException(
318
+ message=f"Tool call is not a dictionary: {tool_call}",
319
+ blame=ErrorBlame.USER_ERROR,
320
+ category=ErrorCategory.INVALID_VALUE,
321
+ target=error_target,
322
+ )
323
+
324
+ return needed_tool_definitions
325
+
326
+ def _not_applicable_result(
327
+ self, error_message: str, threshold: Union[int, float]
328
+ ) -> Dict[str, Union[str, float, Dict]]:
329
+ """Return a result indicating that the evaluation is not applicable.
330
+
331
+ :param error_message: The error message explaining why evaluation is not applicable.
332
+ :type error_message: str
333
+ :param threshold: The threshold value for the evaluator.
334
+ :type threshold: Union[int, float]
335
+ :return: A dictionary containing the result of the evaluation.
336
+ :rtype: Dict[str, Union[str, float, Dict]]
337
+ """
338
+ # If no tool calls were made or tool call type is not supported, return not applicable result
339
+ return {
340
+ self._result_key: self._NOT_APPLICABLE_RESULT,
341
+ f"{self._result_key}_result": "pass",
342
+ f"{self._result_key}_threshold": threshold,
343
+ f"{self._result_key}_reason": error_message,
344
+ f"{self._result_key}_details": {},
345
+ }
@@ -25,6 +25,11 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
25
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
26
26
  :param threshold: The threshold for the fluency evaluator. Default is 3.
27
27
  :type threshold: int
28
+ :param credential: The credential for authenticating to Azure AI service.
29
+ :type credential: ~azure.core.credentials.TokenCredential
30
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
31
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
32
+ :paramtype is_reasoning_model: bool
28
33
 
29
34
  .. admonition:: Example:
30
35
 
@@ -68,7 +73,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
68
73
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
69
74
 
70
75
  @override
71
- def __init__(self, model_config, *, credential=None, threshold=3):
76
+ def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
72
77
  current_dir = os.path.dirname(__file__)
73
78
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
74
79
  self._threshold = threshold
@@ -80,6 +85,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
80
85
  threshold=threshold,
81
86
  credential=credential,
82
87
  _higher_is_better=self._higher_is_better,
88
+ **kwargs,
83
89
  )
84
90
 
85
91
  @overload
@@ -5,7 +5,7 @@ import os, logging
5
5
  from typing import Dict, List, Optional, Union, Any, Tuple
6
6
 
7
7
  from typing_extensions import overload, override
8
- from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
8
+ from azure.ai.evaluation._legacy.prompty import AsyncPrompty
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
@@ -33,8 +33,7 @@ logger = logging.getLogger(__name__)
33
33
 
34
34
 
35
35
  class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
36
- """
37
- Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
36
+ """Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
38
37
  including reasoning.
39
38
 
40
39
  The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
@@ -50,6 +49,11 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
50
49
  ~azure.ai.evaluation.OpenAIModelConfiguration]
51
50
  :param threshold: The threshold for the groundedness evaluator. Default is 3.
52
51
  :type threshold: int
52
+ :param credential: The credential for authenticating to Azure AI service.
53
+ :type credential: ~azure.core.credentials.TokenCredential
54
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
55
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
56
+ :paramtype is_reasoning_model: bool
53
57
 
54
58
  .. admonition:: Example:
55
59
 
@@ -61,6 +65,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
61
65
  :caption: Initialize and call a GroundednessEvaluator.
62
66
 
63
67
  .. admonition:: Example with Threshold:
68
+
64
69
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
65
70
  :start-after: [START threshold_groundedness_evaluator]
66
71
  :end-before: [END threshold_groundedness_evaluator]
@@ -107,6 +112,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
107
112
  threshold=threshold,
108
113
  credential=credential,
109
114
  _higher_is_better=self._higher_is_better,
115
+ **kwargs,
110
116
  )
111
117
  self._model_config = model_config
112
118
  self.threshold = threshold
@@ -196,18 +202,24 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
196
202
  """
197
203
 
198
204
  if kwargs.get("query", None):
199
- current_dir = os.path.dirname(__file__)
200
- prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
201
- self._prompty_file = prompty_path
202
- prompty_model_config = construct_prompty_model_config(
203
- validate_model_config(self._model_config),
204
- self._DEFAULT_OPEN_API_VERSION,
205
- UserAgentSingleton().value,
206
- )
207
- self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
205
+ self._ensure_query_prompty_loaded()
208
206
 
209
207
  return super().__call__(*args, **kwargs)
210
208
 
209
+ def _ensure_query_prompty_loaded(self):
210
+ """Switch to the query prompty file if not already loaded."""
211
+
212
+ current_dir = os.path.dirname(__file__)
213
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
214
+
215
+ self._prompty_file = prompty_path
216
+ prompty_model_config = construct_prompty_model_config(
217
+ validate_model_config(self._model_config),
218
+ self._DEFAULT_OPEN_API_VERSION,
219
+ UserAgentSingleton().value,
220
+ )
221
+ self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
222
+
211
223
  def _has_context(self, eval_input: dict) -> bool:
212
224
  """
213
225
  Return True if eval_input contains a non-empty 'context' field.
@@ -226,7 +238,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
226
238
 
227
239
  @override
228
240
  async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
229
- if "query" not in eval_input:
241
+ if eval_input.get("query", None) is None:
230
242
  return await super()._do_eval(eval_input)
231
243
 
232
244
  contains_context = self._has_context(eval_input)
@@ -273,6 +285,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
273
285
  response = kwargs.get("response")
274
286
  tool_definitions = kwargs.get("tool_definitions")
275
287
 
288
+ if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
289
+ self._ensure_query_prompty_loaded()
290
+
276
291
  if (not query) or (not response): # or not tool_definitions:
277
292
  msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
278
293
  raise EvaluationException(