azure-ai-evaluation 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (134) hide show
  1. azure/ai/evaluation/__init__.py +42 -14
  2. azure/ai/evaluation/_azure/_models.py +6 -6
  3. azure/ai/evaluation/_common/constants.py +6 -2
  4. azure/ai/evaluation/_common/rai_service.py +38 -4
  5. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  6. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  7. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  8. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  9. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  10. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  11. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  12. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  13. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  14. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  15. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  16. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  17. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  18. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  19. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  20. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  21. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  22. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  23. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  24. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
  25. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  26. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  27. azure/ai/evaluation/_common/utils.py +30 -10
  28. azure/ai/evaluation/_constants.py +10 -0
  29. azure/ai/evaluation/_converters/__init__.py +3 -0
  30. azure/ai/evaluation/_converters/_ai_services.py +804 -0
  31. azure/ai/evaluation/_converters/_models.py +302 -0
  32. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
  33. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
  34. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  35. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  36. azure/ai/evaluation/_evaluate/_evaluate.py +36 -4
  37. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
  38. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  39. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
  40. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
  41. azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
  42. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
  43. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
  44. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
  45. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
  46. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
  47. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
  48. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
  49. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
  50. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
  51. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
  52. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
  53. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
  54. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  55. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
  56. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
  57. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
  58. azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
  59. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
  60. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  61. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
  62. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
  63. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
  64. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
  65. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
  66. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
  67. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  68. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
  69. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
  70. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
  72. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
  73. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  74. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
  75. azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
  76. azure/ai/evaluation/_exceptions.py +5 -1
  77. azure/ai/evaluation/_legacy/__init__.py +3 -0
  78. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  79. azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
  80. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
  81. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  82. azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
  83. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
  84. azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
  85. azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
  86. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  87. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
  88. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  89. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
  91. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  92. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  93. azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
  94. azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
  95. azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
  96. azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
  97. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  98. azure/ai/evaluation/_red_team/__init__.py +3 -0
  99. azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
  100. azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
  101. azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
  102. azure/ai/evaluation/_red_team/_default_converter.py +21 -0
  103. azure/ai/evaluation/_red_team/_red_team.py +1858 -0
  104. azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
  105. azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
  106. azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
  107. azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
  108. azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
  109. azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
  110. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  111. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  112. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +741 -0
  113. azure/ai/evaluation/_version.py +2 -1
  114. azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
  115. azure/ai/evaluation/simulator/_adversarial_simulator.py +61 -27
  116. azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
  117. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
  118. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
  119. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
  120. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
  121. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/METADATA +75 -15
  122. azure_ai_evaluation-1.4.0.dist-info/RECORD +197 -0
  123. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/WHEEL +1 -1
  124. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  125. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  126. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  127. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  128. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  129. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  130. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  131. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  132. azure_ai_evaluation-1.2.0.dist-info/RECORD +0 -125
  133. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/NOTICE.txt +0 -0
  134. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/top_level.txt +0 -0
@@ -28,15 +28,26 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
28
28
  :param model_config: Configuration for the Azure OpenAI model.
29
29
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
30
30
  ~azure.ai.evaluation.OpenAIModelConfiguration]
31
+ :param threshold: The threshold for the similarity evaluator. Default is 5.
32
+ :type threshold: int
31
33
 
32
34
  .. admonition:: Example:
33
35
 
34
36
  .. literalinclude:: ../samples/evaluation_samples_evaluate.py
35
- :start-after: [START rouge_score_evaluator]
36
- :end-before: [END rouge_score_evaluator]
37
+ :start-after: [START similarity_evaluator]
38
+ :end-before: [END similarity_evaluator]
37
39
  :language: python
38
40
  :dedent: 8
39
- :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
41
+ :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
42
+
43
+ .. admonition:: Example:
44
+
45
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
46
+ :start-after: [START threshold_similarity_evaluator]
47
+ :end-before: [END threshold_similarity_evaluator]
48
+ :language: python
49
+ :dedent: 8
50
+ :caption: Initialize with a threshold and call a SimilarityEvaluator.
40
51
 
41
52
  .. note::
42
53
 
@@ -54,10 +65,18 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
54
65
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
66
 
56
67
  @override
57
- def __init__(self, model_config):
68
+ def __init__(self, model_config, *, threshold=3):
58
69
  current_dir = os.path.dirname(__file__)
59
70
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
60
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
71
+ self._threshold = threshold
72
+ self._higher_is_better = True
73
+ super().__init__(
74
+ model_config=model_config,
75
+ prompty_file=prompty_path,
76
+ result_key=self._RESULT_KEY,
77
+ threshold=threshold,
78
+ _higher_is_better=self._higher_is_better
79
+ )
61
80
 
62
81
  # Ignoring a mypy error about having only 1 overload function.
63
82
  # We want to use the overload style for all evals, even single-inputs. This is both to make
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._task_adherence import TaskAdherenceEvaluator
6
+
7
+ __all__ = ["TaskAdherenceEvaluator"]
@@ -0,0 +1,148 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import math
6
+ from typing import Dict, Union, List, Optional
7
+
8
+ from typing_extensions import overload, override
9
+
10
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
11
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
12
+ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
13
+ from azure.ai.evaluation._model_configurations import Message
14
+ from azure.ai.evaluation._common._experimental import experimental
15
+
16
+ @experimental
17
+ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
18
+ """The Task Adherence evaluator assesses how well an AI-generated response follows the assigned task based on:
19
+
20
+ - Alignment with instructions and definitions
21
+ - Accuracy and clarity of the response
22
+ - Proper use of provided tool definitions
23
+
24
+ Scoring is based on five levels:
25
+ 1. Fully Inadherent - Response completely ignores instructions.
26
+ 2. Barely Adherent - Partial alignment with critical gaps.
27
+ 3. Moderately Adherent - Meets core requirements but lacks precision.
28
+ 4. Mostly Adherent - Clear and accurate with minor issues.
29
+ 5. Fully Adherent - Flawless adherence to instructions.
30
+
31
+ The evaluation includes a step-by-step reasoning process, a brief explanation, and a final integer score.
32
+
33
+
34
+ :param model_config: Configuration for the Azure OpenAI model.
35
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
36
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
37
+
38
+ .. admonition:: Example:
39
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
40
+ :start-after: [START task_adherence_evaluator]
41
+ :end-before: [END task_adherence_evaluator]
42
+ :language: python
43
+ :dedent: 8
44
+ :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
45
+ """
46
+
47
+ _PROMPTY_FILE = "task_adherence.prompty"
48
+ _RESULT_KEY = "task_adherence"
49
+ _OPTIONAL_PARAMS = ["tool_definitions"]
50
+
51
+ _DEFAULT_TASK_ADHERENCE_SCORE = 3
52
+
53
+ id = None
54
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
+
56
+ @override
57
+ def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE):
58
+ current_dir = os.path.dirname(__file__)
59
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
60
+ self.threshold = threshold
61
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
62
+
63
+ @overload
64
+ def __call__(
65
+ self,
66
+ *,
67
+ query: Union[str, List[dict]],
68
+ response: Union[str, List[dict]],
69
+ tool_definitions: Optional[Union[dict, List[dict]]] = None,
70
+ ) -> Dict[str, Union[str, float]]:
71
+ """Evaluate task adherence for a given query, response, and optional tool defintions.
72
+ The query and response can be either a string or a list of messages.
73
+
74
+
75
+ Example with string inputs and no tools:
76
+ evaluator = TaskAdherenceEvaluator(model_config)
77
+ query = "What is the weather today?"
78
+ response = "The weather is sunny."
79
+
80
+ result = evaluator(query=query, response=response)
81
+
82
+ Example with list of messages:
83
+ evaluator = TaskAdherenceEvaluator(model_config)
84
+ query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
85
+ response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
86
+ tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
87
+
88
+ result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
89
+
90
+ :keyword query: The query being evaluated, either a string or a list of messages.
91
+ :paramtype query: Union[str, List[dict]]
92
+ :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
93
+ :paramtype response: Union[str, List[dict]]
94
+ :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
95
+ :paramtype tool_definitions: Optional[Union[dict, List[dict]]]
96
+ :return: A dictionary with the task adherence evaluation results.
97
+ :rtype: Dict[str, Union[str, float]]
98
+ """
99
+
100
+ @override
101
+ def __call__( # pylint: disable=docstring-missing-param
102
+ self,
103
+ *args,
104
+ **kwargs,
105
+ ):
106
+ """
107
+ Invokes the instance using the overloaded __call__ signature.
108
+
109
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
110
+ """
111
+ return super().__call__(*args, **kwargs)
112
+
113
+ @override
114
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
115
+ """Do Task Adherence evaluation.
116
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
117
+ :type eval_input: Dict
118
+ :return: The evaluation result.
119
+ :rtype: Dict
120
+ """
121
+ # we override the _do_eval method as we want the output to be a dictionary,
122
+ # which is a different schema than _base_prompty_eval.py
123
+ if "query" not in eval_input and "response" not in eval_input:
124
+ raise EvaluationException(
125
+ message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
126
+ internal_message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
127
+ blame=ErrorBlame.USER_ERROR,
128
+ category=ErrorCategory.MISSING_FIELD,
129
+ target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
130
+ )
131
+
132
+ llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
133
+
134
+ score = math.nan
135
+ if llm_output:
136
+ score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
137
+
138
+ score_result = 'pass' if score >= self.threshold else 'fail'
139
+
140
+ return {
141
+ f"{self._result_key}": score,
142
+ f"{self._result_key}_result": score_result,
143
+ f"{self._result_key}_threshold": self.threshold,
144
+ f"{self._result_key}_reason": reason,
145
+ }
146
+
147
+ return {self._result_key: math.nan}
148
+
@@ -0,0 +1,117 @@
1
+ ---
2
+ name: TaskAdherence
3
+ description: Evaluates Task Adherence score for QA scenario
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 800
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: text
14
+
15
+ inputs:
16
+ query:
17
+ type: string
18
+ response:
19
+ type: string
20
+ tool_definitions:
21
+ type: string
22
+ optional: true
23
+ default: "[]"
24
+
25
+ ---
26
+ system:
27
+ # Instruction
28
+ ## Context
29
+ ### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided.
30
+ - **Definition**: Based on the provided query, response, and tool definitions, evaluate the agent's adherence to the assigned task.
31
+ - **Data**: Your input data includes query, response, and tool definitions.
32
+ - **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways.
33
+
34
+ # Definition
35
+
36
+ **Level 1: Fully Inadherent**
37
+
38
+ **Definition:**
39
+ Response completely ignores instructions or deviates significantly
40
+
41
+ **Example:**
42
+ **Query:** What is a recommended weekend itinerary in Paris?
43
+ **Response:** Paris is a lovely city with a rich history.
44
+
45
+ Explanation: This response completely misses the task by not providing any itinerary details. It offers a generic statement about Paris rather than a structured travel plan.
46
+
47
+
48
+ **Level 2: Barely Adherent**
49
+
50
+ **Definition:**
51
+ Response partially aligns with instructions but has critical gaps.
52
+
53
+ **Example:**
54
+ **Query:** What is a recommended weekend itinerary in Paris?
55
+ **Response:** Spend your weekend visiting famous places in Paris.
56
+
57
+ Explanation: While the response hints at visiting well-known sites, it is extremely vague and lacks specific details, such as which sites to visit or any order of activities, leaving major gaps in the instructions.
58
+
59
+
60
+ **Level 3: Moderately Adherent**
61
+
62
+ **Definition:**
63
+ Response meets the core requirements but lacks precision or clarity.
64
+
65
+ **Example:**
66
+ **Query:** What is a recommended weekend itinerary in Paris?
67
+ **Response:** Visit the Eiffel Tower and the Louvre on Saturday, and stroll through Montmartre on Sunday.
68
+
69
+ Explanation: This answer meets the basic requirement by naming a few key attractions and assigning them to specific days. However, it lacks additional context, such as timings, additional activities, or details to make the itinerary practical and clear.
70
+
71
+
72
+ **Level 4: Mostly Adherent**
73
+
74
+ **Definition:**
75
+ Response is clear, accurate, and aligns with instructions with minor issues.
76
+
77
+ **Example:**
78
+ **Query:** What is a recommended weekend itinerary in Paris?
79
+ **Response:** For a weekend in Paris, start Saturday with a morning visit to the Eiffel Tower, then head to the Louvre in the early afternoon. In the evening, enjoy a leisurely walk along the Seine. On Sunday, begin with a visit to Notre-Dame Cathedral, followed by exploring the art and cafés in Montmartre. This plan offers a mix of cultural visits and relaxing experiences.
80
+
81
+ Explanation: This response is clear, structured, and provides a concrete itinerary with specific attractions and a suggested order of activities. It is accurate and useful, though it might benefit from a few more details like exact timings or restaurant suggestions to be perfect.
82
+
83
+
84
+ **Level 5: Fully Adherent**
85
+
86
+ **Definition:**
87
+ Response is flawless, accurate, and follows instructions to the letter.
88
+
89
+ **Example:**
90
+ **Query:** What is a recommended weekend itinerary in Paris?
91
+ **Response:** Here is a detailed weekend itinerary in Paris:
92
+ Saturday:
93
+ Morning: Begin your day with a visit to the Eiffel Tower to admire the views from the top.
94
+ Early Afternoon: Head to the Louvre for a guided tour of its most famous exhibits.
95
+ Late Afternoon: Take a relaxing walk along the Seine, stopping at local boutiques.
96
+ Evening: Enjoy dinner at a classic Parisian bistro near the river.
97
+ Sunday:
98
+ Morning: Visit the Notre-Dame Cathedral to explore its architecture and history.
99
+ Midday: Wander the charming streets of Montmartre, stopping by art galleries and cafés.
100
+ Afternoon: Finish your trip with a scenic boat tour on the Seine.
101
+ This itinerary balances cultural immersion, leisure, and local dining experiences, ensuring a well-rounded visit.
102
+
103
+ Explanation: This response is comprehensive and meticulously follows the instructions. It provides detailed steps, timings, and a variety of activities that fully address the query, leaving no critical gaps.
104
+
105
+ # Data
106
+ Query: {{query}}
107
+ Response: {{response}}
108
+ Tool Definitions: {{tool_definitions}}
109
+
110
+ # Tasks
111
+ ## Please provide your assessment Score for the previous answer. Your output should include the following information:
112
+ - **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:".
113
+ - **Explanation**: a very short explanation of why you think the input data should get that Score.
114
+ - **Score**: based on your previous analysis, provide your Score. The answer you give MUST be an integer score ("1", "2", ...) based on the categories of the definitions.
115
+
116
+ ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
117
+ # Output
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._tool_call_accuracy import ToolCallAccuracyEvaluator
6
+
7
+ __all__ = [
8
+ "ToolCallAccuracyEvaluator",
9
+ ]
@@ -0,0 +1,292 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import math
5
+ import os
6
+ import logging
7
+ import re
8
+ from typing import Dict, List, Union, TypeVar, cast
9
+ from typing_extensions import overload, override
10
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._common.utils import remove_optional_singletons, parse_quality_evaluator_reason_score
12
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
+ from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
14
+ from azure.ai.evaluation._common._experimental import experimental
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ T_EvalValue = TypeVar("T_EvalValue")
19
+
20
+ @experimental
21
+ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
22
+ """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
23
+ - Relevance to the conversation
24
+ - Parameter correctness according to tool definitions
25
+ - Parameter value extraction from the conversation
26
+
27
+ The evaluator uses a binary scoring system (0 or 1):
28
+ - Score 0: The tool call is irrelevant or contains information not in the conversation/definition
29
+ - Score 1: The tool call is relevant with properly extracted parameters from the conversation
30
+
31
+ This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing
32
+ user needs while properly following tool definitions and using information present in the
33
+ conversation history.
34
+
35
+ :param model_config: Configuration for the Azure OpenAI model.
36
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
37
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
38
+
39
+ .. admonition:: Example:
40
+
41
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
42
+ :start-after: [START tool_call_accuracy_evaluator]
43
+ :end-before: [END tool_call_accuracy_evaluator]
44
+ :language: python
45
+ :dedent: 8
46
+ :caption: Initialize and call a ToolCallAccuracyEvaluator.
47
+
48
+ .. note::
49
+
50
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
51
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
52
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
53
+ """
54
+
55
+ _PROMPTY_FILE = "tool_call_accuracy.prompty"
56
+ _RESULT_KEY = "tool_call_accurate"
57
+ _AGGREGATE_RESULT_KEY = "tool_call_accuracy"
58
+
59
+ _MAX_TOOL_CALL_ACCURACY_SCORE = 1.0
60
+ _MIN_TOOL_CALL_ACCURACY_SCORE = 0.0
61
+ _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 0.8
62
+
63
+ id = "id"
64
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
65
+
66
+ @override
67
+ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE):
68
+ current_dir = os.path.dirname(__file__)
69
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
70
+ self.threshold = threshold
71
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
72
+
73
+ @overload
74
+ def __call__(
75
+ self,
76
+ *,
77
+ query: Union[str, List[dict]],
78
+ tool_definitions: Union[dict, List[dict]],
79
+ tool_calls: Union[dict, List[dict]] = None,
80
+ response: Union[str, List[dict]] = None
81
+ ) -> Dict[str, Union[str, float]]:
82
+ """
83
+ Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
84
+
85
+ :keyword query: Query or Chat history up to the message that has the tool call being evaluated.
86
+ :paramtype query: Union[str, List[dict]]
87
+ :keyword tool_definitions: List of tool definitions whose calls are being evaluated.
88
+ :paramtype tool_definitions: Union[dict, List[dict]]
89
+ :keyword tool_calls: Optional List of tool calls to evaluate. If not provided response should be provided and should have
90
+ tool call(s) in it.
91
+ :paramtype tool_calls: Union[dict, List[dict]]
92
+ :keyword response: Optional response to be evaluated alongside the tool calls.
93
+ If provided all tool calls in response will be evaluated when tool_calls parameter is not provided.
94
+ If provided and tool_calls parameter is provided, only the tool calls in tool_calls parameter will be evaluated.
95
+ If response has extra tool calls they will not be evaluated, response will be used to extract any tool calls that are needed for evaluating a certain tool call.
96
+ Recommended to provide it when there are tool calls that depend on output of a previous tool call.
97
+ :paramtype response: Union[str, List[dict]]
98
+ :return: The tool selection evaluation results.
99
+ :rtype: Dict[str, Union[str, float]]
100
+ """
101
+
102
+ def _convert_kwargs_to_eval_input(self, **kwargs):
103
+ """Convert an arbitrary input into a list of inputs for evaluators.
104
+ It is assumed that evaluators generally make use of their inputs in one of two ways.
105
+ Either they receive a collection of keyname inputs that are all single values
106
+ (like a query and response), or they receive conversation that iss a list of dictionary
107
+ values.
108
+
109
+ The self._singleton_inputs list assigned during initialization is used to find and extract
110
+ singleton keywords, and self._allow_conversation_input is used to determine if a conversation
111
+ is a valid input.
112
+
113
+ If both conversations and singletons are allowed, the function will raise an exception if both
114
+ are inputted.
115
+
116
+ This function must be overridden by child classes IF they need to both a conversation and
117
+ other inputs to be passed in.
118
+
119
+ :keyword kwargs: The inputs to convert.
120
+ :type kwargs: Dict
121
+ :return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
122
+ :rtype: List
123
+ """
124
+ # TODO add warning that only tool calls of type function are supported
125
+ # Collect inputs
126
+ tool_calls = kwargs.get("tool_calls", None)
127
+ tool_definitions = kwargs.get("tool_definitions")
128
+ query = kwargs.get("query", None)
129
+ response = kwargs.get("response", None)
130
+
131
+ if response is None and tool_calls is None:
132
+ raise EvaluationException(
133
+ message="Either response or tool_calls must be provided.",
134
+ blame=ErrorBlame.USER_ERROR,
135
+ category=ErrorCategory.MISSING_FIELD,
136
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
137
+ )
138
+
139
+ if tool_definitions is None:
140
+ raise EvaluationException(
141
+ message="Tool definitions must be provided.",
142
+ blame=ErrorBlame.USER_ERROR,
143
+ category=ErrorCategory.MISSING_FIELD,
144
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
145
+ )
146
+
147
+ # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
148
+ if tool_calls is None:
149
+ # Extract tool calls from response if not provided
150
+ tool_calls = []
151
+ if isinstance(response, list):
152
+ for message in response:
153
+ if message.get("role") == "assistant":
154
+ tool_calls.extend([content for content in message.get("content")
155
+ if content.get("type") == "tool_call"])
156
+ if len(tool_calls) == 0:
157
+ raise EvaluationException(
158
+ message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
159
+ blame=ErrorBlame.USER_ERROR,
160
+ category=ErrorCategory.MISSING_FIELD,
161
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
162
+ )
163
+
164
+ if not isinstance(tool_calls, list):
165
+ tool_calls = [tool_calls]
166
+
167
+ if not isinstance(tool_definitions, list):
168
+ tool_definitions = [tool_definitions]
169
+
170
+ eval_inputs = []
171
+ # TODO : When evaluating an agent tool that depends on the output of a previous tool call,
172
+ # we need to provide the output of the previous tool call as part of messages.
173
+ for tool_call in tool_calls:
174
+ if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call": # TODO assuming dict here but it can be a class
175
+ function_name = tool_call.get("name")
176
+ tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
177
+ if len(tool_definition) > 0:
178
+ tool_definition = tool_definition
179
+ else:
180
+ raise EvaluationException(
181
+ message="Tool definition not found",
182
+ blame=ErrorBlame.USER_ERROR,
183
+ category=ErrorCategory.INVALID_VALUE,
184
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
185
+ )
186
+ eval_inputs.append({"query": query, "tool_call": tool_call, "tool_definition": tool_definition})
187
+ else:
188
+ raise EvaluationException(
189
+ message="Tool definition not found",
190
+ blame=ErrorBlame.USER_ERROR,
191
+ category=ErrorCategory.INVALID_VALUE,
192
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
193
+ )
194
+
195
+ return eval_inputs
196
+
197
+ @override
198
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
199
+ """Do a relevance evaluation.
200
+
201
+ :param eval_input: The input to the evaluator. Expected to contain
202
+ whatever inputs are needed for the _flow method, including context
203
+ and other fields depending on the child class.
204
+ :type eval_input: Dict
205
+ :return: The evaluation result.
206
+ :rtype: Dict
207
+ """
208
+ llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
209
+
210
+ score = math.nan
211
+ if llm_output:
212
+ score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
213
+ return {
214
+ self._result_key: bool(float(score)),
215
+ f"{self._result_key}_reason": reason,
216
+ "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
217
+ }
218
+ return {self._result_key: float(score)}
219
+
220
+ async def _real_call(self, **kwargs):
221
+ """The asynchronous call where real end-to-end evaluation logic is performed.
222
+
223
+ :keyword kwargs: The inputs to evaluate.
224
+ :type kwargs: Dict
225
+ :return: The evaluation result.
226
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
227
+ """
228
+ # Convert inputs into list of evaluable inputs.
229
+ eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
230
+ per_turn_results = []
231
+ # Evaluate all inputs.
232
+ for eval_input in eval_input_list:
233
+ per_turn_results.append(await self._do_eval(eval_input))
234
+
235
+ return self._aggregate_results(per_turn_results=per_turn_results)
236
+
237
+ def _aggregate_results(self, per_turn_results):
238
+ """Aggregate the evaluation results of each conversation turn into a single result.
239
+
240
+ Exact implementation might need to vary slightly depending on the results produced.
241
+ Default behavior is to average the all number-based outputs.
242
+
243
+ :param per_turn_results: List of evaluation results for each turn in the conversation.
244
+ :type per_turn_results: List[Dict]
245
+ :return: A dictionary containing aggregated results, with numeric metrics having their
246
+ means as top-level values in the dictionary, and all original
247
+ values (including non-numerics) located in under the "evaluation_per_turn" key,
248
+ which each sub-key being a metric and each sub-value being a the list of that metric's
249
+ per-turn values.
250
+ :rtype: AggregateResult[T_EvalValue]
251
+ """
252
+
253
+ aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
254
+ evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
255
+
256
+ # Go over each turn, and rotate the results into a
257
+ # metric: List[values] format for the evals_per_turn dictionary.
258
+
259
+ score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
260
+ aggregated[self._AGGREGATE_RESULT_KEY] = score
261
+ aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
262
+ aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
263
+
264
+ aggregated["per_tool_call_details"] = per_turn_results
265
+ return aggregated
266
+
267
+ @override
268
+ def __call__( # pylint: disable=docstring-missing-param
269
+ self,
270
+ *args,
271
+ **kwargs,
272
+ ):
273
+ """
274
+ Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
275
+
276
+ :keyword query: Query or Chat history up to the message that has the tool call being evaluated.
277
+ :paramtype query: Union[str, List[dict]]
278
+ :keyword tool_definitions: List of tool definitions whose calls are being evaluated.
279
+ :paramtype tool_definitions: Union[dict, List[dict]]
280
+ :keyword tool_calls: Optional List of tool calls to evaluate. If not provided response should be provided and should have
281
+ tool call(s) in it.
282
+ :paramtype tool_calls: Union[dict, List[dict]]
283
+ :keyword response: Optional response to be evaluated alongside the tool calls.
284
+ If provided all tool calls in response will be evaluated when tool_calls parameter is not provided.
285
+ If provided and tool_calls parameter is provided, only the tool calls in tool_calls parameter will be evaluated.
286
+ If response has extra tool calls they will not be evaluated, response will be used to extract any tool calls that are needed for evaluating a certain tool call.
287
+ Recommended to provide it when there are tool calls that depend on output of a previous tool call.
288
+ :paramtype response: Union[str, List[dict]]
289
+ :return: The tool selection evaluation results.
290
+ :rtype: Dict[str, Union[str, float]]
291
+ """
292
+ return super().__call__(*args, **kwargs)