azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +27 -1
  2. azure/ai/evaluation/_azure/_models.py +6 -6
  3. azure/ai/evaluation/_common/constants.py +6 -2
  4. azure/ai/evaluation/_common/rai_service.py +39 -5
  5. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  6. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  7. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  8. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  9. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  10. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  11. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  12. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  13. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  14. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  15. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  16. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  17. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  18. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  19. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  20. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  21. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  22. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  23. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  24. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
  25. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  26. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  27. azure/ai/evaluation/_common/utils.py +23 -3
  28. azure/ai/evaluation/_constants.py +7 -0
  29. azure/ai/evaluation/_converters/__init__.py +3 -0
  30. azure/ai/evaluation/_converters/_ai_services.py +804 -0
  31. azure/ai/evaluation/_converters/_models.py +302 -0
  32. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
  33. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
  34. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  35. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  36. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -4
  37. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +42 -22
  38. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
  39. azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
  40. azure/ai/evaluation/_evaluate/_evaluate.py +109 -64
  41. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
  42. azure/ai/evaluation/_evaluate/_utils.py +3 -3
  43. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
  44. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  45. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
  46. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
  47. azure/ai/evaluation/_evaluators/_common/_base_eval.py +44 -4
  48. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +4 -2
  49. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +44 -5
  50. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
  51. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
  52. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
  53. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
  54. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
  55. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
  56. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
  57. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
  58. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
  59. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +22 -4
  60. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  61. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
  62. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
  63. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
  64. azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
  65. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
  66. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  67. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +158 -0
  68. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
  69. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
  70. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
  72. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
  73. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  74. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
  75. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
  76. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  77. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
  78. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
  79. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  80. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
  81. azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
  82. azure/ai/evaluation/_exceptions.py +5 -0
  83. azure/ai/evaluation/_legacy/__init__.py +3 -0
  84. azure/ai/evaluation/_legacy/_adapters/__init__.py +21 -0
  85. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  86. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  87. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  88. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  89. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  90. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  91. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  92. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  93. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  94. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  95. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  96. azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
  97. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
  98. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  99. azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
  100. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
  101. azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
  102. azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
  103. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  104. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
  105. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  106. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
  107. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  109. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  110. azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
  111. azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
  112. azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
  113. azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
  114. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  115. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  116. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  117. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +251 -150
  118. azure/ai/evaluation/_version.py +1 -1
  119. azure/ai/evaluation/red_team/__init__.py +19 -0
  120. azure/ai/evaluation/red_team/_attack_objective_generator.py +195 -0
  121. azure/ai/evaluation/red_team/_attack_strategy.py +45 -0
  122. azure/ai/evaluation/red_team/_callback_chat_target.py +74 -0
  123. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  124. azure/ai/evaluation/red_team/_red_team.py +1887 -0
  125. azure/ai/evaluation/red_team/_red_team_result.py +382 -0
  126. azure/ai/evaluation/red_team/_utils/__init__.py +3 -0
  127. azure/ai/evaluation/red_team/_utils/constants.py +65 -0
  128. azure/ai/evaluation/red_team/_utils/formatting_utils.py +165 -0
  129. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  130. azure/ai/evaluation/red_team/_utils/strategy_utils.py +192 -0
  131. azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
  132. azure/ai/evaluation/simulator/_adversarial_simulator.py +54 -27
  133. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
  134. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
  135. azure/ai/evaluation/simulator/_simulator.py +1 -1
  136. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/METADATA +80 -15
  137. azure_ai_evaluation-1.5.0.dist-info/RECORD +207 -0
  138. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/WHEEL +1 -1
  139. azure/ai/evaluation/simulator/_tracing.py +0 -89
  140. azure_ai_evaluation-1.3.0.dist-info/RECORD +0 -119
  141. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/NOTICE.txt +0 -0
  142. {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ from typing import List, Dict
7
7
  from typing_extensions import overload, override
8
8
 
9
9
  from azure.ai.evaluation._evaluators._common import EvaluatorBase
10
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
10
11
 
11
12
 
12
13
  class F1ScoreEvaluator(EvaluatorBase):
@@ -25,6 +26,8 @@ class F1ScoreEvaluator(EvaluatorBase):
25
26
  model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
26
27
  information in the response.
27
28
 
29
+ :param threshold: The threshold for the F1 score evaluator. Default is 0.5.
30
+ :type threshold: float
28
31
 
29
32
  .. admonition:: Example:
30
33
 
@@ -34,13 +37,24 @@ class F1ScoreEvaluator(EvaluatorBase):
34
37
  :language: python
35
38
  :dedent: 8
36
39
  :caption: Initialize and call an F1ScoreEvaluator.
40
+
41
+ .. admonition:: Example with Threshold:
42
+
43
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
44
+ :start-after: [START threshold_f1_score_evaluator]
45
+ :end-before: [END threshold_f1_score_evaluator]
46
+ :language: python
47
+ :dedent: 8
48
+ :caption: Initialize with threshold and call an F1ScoreEvaluator.
37
49
  """
38
50
 
39
51
  id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
40
52
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
41
53
 
42
- def __init__(self):
43
- super().__init__()
54
+ def __init__(self, *, threshold=0.5):
55
+ self._threshold = threshold
56
+ self._higher_is_better = True
57
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
44
58
 
45
59
  @classmethod
46
60
  def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
@@ -115,8 +129,18 @@ class F1ScoreEvaluator(EvaluatorBase):
115
129
  response = eval_input["response"]
116
130
  # Run f1 score computation.
117
131
  f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
118
-
119
- return {"f1_score": f1_result}
132
+ binary_result = False
133
+ if self._higher_is_better:
134
+ if f1_result >= self._threshold:
135
+ binary_result = True
136
+ else:
137
+ if f1_result <= self._threshold:
138
+ binary_result = True
139
+ return {
140
+ "f1_score": f1_result,
141
+ "f1_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
142
+ "f1_threshold": self._threshold,
143
+ }
120
144
 
121
145
  @overload # type: ignore
122
146
  def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
@@ -23,6 +23,8 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
23
23
  :param model_config: Configuration for the Azure OpenAI model.
24
24
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
25
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
26
+ :param threshold: The threshold for the fluency evaluator. Default is 5.
27
+ :type threshold: int
26
28
 
27
29
  .. admonition:: Example:
28
30
 
@@ -33,6 +35,15 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
33
35
  :dedent: 8
34
36
  :caption: Initialize and call a FluencyEvaluator.
35
37
 
38
+ .. admonition:: Example with Threshold:
39
+
40
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
41
+ :start-after: [START threshold_fluency_evaluator]
42
+ :end-before: [END threshold_fluency_evaluator]
43
+ :language: python
44
+ :dedent: 8
45
+ :caption: Initialize with threshold and call a FluencyEvaluator.
46
+
36
47
  .. note::
37
48
 
38
49
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -47,10 +58,18 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
47
58
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
48
59
 
49
60
  @override
50
- def __init__(self, model_config):
61
+ def __init__(self, model_config, *, threshold=3):
51
62
  current_dir = os.path.dirname(__file__)
52
63
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
53
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
64
+ self._threshold = threshold
65
+ self._higher_is_better = True
66
+ super().__init__(
67
+ model_config=model_config,
68
+ prompty_file=prompty_path,
69
+ result_key=self._RESULT_KEY,
70
+ threshold=threshold,
71
+ _higher_is_better=self._higher_is_better
72
+ )
54
73
 
55
74
  @overload
56
75
  def __call__(
@@ -8,6 +8,7 @@ from typing_extensions import overload, override
8
8
  from azure.ai.evaluation._common.utils import nltk_tokenize
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11
12
 
12
13
 
13
14
  class GleuScoreEvaluator(EvaluatorBase):
@@ -22,6 +23,9 @@ class GleuScoreEvaluator(EvaluatorBase):
22
23
  GLEU scores range from 0 to 1, where a value of 1 represents perfect overlap between the response and
23
24
  the ground truth and a value of 0 indicates no overlap.
24
25
 
26
+ :param threshold: The threshold for the GLEU evaluator. Default is 0.5.
27
+ :type threshold: float
28
+
25
29
  .. admonition:: Example:
26
30
 
27
31
  .. literalinclude:: ../samples/evaluation_samples_evaluate.py
@@ -30,14 +34,25 @@ class GleuScoreEvaluator(EvaluatorBase):
30
34
  :language: python
31
35
  :dedent: 8
32
36
  :caption: Initialize and call a GleuScoreEvaluator.
37
+
38
+ .. admonition:: Example with Threshold:
39
+
40
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
41
+ :start-after: [START threshold_gleu_score_evaluator]
42
+ :end-before: [END threshold_gleu_score_evaluator]
43
+ :language: python
44
+ :dedent: 8
45
+ :caption: Initialize with threshold and call a GleuScoreEvaluator.
33
46
  """
34
47
 
35
48
  id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
36
49
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
37
50
 
38
51
  @override
39
- def __init__(self):
40
- super().__init__()
52
+ def __init__(self, *, threshold=0.5):
53
+ self._threshold = threshold
54
+ self._higher_is_better = True
55
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
41
56
 
42
57
  @override
43
58
  async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
@@ -54,9 +69,17 @@ class GleuScoreEvaluator(EvaluatorBase):
54
69
  hypothesis_tokens = nltk_tokenize(response)
55
70
 
56
71
  score = sentence_gleu([reference_tokens], hypothesis_tokens)
57
-
72
+ binary_result = False
73
+ if self._higher_is_better:
74
+ if score >= self._threshold:
75
+ binary_result = True
76
+ else:
77
+ if score <= self._threshold:
78
+ binary_result = True
58
79
  return {
59
80
  "gleu_score": score,
81
+ "gleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
82
+ "gleu_threshold": self._threshold,
60
83
  }
61
84
 
62
85
  @overload # type: ignore
@@ -5,7 +5,7 @@ import os
5
5
  from typing import Dict, List, Optional, Union
6
6
 
7
7
  from typing_extensions import overload, override
8
- from promptflow.core import AsyncPrompty
8
+ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
@@ -33,7 +33,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
33
33
  :param model_config: Configuration for the Azure OpenAI model.
34
34
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
35
35
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
-
36
+ :param threshold: The threshold for the groundedness evaluator. Default is 5.
37
+ :type threshold: int
38
+
37
39
  .. admonition:: Example:
38
40
 
39
41
  .. literalinclude:: ../samples/evaluation_samples_evaluate.py
@@ -43,6 +45,14 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
43
45
  :dedent: 8
44
46
  :caption: Initialize and call a GroundednessEvaluator.
45
47
 
48
+ .. admonition:: Example with Threshold:
49
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
50
+ :start-after: [START threshold_groundedness_evaluator]
51
+ :end-before: [END threshold_groundedness_evaluator]
52
+ :language: python
53
+ :dedent: 8
54
+ :caption: Initialize with threshold and call a GroundednessEvaluator.
55
+
46
56
  .. note::
47
57
 
48
58
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -59,12 +69,20 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
59
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
60
70
 
61
71
  @override
62
- def __init__(self, model_config):
72
+ def __init__(self, model_config, *, threshold=3, **kwargs):
63
73
  current_dir = os.path.dirname(__file__)
64
74
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
65
75
 
66
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
76
+ self._higher_is_better = True
77
+ super().__init__(
78
+ model_config=model_config,
79
+ prompty_file=prompty_path,
80
+ result_key=self._RESULT_KEY,
81
+ threshold=threshold,
82
+ _higher_is_better=self._higher_is_better
83
+ )
67
84
  self._model_config = model_config
85
+ self.threshold = threshold
68
86
  # Needs to be set because it's used in call method to re-validate prompt if `query` is provided
69
87
 
70
88
  @overload
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._intent_resolution import IntentResolutionEvaluator
6
+
7
+ __all__ = ["IntentResolutionEvaluator"]
@@ -0,0 +1,152 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import math
6
+ from typing import Dict, Union, List, Optional
7
+
8
+ from typing_extensions import overload, override
9
+
10
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
11
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
12
+ from azure.ai.evaluation._model_configurations import Conversation, Message
13
+ from ..._common.utils import check_score_is_valid
14
+ from azure.ai.evaluation._common._experimental import experimental
15
+
16
+ @experimental
17
+ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
18
+ """
19
+ Evaluates intent resolution for a given query and response or a multi-turn conversation, including reasoning.
20
+
21
+ The intent resolution evaluator assesses whether the user intent was correctly identified and resolved.
22
+
23
+ :param model_config: Configuration for the Azure OpenAI model.
24
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
25
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
26
+
27
+ .. admonition:: Example:
28
+
29
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
30
+ :start-after: [START intent_resolution_evaluator]
31
+ :end-before: [END intent_resolution_evaluator]
32
+ :language: python
33
+ :dedent: 8
34
+ :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
35
+
36
+ """
37
+
38
+ _PROMPTY_FILE = "intent_resolution.prompty"
39
+ _RESULT_KEY = "intent_resolution"
40
+ _OPTIONAL_PARAMS = ["tool_definitions"]
41
+
42
+ _MIN_INTENT_RESOLUTION_SCORE = 1
43
+ _MAX_INTENT_RESOLUTION_SCORE = 5
44
+ _DEFAULT_INTENT_RESOLUTION_THRESHOLD = 3
45
+
46
+ id = None
47
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
48
+
49
+ @override
50
+ def __init__(self, model_config, *, threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD):
51
+ current_dir = os.path.dirname(__file__)
52
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
53
+ self.threshold = threshold
54
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
55
+
56
+ @overload
57
+ def __call__(
58
+ self,
59
+ *,
60
+ query : Union[str, List[dict]],
61
+ response : Union[str, List[dict]],
62
+ tool_definitions : Optional[Union[dict, List[dict]]] = None,
63
+ ) -> Dict[str, Union[str, float]]:
64
+ """Evaluate intent resolution for a given query, response and optional tool definitions.
65
+ The query and response can be either a string or a list of messages.
66
+
67
+ Example with string inputs and no tools:
68
+ evaluator = IntentResolutionEvaluator(model_config)
69
+ query = "What is the weather today?"
70
+ response = "The weather is sunny."
71
+
72
+ result = evaluator(query=query, response=response)
73
+
74
+ Example with list of messages:
75
+ evaluator = IntentResolutionEvaluator(model_config)
76
+ query: [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
77
+ response: [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
78
+ tool_definitions: [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
79
+
80
+ result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
81
+
82
+ :keyword query: The query to be evaluated which is either a string or a list of messages.
83
+ The list of messages is the previous conversation history of the user and agent, including system messages and tool calls.
84
+ :paramtype query: Union[str, List[dict]]
85
+ :keyword response: The response to be evaluated, which is either a string or a list of messages (full agent response potentially including tool calls)
86
+ :paramtype response: Union[str, List[dict]]
87
+ :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
88
+ :paramtype tool_definitions: Optional[Union[dict, List[dict]]]
89
+ :return: A dictionary with the intent resolution evaluation
90
+ :rtype: Dict[str, Union[str, float]]
91
+ """
92
+
93
+ @override
94
+ def __call__( # pylint: disable=docstring-missing-param
95
+ self,
96
+ *args,
97
+ **kwargs,
98
+ ):
99
+ """
100
+ Invokes the instance using the overloaded __call__ signature.
101
+
102
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
103
+ """
104
+ return super().__call__(*args, **kwargs)
105
+
106
+ @override
107
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
108
+ """Do intent resolution evaluation.
109
+
110
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
111
+ :type eval_input: Dict
112
+ :return: The evaluation result.
113
+ :rtype: Dict
114
+ """
115
+ # we override the _do_eval method as we want the output to be a dictionary, which is a different schema than _base_prompty_eval.py
116
+ if "query" not in eval_input and "response" not in eval_input:
117
+ raise EvaluationException(
118
+ message=f"Both query and response must be provided as input to the intent resolution evaluator.",
119
+ internal_message=f"Both query and response must be provided as input to the intent resolution evaluator.",
120
+ blame=ErrorBlame.USER_ERROR,
121
+ category=ErrorCategory.MISSING_FIELD,
122
+ target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
123
+ )
124
+ llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
125
+ # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
126
+ if isinstance(llm_output, dict):
127
+ score = llm_output.get("resolution_score", math.nan)
128
+ if not check_score_is_valid(score, IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE, IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE):
129
+ raise EvaluationException(
130
+ message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
131
+ internal_message="Invalid score value.",
132
+ category=ErrorCategory.FAILED_EXECUTION,
133
+ blame=ErrorBlame.SYSTEM_ERROR,
134
+ )
135
+ reason = llm_output.get("explanation", "")
136
+ score = float(score)
137
+ score_result = 'pass' if score >= self.threshold else 'fail'
138
+
139
+ #remove fields 'explanation' and 'resolution_score' from llm_output as they are already included in the response_dict
140
+ if 'explanation' in llm_output: llm_output.pop("explanation")
141
+ if 'resolution_score' in llm_output: llm_output.pop("resolution_score")
142
+
143
+ response_dict = {
144
+ f"{self._result_key}" : score,
145
+ f"{self._result_key}_result" : score_result,
146
+ f"{self._result_key}_threshold" : self.threshold,
147
+ f"{self._result_key}_reason" : reason,
148
+ f"additional_details" : llm_output
149
+ }
150
+ return response_dict
151
+ # If llm_output is not a dictionary, return NaN for the score. This should never happen
152
+ return {self._result_key: math.nan}
@@ -0,0 +1,161 @@
1
+ ---
2
+ name: Intent Resolution Evaluator
3
+ description: Evaluates whether user intent was identified and correctly resolved
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 800
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: json_object
14
+
15
+ inputs:
16
+ query:
17
+ type: string
18
+ response:
19
+ type: string
20
+ tool_definitions:
21
+ type: string
22
+ optional: true
23
+ default: "[]"
24
+ ---
25
+ system:
26
+ You are an expert in evaluating the quality of a RESPONSE from an intelligent assistant based on provided definition and Data.
27
+
28
+ user:
29
+ # Goal
30
+ Your goal is to assess the quality of the RESPONSE of an assistant in relation to a QUERY from a user, specifically focusing on
31
+ the assistant's ability to understand and resolve the user intent expressed in the QUERY. There is also a field for tool definitions
32
+ describing the functions, if any, that are accessible to the agent and that the agent may invoke in the RESPONSE if necessary.
33
+
34
+ There are two components to intent resolution:
35
+ - Intent Understanding: The extent to which the agent accurately discerns the user's underlying need or inquiry.
36
+ - Response Resolution: The degree to which the agent's response is comprehensive, relevant, and adequately addresses the user's request.
37
+
38
+ Note that the QUERY can either be a string with a user request or an entire conversation history including previous requests and responses from the assistant.
39
+ In this case, the assistant's response should be evaluated in the context of the entire conversation but the focus should be on the last intent.
40
+
41
+ # Data
42
+ QUERY: {{query}}
43
+ RESPONSE: {{response}}
44
+ TOOL_DEFINITIONS: {{tool_definitions}}
45
+
46
+
47
+ # Ratings
48
+ ## [Score: 1] (Response completely unrelated to user intent)
49
+ **Definition:** The agent's response does not address the query at all.
50
+
51
+ **Example:**
52
+ **Query:** How do I bake a chocolate cake?
53
+ **Response:** The latest smartphone models have incredible features and performance.
54
+ **Tool Definitions:** []
55
+
56
+ **Expected output**
57
+ {
58
+ "explanation": "The agent's response is entirely off-topic, discussing smartphones instead of providing any information about baking a chocolate cake."
59
+ "conversation_has_intent": true,
60
+ "agent_perceived_intent": "discussion about smartphone features",
61
+ "actual_user_intent": "bake a chocolate cake",
62
+ "correct_intent_detected": false,
63
+ "intent_resolved": false,
64
+ "resolution_score": 1,
65
+ }
66
+
67
+
68
+ ## [Score: 2] (Response minimally relates to user intent)
69
+ **Definition:** The response shows a token attempt to address the query by mentioning a relevant keyword or concept, but it provides almost no useful or actionable information.
70
+
71
+ **Example input:**
72
+ **Query:** How do I bake a chocolate cake?
73
+ **Response:** Chocolate cake involves some ingredients.
74
+ **Tool Definitions:** []
75
+
76
+ **Expected output**
77
+ {
78
+ "explanation": "While the response mentions 'ingredients' related to a chocolate cake, it barely addresses the process or any detailed steps, leaving the query unresolved."
79
+ "conversation_has_intent": true,
80
+ "agent_perceived_intent": "mention of ingredients",
81
+ "actual_user_intent": "bake a chocolate cake",
82
+ "correct_intent_detected": false,
83
+ "intent_resolved": false,
84
+ "resolution_score": 2,
85
+ }
86
+
87
+
88
+ ## [Score: 3] (Response partially addresses the user intent but lacks complete details)
89
+ **Definition:** The response provides a basic idea related to the query by mentioning a few relevant elements, but it omits several key details and specifics needed for fully resolving the user's query.
90
+
91
+ **Example input:**
92
+ **Query:** How do I bake a chocolate cake?
93
+ **Response:** Preheat your oven and mix the ingredients before baking the cake.
94
+ **Tool Definitions:** []
95
+
96
+ **Expected output**
97
+ {
98
+ "explanation": "The response outlines a minimal process (preheating and mixing) but omits critical details like ingredient measurements, baking time, and temperature specifics, resulting in only a partial resolution of the query."
99
+ "conversation_has_intent": true,
100
+ "agent_perceived_intent": "basic baking process",
101
+ "actual_user_intent": "bake a chocolate cake",
102
+ "correct_intent_detected": true,
103
+ "intent_resolved": false,
104
+ "resolution_score": 3,
105
+ }
106
+
107
+
108
+ ## [Score: 4] (Response addresses the user intent with moderate accuracy but has minor inaccuracies or omissions)
109
+ **Definition:** The response offers a moderately detailed answer that includes several specific elements relevant to the query, yet it still lacks some finer details or complete information.
110
+
111
+ **Example input:**
112
+ **Query:** How do I bake a chocolate cake?
113
+ **Response:** Preheat your oven to 350°F. In a bowl, combine flour, sugar, cocoa, eggs, and milk, mix well, and bake for about 30 minutes.
114
+ **Tool Definitions:** []
115
+
116
+ **Expected output**
117
+ {
118
+ "explanation": "The response includes specific steps and ingredients, indicating a clear intent to provide a recipe. However, it lacks detailed measurements and finer instructions, which makes the resolution only moderately complete."
119
+ "conversation_has_intent": true,
120
+ "agent_perceived_intent": "provide a basic chocolate cake recipe",
121
+ "actual_user_intent": "bake a chocolate cake",
122
+ "correct_intent_detected": true,
123
+ "intent_resolved": false,
124
+ "resolution_score": 4,
125
+ }
126
+
127
+
128
+ ## [Score: 5] (Response directly addresses the user intent and fully resolves it)
129
+ **Definition:** The response provides a complete, detailed, and accurate answer that fully resolves the user's query with all necessary information and precision.
130
+
131
+ **Example input:**
132
+ **Query:** How do I bake a chocolate cake?
133
+ **Response:** Preheat your oven to 350°F (175°C) and grease a 9-inch round cake pan. In a large bowl, sift together 1 ¾ cups all-purpose flour, 1 cup sugar, ¾ cup unsweetened cocoa powder, 1 ½ tsp baking powder, and 1 tsp salt. In another bowl, beat 2 large eggs with 1 cup milk, ½ cup vegetable oil, and 2 tsp vanilla extract. Combine the wet ingredients with the dry ingredients, then gradually mix in 1 cup boiling water until smooth. Pour the batter into the prepared pan and bake for 30-35 minutes or until a toothpick inserted into the center comes out clean. Allow the cake to cool before serving.
134
+ **Tool Definitions:** []
135
+
136
+ **Expected output**
137
+ {
138
+ "explanation": "The response delivers a complete and precise recipe with detailed instructions and measurements, fully addressing the user's query about baking a chocolate cake."
139
+ "conversation_has_intent": true,
140
+ "agent_perceived_intent": "provide a comprehensive chocolate cake recipe",
141
+ "actual_user_intent": "bake a chocolate cake",
142
+ "correct_intent_detected": true,
143
+ "intent_resolved": true,
144
+ "resolution_score": 5,
145
+ }
146
+
147
+
148
+ # Task
149
+
150
+ Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
151
+ Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
152
+ - explanation: a string that explains why you think the input Data should get this resolution_score.
153
+ - conversation_has_intent: true or false
154
+ - agent_perceived_intent: a string that describes the intent the agent perceived from the user query
155
+ - actual_user_intent: a string that describes the actual user intent
156
+ - correct_intent_detected: true or false
157
+ - intent_resolved: true or false
158
+ - resolution_score: an integer between 1 and 5 that represents the resolution score
159
+
160
+
161
+ # Output
@@ -8,6 +8,7 @@ from typing_extensions import overload, override
8
8
 
9
9
  from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
10
10
  from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11
12
 
12
13
 
13
14
  class MeteorScoreEvaluator(EvaluatorBase):
@@ -32,6 +33,8 @@ class MeteorScoreEvaluator(EvaluatorBase):
32
33
  :type beta: float
33
34
  :param gamma: The METEOR score gamma parameter. Default is 0.5.
34
35
  :type gamma: float
36
+ :param threshold: The threshold for the METEOR score evaluator. Default is 0.5.
37
+ :type threshold: float
35
38
 
36
39
  .. admonition:: Example:
37
40
 
@@ -41,18 +44,30 @@ class MeteorScoreEvaluator(EvaluatorBase):
41
44
  :language: python
42
45
  :dedent: 8
43
46
  :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
47
+
48
+ .. admonition:: Example with Threshold:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
51
+ :start-after: [START threshold_meteor_score_evaluator]
52
+ :end-before: [END threshold_meteor_score_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize with threshold and call a MeteorScoreEvaluator.
44
56
  """
45
57
 
46
58
  id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
47
59
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
48
60
 
49
61
  @override
50
- def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
62
+ def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5, *, threshold: float = 0.5):
51
63
  self._alpha = alpha
52
64
  self._beta = beta
53
65
  self._gamma = gamma
54
66
  ensure_nltk_data_downloaded()
55
- super().__init__()
67
+ self._threshold = threshold
68
+ self._higher_is_better = True
69
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
70
+
56
71
 
57
72
  @override
58
73
  async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
@@ -74,9 +89,17 @@ class MeteorScoreEvaluator(EvaluatorBase):
74
89
  beta=self._beta,
75
90
  gamma=self._gamma,
76
91
  )
77
-
92
+ binary_result = False
93
+ if self._higher_is_better:
94
+ if score >= self._threshold:
95
+ binary_result = True
96
+ else:
97
+ if score <= self._threshold:
98
+ binary_result = True
78
99
  return {
79
100
  "meteor_score": score,
101
+ "meteor_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
102
+ "meteor_threshold": self._threshold,
80
103
  }
81
104
 
82
105
  @overload # type: ignore