azure-ai-evaluation 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (134) hide show
  1. azure/ai/evaluation/__init__.py +42 -14
  2. azure/ai/evaluation/_azure/_models.py +6 -6
  3. azure/ai/evaluation/_common/constants.py +6 -2
  4. azure/ai/evaluation/_common/rai_service.py +38 -4
  5. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  6. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  7. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  8. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  9. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  10. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  11. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  12. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  13. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  14. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  15. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  16. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  17. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  18. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  19. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  20. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  21. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  22. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  23. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  24. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
  25. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  26. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  27. azure/ai/evaluation/_common/utils.py +30 -10
  28. azure/ai/evaluation/_constants.py +10 -0
  29. azure/ai/evaluation/_converters/__init__.py +3 -0
  30. azure/ai/evaluation/_converters/_ai_services.py +804 -0
  31. azure/ai/evaluation/_converters/_models.py +302 -0
  32. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
  33. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
  34. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  35. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  36. azure/ai/evaluation/_evaluate/_evaluate.py +36 -4
  37. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
  38. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  39. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
  40. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
  41. azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
  42. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
  43. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
  44. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
  45. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
  46. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
  47. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
  48. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
  49. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
  50. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
  51. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
  52. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
  53. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
  54. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  55. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
  56. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
  57. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
  58. azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
  59. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
  60. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  61. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
  62. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
  63. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
  64. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
  65. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
  66. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
  67. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  68. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
  69. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
  70. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
  72. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
  73. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  74. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
  75. azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
  76. azure/ai/evaluation/_exceptions.py +5 -1
  77. azure/ai/evaluation/_legacy/__init__.py +3 -0
  78. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  79. azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
  80. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
  81. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  82. azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
  83. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
  84. azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
  85. azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
  86. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  87. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
  88. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  89. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
  91. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  92. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  93. azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
  94. azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
  95. azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
  96. azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
  97. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  98. azure/ai/evaluation/_red_team/__init__.py +3 -0
  99. azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
  100. azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
  101. azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
  102. azure/ai/evaluation/_red_team/_default_converter.py +21 -0
  103. azure/ai/evaluation/_red_team/_red_team.py +1858 -0
  104. azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
  105. azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
  106. azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
  107. azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
  108. azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
  109. azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
  110. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  111. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  112. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +741 -0
  113. azure/ai/evaluation/_version.py +2 -1
  114. azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
  115. azure/ai/evaluation/simulator/_adversarial_simulator.py +61 -27
  116. azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
  117. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
  118. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
  119. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
  120. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
  121. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/METADATA +75 -15
  122. azure_ai_evaluation-1.4.0.dist-info/RECORD +197 -0
  123. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/WHEEL +1 -1
  124. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  125. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  126. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  127. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  128. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  129. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  130. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  131. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  132. azure_ai_evaluation-1.2.0.dist-info/RECORD +0 -125
  133. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/NOTICE.txt +0 -0
  134. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,18 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
23
23
  :param model_config: Configuration for the Azure OpenAI model.
24
24
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
25
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
26
+ :param groundedness_threshold: The threshold for groundedness evaluation. Default is 3.
27
+ :type groundedness_threshold: int
28
+ :param relevance_threshold: The threshold for relevance evaluation. Default is 3.
29
+ :type relevance_threshold: int
30
+ :param coherence_threshold: The threshold for coherence evaluation. Default is 3.
31
+ :type coherence_threshold: int
32
+ :param fluency_threshold: The threshold for fluency evaluation. Default is 3.
33
+ :type fluency_threshold: int
34
+ :param similarity_threshold: The threshold for similarity evaluation. Default is 3.
35
+ :type similarity_threshold: int
36
+ :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
37
+ :type f1_score_threshold: float
26
38
  :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
27
39
  :param kwargs: Additional arguments to pass to the evaluator.
28
40
  :type kwargs: Any
@@ -36,6 +48,15 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
36
48
  :dedent: 8
37
49
  :caption: Initialize and call a QAEvaluator.
38
50
 
51
+ .. admonition:: Example with Threshold:
52
+
53
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
54
+ :start-after: [START threshold_qa_evaluator]
55
+ :end-before: [END threshold_qa_evaluator]
56
+ :language: python
57
+ :dedent: 8
58
+ :caption: Initialize with threshold and call a QAEvaluator.
59
+
39
60
  .. note::
40
61
 
41
62
  To align with our support of a diverse set of models, keys without the `gpt_` prefix has been added.
@@ -46,14 +67,37 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
46
67
  id = "qa"
47
68
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
48
69
 
49
- def __init__(self, model_config, **kwargs):
70
+ def __init__(
71
+ self,
72
+ model_config,
73
+ *,
74
+ groundedness_threshold: int = 3,
75
+ relevance_threshold: int = 3,
76
+ coherence_threshold: int = 3,
77
+ fluency_threshold: int = 3,
78
+ similarity_threshold: int = 3,
79
+ f1_score_threshold: float = 0.5,
80
+ **kwargs
81
+ ):
82
+ # Type checking
83
+ for name, value in [
84
+ ("groundedness_threshold", groundedness_threshold),
85
+ ("relevance_threshold", relevance_threshold),
86
+ ("coherence_threshold", coherence_threshold),
87
+ ("fluency_threshold", fluency_threshold),
88
+ ("similarity_threshold", similarity_threshold),
89
+ ("f1_score_threshold", f1_score_threshold),
90
+ ]:
91
+ if not isinstance(value, (int, float)):
92
+ raise TypeError(f"{name} must be an int or float, got {type(value)}")
93
+
50
94
  evaluators = [
51
- GroundednessEvaluator(model_config),
52
- RelevanceEvaluator(model_config),
53
- CoherenceEvaluator(model_config),
54
- FluencyEvaluator(model_config),
55
- SimilarityEvaluator(model_config),
56
- F1ScoreEvaluator(),
95
+ GroundednessEvaluator(model_config, threshold=groundedness_threshold),
96
+ RelevanceEvaluator(model_config, threshold=relevance_threshold),
97
+ CoherenceEvaluator(model_config, threshold=coherence_threshold),
98
+ FluencyEvaluator(model_config, threshold=fluency_threshold),
99
+ SimilarityEvaluator(model_config, threshold=similarity_threshold),
100
+ F1ScoreEvaluator(threshold=f1_score_threshold),
57
101
  ]
58
102
  super().__init__(evaluators=evaluators, **kwargs)
59
103
 
@@ -27,6 +27,8 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
27
27
  :param model_config: Configuration for the Azure OpenAI model.
28
28
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
29
29
  ~azure.ai.evaluation.OpenAIModelConfiguration]
30
+ :param threshold: The threshold for the relevance evaluator. Default is 5.
31
+ :type threshold: int
30
32
 
31
33
  .. admonition:: Example:
32
34
 
@@ -37,6 +39,15 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
37
39
  :dedent: 8
38
40
  :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
39
41
 
42
+ .. admonition:: Example with Threshold:
43
+
44
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
45
+ :start-after: [START threshold_relevance_evaluator]
46
+ :end-before: [END threshold_relevance_evaluator]
47
+ :language: python
48
+ :dedent: 8
49
+ :caption: Initialize with threshold and call a RelevanceEvaluator with a query, response, and context.
50
+
40
51
  .. note::
41
52
 
42
53
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -52,10 +63,23 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
52
63
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
53
64
 
54
65
  @override
55
- def __init__(self, model_config):
66
+ def __init__(
67
+ self,
68
+ model_config,
69
+ *,
70
+ threshold=3
71
+ ):
56
72
  current_dir = os.path.dirname(__file__)
57
73
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
58
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
74
+ self._threshold = threshold
75
+ self._higher_is_better = True
76
+ super().__init__(
77
+ model_config=model_config,
78
+ prompty_file=prompty_path,
79
+ result_key=self._RESULT_KEY,
80
+ threshold=threshold,
81
+ _higher_is_better=self._higher_is_better
82
+ )
59
83
 
60
84
  @overload
61
85
  def __call__(
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._response_completeness import ResponseCompletenessEvaluator
6
+
7
+ __all__ = ["ResponseCompletenessEvaluator"]
@@ -0,0 +1,157 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import os
6
+ import math
7
+ from typing import Dict, List, Union, Optional
8
+
9
+ from typing_extensions import overload, override
10
+
11
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13
+ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
14
+ from azure.ai.evaluation._model_configurations import Conversation, Message
15
+ from azure.ai.evaluation._common._experimental import experimental
16
+
17
+
18
+ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
19
+ """
20
+ Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
21
+ provided ground truth.
22
+ The completeness measure assesses how thoroughly an AI model's generated response aligns with the key information,
23
+ claims, and statements established in the ground truth. This evaluation considers the presence, accuracy,
24
+ and relevance of the content provided.
25
+ The assessment spans multiple levels, ranging from fully incomplete to fully complete, ensuring a comprehensive
26
+ evaluation of the response's content quality.
27
+ Use this metric when you need to evaluate an AI model's ability to deliver comprehensive and accurate information,
28
+ particularly in text generation tasks where conveying all essential details is crucial for clarity,
29
+ context, and correctness.
30
+ Completeness scores range from 1 to 5:
31
+ 1: Fully incomplete — Contains none of the necessary information.
32
+ 2: Barely complete — Contains only a small portion of the required information.
33
+ 3: Moderately complete — Covers about half of the required content.
34
+ 4: Mostly complete — Includes most of the necessary details with minimal omissions.
35
+ 5: Fully complete — Contains all key information without any omissions.
36
+ :param model_config: Configuration for the Azure OpenAI model.
37
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
38
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
39
+ .. admonition:: Example:
40
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
41
+ :start-after: [START completeness_evaluator]
42
+ :end-before: [END completeness_evaluator]
43
+ :language: python
44
+ :dedent: 8
45
+ :caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
46
+ """
47
+
48
+ # Constants must be defined within eval's directory to be save/loadable
49
+
50
+ _PROMPTY_FILE = "response_completeness.prompty"
51
+ _RESULT_KEY = "response_completeness"
52
+
53
+ id = "completeness"
54
+
55
+ _MIN_COMPLETENESS_SCORE = 1
56
+ _MAX_COMPLETENESS_SCORE = 5
57
+ _DEFAULT_COMPLETENESS_THRESHOLD = 3
58
+
59
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
60
+
61
+ @override
62
+ def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD):
63
+ current_dir = os.path.dirname(__file__)
64
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
65
+ self.threshold = threshold
66
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
67
+
68
+ @overload
69
+ def __call__(
70
+ self,
71
+ *,
72
+ ground_truth: str,
73
+ response: str,
74
+ ) -> Dict[str, Union[str, float]]:
75
+ """Evaluate completeness in given response. Accepts ground truth and response for evaluation.
76
+ Example usage:
77
+ Evaluating completeness for a response string
78
+ ```python
79
+ from azure.ai.evaluation import CompletenessEvaluator
80
+ completeness_evaluator = CompletenessEvaluator(model_config)
81
+ ground_truth = "The ground truth to be evaluated."
82
+ response = "The response to be evaluated."
83
+ completeness_results = completeness_evaluator(ground_truth=ground_truth, response=response)
84
+ ```
85
+ :keword ground_truth: The ground truth to be evaluated.
86
+ :paramtype ground_truth: str
87
+ :keyword response: The response to be evaluated.
88
+ :paramtype response: Union[str, List[Message]]
89
+ :return: The response completeness score results.
90
+ :rtype: Dict[str, Union[str, float]]
91
+ """
92
+
93
+ @overload
94
+ def __call__(
95
+ self,
96
+ *,
97
+ conversation: Conversation,
98
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
99
+ """Evaluate completeness for a conversation
100
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
101
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
102
+ to be dictionaries with keys "content", "role", and possibly "context".
103
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
104
+ :return: The fluency score
105
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
106
+ """
107
+
108
+ @override
109
+ def __call__( # pylint: disable=docstring-missing-param
110
+ self,
111
+ *args,
112
+ **kwargs,
113
+ ):
114
+ """
115
+ Invokes the instance using the overloaded __call__ signature.
116
+
117
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
118
+ """
119
+ return super().__call__(*args, **kwargs)
120
+
121
+ @override
122
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
123
+ """Do completeness evaluation.
124
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
125
+ :type eval_input: Dict
126
+ :return: The evaluation result.
127
+ :rtype: Dict
128
+ """
129
+ # we override the _do_eval method as we want the output to be a dictionary,
130
+ # which is a different schema than _base_prompty_eval.py
131
+ if "ground_truth" not in eval_input or "response" not in eval_input:
132
+ raise EvaluationException(
133
+ message=f"Both ground_truth and response must be provided as input to the completeness evaluator.",
134
+ internal_message=f"Both ground_truth and response must be provided as input to the completeness"
135
+ f" evaluator.",
136
+ blame=ErrorBlame.USER_ERROR,
137
+ category=ErrorCategory.MISSING_FIELD,
138
+ target=ErrorTarget.COMPLETENESS_EVALUATOR,
139
+ )
140
+
141
+ llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
142
+
143
+ score = math.nan
144
+ if llm_output:
145
+ score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
146
+
147
+ score_result = 'pass' if score >= self.threshold else 'fail'
148
+
149
+ # updating the result key and threshold to int based on the schema
150
+ return {
151
+ f"{self._result_key}": int(score),
152
+ f"{self._result_key}_result": score_result,
153
+ f"{self._result_key}_threshold": int(self.threshold),
154
+ f"{self._result_key}_reason": reason,
155
+ }
156
+
157
+ return {self._result_key: math.nan}
@@ -0,0 +1,99 @@
1
+ ---
2
+ name: Completeness
3
+ description: Evaluates Completeness score for QA scenario
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 800
9
+ top_p: 1.0
10
+ seed: 123
11
+ presence_penalty: 0
12
+ frequency_penalty: 0
13
+ response_format:
14
+ type: text
15
+
16
+ inputs:
17
+ response:
18
+ type: string
19
+ ground_truth:
20
+ type: string
21
+
22
+ ---
23
+ system:
24
+ # Instruction
25
+ ## Context
26
+ ### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided.
27
+ - **Definition**: You are given a definition of the response quality that is being evaluated to help guide your Score.
28
+ - **Data**: Your input data include a response and its ground truth.
29
+ - **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways.
30
+
31
+
32
+ # Definition
33
+
34
+ **Level 1: Fully incomplete**
35
+
36
+ **Definition:**
37
+ A response is considered fully incomplete if it does not contain any the necessary and relevant information with respect to the ground truth. In other words, it completely misses all the information - especially claims and statements - established in the ground truth.
38
+
39
+ **Examples:**
40
+ 1. **Response:** "Flu shot cannot cure cancer. Stay healthy requires sleeping exactly 8 hours a day. A few hours of exercise per week will have little benefits for physical and mental health. Physical and mental health benefits are separate topics. Scientists have not studied any of them."
41
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Staying healthy requires proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
42
+
43
+
44
+ **Level 2: Barely complete**
45
+
46
+ **Definition:**
47
+ A response is considered barely complete if it only contains a small percentage of all the necessary and relevant information with respect to the ground truth. In other words, it misses almost all the information - especially claims and statements - established in the ground truth.
48
+
49
+ **Examples:**
50
+ 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires 2 meals a day. Exercise per week makes not difference to physical and mental health. This is because physical and mental health benefits have low correlation through scientific studies. Scientists are making this observation in studies."
51
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
52
+
53
+
54
+ **Level 3: Moderately complete**
55
+
56
+ **Definition:**
57
+ A response is considered moderately complete if it contains half of the necessary and relevant information with respect to the ground truth. In other words, it miss half of the information - especially claims and statements - established in the ground truth.
58
+
59
+ **Examples:**
60
+ 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires a few dollar of investments a day. Even a few dollars of investments per week will not make an impact on physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Fiction writers are starting to discover them through their works."
61
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
62
+
63
+
64
+ **Level 4: Mostly complete**
65
+
66
+ **Definition:**
67
+ A response is considered mostly complete if it contains most of the necessary and relevant information with respect to the ground truth. In other words, it misses some minor information - especially claims and statements - established in the ground truth.
68
+
69
+
70
+ **Examples:**
71
+ 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires keto diet and rigorous athletic training. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
72
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
73
+
74
+
75
+ **Level 5: Fully complete**
76
+
77
+ **Definition:**
78
+ A response is considered complete if it perfectly contains all the necessary and relevant information with respect to the ground truth. In other words, it does not miss any information from statements and claims in the ground truth.
79
+
80
+ **Examples:**
81
+ 1. **Response:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
82
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
83
+
84
+
85
+
86
+ # Data
87
+ Response: {{response}}
88
+ Ground Truth: {{ground_truth}}
89
+
90
+
91
+ # Tasks
92
+ ## Please provide your assessment Score for the previous answer. Your output should include the following information:
93
+ - **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:".
94
+ - **Explanation**: a very short explanation of why you think the input data should get that Score.
95
+ - **Score**: based on your previous analysis, provide your Score. The answer you give MUST be a integer score ("1", "2", ...) based on the categories of the definitions.
96
+
97
+
98
+ ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
99
+ # Output
@@ -31,6 +31,8 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
31
31
  :param model_config: Configuration for the Azure OpenAI model.
32
32
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
33
33
  ~azure.ai.evaluation.OpenAIModelConfiguration]
34
+ :param threshold: The threshold for the evaluation. Default is 3.
35
+ :type threshold: float
34
36
  :return: A function that evaluates and generates metrics for "chat" scenario.
35
37
  :rtype: Callable
36
38
 
@@ -43,6 +45,15 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
43
45
  :dedent: 8
44
46
  :caption: Initialize and call a RetrievalEvaluator.
45
47
 
48
+ .. admonition:: Example with Threshold:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
51
+ :start-after: [START threshold_retrieval_evaluator]
52
+ :end-before: [END threshold_retrieval_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize with threshold and call a RetrievalEvaluator.
56
+
46
57
  .. note::
47
58
 
48
59
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -57,10 +68,18 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
57
68
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
58
69
 
59
70
  @override
60
- def __init__(self, model_config): # pylint: disable=super-init-not-called
71
+ def __init__(self, model_config, *, threshold: float=3): # pylint: disable=super-init-not-called
61
72
  current_dir = os.path.dirname(__file__)
62
73
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
63
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
74
+ self._threshold = threshold
75
+ self._higher_is_better = True
76
+ super().__init__(
77
+ model_config=model_config,
78
+ prompty_file=prompty_path,
79
+ result_key=self._RESULT_KEY,
80
+ threshold=threshold,
81
+ _higher_is_better=self._higher_is_better,
82
+ )
64
83
 
65
84
  @overload
66
85
  def __call__(
@@ -8,6 +8,8 @@ from typing_extensions import overload, override
8
8
 
9
9
  from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
10
10
  from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
12
+ import math
11
13
 
12
14
 
13
15
  class RougeType(Enum):
@@ -50,6 +52,14 @@ class RougeScoreEvaluator(EvaluatorBase):
50
52
  information from the reference text.
51
53
 
52
54
  ROUGE scores range from 0 to 1, with higher scores indicating better quality.
55
+ :param rouge_type: The type of ROUGE score to calculate. Default is "rouge1".
56
+ :type rouge_type: str
57
+ :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
58
+ :type precision_threshold: float
59
+ :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
60
+ :type recall_threshold: float
61
+ :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
62
+ :type f1_score_threshold: float
53
63
 
54
64
  .. admonition:: Example:
55
65
 
@@ -59,15 +69,94 @@ class RougeScoreEvaluator(EvaluatorBase):
59
69
  :language: python
60
70
  :dedent: 8
61
71
  :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
72
+
73
+ .. admonition:: Example with threshold:
74
+
75
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
76
+ :start-after: [START threshold_rouge_score_evaluator]
77
+ :end-before: [END threshold_rouge_score_evaluator]
78
+ :language: python
79
+ :dedent: 8
80
+ :caption: Initialize with a specified threshold and call a RougeScoreEvaluator with a four-gram rouge type.
62
81
  """
63
82
 
64
83
  id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
65
84
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
66
85
 
67
86
  @override
68
- def __init__(self, rouge_type: RougeType):
87
+ def __init__(
88
+ self,
89
+ rouge_type: RougeType,
90
+ *,
91
+ precision_threshold: float = 0.5,
92
+ recall_threshold: float = 0.5,
93
+ f1_score_threshold: float = 0.5
94
+ ):
69
95
  self._rouge_type = rouge_type
96
+ self._higher_is_better = True
70
97
  super().__init__()
98
+
99
+ # Type checking for threshold parameters
100
+ for name, value in [
101
+ ("precision_threshold", precision_threshold),
102
+ ("recall_threshold", recall_threshold),
103
+ ("f1_score_threshold", f1_score_threshold),
104
+ ]:
105
+ if not isinstance(value, float):
106
+ raise TypeError(f"{name} must be a float, got {type(value)}")
107
+
108
+ self._threshold = {
109
+ "precision": precision_threshold,
110
+ "recall": recall_threshold,
111
+ "f1_score": f1_score_threshold,
112
+ }
113
+
114
+ def _get_binary_result(
115
+ self,
116
+ rouge_precision: float,
117
+ rouge_recall: float,
118
+ rouge_f1_score: float,
119
+ ) -> Dict[str, bool]:
120
+ """
121
+ Get binary result based on the threshold.
122
+
123
+ :param rouge_precision: The precision score.
124
+ :type rouge_precision: float
125
+ :param rouge_recall: The recall score.
126
+ :type rouge_recall: float
127
+ :param rouge_f1_score: The F1 score.
128
+ :type rouge_f1_score: float
129
+ :return: A dictionary with binary results for precision, recall, and F1 score.
130
+
131
+ """
132
+ # Initialize results with False for NaN values
133
+ results = {
134
+ "rouge_precision_result": False,
135
+ "rouge_recall_result": False,
136
+ "rouge_f1_score_result": False,
137
+ }
138
+
139
+ # Check if values are valid (not NaN) before comparison
140
+ precision_valid = not math.isnan(rouge_precision)
141
+ recall_valid = not math.isnan(rouge_recall)
142
+ f1_valid = not math.isnan(rouge_f1_score)
143
+
144
+ if self._higher_is_better:
145
+ if precision_valid:
146
+ results["rouge_precision_result"] = (rouge_precision >= self._threshold["precision"])
147
+ if recall_valid:
148
+ results["rouge_recall_result"] = (rouge_recall >= self._threshold["recall"])
149
+ if f1_valid:
150
+ results["rouge_f1_score_result"] = (rouge_f1_score >= self._threshold["f1_score"])
151
+ else:
152
+ if precision_valid:
153
+ results["rouge_precision_result"] = (rouge_precision <= self._threshold["precision"])
154
+ if recall_valid:
155
+ results["rouge_recall_result"] = (rouge_recall <= self._threshold["recall"])
156
+ if f1_valid:
157
+ results["rouge_f1_score_result"] = (rouge_f1_score <= self._threshold["f1_score"])
158
+
159
+ return results
71
160
 
72
161
  @override
73
162
  async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
@@ -82,10 +171,30 @@ class RougeScoreEvaluator(EvaluatorBase):
82
171
  response = eval_input["response"]
83
172
  scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
84
173
  metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
174
+ binary_results = {
175
+ "rouge_precision_result": False,
176
+ "rouge_recall_result": False,
177
+ "rouge_f1_score_result": False,
178
+ }
179
+ # Convert metrics to floats, using nan for None or non-convertible values
180
+ rouge_precision = float(metrics.precision) if metrics.precision is not None else float('nan')
181
+ rouge_recall = float(metrics.recall) if metrics.recall is not None else float('nan')
182
+ rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float('nan')
183
+ binary_results = self._get_binary_result(
184
+ rouge_precision=rouge_precision,
185
+ rouge_recall=rouge_recall,
186
+ rouge_f1_score=rouge_f1_score,
187
+ )
85
188
  return {
86
- "rouge_precision": metrics.precision,
87
- "rouge_recall": metrics.recall,
88
- "rouge_f1_score": metrics.fmeasure,
189
+ "rouge_precision": rouge_precision,
190
+ "rouge_recall": rouge_recall,
191
+ "rouge_f1_score": rouge_f1_score,
192
+ "rouge_precision_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_precision_result"]],
193
+ "rouge_recall_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_recall_result"]],
194
+ "rouge_f1_score_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_f1_score_result"]],
195
+ "rouge_precision_threshold": self._threshold["precision"],
196
+ "rouge_recall_threshold": self._threshold["recall"],
197
+ "rouge_f1_score_threshold": self._threshold["f1_score"],
89
198
  }
90
199
 
91
200
  @overload # type: ignore
@@ -27,6 +27,8 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
27
27
  :param azure_ai_project: The scope of the Azure AI project.
28
28
  It contains subscription id, resource group, and project name.
29
29
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
30
+ :param threshold: The threshold for the groundedness pro evaluator. Default is 5.
31
+ :type threshold: int
30
32
  :param kwargs: Additional arguments to pass to the evaluator.
31
33
  :type kwargs: Any
32
34
 
@@ -39,6 +41,15 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
39
41
  :dedent: 8
40
42
  :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
41
43
 
44
+ .. admonition:: Example with threshold:
45
+
46
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
47
+ :start-after: [START threshold_groundedness_pro_evaluator]
48
+ :end-before: [END threshold_groundedness_pro_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize with a specified threshold and call GroundednessProEvaluator with a query, response, and context.
52
+
42
53
  .. note::
43
54
 
44
55
  If this evaluator is supplied to the `evaluate` function, the aggregated metric
@@ -53,14 +64,18 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
53
64
  self,
54
65
  credential,
55
66
  azure_ai_project,
67
+ *,
68
+ threshold: int = 5,
56
69
  **kwargs,
57
70
  ):
58
- self._passing_score = 5 # TODO update once the binarization PR is merged
71
+ self.threshold = threshold
72
+ self._higher_is_better = True
59
73
  self._output_prefix = "groundedness_pro"
60
74
  super().__init__(
61
75
  eval_metric=EvaluationMetrics.GROUNDEDNESS,
62
76
  azure_ai_project=azure_ai_project,
63
77
  credential=credential,
78
+ threshold=self.threshold,
64
79
  **kwargs,
65
80
  )
66
81
 
@@ -141,8 +156,13 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
141
156
  """
142
157
  result = await super()._do_eval(eval_input)
143
158
  real_result = {}
159
+ real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
144
160
  real_result[self._output_prefix + "_label"] = (
145
- result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self._passing_score
161
+ result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self.threshold
146
162
  )
147
- real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
163
+ if self._higher_is_better:
164
+ real_result[self._output_prefix + "_score"] = max(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 0)
165
+ else:
166
+ real_result[self._output_prefix + "_score"] = min(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 1)
167
+
148
168
  return real_result