azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +27 -1
- azure/ai/evaluation/_azure/_models.py +6 -6
- azure/ai/evaluation/_common/constants.py +6 -2
- azure/ai/evaluation/_common/rai_service.py +39 -5
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +23 -3
- azure/ai/evaluation/_constants.py +7 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +804 -0
- azure/ai/evaluation/_converters/_models.py +302 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +42 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +109 -64
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +3 -3
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +44 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +4 -2
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +44 -5
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +22 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
- azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +158 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
- azure/ai/evaluation/_exceptions.py +5 -0
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +21 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +251 -150
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +195 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +45 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +74 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_red_team.py +1887 -0
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +3 -0
- azure/ai/evaluation/red_team/_utils/constants.py +65 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +165 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +192 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +54 -27
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/METADATA +80 -15
- azure_ai_evaluation-1.5.0.dist-info/RECORD +207 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.3.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -23,6 +23,18 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
23
23
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
24
24
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
25
25
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
26
|
+
:param groundedness_threshold: The threshold for groundedness evaluation. Default is 3.
|
|
27
|
+
:type groundedness_threshold: int
|
|
28
|
+
:param relevance_threshold: The threshold for relevance evaluation. Default is 3.
|
|
29
|
+
:type relevance_threshold: int
|
|
30
|
+
:param coherence_threshold: The threshold for coherence evaluation. Default is 3.
|
|
31
|
+
:type coherence_threshold: int
|
|
32
|
+
:param fluency_threshold: The threshold for fluency evaluation. Default is 3.
|
|
33
|
+
:type fluency_threshold: int
|
|
34
|
+
:param similarity_threshold: The threshold for similarity evaluation. Default is 3.
|
|
35
|
+
:type similarity_threshold: int
|
|
36
|
+
:param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
|
|
37
|
+
:type f1_score_threshold: float
|
|
26
38
|
:return: A callable class that evaluates and generates metrics for "question-answering" scenario.
|
|
27
39
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
28
40
|
:type kwargs: Any
|
|
@@ -36,6 +48,15 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
36
48
|
:dedent: 8
|
|
37
49
|
:caption: Initialize and call a QAEvaluator.
|
|
38
50
|
|
|
51
|
+
.. admonition:: Example with Threshold:
|
|
52
|
+
|
|
53
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
54
|
+
:start-after: [START threshold_qa_evaluator]
|
|
55
|
+
:end-before: [END threshold_qa_evaluator]
|
|
56
|
+
:language: python
|
|
57
|
+
:dedent: 8
|
|
58
|
+
:caption: Initialize with threshold and call a QAEvaluator.
|
|
59
|
+
|
|
39
60
|
.. note::
|
|
40
61
|
|
|
41
62
|
To align with our support of a diverse set of models, keys without the `gpt_` prefix has been added.
|
|
@@ -46,14 +67,37 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
46
67
|
id = "qa"
|
|
47
68
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
48
69
|
|
|
49
|
-
def __init__(
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
model_config,
|
|
73
|
+
*,
|
|
74
|
+
groundedness_threshold: int = 3,
|
|
75
|
+
relevance_threshold: int = 3,
|
|
76
|
+
coherence_threshold: int = 3,
|
|
77
|
+
fluency_threshold: int = 3,
|
|
78
|
+
similarity_threshold: int = 3,
|
|
79
|
+
f1_score_threshold: float = 0.5,
|
|
80
|
+
**kwargs
|
|
81
|
+
):
|
|
82
|
+
# Type checking
|
|
83
|
+
for name, value in [
|
|
84
|
+
("groundedness_threshold", groundedness_threshold),
|
|
85
|
+
("relevance_threshold", relevance_threshold),
|
|
86
|
+
("coherence_threshold", coherence_threshold),
|
|
87
|
+
("fluency_threshold", fluency_threshold),
|
|
88
|
+
("similarity_threshold", similarity_threshold),
|
|
89
|
+
("f1_score_threshold", f1_score_threshold),
|
|
90
|
+
]:
|
|
91
|
+
if not isinstance(value, (int, float)):
|
|
92
|
+
raise TypeError(f"{name} must be an int or float, got {type(value)}")
|
|
93
|
+
|
|
50
94
|
evaluators = [
|
|
51
|
-
GroundednessEvaluator(model_config),
|
|
52
|
-
RelevanceEvaluator(model_config),
|
|
53
|
-
CoherenceEvaluator(model_config),
|
|
54
|
-
FluencyEvaluator(model_config),
|
|
55
|
-
SimilarityEvaluator(model_config),
|
|
56
|
-
F1ScoreEvaluator(),
|
|
95
|
+
GroundednessEvaluator(model_config, threshold=groundedness_threshold),
|
|
96
|
+
RelevanceEvaluator(model_config, threshold=relevance_threshold),
|
|
97
|
+
CoherenceEvaluator(model_config, threshold=coherence_threshold),
|
|
98
|
+
FluencyEvaluator(model_config, threshold=fluency_threshold),
|
|
99
|
+
SimilarityEvaluator(model_config, threshold=similarity_threshold),
|
|
100
|
+
F1ScoreEvaluator(threshold=f1_score_threshold),
|
|
57
101
|
]
|
|
58
102
|
super().__init__(evaluators=evaluators, **kwargs)
|
|
59
103
|
|
|
@@ -27,6 +27,8 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
27
27
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
28
28
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
29
29
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
30
|
+
:param threshold: The threshold for the relevance evaluator. Default is 5.
|
|
31
|
+
:type threshold: int
|
|
30
32
|
|
|
31
33
|
.. admonition:: Example:
|
|
32
34
|
|
|
@@ -37,6 +39,15 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
37
39
|
:dedent: 8
|
|
38
40
|
:caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
|
|
39
41
|
|
|
42
|
+
.. admonition:: Example with Threshold:
|
|
43
|
+
|
|
44
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
45
|
+
:start-after: [START threshold_relevance_evaluator]
|
|
46
|
+
:end-before: [END threshold_relevance_evaluator]
|
|
47
|
+
:language: python
|
|
48
|
+
:dedent: 8
|
|
49
|
+
:caption: Initialize with threshold and call a RelevanceEvaluator with a query, response, and context.
|
|
50
|
+
|
|
40
51
|
.. note::
|
|
41
52
|
|
|
42
53
|
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
@@ -52,10 +63,23 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
52
63
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
53
64
|
|
|
54
65
|
@override
|
|
55
|
-
def __init__(
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
model_config,
|
|
69
|
+
*,
|
|
70
|
+
threshold=3
|
|
71
|
+
):
|
|
56
72
|
current_dir = os.path.dirname(__file__)
|
|
57
73
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
58
|
-
|
|
74
|
+
self._threshold = threshold
|
|
75
|
+
self._higher_is_better = True
|
|
76
|
+
super().__init__(
|
|
77
|
+
model_config=model_config,
|
|
78
|
+
prompty_file=prompty_path,
|
|
79
|
+
result_key=self._RESULT_KEY,
|
|
80
|
+
threshold=threshold,
|
|
81
|
+
_higher_is_better=self._higher_is_better
|
|
82
|
+
)
|
|
59
83
|
|
|
60
84
|
@overload
|
|
61
85
|
def __call__(
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._response_completeness import ResponseCompletenessEvaluator
|
|
6
|
+
|
|
7
|
+
__all__ = ["ResponseCompletenessEvaluator"]
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import math
|
|
7
|
+
from typing import Dict, List, Union, Optional
|
|
8
|
+
|
|
9
|
+
from typing_extensions import overload, override
|
|
10
|
+
|
|
11
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
12
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
13
|
+
from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
|
|
14
|
+
from azure.ai.evaluation._model_configurations import Conversation, Message
|
|
15
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@experimental
|
|
19
|
+
class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
20
|
+
"""
|
|
21
|
+
Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
|
|
22
|
+
provided ground truth.
|
|
23
|
+
The completeness measure assesses how thoroughly an AI model's generated response aligns with the key information,
|
|
24
|
+
claims, and statements established in the ground truth. This evaluation considers the presence, accuracy,
|
|
25
|
+
and relevance of the content provided.
|
|
26
|
+
The assessment spans multiple levels, ranging from fully incomplete to fully complete, ensuring a comprehensive
|
|
27
|
+
evaluation of the response's content quality.
|
|
28
|
+
Use this metric when you need to evaluate an AI model's ability to deliver comprehensive and accurate information,
|
|
29
|
+
particularly in text generation tasks where conveying all essential details is crucial for clarity,
|
|
30
|
+
context, and correctness.
|
|
31
|
+
Completeness scores range from 1 to 5:
|
|
32
|
+
1: Fully incomplete — Contains none of the necessary information.
|
|
33
|
+
2: Barely complete — Contains only a small portion of the required information.
|
|
34
|
+
3: Moderately complete — Covers about half of the required content.
|
|
35
|
+
4: Mostly complete — Includes most of the necessary details with minimal omissions.
|
|
36
|
+
5: Fully complete — Contains all key information without any omissions.
|
|
37
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
38
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
39
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
40
|
+
.. admonition:: Example:
|
|
41
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
42
|
+
:start-after: [START completeness_evaluator]
|
|
43
|
+
:end-before: [END completeness_evaluator]
|
|
44
|
+
:language: python
|
|
45
|
+
:dedent: 8
|
|
46
|
+
:caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
# Constants must be defined within eval's directory to be save/loadable
|
|
50
|
+
|
|
51
|
+
_PROMPTY_FILE = "response_completeness.prompty"
|
|
52
|
+
_RESULT_KEY = "response_completeness"
|
|
53
|
+
|
|
54
|
+
id = "completeness"
|
|
55
|
+
|
|
56
|
+
_MIN_COMPLETENESS_SCORE = 1
|
|
57
|
+
_MAX_COMPLETENESS_SCORE = 5
|
|
58
|
+
_DEFAULT_COMPLETENESS_THRESHOLD = 3
|
|
59
|
+
|
|
60
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
61
|
+
|
|
62
|
+
@override
|
|
63
|
+
def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD):
|
|
64
|
+
current_dir = os.path.dirname(__file__)
|
|
65
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
66
|
+
self.threshold = threshold
|
|
67
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
|
|
68
|
+
|
|
69
|
+
@overload
|
|
70
|
+
def __call__(
|
|
71
|
+
self,
|
|
72
|
+
*,
|
|
73
|
+
ground_truth: str,
|
|
74
|
+
response: str,
|
|
75
|
+
) -> Dict[str, Union[str, float]]:
|
|
76
|
+
"""Evaluate completeness in given response. Accepts ground truth and response for evaluation.
|
|
77
|
+
Example usage:
|
|
78
|
+
Evaluating completeness for a response string
|
|
79
|
+
```python
|
|
80
|
+
from azure.ai.evaluation import CompletenessEvaluator
|
|
81
|
+
completeness_evaluator = CompletenessEvaluator(model_config)
|
|
82
|
+
ground_truth = "The ground truth to be evaluated."
|
|
83
|
+
response = "The response to be evaluated."
|
|
84
|
+
completeness_results = completeness_evaluator(ground_truth=ground_truth, response=response)
|
|
85
|
+
```
|
|
86
|
+
:keword ground_truth: The ground truth to be evaluated.
|
|
87
|
+
:paramtype ground_truth: str
|
|
88
|
+
:keyword response: The response to be evaluated.
|
|
89
|
+
:paramtype response: Union[str, List[Message]]
|
|
90
|
+
:return: The response completeness score results.
|
|
91
|
+
:rtype: Dict[str, Union[str, float]]
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
@overload
|
|
95
|
+
def __call__(
|
|
96
|
+
self,
|
|
97
|
+
*,
|
|
98
|
+
conversation: Conversation,
|
|
99
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
100
|
+
"""Evaluate completeness for a conversation
|
|
101
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
102
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
103
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
104
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
105
|
+
:return: The fluency score
|
|
106
|
+
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
@override
|
|
110
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
111
|
+
self,
|
|
112
|
+
*args,
|
|
113
|
+
**kwargs,
|
|
114
|
+
):
|
|
115
|
+
"""
|
|
116
|
+
Invokes the instance using the overloaded __call__ signature.
|
|
117
|
+
|
|
118
|
+
For detailed parameter types and return value documentation, see the overloaded __call__ definition.
|
|
119
|
+
"""
|
|
120
|
+
return super().__call__(*args, **kwargs)
|
|
121
|
+
|
|
122
|
+
@override
|
|
123
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
124
|
+
"""Do completeness evaluation.
|
|
125
|
+
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
|
|
126
|
+
:type eval_input: Dict
|
|
127
|
+
:return: The evaluation result.
|
|
128
|
+
:rtype: Dict
|
|
129
|
+
"""
|
|
130
|
+
# we override the _do_eval method as we want the output to be a dictionary,
|
|
131
|
+
# which is a different schema than _base_prompty_eval.py
|
|
132
|
+
if "ground_truth" not in eval_input or "response" not in eval_input:
|
|
133
|
+
raise EvaluationException(
|
|
134
|
+
message=f"Both ground_truth and response must be provided as input to the completeness evaluator.",
|
|
135
|
+
internal_message=f"Both ground_truth and response must be provided as input to the completeness"
|
|
136
|
+
f" evaluator.",
|
|
137
|
+
blame=ErrorBlame.USER_ERROR,
|
|
138
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
139
|
+
target=ErrorTarget.COMPLETENESS_EVALUATOR,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
143
|
+
|
|
144
|
+
score = math.nan
|
|
145
|
+
if llm_output:
|
|
146
|
+
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
|
|
147
|
+
|
|
148
|
+
score_result = 'pass' if score >= self.threshold else 'fail'
|
|
149
|
+
|
|
150
|
+
# updating the result key and threshold to int based on the schema
|
|
151
|
+
return {
|
|
152
|
+
f"{self._result_key}": int(score),
|
|
153
|
+
f"{self._result_key}_result": score_result,
|
|
154
|
+
f"{self._result_key}_threshold": int(self.threshold),
|
|
155
|
+
f"{self._result_key}_reason": reason,
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return {self._result_key: math.nan}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Completeness
|
|
3
|
+
description: Evaluates Completeness score for QA scenario
|
|
4
|
+
model:
|
|
5
|
+
api: chat
|
|
6
|
+
parameters:
|
|
7
|
+
temperature: 0.0
|
|
8
|
+
max_tokens: 800
|
|
9
|
+
top_p: 1.0
|
|
10
|
+
seed: 123
|
|
11
|
+
presence_penalty: 0
|
|
12
|
+
frequency_penalty: 0
|
|
13
|
+
response_format:
|
|
14
|
+
type: text
|
|
15
|
+
|
|
16
|
+
inputs:
|
|
17
|
+
response:
|
|
18
|
+
type: string
|
|
19
|
+
ground_truth:
|
|
20
|
+
type: string
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
system:
|
|
24
|
+
# Instruction
|
|
25
|
+
## Context
|
|
26
|
+
### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided.
|
|
27
|
+
- **Definition**: You are given a definition of the response quality that is being evaluated to help guide your Score.
|
|
28
|
+
- **Data**: Your input data include a response and its ground truth.
|
|
29
|
+
- **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways.
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Definition
|
|
33
|
+
|
|
34
|
+
**Level 1: Fully incomplete**
|
|
35
|
+
|
|
36
|
+
**Definition:**
|
|
37
|
+
A response is considered fully incomplete if it does not contain any the necessary and relevant information with respect to the ground truth. In other words, it completely misses all the information - especially claims and statements - established in the ground truth.
|
|
38
|
+
|
|
39
|
+
**Examples:**
|
|
40
|
+
1. **Response:** "Flu shot cannot cure cancer. Stay healthy requires sleeping exactly 8 hours a day. A few hours of exercise per week will have little benefits for physical and mental health. Physical and mental health benefits are separate topics. Scientists have not studied any of them."
|
|
41
|
+
**Ground Truth:** "Flu shot can prevent flu-related illnesses. Staying healthy requires proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
**Level 2: Barely complete**
|
|
45
|
+
|
|
46
|
+
**Definition:**
|
|
47
|
+
A response is considered barely complete if it only contains a small percentage of all the necessary and relevant information with respect to the ground truth. In other words, it misses almost all the information - especially claims and statements - established in the ground truth.
|
|
48
|
+
|
|
49
|
+
**Examples:**
|
|
50
|
+
1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires 2 meals a day. Exercise per week makes not difference to physical and mental health. This is because physical and mental health benefits have low correlation through scientific studies. Scientists are making this observation in studies."
|
|
51
|
+
**Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
**Level 3: Moderately complete**
|
|
55
|
+
|
|
56
|
+
**Definition:**
|
|
57
|
+
A response is considered moderately complete if it contains half of the necessary and relevant information with respect to the ground truth. In other words, it miss half of the information - especially claims and statements - established in the ground truth.
|
|
58
|
+
|
|
59
|
+
**Examples:**
|
|
60
|
+
1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires a few dollar of investments a day. Even a few dollars of investments per week will not make an impact on physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Fiction writers are starting to discover them through their works."
|
|
61
|
+
**Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
**Level 4: Mostly complete**
|
|
65
|
+
|
|
66
|
+
**Definition:**
|
|
67
|
+
A response is considered mostly complete if it contains most of the necessary and relevant information with respect to the ground truth. In other words, it misses some minor information - especially claims and statements - established in the ground truth.
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
**Examples:**
|
|
71
|
+
1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires keto diet and rigorous athletic training. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
|
|
72
|
+
**Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
**Level 5: Fully complete**
|
|
76
|
+
|
|
77
|
+
**Definition:**
|
|
78
|
+
A response is considered complete if it perfectly contains all the necessary and relevant information with respect to the ground truth. In other words, it does not miss any information from statements and claims in the ground truth.
|
|
79
|
+
|
|
80
|
+
**Examples:**
|
|
81
|
+
1. **Response:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
|
|
82
|
+
**Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# Data
|
|
87
|
+
Response: {{response}}
|
|
88
|
+
Ground Truth: {{ground_truth}}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# Tasks
|
|
92
|
+
## Please provide your assessment Score for the previous answer. Your output should include the following information:
|
|
93
|
+
- **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:".
|
|
94
|
+
- **Explanation**: a very short explanation of why you think the input data should get that Score.
|
|
95
|
+
- **Score**: based on your previous analysis, provide your Score. The answer you give MUST be a integer score ("1", "2", ...) based on the categories of the definitions.
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
|
|
99
|
+
# Output
|
|
@@ -31,6 +31,8 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
31
31
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
32
32
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
33
33
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
34
|
+
:param threshold: The threshold for the evaluation. Default is 3.
|
|
35
|
+
:type threshold: float
|
|
34
36
|
:return: A function that evaluates and generates metrics for "chat" scenario.
|
|
35
37
|
:rtype: Callable
|
|
36
38
|
|
|
@@ -43,6 +45,15 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
43
45
|
:dedent: 8
|
|
44
46
|
:caption: Initialize and call a RetrievalEvaluator.
|
|
45
47
|
|
|
48
|
+
.. admonition:: Example with Threshold:
|
|
49
|
+
|
|
50
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
51
|
+
:start-after: [START threshold_retrieval_evaluator]
|
|
52
|
+
:end-before: [END threshold_retrieval_evaluator]
|
|
53
|
+
:language: python
|
|
54
|
+
:dedent: 8
|
|
55
|
+
:caption: Initialize with threshold and call a RetrievalEvaluator.
|
|
56
|
+
|
|
46
57
|
.. note::
|
|
47
58
|
|
|
48
59
|
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
@@ -57,10 +68,18 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
57
68
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
58
69
|
|
|
59
70
|
@override
|
|
60
|
-
def __init__(self, model_config): # pylint: disable=super-init-not-called
|
|
71
|
+
def __init__(self, model_config, *, threshold: float=3): # pylint: disable=super-init-not-called
|
|
61
72
|
current_dir = os.path.dirname(__file__)
|
|
62
73
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
63
|
-
|
|
74
|
+
self._threshold = threshold
|
|
75
|
+
self._higher_is_better = True
|
|
76
|
+
super().__init__(
|
|
77
|
+
model_config=model_config,
|
|
78
|
+
prompty_file=prompty_path,
|
|
79
|
+
result_key=self._RESULT_KEY,
|
|
80
|
+
threshold=threshold,
|
|
81
|
+
_higher_is_better=self._higher_is_better,
|
|
82
|
+
)
|
|
64
83
|
|
|
65
84
|
@overload
|
|
66
85
|
def __call__(
|
|
@@ -8,6 +8,8 @@ from typing_extensions import overload, override
|
|
|
8
8
|
|
|
9
9
|
from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
12
|
+
import math
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
class RougeType(Enum):
|
|
@@ -50,6 +52,14 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
50
52
|
information from the reference text.
|
|
51
53
|
|
|
52
54
|
ROUGE scores range from 0 to 1, with higher scores indicating better quality.
|
|
55
|
+
:param rouge_type: The type of ROUGE score to calculate. Default is "rouge1".
|
|
56
|
+
:type rouge_type: str
|
|
57
|
+
:param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
|
|
58
|
+
:type precision_threshold: float
|
|
59
|
+
:param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
|
|
60
|
+
:type recall_threshold: float
|
|
61
|
+
:param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
|
|
62
|
+
:type f1_score_threshold: float
|
|
53
63
|
|
|
54
64
|
.. admonition:: Example:
|
|
55
65
|
|
|
@@ -59,15 +69,94 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
59
69
|
:language: python
|
|
60
70
|
:dedent: 8
|
|
61
71
|
:caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
72
|
+
|
|
73
|
+
.. admonition:: Example with threshold:
|
|
74
|
+
|
|
75
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
76
|
+
:start-after: [START threshold_rouge_score_evaluator]
|
|
77
|
+
:end-before: [END threshold_rouge_score_evaluator]
|
|
78
|
+
:language: python
|
|
79
|
+
:dedent: 8
|
|
80
|
+
:caption: Initialize with a specified threshold and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
62
81
|
"""
|
|
63
82
|
|
|
64
83
|
id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
|
|
65
84
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
66
85
|
|
|
67
86
|
@override
|
|
68
|
-
def __init__(
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
rouge_type: RougeType,
|
|
90
|
+
*,
|
|
91
|
+
precision_threshold: float = 0.5,
|
|
92
|
+
recall_threshold: float = 0.5,
|
|
93
|
+
f1_score_threshold: float = 0.5
|
|
94
|
+
):
|
|
69
95
|
self._rouge_type = rouge_type
|
|
96
|
+
self._higher_is_better = True
|
|
70
97
|
super().__init__()
|
|
98
|
+
|
|
99
|
+
# Type checking for threshold parameters
|
|
100
|
+
for name, value in [
|
|
101
|
+
("precision_threshold", precision_threshold),
|
|
102
|
+
("recall_threshold", recall_threshold),
|
|
103
|
+
("f1_score_threshold", f1_score_threshold),
|
|
104
|
+
]:
|
|
105
|
+
if not isinstance(value, float):
|
|
106
|
+
raise TypeError(f"{name} must be a float, got {type(value)}")
|
|
107
|
+
|
|
108
|
+
self._threshold = {
|
|
109
|
+
"precision": precision_threshold,
|
|
110
|
+
"recall": recall_threshold,
|
|
111
|
+
"f1_score": f1_score_threshold,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def _get_binary_result(
|
|
115
|
+
self,
|
|
116
|
+
rouge_precision: float,
|
|
117
|
+
rouge_recall: float,
|
|
118
|
+
rouge_f1_score: float,
|
|
119
|
+
) -> Dict[str, bool]:
|
|
120
|
+
"""
|
|
121
|
+
Get binary result based on the threshold.
|
|
122
|
+
|
|
123
|
+
:param rouge_precision: The precision score.
|
|
124
|
+
:type rouge_precision: float
|
|
125
|
+
:param rouge_recall: The recall score.
|
|
126
|
+
:type rouge_recall: float
|
|
127
|
+
:param rouge_f1_score: The F1 score.
|
|
128
|
+
:type rouge_f1_score: float
|
|
129
|
+
:return: A dictionary with binary results for precision, recall, and F1 score.
|
|
130
|
+
|
|
131
|
+
"""
|
|
132
|
+
# Initialize results with False for NaN values
|
|
133
|
+
results = {
|
|
134
|
+
"rouge_precision_result": False,
|
|
135
|
+
"rouge_recall_result": False,
|
|
136
|
+
"rouge_f1_score_result": False,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Check if values are valid (not NaN) before comparison
|
|
140
|
+
precision_valid = not math.isnan(rouge_precision)
|
|
141
|
+
recall_valid = not math.isnan(rouge_recall)
|
|
142
|
+
f1_valid = not math.isnan(rouge_f1_score)
|
|
143
|
+
|
|
144
|
+
if self._higher_is_better:
|
|
145
|
+
if precision_valid:
|
|
146
|
+
results["rouge_precision_result"] = (rouge_precision >= self._threshold["precision"])
|
|
147
|
+
if recall_valid:
|
|
148
|
+
results["rouge_recall_result"] = (rouge_recall >= self._threshold["recall"])
|
|
149
|
+
if f1_valid:
|
|
150
|
+
results["rouge_f1_score_result"] = (rouge_f1_score >= self._threshold["f1_score"])
|
|
151
|
+
else:
|
|
152
|
+
if precision_valid:
|
|
153
|
+
results["rouge_precision_result"] = (rouge_precision <= self._threshold["precision"])
|
|
154
|
+
if recall_valid:
|
|
155
|
+
results["rouge_recall_result"] = (rouge_recall <= self._threshold["recall"])
|
|
156
|
+
if f1_valid:
|
|
157
|
+
results["rouge_f1_score_result"] = (rouge_f1_score <= self._threshold["f1_score"])
|
|
158
|
+
|
|
159
|
+
return results
|
|
71
160
|
|
|
72
161
|
@override
|
|
73
162
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
@@ -82,10 +171,30 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
82
171
|
response = eval_input["response"]
|
|
83
172
|
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
|
|
84
173
|
metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
|
|
174
|
+
binary_results = {
|
|
175
|
+
"rouge_precision_result": False,
|
|
176
|
+
"rouge_recall_result": False,
|
|
177
|
+
"rouge_f1_score_result": False,
|
|
178
|
+
}
|
|
179
|
+
# Convert metrics to floats, using nan for None or non-convertible values
|
|
180
|
+
rouge_precision = float(metrics.precision) if metrics.precision is not None else float('nan')
|
|
181
|
+
rouge_recall = float(metrics.recall) if metrics.recall is not None else float('nan')
|
|
182
|
+
rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float('nan')
|
|
183
|
+
binary_results = self._get_binary_result(
|
|
184
|
+
rouge_precision=rouge_precision,
|
|
185
|
+
rouge_recall=rouge_recall,
|
|
186
|
+
rouge_f1_score=rouge_f1_score,
|
|
187
|
+
)
|
|
85
188
|
return {
|
|
86
|
-
"rouge_precision":
|
|
87
|
-
"rouge_recall":
|
|
88
|
-
"rouge_f1_score":
|
|
189
|
+
"rouge_precision": rouge_precision,
|
|
190
|
+
"rouge_recall": rouge_recall,
|
|
191
|
+
"rouge_f1_score": rouge_f1_score,
|
|
192
|
+
"rouge_precision_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_precision_result"]],
|
|
193
|
+
"rouge_recall_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_recall_result"]],
|
|
194
|
+
"rouge_f1_score_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_f1_score_result"]],
|
|
195
|
+
"rouge_precision_threshold": self._threshold["precision"],
|
|
196
|
+
"rouge_recall_threshold": self._threshold["recall"],
|
|
197
|
+
"rouge_f1_score_threshold": self._threshold["f1_score"],
|
|
89
198
|
}
|
|
90
199
|
|
|
91
200
|
@overload # type: ignore
|
|
@@ -27,6 +27,8 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
27
27
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
28
28
|
It contains subscription id, resource group, and project name.
|
|
29
29
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
30
|
+
:param threshold: The threshold for the groundedness pro evaluator. Default is 5.
|
|
31
|
+
:type threshold: int
|
|
30
32
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
31
33
|
:type kwargs: Any
|
|
32
34
|
|
|
@@ -39,6 +41,15 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
39
41
|
:dedent: 8
|
|
40
42
|
:caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
|
|
41
43
|
|
|
44
|
+
.. admonition:: Example with threshold:
|
|
45
|
+
|
|
46
|
+
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
47
|
+
:start-after: [START threshold_groundedness_pro_evaluator]
|
|
48
|
+
:end-before: [END threshold_groundedness_pro_evaluator]
|
|
49
|
+
:language: python
|
|
50
|
+
:dedent: 8
|
|
51
|
+
:caption: Initialize with a specified threshold and call GroundednessProEvaluator with a query, response, and context.
|
|
52
|
+
|
|
42
53
|
.. note::
|
|
43
54
|
|
|
44
55
|
If this evaluator is supplied to the `evaluate` function, the aggregated metric
|
|
@@ -53,14 +64,18 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
53
64
|
self,
|
|
54
65
|
credential,
|
|
55
66
|
azure_ai_project,
|
|
67
|
+
*,
|
|
68
|
+
threshold: int = 5,
|
|
56
69
|
**kwargs,
|
|
57
70
|
):
|
|
58
|
-
self.
|
|
71
|
+
self.threshold = threshold
|
|
72
|
+
self._higher_is_better = True
|
|
59
73
|
self._output_prefix = "groundedness_pro"
|
|
60
74
|
super().__init__(
|
|
61
75
|
eval_metric=EvaluationMetrics.GROUNDEDNESS,
|
|
62
76
|
azure_ai_project=azure_ai_project,
|
|
63
77
|
credential=credential,
|
|
78
|
+
threshold=self.threshold,
|
|
64
79
|
**kwargs,
|
|
65
80
|
)
|
|
66
81
|
|
|
@@ -141,8 +156,13 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
141
156
|
"""
|
|
142
157
|
result = await super()._do_eval(eval_input)
|
|
143
158
|
real_result = {}
|
|
159
|
+
real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
|
|
144
160
|
real_result[self._output_prefix + "_label"] = (
|
|
145
|
-
result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self.
|
|
161
|
+
result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self.threshold
|
|
146
162
|
)
|
|
147
|
-
|
|
163
|
+
if self._higher_is_better:
|
|
164
|
+
real_result[self._output_prefix + "_score"] = max(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 0)
|
|
165
|
+
else:
|
|
166
|
+
real_result[self._output_prefix + "_score"] = min(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 1)
|
|
167
|
+
|
|
148
168
|
return real_result
|