azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +51 -6
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +88 -52
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +188 -10
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +25 -17
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -41,9 +41,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
41
41
|
|
|
42
42
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
43
43
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
44
|
-
:param azure_ai_project: The
|
|
45
|
-
It contains subscription id, resource group, and project name.
|
|
46
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
44
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
45
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
46
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
47
47
|
:param threshold: The threshold for the Sexual evaluator. Default is 3.
|
|
48
48
|
:type threshold: int
|
|
49
49
|
|
|
@@ -55,17 +55,17 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
55
55
|
:language: python
|
|
56
56
|
:dedent: 8
|
|
57
57
|
:caption: Initialize and call a SexualEvaluator.
|
|
58
|
-
|
|
58
|
+
|
|
59
59
|
.. admonition:: Example using Azure AI Project URL:
|
|
60
|
-
|
|
60
|
+
|
|
61
61
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
62
62
|
:start-after: [START sexual_evaluator]
|
|
63
63
|
:end-before: [END sexual_evaluator]
|
|
64
64
|
:language: python
|
|
65
65
|
:dedent: 8
|
|
66
|
-
:caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
|
|
66
|
+
:caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
|
|
67
67
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
.. admonition:: Example with Threshold:
|
|
70
70
|
|
|
71
71
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -76,8 +76,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
:caption: Initialize with threshold and call a SexualEvaluator.
|
|
77
77
|
"""
|
|
78
78
|
|
|
79
|
-
id = "
|
|
79
|
+
id = "azureai://built-in/evaluators/sexual"
|
|
80
80
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
81
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
81
82
|
|
|
82
83
|
@override
|
|
83
84
|
def __init__(
|
|
@@ -86,6 +87,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
86
87
|
azure_ai_project,
|
|
87
88
|
*,
|
|
88
89
|
threshold: int = 3,
|
|
90
|
+
**kwargs,
|
|
89
91
|
):
|
|
90
92
|
super().__init__(
|
|
91
93
|
eval_metric=EvaluationMetrics.SEXUAL,
|
|
@@ -94,6 +96,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
94
96
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
95
97
|
threshold=threshold,
|
|
96
98
|
_higher_is_better=False,
|
|
99
|
+
**kwargs,
|
|
97
100
|
)
|
|
98
101
|
|
|
99
102
|
@overload
|
|
@@ -146,7 +149,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
146
149
|
key "messages". Conversation turns are expected
|
|
147
150
|
to be dictionaries with keys "content" and "role".
|
|
148
151
|
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
149
|
-
:return: The
|
|
152
|
+
:return: The sexual score.
|
|
150
153
|
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
|
|
151
154
|
"""
|
|
152
155
|
return super().__call__(*args, **kwargs)
|
|
@@ -41,9 +41,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
41
41
|
|
|
42
42
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
43
43
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
44
|
-
:param azure_ai_project: The
|
|
45
|
-
It contains subscription id, resource group, and project name.
|
|
46
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
44
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
45
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
46
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
47
47
|
:param threshold: The threshold for the Violence evaluator. Default is 3.
|
|
48
48
|
:type threshold: int
|
|
49
49
|
|
|
@@ -57,15 +57,15 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
57
57
|
:caption: Initialize and call a ViolenceEvaluator.
|
|
58
58
|
|
|
59
59
|
.. admonition:: Example using Azure AI Project URL:
|
|
60
|
-
|
|
60
|
+
|
|
61
61
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
62
62
|
:start-after: [START violence_evaluator]
|
|
63
63
|
:end-before: [END violence_evaluator]
|
|
64
64
|
:language: python
|
|
65
65
|
:dedent: 8
|
|
66
|
-
:caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
|
|
66
|
+
:caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
|
|
67
67
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
.. admonition:: Example:
|
|
70
70
|
|
|
71
71
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -76,8 +76,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
:caption: Initialize with threshold and call a ViolenceEvaluator.
|
|
77
77
|
"""
|
|
78
78
|
|
|
79
|
-
id = "
|
|
79
|
+
id = "azureai://built-in/evaluators/violence"
|
|
80
80
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
81
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
81
82
|
|
|
82
83
|
@override
|
|
83
84
|
def __init__(
|
|
@@ -86,6 +87,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
86
87
|
azure_ai_project,
|
|
87
88
|
*,
|
|
88
89
|
threshold: int = 3,
|
|
90
|
+
**kwargs,
|
|
89
91
|
):
|
|
90
92
|
super().__init__(
|
|
91
93
|
eval_metric=EvaluationMetrics.VIOLENCE,
|
|
@@ -94,6 +96,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
94
96
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
95
97
|
threshold=threshold,
|
|
96
98
|
_higher_is_better=False,
|
|
99
|
+
**kwargs,
|
|
97
100
|
)
|
|
98
101
|
|
|
99
102
|
@overload
|
|
@@ -4,8 +4,4 @@
|
|
|
4
4
|
|
|
5
5
|
from ._document_retrieval import DocumentRetrievalEvaluator, RetrievalGroundTruthDocument, RetrievedDocument
|
|
6
6
|
|
|
7
|
-
__all__ = [
|
|
8
|
-
"DocumentRetrievalEvaluator",
|
|
9
|
-
"RetrievalGroundTruthDocument",
|
|
10
|
-
"RetrievedDocument"
|
|
11
|
-
]
|
|
7
|
+
__all__ = ["DocumentRetrievalEvaluator", "RetrievalGroundTruthDocument", "RetrievedDocument"]
|
|
@@ -14,9 +14,7 @@ RetrievalGroundTruthDocument = TypedDict(
|
|
|
14
14
|
"RetrievalGroundTruthDocument", {"document_id": str, "query_relevance_label": int}
|
|
15
15
|
)
|
|
16
16
|
|
|
17
|
-
RetrievedDocument = TypedDict(
|
|
18
|
-
"RetrievedDocument", {"document_id": str, "relevance_score": float}
|
|
19
|
-
)
|
|
17
|
+
RetrievedDocument = TypedDict("RetrievedDocument", {"document_id": str, "relevance_score": float})
|
|
20
18
|
|
|
21
19
|
|
|
22
20
|
class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
@@ -33,15 +31,15 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
33
31
|
:caption: Initialize and call a DocumentRetrievalEvaluator
|
|
34
32
|
|
|
35
33
|
.. admonition:: Example using Azure AI Project URL:
|
|
36
|
-
|
|
34
|
+
|
|
37
35
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
38
36
|
:start-after: [START document_retrieval_evaluator]
|
|
39
37
|
:end-before: [END document_retrieval_evaluator]
|
|
40
38
|
:language: python
|
|
41
39
|
:dedent: 8
|
|
42
|
-
:caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
|
|
40
|
+
:caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
|
|
43
41
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
44
|
-
|
|
42
|
+
|
|
45
43
|
.. admonition:: Example with Threshold:
|
|
46
44
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
47
45
|
:start-after: [START threshold_document_retrieval_evaluator]
|
|
@@ -51,6 +49,9 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
51
49
|
:caption: Initialize with threshold and call a DocumentRetrievalEvaluator.
|
|
52
50
|
"""
|
|
53
51
|
|
|
52
|
+
id = "azureai://built-in/evaluators/document_retrieval"
|
|
53
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
54
|
+
|
|
54
55
|
def __init__(
|
|
55
56
|
self,
|
|
56
57
|
*,
|
|
@@ -62,7 +63,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
62
63
|
top1_relevance_threshold: Optional[float] = 50.0,
|
|
63
64
|
top3_max_relevance_threshold: Optional[float] = 50.0,
|
|
64
65
|
total_retrieved_documents_threshold: Optional[int] = 50,
|
|
65
|
-
total_ground_truth_documents_threshold: Optional[int] = 50
|
|
66
|
+
total_ground_truth_documents_threshold: Optional[int] = 50,
|
|
66
67
|
):
|
|
67
68
|
super().__init__()
|
|
68
69
|
self.k = 3
|
|
@@ -74,14 +75,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
74
75
|
)
|
|
75
76
|
|
|
76
77
|
if not isinstance(ground_truth_label_min, int):
|
|
77
|
-
raise EvaluationException(
|
|
78
|
-
"The ground truth label minimum must be an integer value."
|
|
79
|
-
)
|
|
78
|
+
raise EvaluationException("The ground truth label minimum must be an integer value.")
|
|
80
79
|
|
|
81
80
|
if not isinstance(ground_truth_label_max, int):
|
|
82
|
-
raise EvaluationException(
|
|
83
|
-
"The ground truth label maximum must be an integer value."
|
|
84
|
-
)
|
|
81
|
+
raise EvaluationException("The ground truth label maximum must be an integer value.")
|
|
85
82
|
|
|
86
83
|
self.ground_truth_label_min = ground_truth_label_min
|
|
87
84
|
self.ground_truth_label_max = ground_truth_label_max
|
|
@@ -122,7 +119,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
122
119
|
) -> float:
|
|
123
120
|
"""NDCG (Normalized Discounted Cumulative Gain) calculated for the top K documents retrieved from a search query.
|
|
124
121
|
NDCG measures how well a document ranking compares to an ideal document ranking given a list of ground-truth documents.
|
|
125
|
-
|
|
122
|
+
|
|
126
123
|
:param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
|
|
127
124
|
:type result_docs_groundtruth_labels: List[int]
|
|
128
125
|
:param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
|
|
@@ -145,7 +142,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
145
142
|
def _compute_xdcg(self, result_docs_groundtruth_labels: List[int]) -> float:
|
|
146
143
|
"""XDCG calculated for the top K documents retrieved from a search query.
|
|
147
144
|
XDCG measures how objectively good are the top K documents, discounted by their position in the list.
|
|
148
|
-
|
|
145
|
+
|
|
149
146
|
:param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
|
|
150
147
|
:type result_docs_groundtruth_labels: List[int]
|
|
151
148
|
:return: The XDCG@K calculation result.
|
|
@@ -159,11 +156,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
159
156
|
return math.pow(self.xdcg_discount_factor, rank - 1)
|
|
160
157
|
|
|
161
158
|
ranks = list(range(1, self.k + 1))
|
|
162
|
-
xdcg_n = sum(
|
|
163
|
-
starmap(
|
|
164
|
-
calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)
|
|
165
|
-
)
|
|
166
|
-
)
|
|
159
|
+
xdcg_n = sum(starmap(calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)))
|
|
167
160
|
xdcg_d = sum(map(calculate_xdcg_denominator, ranks))
|
|
168
161
|
|
|
169
162
|
return xdcg_n / float(xdcg_d)
|
|
@@ -175,7 +168,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
175
168
|
) -> float:
|
|
176
169
|
"""Fidelity calculated over all documents retrieved from a search query.
|
|
177
170
|
Fidelity measures how objectively good are all of the documents retrieved compared with all known good documents in the underlying data store.
|
|
178
|
-
|
|
171
|
+
|
|
179
172
|
:param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
|
|
180
173
|
:type result_docs_groundtruth_labels: List[int]
|
|
181
174
|
:param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
|
|
@@ -196,25 +189,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
196
189
|
if label >= s:
|
|
197
190
|
label_counts[str(label)] += 1
|
|
198
191
|
|
|
199
|
-
sorted_label_counts = [
|
|
200
|
-
x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])
|
|
201
|
-
]
|
|
192
|
+
sorted_label_counts = [x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])]
|
|
202
193
|
|
|
203
194
|
# calculate weights
|
|
204
|
-
weights = [
|
|
205
|
-
(math.pow(2, i + 1) - 1)
|
|
206
|
-
for i in range(s, self.ground_truth_label_max + 1)
|
|
207
|
-
]
|
|
195
|
+
weights = [(math.pow(2, i + 1) - 1) for i in range(s, self.ground_truth_label_max + 1)]
|
|
208
196
|
|
|
209
197
|
# return weighted sum
|
|
210
198
|
return sum(starmap(operator.mul, zip(sorted_label_counts, weights)))
|
|
211
199
|
|
|
212
|
-
weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(
|
|
213
|
-
|
|
214
|
-
)
|
|
215
|
-
weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(
|
|
216
|
-
ideal_docs_groundtruth_labels
|
|
217
|
-
)
|
|
200
|
+
weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(result_docs_groundtruth_labels)
|
|
201
|
+
weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(ideal_docs_groundtruth_labels)
|
|
218
202
|
|
|
219
203
|
if weighted_sum_by_rating_index == 0:
|
|
220
204
|
return math.nan
|
|
@@ -226,12 +210,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
226
210
|
|
|
227
211
|
for metric_name, metric_value in metrics.items():
|
|
228
212
|
if metric_name in self._threshold_metrics.keys():
|
|
229
|
-
result[f"{metric_name}_result"] =
|
|
213
|
+
result[f"{metric_name}_result"] = (
|
|
214
|
+
"pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
|
|
215
|
+
)
|
|
230
216
|
result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
|
|
231
217
|
result[f"{metric_name}_higher_is_better"] = True
|
|
232
218
|
|
|
233
219
|
elif metric_name in self._threshold_holes.keys():
|
|
234
|
-
result[f"{metric_name}_result"] =
|
|
220
|
+
result[f"{metric_name}_result"] = (
|
|
221
|
+
"pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
|
|
222
|
+
)
|
|
235
223
|
result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
|
|
236
224
|
result[f"{metric_name}_higher_is_better"] = False
|
|
237
225
|
|
|
@@ -256,8 +244,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
256
244
|
# if the qrels are empty, no meaningful evaluation is possible
|
|
257
245
|
if not retrieval_ground_truth:
|
|
258
246
|
raise EvaluationException(
|
|
259
|
-
(
|
|
260
|
-
|
|
247
|
+
(
|
|
248
|
+
"'retrieval_ground_truth' parameter must contain at least one item. "
|
|
249
|
+
"Check your data input to be sure that each input record has ground truth defined."
|
|
250
|
+
)
|
|
261
251
|
)
|
|
262
252
|
|
|
263
253
|
qrels = []
|
|
@@ -277,9 +267,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
277
267
|
)
|
|
278
268
|
|
|
279
269
|
if not isinstance(query_relevance_label, int):
|
|
280
|
-
raise EvaluationException(
|
|
281
|
-
"Query relevance labels must be integer values."
|
|
282
|
-
)
|
|
270
|
+
raise EvaluationException("Query relevance labels must be integer values.")
|
|
283
271
|
|
|
284
272
|
if query_relevance_label < self.ground_truth_label_min:
|
|
285
273
|
raise EvaluationException(
|
|
@@ -318,12 +306,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
318
306
|
)
|
|
319
307
|
)
|
|
320
308
|
|
|
321
|
-
if not isinstance(relevance_score, float) and not isinstance(
|
|
322
|
-
|
|
323
|
-
):
|
|
324
|
-
raise EvaluationException(
|
|
325
|
-
"Retrieved document relevance score must be a numerical value."
|
|
326
|
-
)
|
|
309
|
+
if not isinstance(relevance_score, float) and not isinstance(relevance_score, int):
|
|
310
|
+
raise EvaluationException("Retrieved document relevance score must be a numerical value.")
|
|
327
311
|
|
|
328
312
|
results.append(result)
|
|
329
313
|
|
|
@@ -368,24 +352,17 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
368
352
|
results_lookup = {x["document_id"]: x["relevance_score"] for x in results}
|
|
369
353
|
|
|
370
354
|
# sort each input set by label to get the ranking
|
|
371
|
-
qrels_sorted_by_rank = sorted(
|
|
372
|
-
|
|
373
|
-
)
|
|
374
|
-
results_sorted_by_rank = sorted(
|
|
375
|
-
results_lookup.items(), key=lambda x: x[1], reverse=True
|
|
376
|
-
)
|
|
355
|
+
qrels_sorted_by_rank = sorted(qrels_lookup.items(), key=lambda x: x[1], reverse=True)
|
|
356
|
+
results_sorted_by_rank = sorted(results_lookup.items(), key=lambda x: x[1], reverse=True)
|
|
377
357
|
|
|
378
358
|
# find ground truth labels for the results set and ideal set
|
|
379
359
|
result_docs_groundtruth_labels = [
|
|
380
|
-
qrels_lookup[doc_id] if doc_id in qrels_lookup else 0
|
|
381
|
-
for (doc_id, _) in results_sorted_by_rank
|
|
360
|
+
qrels_lookup[doc_id] if doc_id in qrels_lookup else 0 for (doc_id, _) in results_sorted_by_rank
|
|
382
361
|
]
|
|
383
362
|
ideal_docs_groundtruth_labels = [label for (_, label) in qrels_sorted_by_rank]
|
|
384
363
|
|
|
385
364
|
# calculate the proportion of result docs with no ground truth label (holes)
|
|
386
|
-
holes = self._compute_holes(
|
|
387
|
-
[x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank]
|
|
388
|
-
)
|
|
365
|
+
holes = self._compute_holes([x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank])
|
|
389
366
|
holes_ratio = holes / float(len(results))
|
|
390
367
|
|
|
391
368
|
# if none of the retrieved docs are labeled, report holes only
|
|
@@ -412,12 +389,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
412
389
|
result_docs_groundtruth_labels[: self.k],
|
|
413
390
|
ideal_docs_groundtruth_labels[: self.k],
|
|
414
391
|
),
|
|
415
|
-
f"xdcg@{self.k}": self._compute_xdcg(
|
|
416
|
-
|
|
417
|
-
),
|
|
418
|
-
"fidelity": self._compute_fidelity(
|
|
419
|
-
result_docs_groundtruth_labels, ideal_docs_groundtruth_labels
|
|
420
|
-
),
|
|
392
|
+
f"xdcg@{self.k}": self._compute_xdcg(result_docs_groundtruth_labels[: self.k]),
|
|
393
|
+
"fidelity": self._compute_fidelity(result_docs_groundtruth_labels, ideal_docs_groundtruth_labels),
|
|
421
394
|
"top1_relevance": result_docs_groundtruth_labels[0],
|
|
422
395
|
"top3_max_relevance": max(result_docs_groundtruth_labels[: self.k]),
|
|
423
396
|
"holes": holes,
|
|
@@ -22,9 +22,9 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
|
|
|
22
22
|
|
|
23
23
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
24
24
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
25
|
-
:param azure_ai_project: The
|
|
26
|
-
It contains subscription id, resource group, and project name.
|
|
27
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
25
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
26
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
27
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
28
28
|
:return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
|
|
29
29
|
:rtype: Dict[str, str]
|
|
30
30
|
|
|
@@ -52,17 +52,20 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
|
|
|
52
52
|
|
|
53
53
|
id = "eci"
|
|
54
54
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
55
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
55
56
|
|
|
56
57
|
@override
|
|
57
58
|
def __init__(
|
|
58
59
|
self,
|
|
59
60
|
credential,
|
|
60
61
|
azure_ai_project,
|
|
62
|
+
**kwargs,
|
|
61
63
|
):
|
|
62
64
|
super().__init__(
|
|
63
65
|
eval_metric=_InternalEvaluationMetrics.ECI,
|
|
64
66
|
azure_ai_project=azure_ai_project,
|
|
65
67
|
credential=credential,
|
|
68
|
+
**kwargs,
|
|
66
69
|
)
|
|
67
70
|
|
|
68
71
|
@overload
|
|
@@ -39,15 +39,15 @@ class F1ScoreEvaluator(EvaluatorBase):
|
|
|
39
39
|
:caption: Initialize and call an F1ScoreEvaluator.
|
|
40
40
|
|
|
41
41
|
.. admonition:: Example using Azure AI Project URL:
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
44
44
|
:start-after: [START f1_score_evaluator]
|
|
45
45
|
:end-before: [END f1_score_evaluator]
|
|
46
46
|
:language: python
|
|
47
47
|
:dedent: 8
|
|
48
|
-
:caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
|
|
48
|
+
:caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
|
|
49
49
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
.. admonition:: Example with Threshold:
|
|
52
52
|
|
|
53
53
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -58,7 +58,7 @@ class F1ScoreEvaluator(EvaluatorBase):
|
|
|
58
58
|
:caption: Initialize with threshold and call an F1ScoreEvaluator.
|
|
59
59
|
"""
|
|
60
60
|
|
|
61
|
-
id = "
|
|
61
|
+
id = "azureai://built-in/evaluators/f1_score"
|
|
62
62
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
63
63
|
|
|
64
64
|
def __init__(self, *, threshold=0.5):
|
|
@@ -147,7 +147,7 @@ class F1ScoreEvaluator(EvaluatorBase):
|
|
|
147
147
|
if f1_result <= self._threshold:
|
|
148
148
|
binary_result = True
|
|
149
149
|
return {
|
|
150
|
-
"f1_score": f1_result,
|
|
150
|
+
"f1_score": f1_result,
|
|
151
151
|
"f1_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
|
|
152
152
|
"f1_threshold": self._threshold,
|
|
153
153
|
}
|
|
@@ -45,7 +45,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
45
45
|
:caption: Initialize with threshold and call a FluencyEvaluator.
|
|
46
46
|
|
|
47
47
|
.. admonition:: Example using Azure AI Project URL:
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
50
50
|
:start-after: [START fluency_evaluator]
|
|
51
51
|
:end-before: [END fluency_evaluator]
|
|
@@ -64,7 +64,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
64
64
|
_PROMPTY_FILE = "fluency.prompty"
|
|
65
65
|
_RESULT_KEY = "fluency"
|
|
66
66
|
|
|
67
|
-
id = "
|
|
67
|
+
id = "azureai://built-in/evaluators/fluency"
|
|
68
68
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
69
69
|
|
|
70
70
|
@override
|
|
@@ -78,7 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
78
|
prompty_file=prompty_path,
|
|
79
79
|
result_key=self._RESULT_KEY,
|
|
80
80
|
threshold=threshold,
|
|
81
|
-
_higher_is_better=self._higher_is_better
|
|
81
|
+
_higher_is_better=self._higher_is_better,
|
|
82
82
|
)
|
|
83
83
|
|
|
84
84
|
@overload
|
|
@@ -34,7 +34,7 @@ class GleuScoreEvaluator(EvaluatorBase):
|
|
|
34
34
|
:language: python
|
|
35
35
|
:dedent: 8
|
|
36
36
|
:caption: Initialize and call a GleuScoreEvaluator.
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
.. admonition:: Example with Threshold:
|
|
39
39
|
|
|
40
40
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -45,17 +45,17 @@ class GleuScoreEvaluator(EvaluatorBase):
|
|
|
45
45
|
:caption: Initialize with threshold and call a GleuScoreEvaluator.
|
|
46
46
|
|
|
47
47
|
.. admonition:: Example using Azure AI Project URL:
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
50
50
|
:start-after: [START gleu_score_evaluator]
|
|
51
51
|
:end-before: [END gleu_score_evaluator]
|
|
52
52
|
:language: python
|
|
53
53
|
:dedent: 8
|
|
54
|
-
:caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
|
|
54
|
+
:caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
|
|
55
55
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
56
56
|
"""
|
|
57
57
|
|
|
58
|
-
id = "
|
|
58
|
+
id = "azureai://built-in/evaluators/gleu_score"
|
|
59
59
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
60
60
|
|
|
61
61
|
@override
|
|
@@ -12,9 +12,13 @@ from azure.ai.evaluation._model_configurations import Conversation
|
|
|
12
12
|
from ..._common.utils import construct_prompty_model_config, validate_model_config
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
|
-
from ..._user_agent import
|
|
15
|
+
from ..._user_agent import UserAgentSingleton
|
|
16
16
|
except ImportError:
|
|
17
|
-
|
|
17
|
+
|
|
18
|
+
class UserAgentSingleton:
|
|
19
|
+
@property
|
|
20
|
+
def value(self) -> str:
|
|
21
|
+
return "None"
|
|
18
22
|
|
|
19
23
|
|
|
20
24
|
class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
@@ -35,7 +39,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
35
39
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
36
40
|
:param threshold: The threshold for the groundedness evaluator. Default is 3.
|
|
37
41
|
:type threshold: int
|
|
38
|
-
|
|
42
|
+
|
|
39
43
|
.. admonition:: Example:
|
|
40
44
|
|
|
41
45
|
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
@@ -54,13 +58,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
54
58
|
:caption: Initialize with threshold and call a GroundednessEvaluator.
|
|
55
59
|
|
|
56
60
|
.. admonition:: Example using Azure AI Project URL:
|
|
57
|
-
|
|
61
|
+
|
|
58
62
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
59
63
|
:start-after: [START groundedness_evaluator]
|
|
60
64
|
:end-before: [END groundedness_evaluator]
|
|
61
65
|
:language: python
|
|
62
66
|
:dedent: 8
|
|
63
|
-
:caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
|
|
67
|
+
:caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
|
|
64
68
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
65
69
|
|
|
66
70
|
.. note::
|
|
@@ -75,7 +79,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
75
79
|
_RESULT_KEY = "groundedness"
|
|
76
80
|
_OPTIONAL_PARAMS = ["query"]
|
|
77
81
|
|
|
78
|
-
id = "
|
|
82
|
+
id = "azureai://built-in/evaluators/groundedness"
|
|
79
83
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
80
84
|
|
|
81
85
|
@override
|
|
@@ -89,7 +93,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
89
93
|
prompty_file=prompty_path,
|
|
90
94
|
result_key=self._RESULT_KEY,
|
|
91
95
|
threshold=threshold,
|
|
92
|
-
_higher_is_better=self._higher_is_better
|
|
96
|
+
_higher_is_better=self._higher_is_better,
|
|
93
97
|
)
|
|
94
98
|
self._model_config = model_config
|
|
95
99
|
self.threshold = threshold
|
|
@@ -165,7 +169,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
165
169
|
prompty_model_config = construct_prompty_model_config(
|
|
166
170
|
validate_model_config(self._model_config),
|
|
167
171
|
self._DEFAULT_OPEN_API_VERSION,
|
|
168
|
-
|
|
172
|
+
UserAgentSingleton().value,
|
|
169
173
|
)
|
|
170
174
|
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
171
175
|
|