azure-ai-evaluation 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +1 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +1 -1
- azure/ai/evaluation/_aoai/label_grader.py +2 -2
- azure/ai/evaluation/_aoai/string_check_grader.py +2 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +2 -2
- azure/ai/evaluation/_common/__init__.py +3 -1
- azure/ai/evaluation/_common/evaluation_onedp_client.py +50 -5
- azure/ai/evaluation/_common/onedp/operations/_operations.py +4 -2
- azure/ai/evaluation/_common/rai_service.py +7 -6
- azure/ai/evaluation/_converters/_ai_services.py +162 -118
- azure/ai/evaluation/_converters/_models.py +76 -6
- azure/ai/evaluation/_eval_mapping.py +2 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +15 -17
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +24 -5
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +31 -29
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +10 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +10 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +10 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +10 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +13 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +14 -4
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +10 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +80 -10
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +26 -7
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +264 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +503 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +69 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +237 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -0
- azure/ai/evaluation/red_team/_red_team.py +572 -207
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +570 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
- azure/ai/evaluation/red_team/_utils/constants.py +5 -1
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +2 -2
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +3 -3
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +3 -3
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +3 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +15 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +6 -5
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/METADATA +35 -3
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/RECORD +69 -61
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -53,6 +53,16 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
53
53
|
:dedent: 8
|
|
54
54
|
:caption: Initialize with threshold and call a GroundednessEvaluator.
|
|
55
55
|
|
|
56
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
57
|
+
|
|
58
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
59
|
+
:start-after: [START groundedness_evaluator]
|
|
60
|
+
:end-before: [END groundedness_evaluator]
|
|
61
|
+
:language: python
|
|
62
|
+
:dedent: 8
|
|
63
|
+
:caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
|
|
64
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
65
|
+
|
|
56
66
|
.. note::
|
|
57
67
|
|
|
58
68
|
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
@@ -33,6 +33,16 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
33
33
|
:dedent: 8
|
|
34
34
|
:caption: Initialize and call an IntentResolutionEvaluator with a query and response.
|
|
35
35
|
|
|
36
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
37
|
+
|
|
38
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
39
|
+
:start-after: [START intent_resolution_evaluator]
|
|
40
|
+
:end-before: [END intent_resolution_evaluator]
|
|
41
|
+
:language: python
|
|
42
|
+
:dedent: 8
|
|
43
|
+
:caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
|
|
44
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
45
|
+
|
|
36
46
|
"""
|
|
37
47
|
|
|
38
48
|
_PROMPTY_FILE = "intent_resolution.prompty"
|
|
@@ -45,6 +45,16 @@ class MeteorScoreEvaluator(EvaluatorBase):
|
|
|
45
45
|
:dedent: 8
|
|
46
46
|
:caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
|
|
47
47
|
|
|
48
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
+
|
|
50
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
|
+
:start-after: [START meteor_score_evaluator]
|
|
52
|
+
:end-before: [END meteor_score_evaluator]
|
|
53
|
+
:language: python
|
|
54
|
+
:dedent: 8
|
|
55
|
+
:caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
|
|
56
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
|
+
|
|
48
58
|
.. admonition:: Example with Threshold:
|
|
49
59
|
|
|
50
60
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -37,6 +37,17 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
37
37
|
:language: python
|
|
38
38
|
:dedent: 8
|
|
39
39
|
:caption: Initialize and call a ProtectedMaterialEvaluator.
|
|
40
|
+
|
|
41
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
42
|
+
|
|
43
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
44
|
+
:start-after: [START protected_material_evaluator]
|
|
45
|
+
:end-before: [END protected_material_evaluator]
|
|
46
|
+
:language: python
|
|
47
|
+
:dedent: 8
|
|
48
|
+
:caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
|
|
49
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
50
|
+
|
|
40
51
|
"""
|
|
41
52
|
|
|
42
53
|
id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
|
|
@@ -48,6 +48,16 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
48
48
|
:dedent: 8
|
|
49
49
|
:caption: Initialize and call a QAEvaluator.
|
|
50
50
|
|
|
51
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
52
|
+
|
|
53
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
54
|
+
:start-after: [START qa_evaluator]
|
|
55
|
+
:end-before: [END qa_evaluator]
|
|
56
|
+
:language: python
|
|
57
|
+
:dedent: 8
|
|
58
|
+
:caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
|
|
59
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
60
|
+
|
|
51
61
|
.. admonition:: Example with Threshold:
|
|
52
62
|
|
|
53
63
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -39,6 +39,16 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
39
39
|
:dedent: 8
|
|
40
40
|
:caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
|
|
41
41
|
|
|
42
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
43
|
+
|
|
44
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
45
|
+
:start-after: [START relevance_evaluator]
|
|
46
|
+
:end-before: [END relevance_evaluator]
|
|
47
|
+
:language: python
|
|
48
|
+
:dedent: 8
|
|
49
|
+
:caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
|
|
50
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
51
|
+
|
|
42
52
|
.. admonition:: Example with Threshold:
|
|
43
53
|
|
|
44
54
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -37,13 +37,26 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
37
37
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
38
38
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
39
39
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
40
|
+
|
|
40
41
|
.. admonition:: Example:
|
|
42
|
+
|
|
41
43
|
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
42
44
|
:start-after: [START completeness_evaluator]
|
|
43
45
|
:end-before: [END completeness_evaluator]
|
|
44
46
|
:language: python
|
|
45
47
|
:dedent: 8
|
|
46
48
|
:caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
|
|
49
|
+
|
|
50
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
51
|
+
|
|
52
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
53
|
+
:start-after: [START completeness_evaluator]
|
|
54
|
+
:end-before: [END completeness_evaluator]
|
|
55
|
+
:language: python
|
|
56
|
+
:dedent: 8
|
|
57
|
+
:caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
|
|
58
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
59
|
+
|
|
47
60
|
"""
|
|
48
61
|
|
|
49
62
|
# Constants must be defined within eval's directory to be save/loadable
|
|
@@ -45,6 +45,16 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
45
45
|
:dedent: 8
|
|
46
46
|
:caption: Initialize and call a RetrievalEvaluator.
|
|
47
47
|
|
|
48
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
+
|
|
50
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
|
+
:start-after: [START retrieval_evaluator]
|
|
52
|
+
:end-before: [END retrieval_evaluator]
|
|
53
|
+
:language: python
|
|
54
|
+
:dedent: 8
|
|
55
|
+
:caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
|
|
56
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
|
+
|
|
48
58
|
.. admonition:: Example with Threshold:
|
|
49
59
|
|
|
50
60
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
-
from typing import Dict
|
|
6
|
+
from typing import Dict, Union
|
|
7
7
|
from typing_extensions import overload, override
|
|
8
8
|
|
|
9
9
|
from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
|
|
@@ -12,7 +12,7 @@ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
|
12
12
|
import math
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class RougeType(Enum):
|
|
15
|
+
class RougeType(str, Enum):
|
|
16
16
|
"""
|
|
17
17
|
Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
|
|
18
18
|
"""
|
|
@@ -70,6 +70,16 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
70
70
|
:dedent: 8
|
|
71
71
|
:caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
72
72
|
|
|
73
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
74
|
+
|
|
75
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
76
|
+
:start-after: [START rouge_score_evaluator]
|
|
77
|
+
:end-before: [END rouge_score_evaluator]
|
|
78
|
+
:language: python
|
|
79
|
+
:dedent: 8
|
|
80
|
+
:caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
|
|
81
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
82
|
+
|
|
73
83
|
.. admonition:: Example with threshold:
|
|
74
84
|
|
|
75
85
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -169,8 +179,8 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
169
179
|
"""
|
|
170
180
|
ground_truth = eval_input["ground_truth"]
|
|
171
181
|
response = eval_input["response"]
|
|
172
|
-
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type
|
|
173
|
-
metrics = scorer.score(ground_truth, response)[self._rouge_type
|
|
182
|
+
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
|
|
183
|
+
metrics = scorer.score(ground_truth, response)[self._rouge_type]
|
|
174
184
|
binary_results = {
|
|
175
185
|
"rouge_precision_result": False,
|
|
176
186
|
"rouge_recall_result": False,
|
|
@@ -41,6 +41,16 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
41
41
|
:dedent: 8
|
|
42
42
|
:caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
|
|
43
43
|
|
|
44
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
45
|
+
|
|
46
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
47
|
+
:start-after: [START groundedness_pro_evaluator]
|
|
48
|
+
:end-before: [END groundedness_pro_evaluator]
|
|
49
|
+
:language: python
|
|
50
|
+
:dedent: 8
|
|
51
|
+
:caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
|
|
52
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
53
|
+
|
|
44
54
|
.. admonition:: Example with threshold:
|
|
45
55
|
|
|
46
56
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -40,6 +40,16 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
|
|
|
40
40
|
:dedent: 8
|
|
41
41
|
:caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
|
|
42
42
|
|
|
43
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
44
|
+
|
|
45
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
46
|
+
:start-after: [START similarity_evaluator]
|
|
47
|
+
:end-before: [END similarity_evaluator]
|
|
48
|
+
:language: python
|
|
49
|
+
:dedent: 8
|
|
50
|
+
:caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
|
|
51
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
52
|
+
|
|
43
53
|
.. admonition:: Example:
|
|
44
54
|
|
|
45
55
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -42,6 +42,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
42
42
|
:language: python
|
|
43
43
|
:dedent: 8
|
|
44
44
|
:caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
|
|
45
|
+
|
|
46
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
47
|
+
|
|
48
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
49
|
+
:start-after: [START task_adherence_evaluator]
|
|
50
|
+
:end-before: [END task_adherence_evaluator]
|
|
51
|
+
:language: python
|
|
52
|
+
:dedent: 8
|
|
53
|
+
:caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
|
|
54
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
55
|
+
|
|
45
56
|
"""
|
|
46
57
|
|
|
47
58
|
_PROMPTY_FILE = "task_adherence.prompty"
|
|
@@ -45,6 +45,16 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
45
45
|
:dedent: 8
|
|
46
46
|
:caption: Initialize and call a ToolCallAccuracyEvaluator.
|
|
47
47
|
|
|
48
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
+
|
|
50
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
|
+
:start-after: [START tool_call_accuracy_evaluator]
|
|
52
|
+
:end-before: [END tool_call_accuracy_evaluator]
|
|
53
|
+
:language: python
|
|
54
|
+
:dedent: 8
|
|
55
|
+
:caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
|
|
56
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
|
+
|
|
48
58
|
.. note::
|
|
49
59
|
|
|
50
60
|
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
@@ -214,12 +224,18 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
214
224
|
score = math.nan
|
|
215
225
|
if llm_output:
|
|
216
226
|
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
227
|
+
if score >= 0 and score <= 1:
|
|
228
|
+
return {
|
|
229
|
+
self._result_key: bool(float(score)),
|
|
230
|
+
f"{self._result_key}_reason": reason,
|
|
231
|
+
"tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
|
|
232
|
+
}
|
|
233
|
+
raise EvaluationException(
|
|
234
|
+
message="Tool call accuracy evaluator: Invalid score returned from LLM.",
|
|
235
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
236
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
237
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
238
|
+
)
|
|
223
239
|
|
|
224
240
|
async def _real_call(self, **kwargs):
|
|
225
241
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
@@ -231,13 +247,55 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
231
247
|
"""
|
|
232
248
|
# Convert inputs into list of evaluable inputs.
|
|
233
249
|
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
|
|
250
|
+
if len(eval_input_list) == 0:
|
|
251
|
+
return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
|
|
252
|
+
f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
|
|
253
|
+
f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
|
|
254
|
+
f"{self._AGGREGATE_RESULT_KEY}_reason":
|
|
255
|
+
"No tool calls were made.",
|
|
256
|
+
"per_tool_call_details": []
|
|
257
|
+
}
|
|
258
|
+
|
|
234
259
|
per_turn_results = []
|
|
235
260
|
# Evaluate all inputs.
|
|
236
261
|
for eval_input in eval_input_list:
|
|
237
|
-
|
|
262
|
+
if self._is_applicable_tool(eval_input):
|
|
263
|
+
per_turn_results.append(await self._do_eval(eval_input))
|
|
264
|
+
else:
|
|
265
|
+
per_turn_results.append(self._not_applicable_result(eval_input))
|
|
238
266
|
|
|
239
267
|
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
240
268
|
|
|
269
|
+
def _is_applicable_tool(self, eval_input):
|
|
270
|
+
"""Determine if a given tool should be evaluated, since we only evaluate tools that
|
|
271
|
+
have sufficient context available.
|
|
272
|
+
|
|
273
|
+
:type eval_input: Dict
|
|
274
|
+
:return: True if the tool call should be evaluated
|
|
275
|
+
:rtype: bool
|
|
276
|
+
"""
|
|
277
|
+
tool_definition = eval_input.get("tool_definition")
|
|
278
|
+
if tool_definition is None or len(tool_definition) != 1:
|
|
279
|
+
return False
|
|
280
|
+
tool_type = tool_definition[0].get("type")
|
|
281
|
+
if tool_type is None or tool_type != "function":
|
|
282
|
+
return False
|
|
283
|
+
return True
|
|
284
|
+
|
|
285
|
+
def _not_applicable_result(self, eval_input):
|
|
286
|
+
"""Return a result indicating that the tool call is not applicable for evaluation.
|
|
287
|
+
|
|
288
|
+
:param eval_input: The input to the evaluator.
|
|
289
|
+
:type eval_input: Dict
|
|
290
|
+
:return: A dictionary containing the result of the evaluation.
|
|
291
|
+
:rtype: Dict[str, Union[str, float]]
|
|
292
|
+
"""
|
|
293
|
+
return {
|
|
294
|
+
f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
|
|
295
|
+
f"{self._result_key}_reason": "Tool call not supported for evaluation",
|
|
296
|
+
"tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
|
|
297
|
+
}
|
|
298
|
+
|
|
241
299
|
def _aggregate_results(self, per_turn_results):
|
|
242
300
|
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
243
301
|
|
|
@@ -260,11 +318,23 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
260
318
|
# Go over each turn, and rotate the results into a
|
|
261
319
|
# metric: List[values] format for the evals_per_turn dictionary.
|
|
262
320
|
|
|
263
|
-
|
|
321
|
+
num_evaluated = len([per_turn_result for per_turn_result in per_turn_results
|
|
322
|
+
if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT])
|
|
323
|
+
if num_evaluated == 0:
|
|
324
|
+
# None of the invoked tools were applicable, return not applicable result
|
|
325
|
+
# (If a tool fails evaluation, we'll throw an exception)
|
|
326
|
+
return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
|
|
327
|
+
f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
|
|
328
|
+
f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
|
|
329
|
+
f"{self._AGGREGATE_RESULT_KEY}_reason":
|
|
330
|
+
"Tool call accuracy evaluation is not yet supported for the invoked tools.",
|
|
331
|
+
"per_tool_call_details": []
|
|
332
|
+
}
|
|
333
|
+
# ignore not_applicable results, where the _result_key will be "not applicable"
|
|
334
|
+
score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated
|
|
264
335
|
aggregated[self._AGGREGATE_RESULT_KEY] = score
|
|
265
|
-
aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] =
|
|
336
|
+
aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
|
|
266
337
|
aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
|
|
267
|
-
|
|
268
338
|
aggregated["per_tool_call_details"] = per_turn_results
|
|
269
339
|
return aggregated
|
|
270
340
|
|
|
@@ -41,6 +41,16 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
41
41
|
:dedent: 8
|
|
42
42
|
:caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
|
|
43
43
|
|
|
44
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
45
|
+
|
|
46
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
47
|
+
:start-after: [START ungrounded_attributes_evaluator]
|
|
48
|
+
:end-before: [END ungrounded_attributes_evaluator]
|
|
49
|
+
:language: python
|
|
50
|
+
:dedent: 8
|
|
51
|
+
:caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
|
|
52
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
53
|
+
|
|
44
54
|
.. note::
|
|
45
55
|
|
|
46
56
|
If this evaluator is supplied to the `evaluate` function, the metric
|
|
@@ -54,6 +54,17 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
54
54
|
:language: python
|
|
55
55
|
:dedent: 8
|
|
56
56
|
:caption: Initialize and call an IndirectAttackEvaluator.
|
|
57
|
+
|
|
58
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
59
|
+
|
|
60
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
61
|
+
:start-after: [START indirect_attack_evaluator]
|
|
62
|
+
:end-before: [END indirect_attack_evaluator]
|
|
63
|
+
:language: python
|
|
64
|
+
:dedent: 8
|
|
65
|
+
:caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
|
|
66
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
67
|
+
|
|
57
68
|
"""
|
|
58
69
|
|
|
59
70
|
id = "azureml://registries/azureml/models/Indirect-Attack-Evaluator/versions/3"
|
|
@@ -161,6 +161,8 @@ class _SafetyEvaluation:
|
|
|
161
161
|
adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
|
|
162
162
|
source_text: Optional[str] = None,
|
|
163
163
|
direct_attack: bool = False,
|
|
164
|
+
randomization_seed: Optional[int] = None,
|
|
165
|
+
concurrent_async_tasks: Optional[int] = 5,
|
|
164
166
|
) -> Dict[str, str]:
|
|
165
167
|
"""
|
|
166
168
|
Generates synthetic conversations based on provided parameters.
|
|
@@ -245,6 +247,8 @@ class _SafetyEvaluation:
|
|
|
245
247
|
conversation_turns=conversation_turns,
|
|
246
248
|
text=source_text,
|
|
247
249
|
target=callback,
|
|
250
|
+
randomization_seed=randomization_seed,
|
|
251
|
+
concurrent_async_task=concurrent_async_tasks
|
|
248
252
|
)
|
|
249
253
|
|
|
250
254
|
# if DirectAttack, run DirectAttackSimulator
|
|
@@ -258,6 +262,8 @@ class _SafetyEvaluation:
|
|
|
258
262
|
max_conversation_turns=max_conversation_turns,
|
|
259
263
|
max_simulation_results=max_simulation_results,
|
|
260
264
|
target=callback,
|
|
265
|
+
randomization_seed=randomization_seed,
|
|
266
|
+
concurrent_async_task=concurrent_async_tasks,
|
|
261
267
|
)
|
|
262
268
|
jailbreak_outputs = simulator_outputs["jailbreak"]
|
|
263
269
|
simulator_outputs = simulator_outputs["regular"]
|
|
@@ -275,6 +281,7 @@ class _SafetyEvaluation:
|
|
|
275
281
|
num_queries=max_simulation_results,
|
|
276
282
|
target=callback,
|
|
277
283
|
text=source_text if source_text else "",
|
|
284
|
+
concurrent_async_tasks=concurrent_async_tasks,
|
|
278
285
|
)
|
|
279
286
|
|
|
280
287
|
## Run AdversarialSimulator
|
|
@@ -290,6 +297,8 @@ class _SafetyEvaluation:
|
|
|
290
297
|
conversation_turns=conversation_turns,
|
|
291
298
|
target=callback,
|
|
292
299
|
text=source_text,
|
|
300
|
+
randomization_seed=randomization_seed,
|
|
301
|
+
concurrent_async_task=concurrent_async_tasks
|
|
293
302
|
)
|
|
294
303
|
|
|
295
304
|
## If no outputs are generated, raise an exception
|
|
@@ -677,7 +686,9 @@ class _SafetyEvaluation:
|
|
|
677
686
|
violence_defects = []
|
|
678
687
|
sexual_defects = []
|
|
679
688
|
self_harm_defects = []
|
|
680
|
-
|
|
689
|
+
jailbreak_rows = sorted(evaluation_result_dict[jailbreak_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
|
|
690
|
+
regular_rows = sorted(evaluation_result_dict[regular_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
|
|
691
|
+
for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
|
|
681
692
|
hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
|
|
682
693
|
violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
|
|
683
694
|
sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
|
|
@@ -716,8 +727,10 @@ class _SafetyEvaluation:
|
|
|
716
727
|
data_path: Optional[Union[str, os.PathLike]] = None,
|
|
717
728
|
jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
|
|
718
729
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
719
|
-
data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None
|
|
720
|
-
|
|
730
|
+
data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None,
|
|
731
|
+
randomization_seed: Optional[int] = None,
|
|
732
|
+
concurrent_async_tasks: Optional[int] = 5,
|
|
733
|
+
) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
|
|
721
734
|
'''
|
|
722
735
|
Evaluates the target function based on the provided parameters.
|
|
723
736
|
|
|
@@ -744,12 +757,17 @@ class _SafetyEvaluation:
|
|
|
744
757
|
:param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run.
|
|
745
758
|
:type data_path: Optional[Union[str, os.PathLike]]
|
|
746
759
|
:param jailbreak_data_path: The path to the data file generated by the Simulator for jailbreak scenario. If None, the DirectAttackSimulator will be run.
|
|
747
|
-
:type jailbreak_data_path: Optional[Union[str, os.PathLike]]
|
|
748
|
-
:param output_path: The path to write the evaluation results to if set.
|
|
760
|
+
:type jailbreak_data_path: Optional[Union[str, os.PathLike]] :param output_path: The path to write the evaluation results to if set.
|
|
749
761
|
:type output_path: Optional[Union[str, os.PathLike]]
|
|
762
|
+
:param data_paths: A dictionary of data paths to evaluate. If None, the Simulator will be run.
|
|
763
|
+
:type data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]]
|
|
764
|
+
:param randomization_seed: The seed used to randomize prompt selection. If unset, the system's default seed is used.
|
|
765
|
+
:type randomization_seed: Optional[int]
|
|
766
|
+
:param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
|
|
767
|
+
:type concurrent_async_tasks: Optional[int]
|
|
750
768
|
'''
|
|
751
|
-
## Log inputs
|
|
752
|
-
self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}")
|
|
769
|
+
## Log inputs
|
|
770
|
+
self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}")
|
|
753
771
|
|
|
754
772
|
## Validate arguments
|
|
755
773
|
self._validate_inputs(
|
|
@@ -779,6 +797,7 @@ class _SafetyEvaluation:
|
|
|
779
797
|
tasks=tasks,
|
|
780
798
|
source_text=source_text,
|
|
781
799
|
direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
|
|
800
|
+
randomization_seed=randomization_seed,
|
|
782
801
|
)
|
|
783
802
|
elif data_path:
|
|
784
803
|
data_paths = {Path(data_path).stem: data_path}
|
azure/ai/evaluation/_version.py
CHANGED