azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +22 -0
- azure/ai/evaluation/_common/constants.py +5 -0
- azure/ai/evaluation/_common/math.py +11 -0
- azure/ai/evaluation/_common/rai_service.py +172 -35
- azure/ai/evaluation/_common/utils.py +162 -23
- azure/ai/evaluation/_constants.py +6 -6
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +4 -4
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +6 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
- azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
- azure/ai/evaluation/_evaluate/_utils.py +40 -7
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +49 -15
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +44 -11
- azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +125 -82
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
- azure/ai/evaluation/_exceptions.py +17 -0
- azure/ai/evaluation/_model_configurations.py +18 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
- azure/ai/evaluation/simulator/_simulator.py +115 -61
- azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +166 -9
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/NOTICE.txt +20 -0
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
- /azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +0 -0
- /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Optional, Dict
|
|
5
|
+
from typing_extensions import override
|
|
6
|
+
|
|
7
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
8
|
+
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@experimental
|
|
13
|
+
class GroundednessProEvaluator(RaiServiceEvaluatorBase):
|
|
14
|
+
"""
|
|
15
|
+
Initialize a Groundedness Pro evaluator for determine if the response is grounded
|
|
16
|
+
in the query and context.
|
|
17
|
+
|
|
18
|
+
If this evaluator is supplied to the `evaluate` function, the aggregated metric
|
|
19
|
+
for the groundedness pro label will be "groundedness_pro_passing_rate".
|
|
20
|
+
|
|
21
|
+
:param credential: The credential for connecting to Azure AI project. Required
|
|
22
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
23
|
+
:param azure_ai_project: The scope of the Azure AI project.
|
|
24
|
+
It contains subscription id, resource group, and project name.
|
|
25
|
+
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
26
|
+
:param kwargs: Additional arguments to pass to the evaluator.
|
|
27
|
+
:type kwargs: Any
|
|
28
|
+
|
|
29
|
+
**Usage**
|
|
30
|
+
|
|
31
|
+
.. code-block:: python
|
|
32
|
+
|
|
33
|
+
azure_ai_project = {
|
|
34
|
+
"subscription_id": "<subscription_id>",
|
|
35
|
+
"resource_group_name": "<resource_group_name>",
|
|
36
|
+
"project_name": "<project_name>",
|
|
37
|
+
}
|
|
38
|
+
credential = DefaultAzureCredential()
|
|
39
|
+
|
|
40
|
+
eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
|
|
41
|
+
result = eval_fn(query="What's the capital of France", response="Paris", context="Paris.")
|
|
42
|
+
|
|
43
|
+
**Output format**
|
|
44
|
+
|
|
45
|
+
.. code-block:: python
|
|
46
|
+
|
|
47
|
+
{
|
|
48
|
+
"groundedness_pro_label": True,
|
|
49
|
+
"reason": "'All Contents are grounded"
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
**Usage with conversation input**
|
|
53
|
+
|
|
54
|
+
.. code-block:: python
|
|
55
|
+
|
|
56
|
+
azure_ai_project = {
|
|
57
|
+
"subscription_id": "<subscription_id>",
|
|
58
|
+
"resource_group_name": "<resource_group_name>",
|
|
59
|
+
"project_name": "<project_name>",
|
|
60
|
+
}
|
|
61
|
+
credential = DefaultAzureCredential()
|
|
62
|
+
|
|
63
|
+
eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
|
|
64
|
+
conversation = {
|
|
65
|
+
"messages": [
|
|
66
|
+
{"role": "user", "content": "What is the capital of France?"},
|
|
67
|
+
{"role": "assistant", "content": "Paris.", "context": "Paris."}
|
|
68
|
+
{"role": "user", "content": "What is the capital of Germany?"},
|
|
69
|
+
{"role": "assistant", "content": "Berlin.", "context": "Berlin."}
|
|
70
|
+
]
|
|
71
|
+
}
|
|
72
|
+
result = eval_fn(conversation=conversation)
|
|
73
|
+
|
|
74
|
+
**Output format**
|
|
75
|
+
|
|
76
|
+
.. code-block:: python
|
|
77
|
+
|
|
78
|
+
{
|
|
79
|
+
"groundedness_pro_label": 1.0,
|
|
80
|
+
"evaluation_per_turn": {
|
|
81
|
+
"groundedness_pro_label": [True, True],
|
|
82
|
+
"groundedness_pro_reason": ["All contents are grounded", "All contents are grounded"]
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
@override
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
credential,
|
|
91
|
+
azure_ai_project,
|
|
92
|
+
**kwargs,
|
|
93
|
+
):
|
|
94
|
+
self._passing_score = 3 # TODO update once the binarization PR is merged
|
|
95
|
+
self._output_prefix = "groundedness_pro"
|
|
96
|
+
super().__init__(
|
|
97
|
+
eval_metric=EvaluationMetrics.GROUNDEDNESS,
|
|
98
|
+
azure_ai_project=azure_ai_project,
|
|
99
|
+
credential=credential,
|
|
100
|
+
**kwargs,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
@override
|
|
104
|
+
def __call__(
|
|
105
|
+
self,
|
|
106
|
+
*,
|
|
107
|
+
query: Optional[str] = None,
|
|
108
|
+
response: Optional[str] = None,
|
|
109
|
+
context: Optional[str] = None,
|
|
110
|
+
conversation=None,
|
|
111
|
+
**kwargs,
|
|
112
|
+
):
|
|
113
|
+
"""Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
|
|
114
|
+
or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
|
|
115
|
+
the evaluator will aggregate the results of each turn, with the per-turn results available
|
|
116
|
+
in the output under the "evaluation_per_turn" key.
|
|
117
|
+
|
|
118
|
+
:keyword query: The query to be evaluated.
|
|
119
|
+
:paramtype query: Optional[str]
|
|
120
|
+
:keyword response: The response to be evaluated.
|
|
121
|
+
:paramtype response: Optional[str]
|
|
122
|
+
:keyword context: The context to be evaluated.
|
|
123
|
+
:paramtype context: Optional[str]
|
|
124
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
125
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
126
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
127
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
128
|
+
:return: The relevance score.
|
|
129
|
+
:rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
|
|
130
|
+
"""
|
|
131
|
+
return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
|
|
132
|
+
|
|
133
|
+
@override
|
|
134
|
+
async def _do_eval(self, eval_input: Dict):
|
|
135
|
+
"""This evaluator has some unique post-processing that requires data that
|
|
136
|
+
the rai_service script is not currently built to handle. So we post-post-process
|
|
137
|
+
the result here to message it into the right form.
|
|
138
|
+
|
|
139
|
+
:param eval_input: The input to the evaluation function.
|
|
140
|
+
:type eval_input: Dict
|
|
141
|
+
:return: The evaluation result.
|
|
142
|
+
:rtype: Dict
|
|
143
|
+
"""
|
|
144
|
+
result = await super()._do_eval(eval_input)
|
|
145
|
+
real_result = {}
|
|
146
|
+
real_result[self._output_prefix + "_label"] = (
|
|
147
|
+
result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self._passing_score
|
|
148
|
+
)
|
|
149
|
+
real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
|
|
150
|
+
return real_result
|
|
@@ -5,13 +5,11 @@
|
|
|
5
5
|
import math
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
|
-
from typing import Union
|
|
9
8
|
|
|
10
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
11
10
|
from promptflow.core import AsyncPrompty
|
|
12
11
|
|
|
13
12
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
15
13
|
|
|
16
14
|
from ..._common.utils import construct_prompty_model_config, validate_model_config
|
|
17
15
|
|
|
@@ -23,19 +21,19 @@ except ImportError:
|
|
|
23
21
|
|
|
24
22
|
class _AsyncSimilarityEvaluator:
|
|
25
23
|
# Constants must be defined within eval's directory to be save/loadable
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
24
|
+
_PROMPTY_FILE = "similarity.prompty"
|
|
25
|
+
_LLM_CALL_TIMEOUT = 600
|
|
26
|
+
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
29
27
|
|
|
30
|
-
def __init__(self, model_config:
|
|
28
|
+
def __init__(self, model_config: dict):
|
|
31
29
|
prompty_model_config = construct_prompty_model_config(
|
|
32
|
-
model_config,
|
|
33
|
-
self.
|
|
30
|
+
validate_model_config(model_config),
|
|
31
|
+
self._DEFAULT_OPEN_API_VERSION,
|
|
34
32
|
USER_AGENT,
|
|
35
33
|
)
|
|
36
34
|
|
|
37
35
|
current_dir = os.path.dirname(__file__)
|
|
38
|
-
prompty_path = os.path.join(current_dir, self.
|
|
36
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
39
37
|
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
40
38
|
|
|
41
39
|
async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
|
|
@@ -68,7 +66,7 @@ class _AsyncSimilarityEvaluator:
|
|
|
68
66
|
|
|
69
67
|
# Run the evaluation flow
|
|
70
68
|
llm_output = await self._flow(
|
|
71
|
-
query=query, response=response, ground_truth=ground_truth, timeout=self.
|
|
69
|
+
query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
|
|
72
70
|
)
|
|
73
71
|
|
|
74
72
|
score = math.nan
|
|
@@ -77,7 +75,7 @@ class _AsyncSimilarityEvaluator:
|
|
|
77
75
|
if match:
|
|
78
76
|
score = float(match.group())
|
|
79
77
|
|
|
80
|
-
return {"gpt_similarity": float(score)}
|
|
78
|
+
return {"similarity": float(score), "gpt_similarity": float(score)}
|
|
81
79
|
|
|
82
80
|
|
|
83
81
|
class SimilarityEvaluator:
|
|
@@ -103,12 +101,17 @@ class SimilarityEvaluator:
|
|
|
103
101
|
.. code-block:: python
|
|
104
102
|
|
|
105
103
|
{
|
|
106
|
-
"
|
|
104
|
+
"similarity": 3.0,
|
|
105
|
+
"gpt_similarity": 3.0,
|
|
107
106
|
}
|
|
107
|
+
|
|
108
|
+
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
|
|
109
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
110
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
108
111
|
"""
|
|
109
112
|
|
|
110
|
-
def __init__(self, model_config
|
|
111
|
-
self._async_evaluator = _AsyncSimilarityEvaluator(
|
|
113
|
+
def __init__(self, model_config):
|
|
114
|
+
self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
|
|
112
115
|
|
|
113
116
|
def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
|
|
114
117
|
"""
|
|
@@ -2,19 +2,24 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import logging
|
|
5
|
+
from typing import Optional
|
|
5
6
|
|
|
6
7
|
from typing_extensions import override
|
|
7
8
|
|
|
9
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
8
10
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
9
11
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
10
12
|
|
|
11
13
|
logger = logging.getLogger(__name__)
|
|
12
14
|
|
|
13
15
|
|
|
16
|
+
@experimental
|
|
14
17
|
class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
|
|
15
18
|
"""A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator.
|
|
16
19
|
|
|
17
20
|
Detect whether cross domain injected attacks are present in your AI system's response.
|
|
21
|
+
Metrics include the overall evaluation label and reason for the Q/A Pair, as well as sub-labels
|
|
22
|
+
for manipulated content, intrusion, and information.
|
|
18
23
|
|
|
19
24
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
20
25
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
@@ -24,10 +29,7 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
|
|
|
24
29
|
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
|
|
25
30
|
focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
|
|
26
31
|
:type eval_last_turn: bool
|
|
27
|
-
|
|
28
|
-
evaluation label and reason for the Q/A Pair, as well as sub-labels for manipulated content, intrusion, and
|
|
29
|
-
information.
|
|
30
|
-
:rtype: Callable
|
|
32
|
+
|
|
31
33
|
**Usage**
|
|
32
34
|
|
|
33
35
|
.. code-block:: python
|
|
@@ -53,7 +55,7 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
|
|
|
53
55
|
def __init__(
|
|
54
56
|
self,
|
|
55
57
|
credential,
|
|
56
|
-
azure_ai_project
|
|
58
|
+
azure_ai_project,
|
|
57
59
|
eval_last_turn: bool = False,
|
|
58
60
|
):
|
|
59
61
|
super().__init__(
|
|
@@ -62,3 +64,28 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
|
|
|
62
64
|
credential=credential,
|
|
63
65
|
eval_last_turn=eval_last_turn,
|
|
64
66
|
)
|
|
67
|
+
|
|
68
|
+
@override
|
|
69
|
+
def __call__(
|
|
70
|
+
self,
|
|
71
|
+
*,
|
|
72
|
+
query: Optional[str] = None,
|
|
73
|
+
response: Optional[str] = None,
|
|
74
|
+
conversation=None,
|
|
75
|
+
**kwargs,
|
|
76
|
+
):
|
|
77
|
+
"""
|
|
78
|
+
Evaluate whether cross domain injected attacks are present in your AI system's response.
|
|
79
|
+
|
|
80
|
+
:keyword query: The query to be evaluated.
|
|
81
|
+
:paramtype query: str
|
|
82
|
+
:keyword response: The response to be evaluated.
|
|
83
|
+
:paramtype response: str
|
|
84
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
85
|
+
key "messages". Conversation turns are expected
|
|
86
|
+
to be dictionaries with keys "content" and "role".
|
|
87
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
88
|
+
:return: The fluency score.
|
|
89
|
+
:rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
|
|
90
|
+
"""
|
|
91
|
+
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|
|
@@ -22,6 +22,7 @@ class ErrorCategory(Enum):
|
|
|
22
22
|
* FAILED_EXECUTION -> Execution failed
|
|
23
23
|
* SERVICE_UNAVAILABLE -> Service is unavailable
|
|
24
24
|
* MISSING_PACKAGE -> Required package is missing
|
|
25
|
+
* FAILED_REMOTE_TRACKING -> Remote tracking failed
|
|
25
26
|
* UNKNOWN -> Undefined placeholder. Avoid using.
|
|
26
27
|
"""
|
|
27
28
|
|
|
@@ -33,6 +34,7 @@ class ErrorCategory(Enum):
|
|
|
33
34
|
FAILED_EXECUTION = "FAILED_EXECUTION"
|
|
34
35
|
SERVICE_UNAVAILABLE = "SERVICE UNAVAILABLE"
|
|
35
36
|
MISSING_PACKAGE = "MISSING PACKAGE"
|
|
37
|
+
FAILED_REMOTE_TRACKING = "FAILED REMOTE TRACKING"
|
|
36
38
|
UNKNOWN = "UNKNOWN"
|
|
37
39
|
|
|
38
40
|
|
|
@@ -59,12 +61,15 @@ class ErrorTarget(Enum):
|
|
|
59
61
|
RAI_CLIENT = "RAIClient"
|
|
60
62
|
COHERENCE_EVALUATOR = "CoherenceEvaluator"
|
|
61
63
|
CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
|
|
64
|
+
CONTENT_SAFETY_MULTIMODAL_EVALUATOR = "ContentSafetyMultimodalEvaluator"
|
|
62
65
|
ECI_EVALUATOR = "ECIEvaluator"
|
|
63
66
|
F1_EVALUATOR = "F1Evaluator"
|
|
64
67
|
GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
|
|
65
68
|
PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
|
|
66
69
|
RELEVANCE_EVALUATOR = "RelevanceEvaluator"
|
|
67
70
|
SIMILARITY_EVALUATOR = "SimilarityEvaluator"
|
|
71
|
+
FLUENCY_EVALUATOR = "FluencyEvaluator"
|
|
72
|
+
RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
|
|
68
73
|
INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
|
|
69
74
|
INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator"
|
|
70
75
|
ADVERSARIAL_SIMULATOR = "AdversarialSimulator"
|
|
@@ -90,6 +95,8 @@ class EvaluationException(AzureError):
|
|
|
90
95
|
:type category: ~azure.ai.evaluation._exceptions.ErrorCategory
|
|
91
96
|
:param blame: The source of blame for the error, defaults to Unknown.
|
|
92
97
|
:type balance: ~azure.ai.evaluation._exceptions.ErrorBlame
|
|
98
|
+
:param tsg_link: A link to the TSG page for troubleshooting the error.
|
|
99
|
+
:type tsg_link: str
|
|
93
100
|
"""
|
|
94
101
|
|
|
95
102
|
def __init__(
|
|
@@ -100,10 +107,20 @@ class EvaluationException(AzureError):
|
|
|
100
107
|
target: ErrorTarget = ErrorTarget.UNKNOWN,
|
|
101
108
|
category: ErrorCategory = ErrorCategory.UNKNOWN,
|
|
102
109
|
blame: ErrorBlame = ErrorBlame.UNKNOWN,
|
|
110
|
+
tsg_link: Optional[str] = None,
|
|
103
111
|
**kwargs,
|
|
104
112
|
) -> None:
|
|
105
113
|
self.category = category
|
|
106
114
|
self.target = target
|
|
107
115
|
self.blame = blame
|
|
108
116
|
self.internal_message = internal_message
|
|
117
|
+
self.tsg_link = tsg_link
|
|
109
118
|
super().__init__(message, *args, **kwargs)
|
|
119
|
+
|
|
120
|
+
def __str__(self):
|
|
121
|
+
error_blame = "InternalError" if self.blame != ErrorBlame.USER_ERROR else "UserError"
|
|
122
|
+
msg = f"({error_blame}) {super().__str__()}"
|
|
123
|
+
if self.tsg_link:
|
|
124
|
+
msg += f"\nVisit {self.tsg_link} to troubleshoot this issue."
|
|
125
|
+
|
|
126
|
+
return msg
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from typing import Dict, Literal, TypedDict
|
|
5
|
+
from typing import Any, Dict, List, Literal, TypedDict, Union
|
|
6
6
|
|
|
7
7
|
from typing_extensions import NotRequired
|
|
8
8
|
|
|
@@ -53,3 +53,20 @@ class EvaluatorConfig(TypedDict, total=False):
|
|
|
53
53
|
|
|
54
54
|
column_mapping: Dict[str, str]
|
|
55
55
|
"""Dictionary mapping evaluator input name to column in data"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Message(TypedDict):
|
|
59
|
+
role: str
|
|
60
|
+
content: Union[str, List[Dict]]
|
|
61
|
+
context: NotRequired[Dict[str, Any]]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Conversation(TypedDict):
|
|
65
|
+
messages: Union[List[Message], List[Dict]]
|
|
66
|
+
context: NotRequired[Dict[str, Any]]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class EvaluationResult(TypedDict):
|
|
70
|
+
metrics: Dict
|
|
71
|
+
studio_url: NotRequired[str]
|
|
72
|
+
rows: List[Dict]
|
azure/ai/evaluation/_version.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from ._adversarial_scenario import AdversarialScenario
|
|
1
|
+
from ._adversarial_scenario import AdversarialScenario, AdversarialScenarioJailbreak
|
|
2
2
|
from ._adversarial_simulator import AdversarialSimulator
|
|
3
3
|
from ._constants import SupportedLanguages
|
|
4
4
|
from ._direct_attack_simulator import DirectAttackSimulator
|
|
@@ -8,6 +8,7 @@ from ._simulator import Simulator
|
|
|
8
8
|
__all__ = [
|
|
9
9
|
"AdversarialSimulator",
|
|
10
10
|
"AdversarialScenario",
|
|
11
|
+
"AdversarialScenarioJailbreak",
|
|
11
12
|
"DirectAttackSimulator",
|
|
12
13
|
"IndirectAttackSimulator",
|
|
13
14
|
"SupportedLanguages",
|
|
@@ -16,6 +16,11 @@ class AdversarialScenario(Enum):
|
|
|
16
16
|
ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded"
|
|
17
17
|
ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded"
|
|
18
18
|
ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AdversarialScenarioJailbreak(Enum):
|
|
22
|
+
"""Adversarial scenario types for XPIA Jailbreak"""
|
|
23
|
+
|
|
19
24
|
ADVERSARIAL_INDIRECT_JAILBREAK = "adv_xpia"
|
|
20
25
|
|
|
21
26
|
|
|
@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
|
|
|
10
10
|
|
|
11
11
|
from tqdm import tqdm
|
|
12
12
|
|
|
13
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
13
14
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
14
15
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
16
|
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
@@ -21,7 +22,6 @@ from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
|
|
|
21
22
|
from ._constants import SupportedLanguages
|
|
22
23
|
from ._conversation import CallbackConversationBot, ConversationBot, ConversationRole, ConversationTurn
|
|
23
24
|
from ._conversation._conversation import simulate_conversation
|
|
24
|
-
from ._helpers import experimental
|
|
25
25
|
from ._model_tools import (
|
|
26
26
|
AdversarialTemplateHandler,
|
|
27
27
|
ManagedIdentityAPITokenManager,
|
|
@@ -276,6 +276,9 @@ class AdversarialSimulator:
|
|
|
276
276
|
"target_population",
|
|
277
277
|
"topic",
|
|
278
278
|
"ch_template_placeholder",
|
|
279
|
+
"chatbot_name",
|
|
280
|
+
"name",
|
|
281
|
+
"group",
|
|
279
282
|
):
|
|
280
283
|
template_parameters.pop(key, None)
|
|
281
284
|
if conversation_category:
|