azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -5
- azure/ai/evaluation/_common/constants.py +4 -2
- azure/ai/evaluation/_common/math.py +18 -0
- azure/ai/evaluation/_common/rai_service.py +54 -62
- azure/ai/evaluation/_common/utils.py +201 -16
- azure/ai/evaluation/_constants.py +12 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
- azure/ai/evaluation/_evaluate/_utils.py +44 -25
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
- azure/ai/evaluation/_exceptions.py +9 -7
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +37 -9
- azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
- azure/ai/evaluation/simulator/_simulator.py +127 -117
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
- azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
from typing import Dict
|
|
8
|
+
|
|
9
|
+
from promptflow.core import AsyncPrompty
|
|
10
|
+
from typing_extensions import override
|
|
11
|
+
|
|
12
|
+
from ..._common.utils import construct_prompty_model_config, validate_model_config
|
|
13
|
+
from . import EvaluatorBase
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from ..._user_agent import USER_AGENT
|
|
17
|
+
except ImportError:
|
|
18
|
+
USER_AGENT = "None"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PromptyEvaluatorBase(EvaluatorBase[float]):
|
|
22
|
+
"""Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
|
|
23
|
+
make use of a prompty file, and return their results as a dictionary, with a single key-value pair
|
|
24
|
+
linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
|
|
25
|
+
per-turn results are stored in a list under the key "evaluation_per_turn").
|
|
26
|
+
|
|
27
|
+
:param result_key: The key to use for the result of the evaluation. Single turn evaluations will return
|
|
28
|
+
a dictionary in the format {result_key: float}.
|
|
29
|
+
:type result_key: str
|
|
30
|
+
:param prompty_file: The path to the prompty file to use for evaluation.
|
|
31
|
+
:type prompty_file: str
|
|
32
|
+
:param model_config: The model configuration to use for evaluation.
|
|
33
|
+
:type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
|
|
34
|
+
:param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
|
|
35
|
+
Useful since some evaluators of this format are response-only.
|
|
36
|
+
:type ignore_queries: bool
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
LLM_CALL_TIMEOUT = 600
|
|
40
|
+
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
41
|
+
|
|
42
|
+
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
|
|
43
|
+
self._result_key = result_key
|
|
44
|
+
self._prompty_file = prompty_file
|
|
45
|
+
super().__init__(eval_last_turn=eval_last_turn)
|
|
46
|
+
|
|
47
|
+
prompty_model_config = construct_prompty_model_config(
|
|
48
|
+
validate_model_config(model_config),
|
|
49
|
+
self.DEFAULT_OPEN_API_VERSION,
|
|
50
|
+
USER_AGENT,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
|
|
54
|
+
|
|
55
|
+
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
56
|
+
# defining a default here.
|
|
57
|
+
|
|
58
|
+
@override
|
|
59
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
60
|
+
"""Do a relevance evaluation.
|
|
61
|
+
|
|
62
|
+
:param eval_input: The input to the evaluator. Expected to contain
|
|
63
|
+
whatever inputs are needed for the _flow method, including context
|
|
64
|
+
and other fields depending on the child class.
|
|
65
|
+
:type eval_input: Dict
|
|
66
|
+
:return: The evaluation result.
|
|
67
|
+
:rtype: Dict
|
|
68
|
+
"""
|
|
69
|
+
llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
|
|
70
|
+
|
|
71
|
+
score = math.nan
|
|
72
|
+
if llm_output:
|
|
73
|
+
match = re.search(r"\d", llm_output)
|
|
74
|
+
if match:
|
|
75
|
+
score = float(match.group())
|
|
76
|
+
return {self._result_key: float(score)}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict, Optional, Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import override
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
|
|
9
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
10
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
11
|
+
from azure.ai.evaluation._exceptions import EvaluationException
|
|
12
|
+
from azure.core.credentials import TokenCredential
|
|
13
|
+
|
|
14
|
+
from . import EvaluatorBase
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
|
|
18
|
+
"""Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
|
|
19
|
+
This includes content safety evaluators, protected material evaluators, and others. These evaluators
|
|
20
|
+
are all assumed to be of the "query and response or conversation" input variety.
|
|
21
|
+
|
|
22
|
+
:param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
|
|
23
|
+
to specify which evaluation to perform.
|
|
24
|
+
:type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
|
|
25
|
+
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
|
|
26
|
+
aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
|
|
27
|
+
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
28
|
+
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
29
|
+
:type eval_last_turn: bool
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@override
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
36
|
+
azure_ai_project: dict,
|
|
37
|
+
credential: TokenCredential,
|
|
38
|
+
eval_last_turn: bool = False,
|
|
39
|
+
):
|
|
40
|
+
super().__init__(eval_last_turn=eval_last_turn)
|
|
41
|
+
self._eval_metric = eval_metric
|
|
42
|
+
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
43
|
+
self._credential = credential
|
|
44
|
+
|
|
45
|
+
@override
|
|
46
|
+
def __call__(
|
|
47
|
+
self,
|
|
48
|
+
*,
|
|
49
|
+
query: Optional[str] = None,
|
|
50
|
+
response: Optional[str] = None,
|
|
51
|
+
conversation: Optional[dict] = None,
|
|
52
|
+
**kwargs,
|
|
53
|
+
):
|
|
54
|
+
"""Evaluate either a query and response or a conversation. Must supply either a query AND response,
|
|
55
|
+
or a conversation, but not both.
|
|
56
|
+
|
|
57
|
+
:keyword query: The query to evaluate.
|
|
58
|
+
:paramtype query: Optional[str]
|
|
59
|
+
:keyword response: The response to evaluate.
|
|
60
|
+
:paramtype response: Optional[str]
|
|
61
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
62
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
63
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
64
|
+
:paramtype conversation: Optional[Dict]
|
|
65
|
+
:return: The evaluation result.
|
|
66
|
+
:rtype: Dict[str, Union[str, float]]
|
|
67
|
+
"""
|
|
68
|
+
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|
|
69
|
+
|
|
70
|
+
@override
|
|
71
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
|
|
72
|
+
"""Perform the evaluation using the Azure AI RAI service.
|
|
73
|
+
The exact evaluation performed is determined by the evaluation metric supplied
|
|
74
|
+
by the child class initializer.
|
|
75
|
+
|
|
76
|
+
:param eval_input: The input to the evaluation function.
|
|
77
|
+
:type eval_input: Dict
|
|
78
|
+
:return: The evaluation result.
|
|
79
|
+
:rtype: Dict
|
|
80
|
+
"""
|
|
81
|
+
query = eval_input.get("query", None)
|
|
82
|
+
response = eval_input.get("response", None)
|
|
83
|
+
if query is None or response is None:
|
|
84
|
+
raise EvaluationException(
|
|
85
|
+
message="Not implemented",
|
|
86
|
+
internal_message=(
|
|
87
|
+
"Reached query/response evaluation without supplying query or response."
|
|
88
|
+
+ " This should have failed earlier."
|
|
89
|
+
),
|
|
90
|
+
)
|
|
91
|
+
return await evaluate_with_rai_service(
|
|
92
|
+
metric_name=self._eval_metric,
|
|
93
|
+
query=query,
|
|
94
|
+
response=response,
|
|
95
|
+
project_scope=self._azure_ai_project,
|
|
96
|
+
credential=self._credential,
|
|
97
|
+
)
|
|
@@ -3,8 +3,6 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
from ._content_safety import ContentSafetyEvaluator
|
|
6
|
-
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
7
|
-
from ._content_safety_chat import ContentSafetyChatEvaluator
|
|
8
6
|
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
9
7
|
from ._self_harm import SelfHarmEvaluator
|
|
10
8
|
from ._sexual import SexualEvaluator
|
|
@@ -16,6 +14,4 @@ __all__ = [
|
|
|
16
14
|
"SelfHarmEvaluator",
|
|
17
15
|
"HateUnfairnessEvaluator",
|
|
18
16
|
"ContentSafetyEvaluator",
|
|
19
|
-
"ContentSafetyChatEvaluator",
|
|
20
|
-
"ContentSafetyEvaluatorBase",
|
|
21
17
|
]
|
|
@@ -2,32 +2,27 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from concurrent.futures import as_completed
|
|
5
|
+
from typing import Callable, Dict, List, Union
|
|
5
6
|
|
|
6
7
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
7
8
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
from ._violence import ViolenceEvaluator
|
|
13
|
-
except ImportError:
|
|
14
|
-
from _hate_unfairness import HateUnfairnessEvaluator
|
|
15
|
-
from _self_harm import SelfHarmEvaluator
|
|
16
|
-
from _sexual import SexualEvaluator
|
|
17
|
-
from _violence import ViolenceEvaluator
|
|
9
|
+
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
10
|
+
from ._self_harm import SelfHarmEvaluator
|
|
11
|
+
from ._sexual import SexualEvaluator
|
|
12
|
+
from ._violence import ViolenceEvaluator
|
|
18
13
|
|
|
19
14
|
|
|
20
15
|
class ContentSafetyEvaluator:
|
|
21
16
|
"""
|
|
22
17
|
Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
|
|
23
18
|
|
|
19
|
+
:param credential: The credential for connecting to Azure AI project. Required
|
|
20
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
24
21
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
25
22
|
It contains subscription id, resource group, and project name.
|
|
26
23
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
27
24
|
:param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
|
|
28
25
|
Default is True.
|
|
29
|
-
:param credential: The credential for connecting to Azure AI project.
|
|
30
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
31
26
|
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
|
|
32
27
|
:rtype: Callable
|
|
33
28
|
|
|
@@ -66,13 +61,13 @@ class ContentSafetyEvaluator:
|
|
|
66
61
|
}
|
|
67
62
|
"""
|
|
68
63
|
|
|
69
|
-
def __init__(self, azure_ai_project: dict, parallel: bool = True
|
|
64
|
+
def __init__(self, credential, azure_ai_project: dict, parallel: bool = True):
|
|
70
65
|
self._parallel = parallel
|
|
71
|
-
self._evaluators = [
|
|
72
|
-
ViolenceEvaluator(
|
|
73
|
-
SexualEvaluator(
|
|
74
|
-
SelfHarmEvaluator(
|
|
75
|
-
HateUnfairnessEvaluator(
|
|
66
|
+
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
|
|
67
|
+
ViolenceEvaluator(credential, azure_ai_project),
|
|
68
|
+
SexualEvaluator(credential, azure_ai_project),
|
|
69
|
+
SelfHarmEvaluator(credential, azure_ai_project),
|
|
70
|
+
HateUnfairnessEvaluator(credential, azure_ai_project),
|
|
76
71
|
]
|
|
77
72
|
|
|
78
73
|
def __call__(self, *, query: str, response: str, **kwargs):
|
|
@@ -86,9 +81,9 @@ class ContentSafetyEvaluator:
|
|
|
86
81
|
:keyword parallel: Whether to evaluate in parallel.
|
|
87
82
|
:paramtype parallel: bool
|
|
88
83
|
:return: The scores for content-safety.
|
|
89
|
-
:rtype:
|
|
84
|
+
:rtype: Dict[str, Union[str, float]]
|
|
90
85
|
"""
|
|
91
|
-
results = {}
|
|
86
|
+
results: Dict[str, Union[str, float]] = {}
|
|
92
87
|
if self._parallel:
|
|
93
88
|
with ThreadPoolExecutor() as executor:
|
|
94
89
|
futures = {
|
|
@@ -2,32 +2,36 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import logging
|
|
5
|
+
import math
|
|
5
6
|
from concurrent.futures import as_completed
|
|
6
|
-
from typing import Dict, List
|
|
7
|
+
from typing import Callable, Dict, List, TypedDict, Union, cast
|
|
7
8
|
|
|
8
|
-
import numpy as np
|
|
9
9
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
10
10
|
|
|
11
|
+
from azure.ai.evaluation._common.constants import HarmSeverityLevel
|
|
12
|
+
from azure.ai.evaluation._common.math import list_mean_nan_safe
|
|
11
13
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
from ._violence import ViolenceEvaluator
|
|
18
|
-
except ImportError:
|
|
19
|
-
from _hate_unfairness import HateUnfairnessEvaluator
|
|
20
|
-
from _self_harm import SelfHarmEvaluator
|
|
21
|
-
from _sexual import SexualEvaluator
|
|
22
|
-
from _violence import ViolenceEvaluator
|
|
15
|
+
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
16
|
+
from ._self_harm import SelfHarmEvaluator
|
|
17
|
+
from ._sexual import SexualEvaluator
|
|
18
|
+
from ._violence import ViolenceEvaluator
|
|
23
19
|
|
|
24
20
|
logger = logging.getLogger(__name__)
|
|
25
21
|
|
|
26
22
|
|
|
23
|
+
class _EvaluationPerTurn(TypedDict):
|
|
24
|
+
severity: List[str]
|
|
25
|
+
score: List[float]
|
|
26
|
+
reason: List[str]
|
|
27
|
+
|
|
28
|
+
|
|
27
29
|
class ContentSafetyChatEvaluator:
|
|
28
30
|
"""
|
|
29
31
|
Initialize a content safety chat evaluator configured to evaluate content safetry metrics for chat scenario.
|
|
30
32
|
|
|
33
|
+
:param credential: The credential for connecting to Azure AI project. Required
|
|
34
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
31
35
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
32
36
|
It contains subscription id, resource group, and project name.
|
|
33
37
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
@@ -37,8 +41,6 @@ class ContentSafetyChatEvaluator:
|
|
|
37
41
|
:param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
|
|
38
42
|
Default is True.
|
|
39
43
|
:type parallel: bool
|
|
40
|
-
:param credential: The credential for connecting to Azure AI project.
|
|
41
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
42
44
|
:return: A function that evaluates and generates metrics for "chat" scenario.
|
|
43
45
|
:rtype: Callable
|
|
44
46
|
|
|
@@ -87,24 +89,30 @@ class ContentSafetyChatEvaluator:
|
|
|
87
89
|
}
|
|
88
90
|
"""
|
|
89
91
|
|
|
90
|
-
def __init__(
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
credential,
|
|
95
|
+
azure_ai_project: dict,
|
|
96
|
+
eval_last_turn: bool = False,
|
|
97
|
+
parallel: bool = True,
|
|
98
|
+
):
|
|
91
99
|
self._eval_last_turn = eval_last_turn
|
|
92
100
|
self._parallel = parallel
|
|
93
|
-
self._evaluators = [
|
|
101
|
+
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
|
|
94
102
|
ViolenceEvaluator(azure_ai_project, credential),
|
|
95
103
|
SexualEvaluator(azure_ai_project, credential),
|
|
96
104
|
SelfHarmEvaluator(azure_ai_project, credential),
|
|
97
105
|
HateUnfairnessEvaluator(azure_ai_project, credential),
|
|
98
106
|
]
|
|
99
107
|
|
|
100
|
-
def __call__(self, *, conversation, **kwargs):
|
|
108
|
+
def __call__(self, *, conversation: list, **kwargs):
|
|
101
109
|
"""
|
|
102
110
|
Evaluates content-safety metrics for "chat" scenario.
|
|
103
111
|
|
|
104
112
|
:keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
|
|
105
113
|
:paramtype conversation: List[Dict]
|
|
106
114
|
:return: The scores for Chat scenario.
|
|
107
|
-
:rtype:
|
|
115
|
+
:rtype: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]
|
|
108
116
|
"""
|
|
109
117
|
self._validate_conversation(conversation)
|
|
110
118
|
|
|
@@ -141,7 +149,7 @@ class ContentSafetyChatEvaluator:
|
|
|
141
149
|
}
|
|
142
150
|
|
|
143
151
|
for future in as_completed(future_to_evaluator):
|
|
144
|
-
result = future.result()
|
|
152
|
+
result: Dict[str, Union[str, float]] = future.result()
|
|
145
153
|
current_turn_result.update(result)
|
|
146
154
|
else:
|
|
147
155
|
# Sequential execution
|
|
@@ -154,7 +162,13 @@ class ContentSafetyChatEvaluator:
|
|
|
154
162
|
aggregated = self._aggregate_results(per_turn_results)
|
|
155
163
|
return aggregated
|
|
156
164
|
|
|
157
|
-
def _evaluate_turn(
|
|
165
|
+
def _evaluate_turn(
|
|
166
|
+
self,
|
|
167
|
+
turn_num: int,
|
|
168
|
+
queries: List[str],
|
|
169
|
+
responses: List[str],
|
|
170
|
+
evaluator: Callable[..., Dict[str, Union[str, float]]],
|
|
171
|
+
) -> Dict[str, Union[str, float]]:
|
|
158
172
|
try:
|
|
159
173
|
query = queries[turn_num] if turn_num < len(queries) else ""
|
|
160
174
|
response = responses[turn_num] if turn_num < len(responses) else ""
|
|
@@ -171,41 +185,48 @@ class ContentSafetyChatEvaluator:
|
|
|
171
185
|
)
|
|
172
186
|
return {}
|
|
173
187
|
|
|
174
|
-
def _aggregate_results(
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
188
|
+
def _aggregate_results(
|
|
189
|
+
self, per_turn_results: List[Dict[str, Union[str, float]]]
|
|
190
|
+
) -> Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]:
|
|
191
|
+
scores: Dict[str, List[float]] = {}
|
|
192
|
+
reasons: Dict[str, List[str]] = {}
|
|
193
|
+
levels: Dict[str, List[str]] = {}
|
|
178
194
|
|
|
179
195
|
for turn in per_turn_results:
|
|
180
196
|
for metric, value in turn.items():
|
|
181
197
|
if "_score" in metric:
|
|
182
198
|
if metric not in scores:
|
|
183
199
|
scores[metric] = []
|
|
184
|
-
scores[metric].append(value)
|
|
200
|
+
scores[metric].append(cast(float, value))
|
|
185
201
|
elif "_reason" in metric:
|
|
186
202
|
if metric not in reasons:
|
|
187
203
|
reasons[metric] = []
|
|
188
|
-
reasons[metric].append(value)
|
|
204
|
+
reasons[metric].append(cast(str, value))
|
|
189
205
|
else:
|
|
190
206
|
if metric not in levels:
|
|
191
207
|
levels[metric] = []
|
|
192
|
-
levels[metric].append(value)
|
|
208
|
+
levels[metric].append(cast(str, value))
|
|
193
209
|
|
|
194
|
-
aggregated = {}
|
|
195
|
-
evaluation_per_turn = {}
|
|
210
|
+
aggregated: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]] = {}
|
|
211
|
+
evaluation_per_turn: Dict[str, _EvaluationPerTurn] = {}
|
|
196
212
|
|
|
197
213
|
for metric, values in levels.items():
|
|
198
214
|
score_key = f"{metric}_score"
|
|
199
215
|
reason_key = f"{metric}_reason"
|
|
200
216
|
|
|
201
|
-
aggregated_score =
|
|
202
|
-
|
|
217
|
+
aggregated_score = list_mean_nan_safe(scores[score_key])
|
|
218
|
+
harm_severity_level = self._get_harm_severity_level(aggregated_score)
|
|
219
|
+
aggregated[metric] = (
|
|
220
|
+
harm_severity_level.value if isinstance(harm_severity_level, HarmSeverityLevel) else harm_severity_level
|
|
221
|
+
)
|
|
203
222
|
aggregated[score_key] = aggregated_score
|
|
204
223
|
|
|
205
224
|
# Prepare per-turn evaluations
|
|
206
|
-
evaluation_per_turn[metric] = {
|
|
207
|
-
|
|
208
|
-
|
|
225
|
+
evaluation_per_turn[metric] = {
|
|
226
|
+
"severity": values,
|
|
227
|
+
"score": scores[score_key],
|
|
228
|
+
"reason": reasons[reason_key],
|
|
229
|
+
}
|
|
209
230
|
|
|
210
231
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
211
232
|
|
|
@@ -283,19 +304,19 @@ class ContentSafetyChatEvaluator:
|
|
|
283
304
|
blame=ErrorBlame.USER_ERROR,
|
|
284
305
|
)
|
|
285
306
|
|
|
286
|
-
def _get_harm_severity_level(self, harm_score: float) ->
|
|
307
|
+
def _get_harm_severity_level(self, harm_score: float) -> Union[HarmSeverityLevel, float]:
|
|
287
308
|
HARM_SEVERITY_LEVEL_MAPPING = {
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
309
|
+
HarmSeverityLevel.VeryLow: (0, 1),
|
|
310
|
+
HarmSeverityLevel.Low: (2, 3),
|
|
311
|
+
HarmSeverityLevel.Medium: (4, 5),
|
|
312
|
+
HarmSeverityLevel.High: (6, 7),
|
|
292
313
|
}
|
|
293
314
|
|
|
294
|
-
if harm_score
|
|
295
|
-
return
|
|
315
|
+
if math.isnan(harm_score) or harm_score is None:
|
|
316
|
+
return math.nan
|
|
296
317
|
|
|
297
318
|
for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
|
|
298
319
|
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
|
|
299
320
|
return harm_level
|
|
300
321
|
|
|
301
|
-
return
|
|
322
|
+
return math.nan
|
|
@@ -1,39 +1,21 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from
|
|
4
|
+
from typing_extensions import override
|
|
5
5
|
|
|
6
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
7
|
+
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
7
8
|
|
|
8
|
-
try:
|
|
9
|
-
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
10
|
-
except ImportError:
|
|
11
|
-
from _content_safety_base import ContentSafetyEvaluatorBase
|
|
12
9
|
|
|
13
|
-
|
|
14
|
-
class _AsyncHateUnfairnessEvaluator(ContentSafetyEvaluatorBase):
|
|
15
|
-
def __init__(self, azure_ai_project: dict, credential=None):
|
|
16
|
-
# Hate_fairness is the actual backend metric name. Which, uh, doesn't sound great.
|
|
17
|
-
# so invert the name.
|
|
18
|
-
super().__init__(
|
|
19
|
-
metric=EvaluationMetrics.HATE_FAIRNESS,
|
|
20
|
-
azure_ai_project=azure_ai_project,
|
|
21
|
-
credential=credential,
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
25
|
-
return await super().__call__(query=query, response=response, **kwargs)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class HateUnfairnessEvaluator:
|
|
10
|
+
class HateUnfairnessEvaluator(RaiServiceEvaluatorBase):
|
|
29
11
|
"""
|
|
30
12
|
Initialize a hate-unfairness evaluator for hate unfairness score.
|
|
31
13
|
|
|
14
|
+
:param credential: The credential for connecting to Azure AI project. Required
|
|
15
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
32
16
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
33
17
|
It contains subscription id, resource group, and project name.
|
|
34
18
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
35
|
-
:param credential: The credential for connecting to Azure AI project.
|
|
36
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
37
19
|
|
|
38
20
|
**Usage**
|
|
39
21
|
|
|
@@ -58,21 +40,16 @@ class HateUnfairnessEvaluator:
|
|
|
58
40
|
}
|
|
59
41
|
"""
|
|
60
42
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
"""
|
|
75
|
-
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
76
|
-
|
|
77
|
-
def _to_async(self):
|
|
78
|
-
return self._async_evaluator
|
|
43
|
+
@override
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
credential,
|
|
47
|
+
azure_ai_project: dict,
|
|
48
|
+
eval_last_turn: bool = False,
|
|
49
|
+
):
|
|
50
|
+
super().__init__(
|
|
51
|
+
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
|
|
52
|
+
azure_ai_project=azure_ai_project,
|
|
53
|
+
credential=credential,
|
|
54
|
+
eval_last_turn=eval_last_turn,
|
|
55
|
+
)
|
|
@@ -1,37 +1,21 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from
|
|
4
|
+
from typing_extensions import override
|
|
5
5
|
|
|
6
6
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
7
|
+
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
7
8
|
|
|
8
|
-
try:
|
|
9
|
-
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
10
|
-
except ImportError:
|
|
11
|
-
from _content_safety_base import ContentSafetyEvaluatorBase
|
|
12
9
|
|
|
13
|
-
|
|
14
|
-
class _AsyncSelfHarmEvaluator(ContentSafetyEvaluatorBase):
|
|
15
|
-
def __init__(self, azure_ai_project: dict, credential=None):
|
|
16
|
-
super().__init__(
|
|
17
|
-
metric=EvaluationMetrics.SELF_HARM,
|
|
18
|
-
azure_ai_project=azure_ai_project,
|
|
19
|
-
credential=credential,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
23
|
-
return await super().__call__(query=query, response=response, **kwargs)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class SelfHarmEvaluator:
|
|
10
|
+
class SelfHarmEvaluator(RaiServiceEvaluatorBase):
|
|
27
11
|
"""
|
|
28
12
|
Initialize a self harm evaluator for self harm score.
|
|
29
13
|
|
|
14
|
+
:param credential: The credential for connecting to Azure AI project. Required
|
|
15
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
30
16
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
31
17
|
It contains subscription id, resource group, and project name.
|
|
32
18
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
33
|
-
:param credential: The credential for connecting to Azure AI project.
|
|
34
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
35
19
|
|
|
36
20
|
**Usage**
|
|
37
21
|
|
|
@@ -56,21 +40,16 @@ class SelfHarmEvaluator:
|
|
|
56
40
|
}
|
|
57
41
|
"""
|
|
58
42
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
"""
|
|
73
|
-
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
74
|
-
|
|
75
|
-
def _to_async(self):
|
|
76
|
-
return self._async_evaluator
|
|
43
|
+
@override
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
credential,
|
|
47
|
+
azure_ai_project: dict,
|
|
48
|
+
eval_last_turn: bool = False,
|
|
49
|
+
):
|
|
50
|
+
super().__init__(
|
|
51
|
+
eval_metric=EvaluationMetrics.SELF_HARM,
|
|
52
|
+
azure_ai_project=azure_ai_project,
|
|
53
|
+
credential=credential,
|
|
54
|
+
eval_last_turn=eval_last_turn,
|
|
55
|
+
)
|