azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +51 -6
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +88 -52
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +188 -10
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +25 -17
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -8,15 +8,16 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
8
8
|
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
9
9
|
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
# cspell:ignore ssrf, vuln
|
|
12
13
|
@experimental
|
|
13
14
|
class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
14
15
|
"""
|
|
15
|
-
Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
|
|
16
|
+
Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
|
|
16
17
|
where query represents the user query or code before the completion, and response represents the code recommended by the assistant.
|
|
17
18
|
|
|
18
19
|
The code vulnerability evaluation checks for vulnerabilities in the following coding languages:
|
|
19
|
-
|
|
20
|
+
|
|
20
21
|
- Python
|
|
21
22
|
- Java
|
|
22
23
|
- C++
|
|
@@ -26,7 +27,7 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
26
27
|
- SQL
|
|
27
28
|
|
|
28
29
|
The code vulnerability evaluation identifies the following vulnerabilities:
|
|
29
|
-
|
|
30
|
+
|
|
30
31
|
- path-injection
|
|
31
32
|
- sql-injection
|
|
32
33
|
- code-injection
|
|
@@ -49,9 +50,9 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
49
50
|
|
|
50
51
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
51
52
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
52
|
-
:param azure_ai_project: The
|
|
53
|
-
It contains subscription id, resource group, and project name.
|
|
54
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
53
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
54
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
55
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
55
56
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
56
57
|
:type kwargs: Any
|
|
57
58
|
|
|
@@ -63,13 +64,13 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
63
64
|
:language: python
|
|
64
65
|
:dedent: 8
|
|
65
66
|
:caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
|
|
66
|
-
|
|
67
|
+
|
|
67
68
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
68
69
|
:start-after: [START code_vulnerability_evaluator]
|
|
69
70
|
:end-before: [END code_vulnerability_evaluator]
|
|
70
71
|
:language: python
|
|
71
72
|
:dedent: 8
|
|
72
|
-
:caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
|
|
73
|
+
:caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
|
|
73
74
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
74
75
|
|
|
75
76
|
.. note::
|
|
@@ -78,19 +79,26 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
78
79
|
for the code vulnerability will be "code_vulnerability_label".
|
|
79
80
|
"""
|
|
80
81
|
|
|
81
|
-
id = "code_vulnerability"
|
|
82
|
+
id = "azureai://built-in/evaluators/code_vulnerability"
|
|
82
83
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
84
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
83
85
|
|
|
84
86
|
@override
|
|
85
87
|
def __init__(
|
|
86
88
|
self,
|
|
87
89
|
credential,
|
|
88
90
|
azure_ai_project,
|
|
91
|
+
**kwargs,
|
|
89
92
|
):
|
|
93
|
+
# Set default for evaluate_query if not provided
|
|
94
|
+
if "evaluate_query" not in kwargs:
|
|
95
|
+
kwargs["evaluate_query"] = True
|
|
96
|
+
|
|
90
97
|
super().__init__(
|
|
91
98
|
eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
|
|
92
99
|
azure_ai_project=azure_ai_project,
|
|
93
100
|
credential=credential,
|
|
101
|
+
**kwargs,
|
|
94
102
|
)
|
|
95
103
|
|
|
96
104
|
@overload
|
|
@@ -99,7 +107,7 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
99
107
|
*,
|
|
100
108
|
query: str,
|
|
101
109
|
response: str,
|
|
102
|
-
) -> Dict[str, Union[str, float]]:
|
|
110
|
+
) -> Dict[str, Union[str, float]]:
|
|
103
111
|
"""Evaluate a given query/response pair for code vulnerability
|
|
104
112
|
|
|
105
113
|
:keyword query: The query to be evaluated.
|
|
@@ -116,7 +124,7 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
116
124
|
*args,
|
|
117
125
|
**kwargs,
|
|
118
126
|
):
|
|
119
|
-
"""Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
|
|
127
|
+
"""Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
|
|
120
128
|
|
|
121
129
|
:keyword query: The query to be evaluated.
|
|
122
130
|
:paramtype query: Optional[str]
|
|
@@ -124,5 +132,5 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
124
132
|
:paramtype response: Optional[str]
|
|
125
133
|
:rtype: Dict[str, Union[str, bool]]
|
|
126
134
|
"""
|
|
127
|
-
|
|
135
|
+
|
|
128
136
|
return super().__call__(*args, **kwargs)
|
|
@@ -32,17 +32,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
32
32
|
:language: python
|
|
33
33
|
:dedent: 8
|
|
34
34
|
:caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
.. admonition:: Example using Azure AI Project URL:
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
39
39
|
:start-after: [START coherence_evaluator]
|
|
40
40
|
:end-before: [END coherence_evaluator]
|
|
41
41
|
:language: python
|
|
42
42
|
:dedent: 8
|
|
43
|
-
:caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
|
|
43
|
+
:caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
|
|
44
44
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
.. admonition:: Example with Threshold:
|
|
47
47
|
|
|
48
48
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -62,7 +62,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
62
62
|
_PROMPTY_FILE = "coherence.prompty"
|
|
63
63
|
_RESULT_KEY = "coherence"
|
|
64
64
|
|
|
65
|
-
id = "
|
|
65
|
+
id = "azureai://built-in/evaluators/coherence"
|
|
66
66
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
67
67
|
|
|
68
68
|
@override
|
|
@@ -76,7 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
prompty_file=prompty_path,
|
|
77
77
|
result_key=self._RESULT_KEY,
|
|
78
78
|
threshold=threshold,
|
|
79
|
-
_higher_is_better=self._higher_is_better
|
|
79
|
+
_higher_is_better=self._higher_is_better,
|
|
80
80
|
)
|
|
81
81
|
|
|
82
82
|
@overload
|
|
@@ -4,14 +4,34 @@
|
|
|
4
4
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import (
|
|
8
|
+
Any,
|
|
9
|
+
Callable,
|
|
10
|
+
Dict,
|
|
11
|
+
Generic,
|
|
12
|
+
List,
|
|
13
|
+
TypedDict,
|
|
14
|
+
TypeVar,
|
|
15
|
+
Union,
|
|
16
|
+
cast,
|
|
17
|
+
final,
|
|
18
|
+
Optional,
|
|
19
|
+
)
|
|
8
20
|
|
|
9
21
|
from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
|
|
10
22
|
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
23
|
|
|
12
|
-
from azure.ai.evaluation._exceptions import
|
|
24
|
+
from azure.ai.evaluation._exceptions import (
|
|
25
|
+
ErrorBlame,
|
|
26
|
+
ErrorCategory,
|
|
27
|
+
ErrorTarget,
|
|
28
|
+
EvaluationException,
|
|
29
|
+
)
|
|
13
30
|
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
14
|
-
from azure.ai.evaluation._constants import
|
|
31
|
+
from azure.ai.evaluation._constants import (
|
|
32
|
+
_AggregationType,
|
|
33
|
+
EVALUATION_PASS_FAIL_MAPPING,
|
|
34
|
+
)
|
|
15
35
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
16
36
|
from azure.ai.evaluation._common._experimental import experimental
|
|
17
37
|
|
|
@@ -176,7 +196,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
176
196
|
singletons.extend([p for p in params if p != "self"])
|
|
177
197
|
return singletons
|
|
178
198
|
|
|
179
|
-
def _derive_conversation_converter(
|
|
199
|
+
def _derive_conversation_converter(
|
|
200
|
+
self,
|
|
201
|
+
) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
180
202
|
"""Produce the function that will be used to convert conversations to a list of evaluable inputs.
|
|
181
203
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
182
204
|
aspects of a conversation ought to be extracted.
|
|
@@ -235,7 +257,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
235
257
|
|
|
236
258
|
return converter
|
|
237
259
|
|
|
238
|
-
def _derive_multi_modal_conversation_converter(
|
|
260
|
+
def _derive_multi_modal_conversation_converter(
|
|
261
|
+
self,
|
|
262
|
+
) -> Callable[[Dict], List[Dict[str, Any]]]:
|
|
239
263
|
"""Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
|
|
240
264
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
241
265
|
aspects of a conversation ought to be extracted.
|
|
@@ -288,7 +312,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
288
312
|
|
|
289
313
|
return multi_modal_converter
|
|
290
314
|
|
|
291
|
-
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
315
|
+
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
|
|
292
316
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
293
317
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
294
318
|
Either they receive a collection of keyname inputs that are all single values
|
|
@@ -416,12 +440,12 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
416
440
|
threshold_key = f"{base_key}_threshold"
|
|
417
441
|
result[threshold_key] = self._threshold
|
|
418
442
|
if self._higher_is_better:
|
|
419
|
-
if
|
|
443
|
+
if float(score_value) >= self._threshold:
|
|
420
444
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
|
|
421
445
|
else:
|
|
422
446
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
|
|
423
447
|
else:
|
|
424
|
-
if
|
|
448
|
+
if float(score_value) <= self._threshold:
|
|
425
449
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
|
|
426
450
|
else:
|
|
427
451
|
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
|
|
@@ -498,9 +522,19 @@ class AsyncEvaluatorBase:
|
|
|
498
522
|
# Since we want this to be relatively call-agnostic, we just account for every input that any children
|
|
499
523
|
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
500
524
|
async def __call__(
|
|
501
|
-
self,
|
|
502
|
-
|
|
503
|
-
|
|
525
|
+
self,
|
|
526
|
+
*,
|
|
527
|
+
query=None,
|
|
528
|
+
response=None,
|
|
529
|
+
context=None,
|
|
530
|
+
conversation=None,
|
|
531
|
+
ground_truth=None,
|
|
532
|
+
tool_calls=None,
|
|
533
|
+
tool_definitions=None,
|
|
534
|
+
messages=None,
|
|
535
|
+
retrieval_ground_truth=None,
|
|
536
|
+
retrieved_documents=None,
|
|
537
|
+
**kwargs,
|
|
504
538
|
):
|
|
505
539
|
if conversation is not None:
|
|
506
540
|
kwargs["conversation"] = conversation
|
|
@@ -20,9 +20,14 @@ from ..._common.utils import construct_prompty_model_config, validate_model_conf
|
|
|
20
20
|
from . import EvaluatorBase
|
|
21
21
|
|
|
22
22
|
try:
|
|
23
|
-
from ..._user_agent import
|
|
23
|
+
from ..._user_agent import UserAgentSingleton
|
|
24
24
|
except ImportError:
|
|
25
|
-
|
|
25
|
+
|
|
26
|
+
class UserAgentSingleton:
|
|
27
|
+
@property
|
|
28
|
+
def value(self) -> str:
|
|
29
|
+
return "None"
|
|
30
|
+
|
|
26
31
|
|
|
27
32
|
T = TypeVar("T")
|
|
28
33
|
|
|
@@ -50,8 +55,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
50
55
|
_LLM_CALL_TIMEOUT = 600
|
|
51
56
|
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
52
57
|
|
|
53
|
-
def __init__(
|
|
54
|
-
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
result_key: str,
|
|
62
|
+
prompty_file: str,
|
|
63
|
+
model_config: dict,
|
|
64
|
+
eval_last_turn: bool = False,
|
|
65
|
+
threshold: int = 3,
|
|
66
|
+
_higher_is_better: bool = False,
|
|
67
|
+
**kwargs,
|
|
68
|
+
) -> None:
|
|
55
69
|
self._result_key = result_key
|
|
56
70
|
self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
|
|
57
71
|
self._prompty_file = prompty_file
|
|
@@ -60,15 +74,16 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
60
74
|
super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better)
|
|
61
75
|
|
|
62
76
|
subclass_name = self.__class__.__name__
|
|
63
|
-
user_agent = f"{
|
|
77
|
+
user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})"
|
|
64
78
|
prompty_model_config = construct_prompty_model_config(
|
|
65
79
|
validate_model_config(model_config),
|
|
66
80
|
self._DEFAULT_OPEN_API_VERSION,
|
|
67
81
|
user_agent,
|
|
68
82
|
)
|
|
69
83
|
|
|
70
|
-
self._flow = AsyncPrompty.load(
|
|
71
|
-
|
|
84
|
+
self._flow = AsyncPrompty.load(
|
|
85
|
+
source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
|
|
86
|
+
)
|
|
72
87
|
|
|
73
88
|
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
74
89
|
# defining a default here.
|
|
@@ -132,7 +147,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
132
147
|
score = float(match.group())
|
|
133
148
|
binary_result = self._get_binary_result(score)
|
|
134
149
|
return {
|
|
135
|
-
self._result_key: float(score),
|
|
150
|
+
self._result_key: float(score),
|
|
136
151
|
f"gpt_{self._result_key}": float(score),
|
|
137
152
|
f"{self._result_key}_result": binary_result,
|
|
138
153
|
f"{self._result_key}_threshold": self._threshold,
|
|
@@ -140,7 +155,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
140
155
|
|
|
141
156
|
binary_result = self._get_binary_result(score)
|
|
142
157
|
return {
|
|
143
|
-
self._result_key: float(score),
|
|
158
|
+
self._result_key: float(score),
|
|
144
159
|
f"gpt_{self._result_key}": float(score),
|
|
145
160
|
f"{self._result_key}_result": binary_result,
|
|
146
161
|
f"{self._result_key}_threshold": self._threshold,
|
|
@@ -36,14 +36,17 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
36
36
|
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
37
37
|
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
38
38
|
:type eval_last_turn: bool
|
|
39
|
-
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
|
|
40
|
-
to produce a single result.
|
|
39
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
|
|
41
40
|
Default is ~azure.ai.evaluation._AggregationType.MEAN.
|
|
42
41
|
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
43
42
|
:param threshold: The threshold for the evaluation. Default is 3.
|
|
44
43
|
:type threshold: Optional[int]
|
|
45
44
|
:param _higher_is_better: If True, higher scores are better. Default is True.
|
|
46
45
|
:type _higher_is_better: Optional[bool]
|
|
46
|
+
:param evaluate_query: If True, the query will be included in the evaluation data when evaluating
|
|
47
|
+
query-response pairs. If False, only the response will be evaluated. Default is False.
|
|
48
|
+
Can be passed as a keyword argument.
|
|
49
|
+
:type evaluate_query: bool
|
|
47
50
|
"""
|
|
48
51
|
|
|
49
52
|
@override
|
|
@@ -56,12 +59,21 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
56
59
|
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
57
60
|
threshold: int = 3,
|
|
58
61
|
_higher_is_better: Optional[bool] = False,
|
|
62
|
+
**kwargs,
|
|
59
63
|
):
|
|
60
|
-
super().__init__(
|
|
64
|
+
super().__init__(
|
|
65
|
+
eval_last_turn=eval_last_turn,
|
|
66
|
+
conversation_aggregation_type=conversation_aggregation_type,
|
|
67
|
+
threshold=threshold,
|
|
68
|
+
_higher_is_better=_higher_is_better,
|
|
69
|
+
)
|
|
61
70
|
self._eval_metric = eval_metric
|
|
62
71
|
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
63
72
|
self._credential = credential
|
|
64
73
|
self._threshold = threshold
|
|
74
|
+
|
|
75
|
+
# Handle evaluate_query parameter from kwargs
|
|
76
|
+
self._evaluate_query = kwargs.get("evaluate_query", False)
|
|
65
77
|
self._higher_is_better = _higher_is_better
|
|
66
78
|
|
|
67
79
|
@override
|
|
@@ -96,7 +108,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
96
108
|
:return: The evaluation result.
|
|
97
109
|
:rtype: Dict
|
|
98
110
|
"""
|
|
99
|
-
if "
|
|
111
|
+
if "response" in eval_input:
|
|
100
112
|
return await self._evaluate_query_response(eval_input)
|
|
101
113
|
|
|
102
114
|
conversation = eval_input.get("conversation", None)
|
|
@@ -126,17 +138,20 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
126
138
|
return result
|
|
127
139
|
|
|
128
140
|
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
|
|
129
|
-
query =
|
|
130
|
-
response =
|
|
131
|
-
if
|
|
141
|
+
query = eval_input.get("query", None)
|
|
142
|
+
response = eval_input.get("response", None)
|
|
143
|
+
if response is None:
|
|
132
144
|
raise EvaluationException(
|
|
133
145
|
message="Not implemented",
|
|
134
146
|
internal_message=(
|
|
135
|
-
"Reached query/response evaluation without supplying
|
|
147
|
+
"Reached query/response evaluation without supplying response."
|
|
136
148
|
+ " This should have failed earlier."
|
|
137
149
|
),
|
|
138
150
|
)
|
|
139
|
-
input_data = {"
|
|
151
|
+
input_data = {"response": str(response)}
|
|
152
|
+
|
|
153
|
+
if query is not None and self._evaluate_query:
|
|
154
|
+
input_data["query"] = str(query)
|
|
140
155
|
|
|
141
156
|
if "context" in self._singleton_inputs:
|
|
142
157
|
context = eval_input.get("context", None)
|
|
@@ -22,9 +22,9 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
22
22
|
|
|
23
23
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
24
24
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
25
|
-
:param azure_ai_project: The
|
|
26
|
-
It contains subscription id, resource group, and project name.
|
|
27
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
25
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
26
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
27
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
28
28
|
:param violence_threshold: The threshold for violence evaluation. Default is 3.
|
|
29
29
|
:type violence_threshold: int
|
|
30
30
|
:param sexual_threshold: The threshold for sexual content evaluation. Default is 3.
|
|
@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
33
33
|
:type self_harm_threshold: int
|
|
34
34
|
:param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
|
|
35
35
|
:type hate_unfairness_threshold: int
|
|
36
|
+
:param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
|
|
37
|
+
:type evaluate_query: bool
|
|
36
38
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
37
39
|
:type kwargs: Any
|
|
38
40
|
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
|
|
@@ -45,19 +47,19 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
45
47
|
:language: python
|
|
46
48
|
:dedent: 8
|
|
47
49
|
:caption: Initialize and call ContentSafetyEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
48
|
-
|
|
50
|
+
|
|
49
51
|
.. admonition:: Example using Azure AI Project URL:
|
|
50
|
-
|
|
52
|
+
|
|
51
53
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
52
54
|
:start-after: [START content_safety_evaluator]
|
|
53
55
|
:end-before: [END content_safety_evaluator]
|
|
54
56
|
:language: python
|
|
55
57
|
:dedent: 8
|
|
56
|
-
:caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
|
|
58
|
+
:caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
|
|
57
59
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
58
|
-
|
|
60
|
+
|
|
59
61
|
.. admonition:: Example with Threshold:
|
|
60
|
-
|
|
62
|
+
|
|
61
63
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
62
64
|
:start-after: [START threshold_content_safety_evaluator]
|
|
63
65
|
:end-before: [END threshold_content_safety_evaluator]
|
|
@@ -66,19 +68,20 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
66
68
|
:caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
|
|
67
69
|
"""
|
|
68
70
|
|
|
69
|
-
id = "content_safety"
|
|
71
|
+
id = "azureai://built-in/evaluators/content_safety"
|
|
70
72
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
73
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
71
74
|
|
|
72
75
|
def __init__(
|
|
73
|
-
self,
|
|
74
|
-
credential,
|
|
76
|
+
self,
|
|
77
|
+
credential,
|
|
75
78
|
azure_ai_project,
|
|
76
|
-
*,
|
|
79
|
+
*,
|
|
77
80
|
violence_threshold: int = 3,
|
|
78
81
|
sexual_threshold: int = 3,
|
|
79
82
|
self_harm_threshold: int = 3,
|
|
80
83
|
hate_unfairness_threshold: int = 3,
|
|
81
|
-
**kwargs
|
|
84
|
+
**kwargs,
|
|
82
85
|
):
|
|
83
86
|
# Type checking
|
|
84
87
|
for name, value in [
|
|
@@ -89,12 +92,19 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
89
92
|
]:
|
|
90
93
|
if not isinstance(value, int):
|
|
91
94
|
raise TypeError(f"{name} must be an int, got {type(value)}")
|
|
92
|
-
|
|
95
|
+
|
|
96
|
+
# Extract evaluate_query from kwargs if present
|
|
97
|
+
evaluate_query_kwargs = {}
|
|
98
|
+
if "evaluate_query" in kwargs:
|
|
99
|
+
evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
|
|
100
|
+
|
|
93
101
|
evaluators = [
|
|
94
|
-
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
|
|
95
|
-
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
|
|
96
|
-
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
|
|
97
|
-
HateUnfairnessEvaluator(
|
|
102
|
+
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
|
|
103
|
+
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
|
|
104
|
+
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
|
|
105
|
+
HateUnfairnessEvaluator(
|
|
106
|
+
credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
|
|
107
|
+
),
|
|
98
108
|
]
|
|
99
109
|
super().__init__(evaluators=evaluators, **kwargs)
|
|
100
110
|
|
|
@@ -45,9 +45,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
45
45
|
|
|
46
46
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
47
47
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
48
|
-
:param azure_ai_project: The
|
|
49
|
-
It contains subscription id, resource group, and project name.
|
|
50
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
48
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
49
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
50
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
51
51
|
:param threshold: The threshold for the HateUnfairness evaluator. Default is 3.
|
|
52
52
|
:type threshold: int
|
|
53
53
|
|
|
@@ -59,17 +59,17 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
59
59
|
:language: python
|
|
60
60
|
:dedent: 8
|
|
61
61
|
:caption: Initialize and call HateUnfairnessEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
.. admonition:: Example using Azure AI Project URL:
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
66
66
|
:start-after: [START hate_unfairness_evaluator]
|
|
67
67
|
:end-before: [END hate_unfairness_evaluator]
|
|
68
68
|
:language: python
|
|
69
69
|
:dedent: 8
|
|
70
|
-
:caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
|
|
70
|
+
:caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
|
|
71
71
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
.. admonition:: Example with Threshold:
|
|
74
74
|
|
|
75
75
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -80,8 +80,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
80
80
|
:caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
|
|
81
81
|
"""
|
|
82
82
|
|
|
83
|
-
id = "
|
|
83
|
+
id = "azureai://built-in/evaluators/hate_unfairness"
|
|
84
84
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
85
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
85
86
|
|
|
86
87
|
@override
|
|
87
88
|
def __init__(
|
|
@@ -90,6 +91,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
90
91
|
azure_ai_project,
|
|
91
92
|
*,
|
|
92
93
|
threshold: int = 3,
|
|
94
|
+
**kwargs,
|
|
93
95
|
):
|
|
94
96
|
super().__init__(
|
|
95
97
|
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
|
|
@@ -98,6 +100,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
98
100
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
99
101
|
threshold=threshold,
|
|
100
102
|
_higher_is_better=False,
|
|
103
|
+
**kwargs,
|
|
101
104
|
)
|
|
102
105
|
|
|
103
106
|
@overload
|
|
@@ -39,9 +39,9 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
39
39
|
|
|
40
40
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
41
41
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
42
|
-
:param azure_ai_project: The
|
|
43
|
-
It contains subscription id, resource group, and project name.
|
|
44
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
42
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
43
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
44
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
45
45
|
:param threshold: The threshold for the SelfHarm evaluator. Default is 3.
|
|
46
46
|
:type threshold: int
|
|
47
47
|
|
|
@@ -53,20 +53,21 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent: 8
|
|
55
55
|
:caption: Initialize and call SelfHarmEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
56
|
-
|
|
56
|
+
|
|
57
57
|
.. admonition:: Example using Azure AI Project URL:
|
|
58
|
-
|
|
58
|
+
|
|
59
59
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
60
60
|
:start-after: [START self_harm_evaluator]
|
|
61
61
|
:end-before: [END self_harm_evaluator]
|
|
62
62
|
:language: python
|
|
63
63
|
:dedent: 8
|
|
64
|
-
:caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
|
|
65
|
-
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
64
|
+
:caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
|
|
65
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
66
66
|
"""
|
|
67
67
|
|
|
68
|
-
id = "
|
|
68
|
+
id = "azureai://built-in/evaluators/self_harm"
|
|
69
69
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
70
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
70
71
|
|
|
71
72
|
@override
|
|
72
73
|
def __init__(
|
|
@@ -75,6 +76,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
75
76
|
azure_ai_project,
|
|
76
77
|
*,
|
|
77
78
|
threshold: int = 3,
|
|
79
|
+
**kwargs,
|
|
78
80
|
):
|
|
79
81
|
super().__init__(
|
|
80
82
|
eval_metric=EvaluationMetrics.SELF_HARM,
|
|
@@ -83,6 +85,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
83
85
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
84
86
|
threshold=threshold,
|
|
85
87
|
_higher_is_better=False,
|
|
88
|
+
**kwargs,
|
|
86
89
|
)
|
|
87
90
|
|
|
88
91
|
@overload
|