azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -32,6 +32,7 @@ from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
|
|
|
32
32
|
from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
|
|
33
33
|
from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
|
|
34
34
|
from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
|
|
35
|
+
from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
|
|
35
36
|
from ._model_configurations import (
|
|
36
37
|
AzureAIProject,
|
|
37
38
|
AzureOpenAIModelConfiguration,
|
|
@@ -131,6 +132,7 @@ __all__ = [
|
|
|
131
132
|
"CodeVulnerabilityEvaluator",
|
|
132
133
|
"UngroundedAttributesEvaluator",
|
|
133
134
|
"ToolCallAccuracyEvaluator",
|
|
135
|
+
"_ToolOutputUtilizationEvaluator",
|
|
134
136
|
"AzureOpenAIGrader",
|
|
135
137
|
"AzureOpenAILabelGrader",
|
|
136
138
|
"AzureOpenAIStringCheckGrader",
|
|
@@ -1,19 +1,26 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import TypeIs
|
|
5
7
|
|
|
6
|
-
from azure.ai.evaluation.
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION, TokenScope
|
|
7
10
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
11
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
8
12
|
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
9
|
-
from
|
|
10
|
-
|
|
13
|
+
from azure.core.credentials import TokenCredential
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from openai.lib.azure import AzureADTokenProvider
|
|
11
17
|
|
|
12
18
|
|
|
13
19
|
@experimental
|
|
14
20
|
class AzureOpenAIGrader:
|
|
15
|
-
"""
|
|
16
|
-
|
|
21
|
+
"""Base class for Azure OpenAI grader wrappers.
|
|
22
|
+
|
|
23
|
+
Recommended only for use by experienced OpenAI API users.
|
|
17
24
|
Combines a model configuration and any grader configuration
|
|
18
25
|
into a singular object that can be used in evaluations.
|
|
19
26
|
|
|
@@ -22,18 +29,16 @@ class AzureOpenAIGrader:
|
|
|
22
29
|
evaluation results.
|
|
23
30
|
|
|
24
31
|
:param model_config: The model configuration to use for the grader.
|
|
25
|
-
:type model_config: Union[
|
|
26
|
-
~azure.ai.evaluation.
|
|
27
|
-
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
28
|
-
]
|
|
32
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
33
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
29
34
|
:param grader_config: The grader configuration to use for the grader. This is expected
|
|
30
35
|
to be formatted as a dictionary that matches the specifications of the sub-types of
|
|
31
|
-
the TestingCriterion alias specified in
|
|
36
|
+
the TestingCriterion alias specified in `OpenAI's SDK <https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L151>`_.
|
|
32
37
|
:type grader_config: Dict[str, Any]
|
|
38
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
39
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
33
40
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
34
41
|
:type kwargs: Any
|
|
35
|
-
|
|
36
|
-
|
|
37
42
|
"""
|
|
38
43
|
|
|
39
44
|
id = "azureai://built-in/evaluators/azure-openai/custom_grader"
|
|
@@ -43,10 +48,12 @@ class AzureOpenAIGrader:
|
|
|
43
48
|
*,
|
|
44
49
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
45
50
|
grader_config: Dict[str, Any],
|
|
51
|
+
credential: Optional[TokenCredential] = None,
|
|
46
52
|
**kwargs: Any,
|
|
47
53
|
):
|
|
48
54
|
self._model_config = model_config
|
|
49
55
|
self._grader_config = grader_config
|
|
56
|
+
self._credential = credential
|
|
50
57
|
|
|
51
58
|
if kwargs.get("validate", True):
|
|
52
59
|
self._validate_model_config()
|
|
@@ -54,20 +61,39 @@ class AzureOpenAIGrader:
|
|
|
54
61
|
|
|
55
62
|
def _validate_model_config(self) -> None:
|
|
56
63
|
"""Validate the model configuration that this grader wrapper is using."""
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
msg = None
|
|
65
|
+
if self._is_azure_model_config(self._model_config):
|
|
66
|
+
if not any(auth for auth in (self._model_config.get("api_key"), self._credential)):
|
|
67
|
+
msg = (
|
|
68
|
+
f"{type(self).__name__}: Requires an api_key in the supplied model_config, "
|
|
69
|
+
+ "or providing a credential to the grader's __init__ method. "
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
else:
|
|
73
|
+
if "api_key" not in self._model_config or not self._model_config.get("api_key"):
|
|
74
|
+
msg = f"{type(self).__name__}: Requires an api_key in the supplied model_config."
|
|
75
|
+
|
|
76
|
+
if msg is None:
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
raise EvaluationException(
|
|
80
|
+
message=msg,
|
|
81
|
+
blame=ErrorBlame.USER_ERROR,
|
|
82
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
83
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
84
|
+
)
|
|
65
85
|
|
|
66
86
|
def _validate_grader_config(self) -> None:
|
|
67
87
|
"""Validate the grader configuration that this grader wrapper is using."""
|
|
68
88
|
|
|
69
89
|
return
|
|
70
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def _is_azure_model_config(
|
|
93
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
94
|
+
) -> TypeIs[AzureOpenAIModelConfiguration]:
|
|
95
|
+
return "azure_endpoint" in model_config
|
|
96
|
+
|
|
71
97
|
def get_client(self) -> Any:
|
|
72
98
|
"""Construct an appropriate OpenAI client using this grader's model configuration.
|
|
73
99
|
Returns a slightly different client depending on whether or not this grader's model
|
|
@@ -77,23 +103,38 @@ class AzureOpenAIGrader:
|
|
|
77
103
|
:rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
|
|
78
104
|
"""
|
|
79
105
|
default_headers = {"User-Agent": UserAgentSingleton().value}
|
|
80
|
-
|
|
106
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration] = self._model_config
|
|
107
|
+
api_key: Optional[str] = model_config.get("api_key")
|
|
108
|
+
|
|
109
|
+
if self._is_azure_model_config(model_config):
|
|
81
110
|
from openai import AzureOpenAI
|
|
82
111
|
|
|
83
112
|
# TODO set default values?
|
|
84
113
|
return AzureOpenAI(
|
|
85
|
-
azure_endpoint=
|
|
86
|
-
api_key=
|
|
114
|
+
azure_endpoint=model_config["azure_endpoint"],
|
|
115
|
+
api_key=api_key, # Default-style access to appease linters.
|
|
87
116
|
api_version=DEFAULT_AOAI_API_VERSION, # Force a known working version
|
|
88
|
-
azure_deployment=
|
|
117
|
+
azure_deployment=model_config.get("azure_deployment", ""),
|
|
118
|
+
azure_ad_token_provider=self._get_token_provider(self._credential) if not api_key else None,
|
|
89
119
|
default_headers=default_headers,
|
|
90
120
|
)
|
|
91
121
|
from openai import OpenAI
|
|
92
122
|
|
|
93
123
|
# TODO add default values for base_url and organization?
|
|
94
124
|
return OpenAI(
|
|
95
|
-
api_key=
|
|
96
|
-
base_url=
|
|
97
|
-
organization=
|
|
125
|
+
api_key=api_key,
|
|
126
|
+
base_url=model_config.get("base_url", ""),
|
|
127
|
+
organization=model_config.get("organization", ""),
|
|
98
128
|
default_headers=default_headers,
|
|
99
129
|
)
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _get_token_provider(cred: TokenCredential) -> "AzureADTokenProvider":
|
|
133
|
+
"""Get the token provider the AzureOpenAI client.
|
|
134
|
+
|
|
135
|
+
:param TokenCredential cred: The Azure authentication credential.
|
|
136
|
+
:return: The token provider if a credential is provided, otherwise None.
|
|
137
|
+
:rtype: openai.lib.azure.AzureADTokenProvider
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
return lambda: cred.get_token(TokenScope.COGNITIVE_SERVICES_MANAGEMENT).token
|
|
@@ -1,32 +1,31 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
6
|
from openai.types.graders import LabelModelGrader
|
|
7
|
+
|
|
8
8
|
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
9
11
|
|
|
10
12
|
from .aoai_grader import AzureOpenAIGrader
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
@experimental
|
|
14
16
|
class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
15
|
-
"""
|
|
16
|
-
Wrapper class for OpenAI's label model graders.
|
|
17
|
+
"""Wrapper class for OpenAI's label model graders.
|
|
17
18
|
|
|
18
19
|
Supplying a LabelGrader to the `evaluate` method will cause an asynchronous request to evaluate
|
|
19
20
|
the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
|
|
20
21
|
evaluation results.
|
|
21
22
|
|
|
22
23
|
:param model_config: The model configuration to use for the grader.
|
|
23
|
-
:type model_config: Union[
|
|
24
|
-
~azure.ai.evaluation.
|
|
25
|
-
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
26
|
-
]
|
|
24
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
25
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
27
26
|
:param input: The list of label-based testing criterion for this grader. Individual
|
|
28
27
|
values of this list are expected to be dictionaries that match the format of any of the valid
|
|
29
|
-
|
|
28
|
+
`TestingCriterionLabelModelInput <https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L125C1-L125C32>`_
|
|
30
29
|
subtypes.
|
|
31
30
|
:type input: List[Dict[str, str]]
|
|
32
31
|
:param labels: A list of strings representing the classification labels of this grader.
|
|
@@ -37,13 +36,14 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
|
37
36
|
:type name: str
|
|
38
37
|
:param passing_labels: The labels that indicate a passing result. Must be a subset of labels.
|
|
39
38
|
:type passing_labels: List[str]
|
|
39
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
40
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
40
41
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
41
42
|
:type kwargs: Any
|
|
42
|
-
|
|
43
|
-
|
|
44
43
|
"""
|
|
45
44
|
|
|
46
45
|
id = "azureai://built-in/evaluators/azure-openai/label_grader"
|
|
46
|
+
_type = "label_model"
|
|
47
47
|
|
|
48
48
|
def __init__(
|
|
49
49
|
self,
|
|
@@ -54,6 +54,7 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
|
54
54
|
model: str,
|
|
55
55
|
name: str,
|
|
56
56
|
passing_labels: List[str],
|
|
57
|
+
credential: Optional[TokenCredential] = None,
|
|
57
58
|
**kwargs: Any
|
|
58
59
|
):
|
|
59
60
|
grader = LabelModelGrader(
|
|
@@ -62,6 +63,6 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
|
62
63
|
model=model,
|
|
63
64
|
name=name,
|
|
64
65
|
passing_labels=passing_labels,
|
|
65
|
-
type=
|
|
66
|
+
type=AzureOpenAILabelGrader._type,
|
|
66
67
|
)
|
|
67
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
68
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
5
5
|
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
6
|
from openai.types.graders import PythonGrader
|
|
7
|
+
|
|
8
8
|
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
9
11
|
|
|
10
12
|
from .aoai_grader import AzureOpenAIGrader
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
@experimental
|
|
14
16
|
class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
15
|
-
"""
|
|
16
|
-
Wrapper class for OpenAI's Python code graders.
|
|
17
|
+
"""Wrapper class for OpenAI's Python code graders.
|
|
17
18
|
|
|
18
19
|
Enables custom Python-based evaluation logic with flexible scoring and
|
|
19
20
|
pass/fail thresholds. The grader executes user-provided Python code
|
|
@@ -25,20 +26,19 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
|
25
26
|
evaluation results.
|
|
26
27
|
|
|
27
28
|
:param model_config: The model configuration to use for the grader.
|
|
28
|
-
:type model_config: Union[
|
|
29
|
-
~azure.ai.evaluation.
|
|
30
|
-
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
31
|
-
]
|
|
29
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
32
31
|
:param name: The name of the grader.
|
|
33
32
|
:type name: str
|
|
34
33
|
:param image_tag: The image tag for the Python execution environment.
|
|
35
34
|
:type image_tag: str
|
|
36
|
-
:param pass_threshold: Score threshold for pass/fail classification.
|
|
37
|
-
Scores >= threshold are considered passing.
|
|
35
|
+
:param pass_threshold: Score threshold for pass/fail classification. Scores >= threshold are considered passing.
|
|
38
36
|
:type pass_threshold: float
|
|
39
37
|
:param source: Python source code containing the grade function.
|
|
40
38
|
Must define: def grade(sample: dict, item: dict) -> float
|
|
41
39
|
:type source: str
|
|
40
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
41
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
42
42
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
43
43
|
:type kwargs: Any
|
|
44
44
|
|
|
@@ -54,15 +54,17 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
|
54
54
|
"""
|
|
55
55
|
|
|
56
56
|
id = "azureai://built-in/evaluators/azure-openai/python_grader"
|
|
57
|
+
_type = "python"
|
|
57
58
|
|
|
58
59
|
def __init__(
|
|
59
60
|
self,
|
|
60
61
|
*,
|
|
61
62
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
62
63
|
name: str,
|
|
63
|
-
image_tag: str,
|
|
64
64
|
pass_threshold: float,
|
|
65
65
|
source: str,
|
|
66
|
+
image_tag: Optional[str] = None,
|
|
67
|
+
credential: Optional[TokenCredential] = None,
|
|
66
68
|
**kwargs: Any,
|
|
67
69
|
):
|
|
68
70
|
# Validate pass_threshold
|
|
@@ -78,7 +80,7 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
|
78
80
|
image_tag=image_tag,
|
|
79
81
|
pass_threshold=pass_threshold,
|
|
80
82
|
source=source,
|
|
81
|
-
type=
|
|
83
|
+
type=AzureOpenAIPythonGrader._type,
|
|
82
84
|
)
|
|
83
85
|
|
|
84
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
86
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
6
|
from openai.types.graders import ScoreModelGrader
|
|
7
|
+
|
|
8
8
|
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
9
11
|
|
|
10
12
|
from .aoai_grader import AzureOpenAIGrader
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
@experimental
|
|
14
16
|
class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
15
|
-
"""
|
|
16
|
-
Wrapper class for OpenAI's score model graders.
|
|
17
|
+
"""Wrapper class for OpenAI's score model graders.
|
|
17
18
|
|
|
18
19
|
Enables continuous scoring evaluation with custom prompts and flexible
|
|
19
20
|
conversation-style inputs. Supports configurable score ranges and
|
|
@@ -25,10 +26,8 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
25
26
|
evaluation results.
|
|
26
27
|
|
|
27
28
|
:param model_config: The model configuration to use for the grader.
|
|
28
|
-
:type model_config: Union[
|
|
29
|
-
~azure.ai.evaluation.
|
|
30
|
-
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
31
|
-
]
|
|
29
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
32
31
|
:param input: The input messages for the grader. List of conversation
|
|
33
32
|
messages with role and content.
|
|
34
33
|
:type input: List[Dict[str, str]]
|
|
@@ -43,11 +42,14 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
43
42
|
:type pass_threshold: Optional[float]
|
|
44
43
|
:param sampling_params: The sampling parameters for the model.
|
|
45
44
|
:type sampling_params: Optional[Dict[str, Any]]
|
|
45
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
46
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
46
47
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
47
48
|
:type kwargs: Any
|
|
48
49
|
"""
|
|
49
50
|
|
|
50
51
|
id = "azureai://built-in/evaluators/azure-openai/score_model_grader"
|
|
52
|
+
_type = "score_model"
|
|
51
53
|
|
|
52
54
|
def __init__(
|
|
53
55
|
self,
|
|
@@ -59,6 +61,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
59
61
|
range: Optional[List[float]] = None,
|
|
60
62
|
pass_threshold: Optional[float] = None,
|
|
61
63
|
sampling_params: Optional[Dict[str, Any]] = None,
|
|
64
|
+
credential: Optional[TokenCredential] = None,
|
|
62
65
|
**kwargs: Any,
|
|
63
66
|
):
|
|
64
67
|
# Validate range and pass_threshold
|
|
@@ -78,7 +81,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
78
81
|
self.pass_threshold = pass_threshold
|
|
79
82
|
|
|
80
83
|
# Create OpenAI ScoreModelGrader instance
|
|
81
|
-
grader_kwargs = {"input": input, "model": model, "name": name, "type":
|
|
84
|
+
grader_kwargs = {"input": input, "model": model, "name": name, "type": AzureOpenAIScoreModelGrader._type}
|
|
82
85
|
|
|
83
86
|
if range is not None:
|
|
84
87
|
grader_kwargs["range"] = range
|
|
@@ -88,4 +91,4 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
88
91
|
|
|
89
92
|
grader = ScoreModelGrader(**grader_kwargs)
|
|
90
93
|
|
|
91
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
94
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -1,30 +1,28 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict, Union
|
|
5
|
-
from typing_extensions import Literal
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
6
5
|
|
|
7
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
8
6
|
from openai.types.graders import StringCheckGrader
|
|
7
|
+
from typing_extensions import Literal
|
|
8
|
+
|
|
9
9
|
from azure.ai.evaluation._common._experimental import experimental
|
|
10
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
11
|
+
from azure.core.credentials import TokenCredential
|
|
10
12
|
|
|
11
13
|
from .aoai_grader import AzureOpenAIGrader
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
@experimental
|
|
15
17
|
class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
16
|
-
"""
|
|
17
|
-
Wrapper class for OpenAI's string check graders.
|
|
18
|
+
"""Wrapper class for OpenAI's string check graders.
|
|
18
19
|
|
|
19
20
|
Supplying a StringCheckGrader to the `evaluate` method will cause an asynchronous request to evaluate
|
|
20
21
|
the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
|
|
21
22
|
evaluation results.
|
|
22
23
|
|
|
23
24
|
:param model_config: The model configuration to use for the grader.
|
|
24
|
-
:type model_config: Union[
|
|
25
|
-
~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
26
|
-
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
27
|
-
]
|
|
25
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
28
26
|
:param input: The input text. This may include template strings.
|
|
29
27
|
:type input: str
|
|
30
28
|
:param name: The name of the grader.
|
|
@@ -33,13 +31,14 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
|
33
31
|
:type operation: Literal["eq", "ne", "like", "ilike"]
|
|
34
32
|
:param reference: The reference text. This may include template strings.
|
|
35
33
|
:type reference: str
|
|
34
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
35
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
36
36
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
37
37
|
:type kwargs: Any
|
|
38
|
-
|
|
39
|
-
|
|
40
38
|
"""
|
|
41
39
|
|
|
42
40
|
id = "azureai://built-in/evaluators/azure-openai/string_check_grader"
|
|
41
|
+
_type = "string_check"
|
|
43
42
|
|
|
44
43
|
def __init__(
|
|
45
44
|
self,
|
|
@@ -54,6 +53,7 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
|
54
53
|
"ilike",
|
|
55
54
|
],
|
|
56
55
|
reference: str,
|
|
56
|
+
credential: Optional[TokenCredential] = None,
|
|
57
57
|
**kwargs: Any
|
|
58
58
|
):
|
|
59
59
|
grader = StringCheckGrader(
|
|
@@ -61,6 +61,6 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
|
61
61
|
name=name,
|
|
62
62
|
operation=operation,
|
|
63
63
|
reference=reference,
|
|
64
|
-
type=
|
|
64
|
+
type=AzureOpenAIStringCheckGrader._type,
|
|
65
65
|
)
|
|
66
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
66
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -1,20 +1,21 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict, Union
|
|
5
|
-
from typing_extensions import Literal
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
6
5
|
|
|
7
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
8
6
|
from openai.types.graders import TextSimilarityGrader
|
|
7
|
+
from typing_extensions import Literal
|
|
8
|
+
|
|
9
9
|
from azure.ai.evaluation._common._experimental import experimental
|
|
10
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
11
|
+
from azure.core.credentials import TokenCredential
|
|
10
12
|
|
|
11
13
|
from .aoai_grader import AzureOpenAIGrader
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
@experimental
|
|
15
17
|
class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
16
|
-
"""
|
|
17
|
-
Wrapper class for OpenAI's string check graders.
|
|
18
|
+
"""Wrapper class for OpenAI's string check graders.
|
|
18
19
|
|
|
19
20
|
Supplying a StringCheckGrader to the `evaluate` method will cause an asynchronous request to evaluate
|
|
20
21
|
the grader via the OpenAI API. The results of the evaluation will then be merged into the standard
|
|
@@ -22,23 +23,11 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
22
23
|
|
|
23
24
|
:param model_config: The model configuration to use for the grader.
|
|
24
25
|
:type model_config: Union[
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
]
|
|
26
|
+
~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
27
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
28
28
|
:param evaluation_metric: The evaluation metric to use.
|
|
29
|
-
:type evaluation_metric: Literal[
|
|
30
|
-
|
|
31
|
-
"bleu",
|
|
32
|
-
"gleu",
|
|
33
|
-
"meteor",
|
|
34
|
-
"rouge_1",
|
|
35
|
-
"rouge_2",
|
|
36
|
-
"rouge_3",
|
|
37
|
-
"rouge_4",
|
|
38
|
-
"rouge_5",
|
|
39
|
-
"rouge_l",
|
|
40
|
-
"cosine",
|
|
41
|
-
]
|
|
29
|
+
:type evaluation_metric: Literal["fuzzy_match", "bleu", "gleu", "meteor", "rouge_1", "rouge_2", "rouge_3",
|
|
30
|
+
"rouge_4", "rouge_5", "rouge_l", "cosine"]
|
|
42
31
|
:param input: The text being graded.
|
|
43
32
|
:type input: str
|
|
44
33
|
:param pass_threshold: A float score where a value greater than or equal indicates a passing grade.
|
|
@@ -47,13 +36,14 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
47
36
|
:type reference: str
|
|
48
37
|
:param name: The name of the grader.
|
|
49
38
|
:type name: str
|
|
39
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
40
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
50
41
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
51
42
|
:type kwargs: Any
|
|
52
|
-
|
|
53
|
-
|
|
54
43
|
"""
|
|
55
44
|
|
|
56
45
|
id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader"
|
|
46
|
+
_type = "text_similarity"
|
|
57
47
|
|
|
58
48
|
def __init__(
|
|
59
49
|
self,
|
|
@@ -76,6 +66,7 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
76
66
|
pass_threshold: float,
|
|
77
67
|
reference: str,
|
|
78
68
|
name: str,
|
|
69
|
+
credential: Optional[TokenCredential] = None,
|
|
79
70
|
**kwargs: Any
|
|
80
71
|
):
|
|
81
72
|
grader = TextSimilarityGrader(
|
|
@@ -84,6 +75,6 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
84
75
|
pass_threshold=pass_threshold,
|
|
85
76
|
name=name,
|
|
86
77
|
reference=reference,
|
|
87
|
-
type=
|
|
78
|
+
type=AzureOpenAITextSimilarityGrader._type,
|
|
88
79
|
)
|
|
89
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
80
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
# that would have otherwise been a relative import scoped to single evaluator directories.
|
|
7
7
|
|
|
8
8
|
from . import constants
|
|
9
|
-
from .rai_service import evaluate_with_rai_service
|
|
9
|
+
from .rai_service import evaluate_with_rai_service, evaluate_with_rai_service_sync
|
|
10
10
|
from .utils import get_harm_severity_level
|
|
11
11
|
from .evaluation_onedp_client import EvaluationServiceOneDPClient
|
|
12
12
|
from .onedp.models import EvaluationUpload, EvaluationResult, RedTeamUpload, ResultType
|
|
@@ -14,6 +14,7 @@ from .onedp.models import EvaluationUpload, EvaluationResult, RedTeamUpload, Res
|
|
|
14
14
|
__all__ = [
|
|
15
15
|
"get_harm_severity_level",
|
|
16
16
|
"evaluate_with_rai_service",
|
|
17
|
+
"evaluate_with_rai_service_sync",
|
|
17
18
|
"constants",
|
|
18
19
|
"EvaluationServiceOneDPClient",
|
|
19
20
|
"EvaluationResult",
|