azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +13 -2
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +86 -50
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +124 -3
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +60 -54
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +24 -15
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +21 -21
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1286 -739
- azure/ai/evaluation/red_team/_red_team_result.py +43 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +32 -32
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +2 -12
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +26 -15
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +10 -8
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +9 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +15 -1
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -131
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -45,6 +45,7 @@ from ._aoai.aoai_grader import AzureOpenAIGrader
|
|
|
45
45
|
from ._aoai.label_grader import AzureOpenAILabelGrader
|
|
46
46
|
from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
|
|
47
47
|
from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
|
|
48
|
+
from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
_patch_all = []
|
|
@@ -54,10 +55,19 @@ _patch_all = []
|
|
|
54
55
|
# in ai.projects. So we only import it if it's available and the user has ai.projects.
|
|
55
56
|
try:
|
|
56
57
|
from ._converters._ai_services import AIAgentConverter
|
|
58
|
+
|
|
57
59
|
_patch_all.append("AIAgentConverter")
|
|
58
60
|
except ImportError:
|
|
59
|
-
print(
|
|
61
|
+
print(
|
|
62
|
+
"[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`."
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
from ._converters._sk_services import SKAgentConverter
|
|
60
67
|
|
|
68
|
+
_patch_all.append("SKAgentConverter")
|
|
69
|
+
except ImportError:
|
|
70
|
+
print("[INFO] Could not import SKAgentConverter. Please install the dependency with `pip install semantic-kernel`.")
|
|
61
71
|
|
|
62
72
|
__all__ = [
|
|
63
73
|
"evaluate",
|
|
@@ -99,6 +109,7 @@ __all__ = [
|
|
|
99
109
|
"AzureOpenAILabelGrader",
|
|
100
110
|
"AzureOpenAIStringCheckGrader",
|
|
101
111
|
"AzureOpenAITextSimilarityGrader",
|
|
112
|
+
"AzureOpenAIScoreModelGrader",
|
|
102
113
|
]
|
|
103
114
|
|
|
104
|
-
__all__.extend([p for p in _patch_all if p not in __all__])
|
|
115
|
+
__all__.extend([p for p in _patch_all if p not in __all__])
|
|
@@ -5,12 +5,13 @@ from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfigurat
|
|
|
5
5
|
|
|
6
6
|
from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION
|
|
7
7
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
8
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
8
9
|
from typing import Any, Dict, Union
|
|
9
10
|
from azure.ai.evaluation._common._experimental import experimental
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
@experimental
|
|
13
|
-
class AzureOpenAIGrader
|
|
14
|
+
class AzureOpenAIGrader:
|
|
14
15
|
"""
|
|
15
16
|
Base class for Azure OpenAI grader wrappers, recommended only for use by experienced OpenAI API users.
|
|
16
17
|
Combines a model configuration and any grader configuration
|
|
@@ -35,9 +36,15 @@ class AzureOpenAIGrader():
|
|
|
35
36
|
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
|
-
id = "
|
|
39
|
+
id = "azureai://built-in/evaluators/azure-openai/custom_grader"
|
|
39
40
|
|
|
40
|
-
def __init__(
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
*,
|
|
44
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
45
|
+
grader_config: Dict[str, Any],
|
|
46
|
+
**kwargs: Any,
|
|
47
|
+
):
|
|
41
48
|
self._model_config = model_config
|
|
42
49
|
self._grader_config = grader_config
|
|
43
50
|
|
|
@@ -45,8 +52,6 @@ class AzureOpenAIGrader():
|
|
|
45
52
|
self._validate_model_config()
|
|
46
53
|
self._validate_grader_config()
|
|
47
54
|
|
|
48
|
-
|
|
49
|
-
|
|
50
55
|
def _validate_model_config(self) -> None:
|
|
51
56
|
"""Validate the model configuration that this grader wrapper is using."""
|
|
52
57
|
if "api_key" not in self._model_config or not self._model_config.get("api_key"):
|
|
@@ -57,7 +62,7 @@ class AzureOpenAIGrader():
|
|
|
57
62
|
category=ErrorCategory.INVALID_VALUE,
|
|
58
63
|
target=ErrorTarget.AOAI_GRADER,
|
|
59
64
|
)
|
|
60
|
-
|
|
65
|
+
|
|
61
66
|
def _validate_grader_config(self) -> None:
|
|
62
67
|
"""Validate the grader configuration that this grader wrapper is using."""
|
|
63
68
|
|
|
@@ -71,19 +76,24 @@ class AzureOpenAIGrader():
|
|
|
71
76
|
:return: The OpenAI client.
|
|
72
77
|
:rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
|
|
73
78
|
"""
|
|
79
|
+
default_headers = {"User-Agent": UserAgentSingleton().value}
|
|
74
80
|
if "azure_endpoint" in self._model_config:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
81
|
+
from openai import AzureOpenAI
|
|
82
|
+
|
|
83
|
+
# TODO set default values?
|
|
84
|
+
return AzureOpenAI(
|
|
78
85
|
azure_endpoint=self._model_config["azure_endpoint"],
|
|
79
|
-
api_key=self._model_config.get("api_key", None),
|
|
80
|
-
api_version=DEFAULT_AOAI_API_VERSION,
|
|
86
|
+
api_key=self._model_config.get("api_key", None), # Default-style access to appease linters.
|
|
87
|
+
api_version=DEFAULT_AOAI_API_VERSION, # Force a known working version
|
|
81
88
|
azure_deployment=self._model_config.get("azure_deployment", ""),
|
|
89
|
+
default_headers=default_headers,
|
|
82
90
|
)
|
|
83
91
|
from openai import OpenAI
|
|
92
|
+
|
|
84
93
|
# TODO add default values for base_url and organization?
|
|
85
94
|
return OpenAI(
|
|
86
95
|
api_key=self._model_config["api_key"],
|
|
87
96
|
base_url=self._model_config.get("base_url", ""),
|
|
88
97
|
organization=self._model_config.get("organization", ""),
|
|
98
|
+
default_headers=default_headers,
|
|
89
99
|
)
|
|
@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
9
9
|
|
|
10
10
|
from .aoai_grader import AzureOpenAIGrader
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
@experimental
|
|
13
14
|
class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
14
15
|
"""
|
|
@@ -42,12 +43,12 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
|
42
43
|
|
|
43
44
|
"""
|
|
44
45
|
|
|
45
|
-
id = "
|
|
46
|
+
id = "azureai://built-in/evaluators/azure-openai/label_grader"
|
|
46
47
|
|
|
47
48
|
def __init__(
|
|
48
49
|
self,
|
|
49
50
|
*,
|
|
50
|
-
model_config
|
|
51
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
51
52
|
input: List[Dict[str, str]],
|
|
52
53
|
labels: List[str],
|
|
53
54
|
model: str,
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Any, Dict, Union, List, Optional
|
|
5
|
+
|
|
6
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
|
+
from openai.types.graders import ScoreModelGrader
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
|
|
10
|
+
from .aoai_grader import AzureOpenAIGrader
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@experimental
|
|
14
|
+
class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
15
|
+
"""
|
|
16
|
+
Wrapper class for OpenAI's score model graders.
|
|
17
|
+
|
|
18
|
+
Enables continuous scoring evaluation with custom prompts and flexible
|
|
19
|
+
conversation-style inputs. Supports configurable score ranges and
|
|
20
|
+
pass thresholds for binary classification.
|
|
21
|
+
|
|
22
|
+
Supplying a ScoreModelGrader to the `evaluate` method will cause an
|
|
23
|
+
asynchronous request to evaluate the grader via the OpenAI API. The
|
|
24
|
+
results of the evaluation will then be merged into the standard
|
|
25
|
+
evaluation results.
|
|
26
|
+
|
|
27
|
+
:param model_config: The model configuration to use for the grader.
|
|
28
|
+
:type model_config: Union[
|
|
29
|
+
~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
30
|
+
~azure.ai.evaluation.OpenAIModelConfiguration
|
|
31
|
+
]
|
|
32
|
+
:param input: The input messages for the grader. List of conversation
|
|
33
|
+
messages with role and content.
|
|
34
|
+
:type input: List[Dict[str, str]]
|
|
35
|
+
:param model: The model to use for the evaluation.
|
|
36
|
+
:type model: str
|
|
37
|
+
:param name: The name of the grader.
|
|
38
|
+
:type name: str
|
|
39
|
+
:param range: The range of the score. Defaults to [0, 1].
|
|
40
|
+
:type range: Optional[List[float]]
|
|
41
|
+
:param pass_threshold: Score threshold for pass/fail classification.
|
|
42
|
+
Defaults to midpoint of range.
|
|
43
|
+
:type pass_threshold: Optional[float]
|
|
44
|
+
:param sampling_params: The sampling parameters for the model.
|
|
45
|
+
:type sampling_params: Optional[Dict[str, Any]]
|
|
46
|
+
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
47
|
+
:type kwargs: Any
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
id = "azureai://built-in/evaluators/azure-openai/score_model_grader"
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
*,
|
|
55
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
56
|
+
input: List[Dict[str, str]],
|
|
57
|
+
model: str,
|
|
58
|
+
name: str,
|
|
59
|
+
range: Optional[List[float]] = None,
|
|
60
|
+
pass_threshold: Optional[float] = None,
|
|
61
|
+
sampling_params: Optional[Dict[str, Any]] = None,
|
|
62
|
+
**kwargs: Any,
|
|
63
|
+
):
|
|
64
|
+
# Validate range and pass_threshold
|
|
65
|
+
if range is not None:
|
|
66
|
+
if len(range) != 2 or range[0] >= range[1]:
|
|
67
|
+
raise ValueError("range must be a list of two numbers [min, max] where min < max")
|
|
68
|
+
else:
|
|
69
|
+
range = [0.0, 1.0] # Default range
|
|
70
|
+
|
|
71
|
+
if pass_threshold is not None:
|
|
72
|
+
if range and (pass_threshold < range[0] or pass_threshold > range[1]):
|
|
73
|
+
raise ValueError(f"pass_threshold {pass_threshold} must be within range {range}")
|
|
74
|
+
else:
|
|
75
|
+
pass_threshold = (range[0] + range[1]) / 2 # Default to midpoint
|
|
76
|
+
|
|
77
|
+
# Store pass_threshold as instance attribute
|
|
78
|
+
self.pass_threshold = pass_threshold
|
|
79
|
+
|
|
80
|
+
# Create OpenAI ScoreModelGrader instance
|
|
81
|
+
grader_kwargs = {"input": input, "model": model, "name": name, "type": "score_model"}
|
|
82
|
+
|
|
83
|
+
if range is not None:
|
|
84
|
+
grader_kwargs["range"] = range
|
|
85
|
+
if sampling_params is not None:
|
|
86
|
+
grader_kwargs["sampling_params"] = sampling_params
|
|
87
|
+
|
|
88
|
+
grader = ScoreModelGrader(**grader_kwargs)
|
|
89
|
+
|
|
90
|
+
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
@@ -10,6 +10,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
10
10
|
|
|
11
11
|
from .aoai_grader import AzureOpenAIGrader
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
@experimental
|
|
14
15
|
class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
15
16
|
"""
|
|
@@ -38,12 +39,12 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
|
38
39
|
|
|
39
40
|
"""
|
|
40
41
|
|
|
41
|
-
id = "
|
|
42
|
+
id = "azureai://built-in/evaluators/azure-openai/string_check_grader"
|
|
42
43
|
|
|
43
44
|
def __init__(
|
|
44
45
|
self,
|
|
45
46
|
*,
|
|
46
|
-
model_config
|
|
47
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
47
48
|
input: str,
|
|
48
49
|
name: str,
|
|
49
50
|
operation: Literal[
|
|
@@ -10,6 +10,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
10
10
|
|
|
11
11
|
from .aoai_grader import AzureOpenAIGrader
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
@experimental
|
|
14
15
|
class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
15
16
|
"""
|
|
@@ -52,12 +53,12 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
52
53
|
|
|
53
54
|
"""
|
|
54
55
|
|
|
55
|
-
id = "
|
|
56
|
+
id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader"
|
|
56
57
|
|
|
57
58
|
def __init__(
|
|
58
59
|
self,
|
|
59
60
|
*,
|
|
60
|
-
model_config
|
|
61
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
61
62
|
evaluation_metric: Literal[
|
|
62
63
|
"fuzzy_match",
|
|
63
64
|
"bleu",
|
|
@@ -19,6 +19,7 @@ from azure.core.pipeline.policies import ProxyPolicy, AsyncRetryPolicy
|
|
|
19
19
|
|
|
20
20
|
class AzureEnvironmentMetadata(TypedDict):
|
|
21
21
|
"""Configuration for various Azure environments. All endpoints include a trailing slash."""
|
|
22
|
+
|
|
22
23
|
portal_endpoint: str
|
|
23
24
|
"""The management portal for the Azure environment (e.g. https://portal.azure.com/)"""
|
|
24
25
|
resource_manager_endpoint: str
|
|
@@ -107,15 +108,15 @@ class AzureEnvironmentClient:
|
|
|
107
108
|
|
|
108
109
|
def case_insensitive_match(d: Mapping[str, Any], key: str) -> Optional[Any]:
|
|
109
110
|
key = key.strip().lower()
|
|
110
|
-
return next((v for k,v in d.items() if k.strip().lower() == key), None)
|
|
111
|
+
return next((v for k, v in d.items() if k.strip().lower() == key), None)
|
|
111
112
|
|
|
112
113
|
async with _ASYNC_LOCK:
|
|
113
114
|
cloud = _KNOWN_AZURE_ENVIRONMENTS.get(name) or case_insensitive_match(_KNOWN_AZURE_ENVIRONMENTS, name)
|
|
114
115
|
if cloud:
|
|
115
116
|
return cloud
|
|
116
|
-
default_endpoint = (
|
|
117
|
-
|
|
118
|
-
|
|
117
|
+
default_endpoint = _KNOWN_AZURE_ENVIRONMENTS.get(_DEFAULT_AZURE_ENV_NAME, {}).get(
|
|
118
|
+
"resource_manager_endpoint"
|
|
119
|
+
)
|
|
119
120
|
|
|
120
121
|
metadata_url = self.get_default_metadata_url(default_endpoint)
|
|
121
122
|
clouds = await self.get_clouds_async(metadata_url=metadata_url, update_cached=update_cached)
|
|
@@ -124,10 +125,7 @@ class AzureEnvironmentClient:
|
|
|
124
125
|
return cloud_metadata
|
|
125
126
|
|
|
126
127
|
async def get_clouds_async(
|
|
127
|
-
self,
|
|
128
|
-
*,
|
|
129
|
-
metadata_url: Optional[str] = None,
|
|
130
|
-
update_cached: bool = True
|
|
128
|
+
self, *, metadata_url: Optional[str] = None, update_cached: bool = True
|
|
131
129
|
) -> Mapping[str, AzureEnvironmentMetadata]:
|
|
132
130
|
metadata_url = metadata_url or self.get_default_metadata_url()
|
|
133
131
|
|
|
@@ -149,7 +147,8 @@ class AzureEnvironmentClient:
|
|
|
149
147
|
default_endpoint = default_endpoint or "https://management.azure.com/"
|
|
150
148
|
metadata_url = os.getenv(
|
|
151
149
|
_ENV_ARM_CLOUD_METADATA_URL,
|
|
152
|
-
f"{default_endpoint}metadata/endpoints?api-version={AzureEnvironmentClient.DEFAULT_API_VERSION}"
|
|
150
|
+
f"{default_endpoint}metadata/endpoints?api-version={AzureEnvironmentClient.DEFAULT_API_VERSION}",
|
|
151
|
+
)
|
|
153
152
|
return metadata_url
|
|
154
153
|
|
|
155
154
|
@staticmethod
|
|
@@ -197,7 +196,7 @@ class AzureEnvironmentClient:
|
|
|
197
196
|
|
|
198
197
|
def recursive_update(d: Dict, u: Mapping) -> None:
|
|
199
198
|
"""Recursively update a dictionary.
|
|
200
|
-
|
|
199
|
+
|
|
201
200
|
:param Dict d: The dictionary to update.
|
|
202
201
|
:param Mapping u: The mapping to update from.
|
|
203
202
|
"""
|
|
@@ -73,7 +73,13 @@ class AzureMLTokenManager(APITokenManager):
|
|
|
73
73
|
return super().get_aad_credential()
|
|
74
74
|
|
|
75
75
|
def get_token(
|
|
76
|
-
|
|
76
|
+
self,
|
|
77
|
+
scopes=None,
|
|
78
|
+
claims: Union[str, None] = None,
|
|
79
|
+
tenant_id: Union[str, None] = None,
|
|
80
|
+
enable_cae: bool = False,
|
|
81
|
+
**kwargs: Any
|
|
82
|
+
) -> AccessToken:
|
|
77
83
|
"""Get the API token. If the token is not available or has expired, refresh the token.
|
|
78
84
|
|
|
79
85
|
:return: API token
|
|
@@ -5,8 +5,17 @@ from enum import Enum
|
|
|
5
5
|
|
|
6
6
|
from azure.core import CaseInsensitiveEnumMeta
|
|
7
7
|
|
|
8
|
-
PROMPT_BASED_REASON_EVALUATORS = [
|
|
9
|
-
|
|
8
|
+
PROMPT_BASED_REASON_EVALUATORS = [
|
|
9
|
+
"coherence",
|
|
10
|
+
"relevance",
|
|
11
|
+
"retrieval",
|
|
12
|
+
"groundedness",
|
|
13
|
+
"fluency",
|
|
14
|
+
"intent_resolution",
|
|
15
|
+
"tool_call_accurate",
|
|
16
|
+
"response_completeness",
|
|
17
|
+
"task_adherence",
|
|
18
|
+
]
|
|
10
19
|
|
|
11
20
|
|
|
12
21
|
class CommonConstants:
|
|
@@ -6,13 +6,22 @@ import logging
|
|
|
6
6
|
from typing import Union, Any, Dict
|
|
7
7
|
from azure.core.credentials import AzureKeyCredential, TokenCredential
|
|
8
8
|
from azure.ai.evaluation._common.onedp import AIProjectClient as RestEvaluationServiceClient
|
|
9
|
-
from azure.ai.evaluation._common.onedp.models import (
|
|
10
|
-
|
|
9
|
+
from azure.ai.evaluation._common.onedp.models import (
|
|
10
|
+
PendingUploadRequest,
|
|
11
|
+
PendingUploadType,
|
|
12
|
+
EvaluationResult,
|
|
13
|
+
ResultType,
|
|
14
|
+
AssetCredentialRequest,
|
|
15
|
+
EvaluationUpload,
|
|
16
|
+
InputDataset,
|
|
17
|
+
RedTeamUpload,
|
|
18
|
+
)
|
|
11
19
|
from azure.storage.blob import ContainerClient
|
|
12
20
|
from .utils import upload
|
|
13
21
|
|
|
14
22
|
LOGGER = logging.getLogger(__name__)
|
|
15
23
|
|
|
24
|
+
|
|
16
25
|
class EvaluationServiceOneDPClient:
|
|
17
26
|
|
|
18
27
|
def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
|
|
@@ -23,7 +32,15 @@ class EvaluationServiceOneDPClient:
|
|
|
23
32
|
)
|
|
24
33
|
|
|
25
34
|
def create_evaluation_result(
|
|
26
|
-
|
|
35
|
+
self,
|
|
36
|
+
*,
|
|
37
|
+
name: str,
|
|
38
|
+
path: str,
|
|
39
|
+
version=1,
|
|
40
|
+
metrics: Dict[str, int] = None,
|
|
41
|
+
result_type: ResultType = ResultType.EVALUATION,
|
|
42
|
+
**kwargs,
|
|
43
|
+
) -> EvaluationResult:
|
|
27
44
|
"""Create and upload evaluation results to Azure evaluation service.
|
|
28
45
|
|
|
29
46
|
This method uploads evaluation results from a local path to Azure Blob Storage
|
|
@@ -49,17 +66,20 @@ class EvaluationServiceOneDPClient:
|
|
|
49
66
|
:raises: Various exceptions from the underlying API calls or upload process
|
|
50
67
|
"""
|
|
51
68
|
|
|
52
|
-
LOGGER.debug(
|
|
69
|
+
LOGGER.debug(
|
|
70
|
+
f"Creating evaluation result for {name} with version {version} type {result_type} from path {path}"
|
|
71
|
+
)
|
|
53
72
|
start_pending_upload_response = self.rest_client.evaluation_results.start_pending_upload(
|
|
54
73
|
name=name,
|
|
55
74
|
version=version,
|
|
56
75
|
body=PendingUploadRequest(pending_upload_type=PendingUploadType.TEMPORARY_BLOB_REFERENCE),
|
|
57
|
-
**kwargs
|
|
76
|
+
**kwargs,
|
|
58
77
|
)
|
|
59
78
|
|
|
60
79
|
LOGGER.debug(f"Uploading {path} to {start_pending_upload_response.blob_reference_for_consumption.blob_uri}")
|
|
61
80
|
with ContainerClient.from_container_url(
|
|
62
|
-
start_pending_upload_response.blob_reference_for_consumption.credential.sas_uri
|
|
81
|
+
start_pending_upload_response.blob_reference_for_consumption.credential.sas_uri
|
|
82
|
+
) as container_client:
|
|
63
83
|
upload(path=path, container_client=container_client, logger=LOGGER)
|
|
64
84
|
|
|
65
85
|
LOGGER.debug(f"Creating evaluation result version for {name} with version {version}")
|
|
@@ -73,7 +93,7 @@ class EvaluationServiceOneDPClient:
|
|
|
73
93
|
),
|
|
74
94
|
name=name,
|
|
75
95
|
version=version,
|
|
76
|
-
**kwargs
|
|
96
|
+
**kwargs,
|
|
77
97
|
)
|
|
78
98
|
|
|
79
99
|
return create_version_response
|
|
@@ -90,10 +110,7 @@ class EvaluationServiceOneDPClient:
|
|
|
90
110
|
:rtype: EvaluationUpload
|
|
91
111
|
:raises: Various exceptions from the underlying API calls
|
|
92
112
|
"""
|
|
93
|
-
upload_run_response = self.rest_client.evaluations.upload_run(
|
|
94
|
-
evaluation=evaluation,
|
|
95
|
-
**kwargs
|
|
96
|
-
)
|
|
113
|
+
upload_run_response = self.rest_client.evaluations.upload_run(evaluation=evaluation, **kwargs)
|
|
97
114
|
|
|
98
115
|
return upload_run_response
|
|
99
116
|
|
|
@@ -112,11 +129,7 @@ class EvaluationServiceOneDPClient:
|
|
|
112
129
|
:rtype: EvaluationUpload
|
|
113
130
|
:raises: Various exceptions from the underlying API calls
|
|
114
131
|
"""
|
|
115
|
-
update_run_response = self.rest_client.evaluations.upload_update_run(
|
|
116
|
-
name=name,
|
|
117
|
-
evaluation=evaluation,
|
|
118
|
-
**kwargs
|
|
119
|
-
)
|
|
132
|
+
update_run_response = self.rest_client.evaluations.upload_update_run(name=name, evaluation=evaluation, **kwargs)
|
|
120
133
|
|
|
121
134
|
return update_run_response
|
|
122
135
|
|
|
@@ -132,10 +145,7 @@ class EvaluationServiceOneDPClient:
|
|
|
132
145
|
:rtype: ~azure.ai.evaluation._common.onedp.models.RedTeamUpload
|
|
133
146
|
:raises: Various exceptions from the underlying API calls
|
|
134
147
|
"""
|
|
135
|
-
upload_run_response = self.rest_client.red_teams.upload_run(
|
|
136
|
-
redteam=red_team,
|
|
137
|
-
**kwargs
|
|
138
|
-
)
|
|
148
|
+
upload_run_response = self.rest_client.red_teams.upload_run(redteam=red_team, **kwargs)
|
|
139
149
|
|
|
140
150
|
return upload_run_response
|
|
141
151
|
|
|
@@ -154,10 +164,6 @@ class EvaluationServiceOneDPClient:
|
|
|
154
164
|
:rtype: ~azure.ai.evaluation._common.onedp.models.RedTeamUpload
|
|
155
165
|
:raises: Various exceptions from the underlying API calls
|
|
156
166
|
"""
|
|
157
|
-
update_run_response = self.rest_client.red_teams.upload_update_run(
|
|
158
|
-
name=name,
|
|
159
|
-
redteam=red_team,
|
|
160
|
-
**kwargs
|
|
161
|
-
)
|
|
167
|
+
update_run_response = self.rest_client.red_teams.upload_update_run(name=name, redteam=red_team, **kwargs)
|
|
162
168
|
|
|
163
|
-
return update_run_response
|
|
169
|
+
return update_run_response
|
|
@@ -1,32 +1,32 @@
|
|
|
1
|
-
# coding=utf-8
|
|
2
|
-
# --------------------------------------------------------------------------
|
|
3
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
4
|
-
# Licensed under the MIT License. See License.txt in the project root for license information.
|
|
5
|
-
# Code generated by Microsoft (R) Python Code Generator.
|
|
6
|
-
# Changes may cause incorrect behavior and will be lost if the code is regenerated.
|
|
7
|
-
# --------------------------------------------------------------------------
|
|
8
|
-
# pylint: disable=wrong-import-position
|
|
9
|
-
|
|
10
|
-
from typing import TYPE_CHECKING
|
|
11
|
-
|
|
12
|
-
if TYPE_CHECKING:
|
|
13
|
-
from ._patch import * # pylint: disable=unused-wildcard-import
|
|
14
|
-
|
|
15
|
-
from ._client import AIProjectClient # type: ignore
|
|
16
|
-
from ._version import VERSION
|
|
17
|
-
|
|
18
|
-
__version__ = VERSION
|
|
19
|
-
|
|
20
|
-
try:
|
|
21
|
-
from ._patch import __all__ as _patch_all
|
|
22
|
-
from ._patch import *
|
|
23
|
-
except ImportError:
|
|
24
|
-
_patch_all = []
|
|
25
|
-
from ._patch import patch_sdk as _patch_sdk
|
|
26
|
-
|
|
27
|
-
__all__ = [
|
|
28
|
-
"AIProjectClient",
|
|
29
|
-
]
|
|
30
|
-
__all__.extend([p for p in _patch_all if p not in __all__]) # pyright: ignore
|
|
31
|
-
|
|
32
|
-
_patch_sdk()
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# --------------------------------------------------------------------------
|
|
3
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
4
|
+
# Licensed under the MIT License. See License.txt in the project root for license information.
|
|
5
|
+
# Code generated by Microsoft (R) Python Code Generator.
|
|
6
|
+
# Changes may cause incorrect behavior and will be lost if the code is regenerated.
|
|
7
|
+
# --------------------------------------------------------------------------
|
|
8
|
+
# pylint: disable=wrong-import-position
|
|
9
|
+
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from ._patch import * # pylint: disable=unused-wildcard-import
|
|
14
|
+
|
|
15
|
+
from ._client import AIProjectClient # type: ignore
|
|
16
|
+
from ._version import VERSION
|
|
17
|
+
|
|
18
|
+
__version__ = VERSION
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from ._patch import __all__ as _patch_all
|
|
22
|
+
from ._patch import *
|
|
23
|
+
except ImportError:
|
|
24
|
+
_patch_all = []
|
|
25
|
+
from ._patch import patch_sdk as _patch_sdk
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"AIProjectClient",
|
|
29
|
+
]
|
|
30
|
+
__all__.extend([p for p in _patch_all if p not in __all__]) # pyright: ignore
|
|
31
|
+
|
|
32
|
+
_patch_sdk()
|