azure-ai-evaluation 1.11.0__py3-none-any.whl → 1.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
- azure/ai/evaluation/_aoai/label_grader.py +8 -3
- azure/ai/evaluation/_aoai/python_grader.py +8 -3
- azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
- azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
- azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
- azure/ai/evaluation/_eval_mapping.py +2 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +279 -50
- azure/ai/evaluation/_evaluate/_utils.py +7 -3
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
- azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
- azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
- azure/ai/evaluation/red_team/_red_team.py +9 -0
- azure/ai/evaluation/red_team/_red_team_result.py +230 -1
- azure/ai/evaluation/red_team/_result_processor.py +416 -23
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +19 -3
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import TypeIs
|
|
5
7
|
|
|
6
|
-
from azure.ai.evaluation.
|
|
8
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION, TokenScope
|
|
7
10
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
11
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
8
12
|
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
9
|
-
from
|
|
10
|
-
|
|
13
|
+
from azure.core.credentials import TokenCredential
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from openai.lib.azure import AzureADTokenProvider
|
|
11
17
|
|
|
12
18
|
|
|
13
19
|
@experimental
|
|
@@ -30,6 +36,8 @@ class AzureOpenAIGrader:
|
|
|
30
36
|
to be formatted as a dictionary that matches the specifications of the sub-types of
|
|
31
37
|
the TestingCriterion alias specified in (OpenAI's SDK)[https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L151].
|
|
32
38
|
:type grader_config: Dict[str, Any]
|
|
39
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
40
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
33
41
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
34
42
|
:type kwargs: Any
|
|
35
43
|
|
|
@@ -43,10 +51,12 @@ class AzureOpenAIGrader:
|
|
|
43
51
|
*,
|
|
44
52
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
45
53
|
grader_config: Dict[str, Any],
|
|
54
|
+
credential: Optional[TokenCredential] = None,
|
|
46
55
|
**kwargs: Any,
|
|
47
56
|
):
|
|
48
57
|
self._model_config = model_config
|
|
49
58
|
self._grader_config = grader_config
|
|
59
|
+
self._credential = credential
|
|
50
60
|
|
|
51
61
|
if kwargs.get("validate", True):
|
|
52
62
|
self._validate_model_config()
|
|
@@ -54,20 +64,39 @@ class AzureOpenAIGrader:
|
|
|
54
64
|
|
|
55
65
|
def _validate_model_config(self) -> None:
|
|
56
66
|
"""Validate the model configuration that this grader wrapper is using."""
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
67
|
+
msg = None
|
|
68
|
+
if self._is_azure_model_config(self._model_config):
|
|
69
|
+
if not any(auth for auth in (self._model_config.get("api_key"), self._credential)):
|
|
70
|
+
msg = (
|
|
71
|
+
f"{type(self).__name__}: Requires an api_key in the supplied model_config, "
|
|
72
|
+
+ "or providing a credential to the grader's __init__ method. "
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
else:
|
|
76
|
+
if "api_key" not in self._model_config or not self._model_config.get("api_key"):
|
|
77
|
+
msg = f"{type(self).__name__}: Requires an api_key in the supplied model_config."
|
|
78
|
+
|
|
79
|
+
if msg is None:
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
raise EvaluationException(
|
|
83
|
+
message=msg,
|
|
84
|
+
blame=ErrorBlame.USER_ERROR,
|
|
85
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
86
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
87
|
+
)
|
|
65
88
|
|
|
66
89
|
def _validate_grader_config(self) -> None:
|
|
67
90
|
"""Validate the grader configuration that this grader wrapper is using."""
|
|
68
91
|
|
|
69
92
|
return
|
|
70
93
|
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _is_azure_model_config(
|
|
96
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
97
|
+
) -> TypeIs[AzureOpenAIModelConfiguration]:
|
|
98
|
+
return "azure_endpoint" in model_config
|
|
99
|
+
|
|
71
100
|
def get_client(self) -> Any:
|
|
72
101
|
"""Construct an appropriate OpenAI client using this grader's model configuration.
|
|
73
102
|
Returns a slightly different client depending on whether or not this grader's model
|
|
@@ -77,23 +106,38 @@ class AzureOpenAIGrader:
|
|
|
77
106
|
:rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
|
|
78
107
|
"""
|
|
79
108
|
default_headers = {"User-Agent": UserAgentSingleton().value}
|
|
80
|
-
|
|
109
|
+
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration] = self._model_config
|
|
110
|
+
api_key: Optional[str] = model_config.get("api_key")
|
|
111
|
+
|
|
112
|
+
if self._is_azure_model_config(model_config):
|
|
81
113
|
from openai import AzureOpenAI
|
|
82
114
|
|
|
83
115
|
# TODO set default values?
|
|
84
116
|
return AzureOpenAI(
|
|
85
|
-
azure_endpoint=
|
|
86
|
-
api_key=
|
|
117
|
+
azure_endpoint=model_config["azure_endpoint"],
|
|
118
|
+
api_key=api_key, # Default-style access to appease linters.
|
|
87
119
|
api_version=DEFAULT_AOAI_API_VERSION, # Force a known working version
|
|
88
|
-
azure_deployment=
|
|
120
|
+
azure_deployment=model_config.get("azure_deployment", ""),
|
|
121
|
+
azure_ad_token_provider=self._get_token_provider(self._credential) if not api_key else None,
|
|
89
122
|
default_headers=default_headers,
|
|
90
123
|
)
|
|
91
124
|
from openai import OpenAI
|
|
92
125
|
|
|
93
126
|
# TODO add default values for base_url and organization?
|
|
94
127
|
return OpenAI(
|
|
95
|
-
api_key=
|
|
96
|
-
base_url=
|
|
97
|
-
organization=
|
|
128
|
+
api_key=api_key,
|
|
129
|
+
base_url=model_config.get("base_url", ""),
|
|
130
|
+
organization=model_config.get("organization", ""),
|
|
98
131
|
default_headers=default_headers,
|
|
99
132
|
)
|
|
133
|
+
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _get_token_provider(cred: TokenCredential) -> "AzureADTokenProvider":
|
|
136
|
+
"""Get the token provider the AzureOpenAI client.
|
|
137
|
+
|
|
138
|
+
:param TokenCredential cred: The Azure authentication credential.
|
|
139
|
+
:return: The token provider if a credential is provided, otherwise None.
|
|
140
|
+
:rtype: openai.lib.azure.AzureADTokenProvider
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
return lambda: cred.get_token(TokenScope.COGNITIVE_SERVICES_MANAGEMENT).token
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
6
|
from openai.types.graders import LabelModelGrader
|
|
7
|
+
|
|
8
8
|
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
9
11
|
|
|
10
12
|
from .aoai_grader import AzureOpenAIGrader
|
|
11
13
|
|
|
@@ -37,6 +39,8 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
|
37
39
|
:type name: str
|
|
38
40
|
:param passing_labels: The labels that indicate a passing result. Must be a subset of labels.
|
|
39
41
|
:type passing_labels: List[str]
|
|
42
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
43
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
40
44
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
41
45
|
:type kwargs: Any
|
|
42
46
|
|
|
@@ -54,6 +58,7 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
|
54
58
|
model: str,
|
|
55
59
|
name: str,
|
|
56
60
|
passing_labels: List[str],
|
|
61
|
+
credential: Optional[TokenCredential] = None,
|
|
57
62
|
**kwargs: Any
|
|
58
63
|
):
|
|
59
64
|
grader = LabelModelGrader(
|
|
@@ -64,4 +69,4 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
|
|
|
64
69
|
passing_labels=passing_labels,
|
|
65
70
|
type="label_model",
|
|
66
71
|
)
|
|
67
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
72
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
5
5
|
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
6
|
from openai.types.graders import PythonGrader
|
|
7
|
+
|
|
8
8
|
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
9
11
|
|
|
10
12
|
from .aoai_grader import AzureOpenAIGrader
|
|
11
13
|
|
|
@@ -39,6 +41,8 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
|
39
41
|
:param source: Python source code containing the grade function.
|
|
40
42
|
Must define: def grade(sample: dict, item: dict) -> float
|
|
41
43
|
:type source: str
|
|
44
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
45
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
42
46
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
43
47
|
:type kwargs: Any
|
|
44
48
|
|
|
@@ -63,6 +67,7 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
|
63
67
|
image_tag: str,
|
|
64
68
|
pass_threshold: float,
|
|
65
69
|
source: str,
|
|
70
|
+
credential: Optional[TokenCredential] = None,
|
|
66
71
|
**kwargs: Any,
|
|
67
72
|
):
|
|
68
73
|
# Validate pass_threshold
|
|
@@ -81,4 +86,4 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
|
|
|
81
86
|
type="python",
|
|
82
87
|
)
|
|
83
88
|
|
|
84
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
89
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
7
6
|
from openai.types.graders import ScoreModelGrader
|
|
7
|
+
|
|
8
8
|
from azure.ai.evaluation._common._experimental import experimental
|
|
9
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
10
|
+
from azure.core.credentials import TokenCredential
|
|
9
11
|
|
|
10
12
|
from .aoai_grader import AzureOpenAIGrader
|
|
11
13
|
|
|
@@ -43,6 +45,8 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
43
45
|
:type pass_threshold: Optional[float]
|
|
44
46
|
:param sampling_params: The sampling parameters for the model.
|
|
45
47
|
:type sampling_params: Optional[Dict[str, Any]]
|
|
48
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
49
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
46
50
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
47
51
|
:type kwargs: Any
|
|
48
52
|
"""
|
|
@@ -59,6 +63,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
59
63
|
range: Optional[List[float]] = None,
|
|
60
64
|
pass_threshold: Optional[float] = None,
|
|
61
65
|
sampling_params: Optional[Dict[str, Any]] = None,
|
|
66
|
+
credential: Optional[TokenCredential] = None,
|
|
62
67
|
**kwargs: Any,
|
|
63
68
|
):
|
|
64
69
|
# Validate range and pass_threshold
|
|
@@ -88,4 +93,4 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
|
|
|
88
93
|
|
|
89
94
|
grader = ScoreModelGrader(**grader_kwargs)
|
|
90
95
|
|
|
91
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
96
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict, Union
|
|
5
|
-
from typing_extensions import Literal
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
6
5
|
|
|
7
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
8
6
|
from openai.types.graders import StringCheckGrader
|
|
7
|
+
from typing_extensions import Literal
|
|
8
|
+
|
|
9
9
|
from azure.ai.evaluation._common._experimental import experimental
|
|
10
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
11
|
+
from azure.core.credentials import TokenCredential
|
|
10
12
|
|
|
11
13
|
from .aoai_grader import AzureOpenAIGrader
|
|
12
14
|
|
|
@@ -33,6 +35,8 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
|
33
35
|
:type operation: Literal["eq", "ne", "like", "ilike"]
|
|
34
36
|
:param reference: The reference text. This may include template strings.
|
|
35
37
|
:type reference: str
|
|
38
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
39
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
36
40
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
37
41
|
:type kwargs: Any
|
|
38
42
|
|
|
@@ -54,6 +58,7 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
|
54
58
|
"ilike",
|
|
55
59
|
],
|
|
56
60
|
reference: str,
|
|
61
|
+
credential: Optional[TokenCredential] = None,
|
|
57
62
|
**kwargs: Any
|
|
58
63
|
):
|
|
59
64
|
grader = StringCheckGrader(
|
|
@@ -63,4 +68,4 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
|
|
|
63
68
|
reference=reference,
|
|
64
69
|
type="string_check",
|
|
65
70
|
)
|
|
66
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
71
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Any, Dict, Union
|
|
5
|
-
from typing_extensions import Literal
|
|
4
|
+
from typing import Any, Dict, Optional, Union
|
|
6
5
|
|
|
7
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
8
6
|
from openai.types.graders import TextSimilarityGrader
|
|
7
|
+
from typing_extensions import Literal
|
|
8
|
+
|
|
9
9
|
from azure.ai.evaluation._common._experimental import experimental
|
|
10
|
+
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
11
|
+
from azure.core.credentials import TokenCredential
|
|
10
12
|
|
|
11
13
|
from .aoai_grader import AzureOpenAIGrader
|
|
12
14
|
|
|
@@ -47,6 +49,8 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
47
49
|
:type reference: str
|
|
48
50
|
:param name: The name of the grader.
|
|
49
51
|
:type name: str
|
|
52
|
+
:param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
|
|
53
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
50
54
|
:param kwargs: Additional keyword arguments to pass to the grader.
|
|
51
55
|
:type kwargs: Any
|
|
52
56
|
|
|
@@ -76,6 +80,7 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
76
80
|
pass_threshold: float,
|
|
77
81
|
reference: str,
|
|
78
82
|
name: str,
|
|
83
|
+
credential: Optional[TokenCredential] = None,
|
|
79
84
|
**kwargs: Any
|
|
80
85
|
):
|
|
81
86
|
grader = TextSimilarityGrader(
|
|
@@ -86,4 +91,4 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
|
|
|
86
91
|
reference=reference,
|
|
87
92
|
type="text_similarity",
|
|
88
93
|
)
|
|
89
|
-
super().__init__(model_config=model_config, grader_config=grader, **kwargs)
|
|
94
|
+
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
|
|
12
12
|
# Import all evals
|
|
13
13
|
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
|
|
14
|
+
from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
|
|
14
15
|
from azure.ai.evaluation import (
|
|
15
16
|
BleuScoreEvaluator,
|
|
16
17
|
CodeVulnerabilityEvaluator,
|
|
@@ -67,6 +68,7 @@ EVAL_CLASS_MAP = {
|
|
|
67
68
|
SexualEvaluator: "sexual",
|
|
68
69
|
SimilarityEvaluator: "similarity",
|
|
69
70
|
TaskAdherenceEvaluator: "task_adherence",
|
|
71
|
+
TaskSuccessEvaluator: "task_success",
|
|
70
72
|
ToolCallAccuracyEvaluator: "tool_call_accuracy",
|
|
71
73
|
UngroundedAttributesEvaluator: "ungrounded_attributes",
|
|
72
74
|
ViolenceEvaluator: "violence",
|
|
@@ -9,7 +9,7 @@ import os
|
|
|
9
9
|
import re
|
|
10
10
|
import tempfile
|
|
11
11
|
import json
|
|
12
|
-
from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
12
|
+
from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
13
13
|
|
|
14
14
|
from openai import OpenAI, AzureOpenAI
|
|
15
15
|
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
@@ -18,6 +18,7 @@ import pandas as pd
|
|
|
18
18
|
|
|
19
19
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
20
20
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
21
|
+
from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
|
|
21
22
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
22
23
|
|
|
23
24
|
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
@@ -317,6 +318,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
317
318
|
# For rest of metrics, we will calculate mean
|
|
318
319
|
df.drop(columns=handled_columns, inplace=True)
|
|
319
320
|
|
|
321
|
+
# Convert "not applicable" strings to None to allow proper numeric aggregation
|
|
322
|
+
df = df.replace(EvaluatorBase._NOT_APPLICABLE_RESULT, None)
|
|
323
|
+
|
|
320
324
|
# NOTE: nan/None values don't count as as booleans, so boolean columns with
|
|
321
325
|
# nan/None values won't have a mean produced from them.
|
|
322
326
|
# This is different from label-based known evaluators, which have special handling.
|
|
@@ -1131,11 +1135,36 @@ def _preprocess_data(
|
|
|
1131
1135
|
# via target mapping.
|
|
1132
1136
|
# If both the data and the output dictionary of the target function
|
|
1133
1137
|
# have the same column, then the target function value is used.
|
|
1138
|
+
# NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
|
|
1139
|
+
# Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
|
|
1140
|
+
if input_data_df is not None:
|
|
1141
|
+
if "conversation" in input_data_df.columns or "messages" in input_data_df.columns:
|
|
1142
|
+
# No action is taken when 'conversation' or 'messages' columns are present,
|
|
1143
|
+
# as these indicate chat/conversation data which should not be flattened or mapped by default.
|
|
1144
|
+
pass
|
|
1145
|
+
else:
|
|
1146
|
+
input_data_df = _flatten_object_columns_for_default_mapping(input_data_df)
|
|
1147
|
+
|
|
1148
|
+
# Build default mapping for leaves:
|
|
1134
1149
|
if input_data_df is not None:
|
|
1150
|
+
# First, map flattened nested columns (those containing a dot) to leaf names.
|
|
1151
|
+
for col in input_data_df.columns:
|
|
1152
|
+
# Skip target output columns
|
|
1153
|
+
if col.startswith(Prefixes.TSG_OUTPUTS):
|
|
1154
|
+
continue
|
|
1155
|
+
# Skip root container columns (no dot) here; they'll be handled below if truly primitive.
|
|
1156
|
+
if "." in col:
|
|
1157
|
+
leaf_name = col.split(".")[-1]
|
|
1158
|
+
if leaf_name not in column_mapping["default"]:
|
|
1159
|
+
column_mapping["default"][leaf_name] = f"${{data.{col}}}"
|
|
1160
|
+
|
|
1161
|
+
# Then, handle remaining top-level primitive columns (original logic).
|
|
1135
1162
|
for col in input_data_df.columns:
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1163
|
+
if (
|
|
1164
|
+
not col.startswith(Prefixes.TSG_OUTPUTS)
|
|
1165
|
+
and col not in column_mapping["default"].keys()
|
|
1166
|
+
and "." not in col # only pure top-level primitives
|
|
1167
|
+
):
|
|
1139
1168
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
1140
1169
|
|
|
1141
1170
|
return __ValidatedData(
|
|
@@ -1149,6 +1178,79 @@ def _preprocess_data(
|
|
|
1149
1178
|
)
|
|
1150
1179
|
|
|
1151
1180
|
|
|
1181
|
+
def _flatten_object_columns_for_default_mapping(
|
|
1182
|
+
df: pd.DataFrame, root_prefixes: Optional[Iterable[str]] = None
|
|
1183
|
+
) -> pd.DataFrame:
|
|
1184
|
+
"""Flatten nested dictionary-valued columns into dotted leaf columns.
|
|
1185
|
+
|
|
1186
|
+
For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
|
|
1187
|
+
leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
|
|
1188
|
+
columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
|
|
1189
|
+
all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
|
|
1190
|
+
are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
|
|
1191
|
+
|
|
1192
|
+
Example
|
|
1193
|
+
If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
|
|
1194
|
+
columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
|
|
1195
|
+
|
|
1196
|
+
:param df: Input DataFrame to flatten in place.
|
|
1197
|
+
:type df: ~pandas.DataFrame
|
|
1198
|
+
:param root_prefixes: Optional iterable restricting which top-level columns are considered
|
|
1199
|
+
for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
|
|
1200
|
+
:type root_prefixes: Optional[Iterable[str]]
|
|
1201
|
+
:return: The same DataFrame instance (returned for convenient chaining).
|
|
1202
|
+
:rtype: ~pandas.DataFrame
|
|
1203
|
+
"""
|
|
1204
|
+
candidate_cols = []
|
|
1205
|
+
if root_prefixes is not None:
|
|
1206
|
+
candidate_cols = [c for c in root_prefixes if c in df.columns]
|
|
1207
|
+
else:
|
|
1208
|
+
# pick columns where at least one non-null value is a dict
|
|
1209
|
+
for c in df.columns:
|
|
1210
|
+
series = df[c]
|
|
1211
|
+
if series.map(lambda v: isinstance(v, dict)).any():
|
|
1212
|
+
candidate_cols.append(c)
|
|
1213
|
+
|
|
1214
|
+
def _extract_leaves(obj: Any, prefix: str) -> Iterator[Tuple[str, Any]]:
|
|
1215
|
+
if isinstance(obj, dict):
|
|
1216
|
+
for k, v in obj.items():
|
|
1217
|
+
new_prefix = f"{prefix}.{k}" if prefix else k
|
|
1218
|
+
if isinstance(v, dict):
|
|
1219
|
+
yield from _extract_leaves(v, new_prefix)
|
|
1220
|
+
else:
|
|
1221
|
+
# treat list / primitive / None as leaf
|
|
1222
|
+
yield new_prefix, v
|
|
1223
|
+
|
|
1224
|
+
for root_col in candidate_cols:
|
|
1225
|
+
# Build a union of leaf paths across rows to ensure consistent columns
|
|
1226
|
+
leaf_paths: Set[str] = set()
|
|
1227
|
+
for val in df[root_col]:
|
|
1228
|
+
if isinstance(val, dict):
|
|
1229
|
+
for path, _ in _extract_leaves(val, root_col):
|
|
1230
|
+
leaf_paths.add(path)
|
|
1231
|
+
|
|
1232
|
+
if not leaf_paths:
|
|
1233
|
+
continue
|
|
1234
|
+
|
|
1235
|
+
# Create each flattened column if absent
|
|
1236
|
+
for path in leaf_paths:
|
|
1237
|
+
if path in df.columns:
|
|
1238
|
+
continue # already present
|
|
1239
|
+
relative_keys = path[len(root_col) + 1 :].split(".") if len(path) > len(root_col) else []
|
|
1240
|
+
|
|
1241
|
+
def getter(root_val: Any) -> Any:
|
|
1242
|
+
cur = root_val
|
|
1243
|
+
for rk in relative_keys:
|
|
1244
|
+
if not isinstance(cur, dict):
|
|
1245
|
+
return None
|
|
1246
|
+
cur = cur.get(rk, None)
|
|
1247
|
+
return cur
|
|
1248
|
+
|
|
1249
|
+
df[path] = df[root_col].map(lambda rv: getter(rv) if isinstance(rv, dict) else None)
|
|
1250
|
+
|
|
1251
|
+
return df
|
|
1252
|
+
|
|
1253
|
+
|
|
1152
1254
|
def _run_callable_evaluators(
|
|
1153
1255
|
validated_data: __ValidatedData,
|
|
1154
1256
|
fail_on_evaluator_errors: bool = False,
|