azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +1 -1
- azure/ai/evaluation/_common/constants.py +4 -2
- azure/ai/evaluation/_common/math.py +18 -0
- azure/ai/evaluation/_common/rai_service.py +54 -62
- azure/ai/evaluation/_common/utils.py +182 -12
- azure/ai/evaluation/_constants.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +116 -62
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
- azure/ai/evaluation/_evaluate/_utils.py +44 -25
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +59 -30
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +10 -13
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -4
- azure/ai/evaluation/_evaluators/_eci/_eci.py +4 -4
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +3 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +4 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +3 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +11 -8
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +21 -7
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -5
- azure/ai/evaluation/_exceptions.py +9 -6
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +5 -5
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -5
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
- azure/ai/evaluation/simulator/_simulator.py +112 -113
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +72 -44
- azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/RECORD +64 -56
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ import functools
|
|
|
6
6
|
import inspect
|
|
7
7
|
import json
|
|
8
8
|
import logging
|
|
9
|
-
from typing import Callable, Dict,
|
|
9
|
+
from typing import Callable, Dict, Literal, Optional, Union, cast
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
12
12
|
from promptflow._sdk.entities._flows import FlexFlow as flex_flow
|
|
@@ -16,31 +16,30 @@ from promptflow.client import PFClient
|
|
|
16
16
|
from promptflow.core import Prompty as prompty_core
|
|
17
17
|
from typing_extensions import ParamSpec
|
|
18
18
|
|
|
19
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
20
|
+
|
|
19
21
|
from ..._user_agent import USER_AGENT
|
|
20
|
-
from .._utils import _trace_destination_from_project_scope
|
|
22
|
+
from .._utils import EvaluateResult, _trace_destination_from_project_scope
|
|
21
23
|
|
|
22
24
|
LOGGER = logging.getLogger(__name__)
|
|
23
25
|
|
|
24
26
|
P = ParamSpec("P")
|
|
25
|
-
R = TypeVar("R")
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
def _get_evaluator_type(evaluator: Dict[str, Callable]):
|
|
29
|
+
def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
|
|
29
30
|
"""
|
|
30
31
|
Get evaluator type for telemetry.
|
|
31
32
|
|
|
32
33
|
:param evaluator: The evaluator object
|
|
33
34
|
:type evaluator: Dict[str, Callable]
|
|
34
35
|
:return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
|
|
35
|
-
:rtype:
|
|
36
|
+
:rtype: Literal["content-safety", "built-in", "custom"]
|
|
36
37
|
"""
|
|
37
|
-
built_in = False
|
|
38
|
-
content_safety = False
|
|
39
|
-
|
|
40
38
|
module = inspect.getmodule(evaluator)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
39
|
+
module_name = module.__name__ if module else ""
|
|
40
|
+
|
|
41
|
+
built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
|
|
42
|
+
content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
|
|
44
43
|
|
|
45
44
|
if content_safety:
|
|
46
45
|
return "content-safety"
|
|
@@ -98,22 +97,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
98
97
|
|
|
99
98
|
|
|
100
99
|
# cspell:ignore isna
|
|
101
|
-
def log_evaluate_activity(func: Callable[P,
|
|
100
|
+
def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
|
|
102
101
|
"""Decorator to log evaluate activity
|
|
103
102
|
|
|
104
103
|
:param func: The function to be decorated
|
|
105
104
|
:type func: Callable
|
|
106
105
|
:returns: The decorated function
|
|
107
|
-
:rtype: Callable[P,
|
|
106
|
+
:rtype: Callable[P, EvaluateResult]
|
|
108
107
|
"""
|
|
109
108
|
|
|
110
109
|
@functools.wraps(func)
|
|
111
|
-
def wrapper(*args: P.args, **kwargs: P.kwargs) ->
|
|
110
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
|
|
112
111
|
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
113
112
|
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
114
113
|
|
|
115
|
-
evaluators = kwargs.get("evaluators",
|
|
116
|
-
azure_ai_project = kwargs.get("azure_ai_project", None)
|
|
114
|
+
evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
|
|
115
|
+
azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
|
|
117
116
|
|
|
118
117
|
pf_client = PFClient(
|
|
119
118
|
config=(
|
|
@@ -127,7 +126,7 @@ def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
|
|
|
127
126
|
track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
|
|
128
127
|
evaluate_target = bool(kwargs.get("target", None))
|
|
129
128
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
130
|
-
custom_dimensions = {
|
|
129
|
+
custom_dimensions: Dict[str, Union[str, bool]] = {
|
|
131
130
|
"track_in_cloud": track_in_cloud,
|
|
132
131
|
"evaluate_target": evaluate_target,
|
|
133
132
|
"evaluator_config": evaluator_config,
|
|
@@ -6,15 +6,22 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
|
-
from collections import namedtuple
|
|
10
9
|
from pathlib import Path
|
|
11
|
-
from typing import Dict
|
|
10
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict, Union
|
|
12
11
|
|
|
13
12
|
import pandas as pd
|
|
14
|
-
|
|
15
|
-
from
|
|
13
|
+
from promptflow.client import PFClient
|
|
14
|
+
from promptflow.entities import Run
|
|
15
|
+
|
|
16
|
+
from azure.ai.evaluation._constants import (
|
|
17
|
+
DEFAULT_EVALUATION_RESULTS_FILE_NAME,
|
|
18
|
+
DefaultOpenEncoding,
|
|
19
|
+
EvaluationRunProperties,
|
|
20
|
+
Prefixes,
|
|
21
|
+
)
|
|
16
22
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
17
23
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
24
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
25
|
|
|
19
26
|
LOGGER = logging.getLogger(__name__)
|
|
20
27
|
|
|
@@ -23,14 +30,26 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
|
|
|
23
30
|
"(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
|
|
24
31
|
)
|
|
25
32
|
|
|
26
|
-
|
|
33
|
+
|
|
34
|
+
class AzureMLWorkspace(NamedTuple):
|
|
35
|
+
subscription_id: str
|
|
36
|
+
resource_group_name: str
|
|
37
|
+
workspace_name: str
|
|
27
38
|
|
|
28
39
|
|
|
29
|
-
|
|
40
|
+
class EvaluateResult(TypedDict):
|
|
41
|
+
metrics: Dict[str, float]
|
|
42
|
+
studio_url: Optional[str]
|
|
43
|
+
rows: List[Dict]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def is_none(value) -> bool:
|
|
30
47
|
return value is None or str(value).lower() == "none"
|
|
31
48
|
|
|
32
49
|
|
|
33
|
-
def extract_workspace_triad_from_trace_provider(
|
|
50
|
+
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
51
|
+
trace_provider: str,
|
|
52
|
+
) -> AzureMLWorkspace:
|
|
34
53
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
35
54
|
if not match or len(match.groups()) != 5:
|
|
36
55
|
raise EvaluationException(
|
|
@@ -47,7 +66,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
|
|
|
47
66
|
subscription_id = match.group(1)
|
|
48
67
|
resource_group_name = match.group(3)
|
|
49
68
|
workspace_name = match.group(5)
|
|
50
|
-
return
|
|
69
|
+
return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
|
|
51
70
|
|
|
52
71
|
|
|
53
72
|
def load_jsonl(path):
|
|
@@ -55,7 +74,7 @@ def load_jsonl(path):
|
|
|
55
74
|
return [json.loads(line) for line in f.readlines()]
|
|
56
75
|
|
|
57
76
|
|
|
58
|
-
def _azure_pf_client_and_triad(trace_destination):
|
|
77
|
+
def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
|
|
59
78
|
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
60
79
|
|
|
61
80
|
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
@@ -69,14 +88,14 @@ def _azure_pf_client_and_triad(trace_destination):
|
|
|
69
88
|
|
|
70
89
|
|
|
71
90
|
def _log_metrics_and_instance_results(
|
|
72
|
-
metrics,
|
|
73
|
-
instance_results,
|
|
74
|
-
trace_destination,
|
|
75
|
-
run,
|
|
76
|
-
evaluation_name,
|
|
77
|
-
) -> str:
|
|
91
|
+
metrics: Dict[str, Any],
|
|
92
|
+
instance_results: pd.DataFrame,
|
|
93
|
+
trace_destination: Optional[str],
|
|
94
|
+
run: Run,
|
|
95
|
+
evaluation_name: Optional[str],
|
|
96
|
+
) -> Optional[str]:
|
|
78
97
|
if trace_destination is None:
|
|
79
|
-
LOGGER.
|
|
98
|
+
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
80
99
|
return None
|
|
81
100
|
|
|
82
101
|
azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
|
|
@@ -94,7 +113,6 @@ def _log_metrics_and_instance_results(
|
|
|
94
113
|
ml_client=azure_pf_client.ml_client,
|
|
95
114
|
promptflow_run=run,
|
|
96
115
|
) as ev_run:
|
|
97
|
-
|
|
98
116
|
artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
|
|
99
117
|
|
|
100
118
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
@@ -112,7 +130,8 @@ def _log_metrics_and_instance_results(
|
|
|
112
130
|
if run is None:
|
|
113
131
|
ev_run.write_properties_to_run_history(
|
|
114
132
|
properties={
|
|
115
|
-
|
|
133
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
134
|
+
EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent",
|
|
116
135
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
117
136
|
"isEvaluatorRun": "true",
|
|
118
137
|
}
|
|
@@ -138,7 +157,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
|
|
|
138
157
|
return studio_url
|
|
139
158
|
|
|
140
159
|
|
|
141
|
-
def _trace_destination_from_project_scope(project_scope:
|
|
160
|
+
def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
|
|
142
161
|
subscription_id = project_scope["subscription_id"]
|
|
143
162
|
resource_group_name = project_scope["resource_group_name"]
|
|
144
163
|
workspace_name = project_scope["project_name"]
|
|
@@ -151,9 +170,9 @@ def _trace_destination_from_project_scope(project_scope: dict) -> str:
|
|
|
151
170
|
return trace_destination
|
|
152
171
|
|
|
153
172
|
|
|
154
|
-
def _write_output(path, data_dict):
|
|
173
|
+
def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
155
174
|
p = Path(path)
|
|
156
|
-
if
|
|
175
|
+
if p.is_dir():
|
|
157
176
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
158
177
|
|
|
159
178
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
@@ -161,7 +180,7 @@ def _write_output(path, data_dict):
|
|
|
161
180
|
|
|
162
181
|
|
|
163
182
|
def _apply_column_mapping(
|
|
164
|
-
source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
|
|
183
|
+
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
|
|
165
184
|
) -> pd.DataFrame:
|
|
166
185
|
"""
|
|
167
186
|
Apply column mapping to source_df based on mapping_config.
|
|
@@ -211,7 +230,7 @@ def _apply_column_mapping(
|
|
|
211
230
|
return result_df
|
|
212
231
|
|
|
213
232
|
|
|
214
|
-
def _has_aggregator(evaluator):
|
|
233
|
+
def _has_aggregator(evaluator: object) -> bool:
|
|
215
234
|
return hasattr(evaluator, "__aggregate__")
|
|
216
235
|
|
|
217
236
|
|
|
@@ -234,11 +253,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
|
|
|
234
253
|
return default_value
|
|
235
254
|
|
|
236
255
|
|
|
237
|
-
def set_event_loop_policy():
|
|
256
|
+
def set_event_loop_policy() -> None:
|
|
238
257
|
import asyncio
|
|
239
258
|
import platform
|
|
240
259
|
|
|
241
260
|
if platform.system().lower() == "windows":
|
|
242
261
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
243
262
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
244
|
-
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
263
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
5
|
from typing import Optional
|
|
6
|
+
|
|
6
7
|
from typing_extensions import override
|
|
7
8
|
|
|
8
9
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
@@ -50,7 +51,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
|
50
51
|
query: Optional[str] = None,
|
|
51
52
|
response: Optional[str] = None,
|
|
52
53
|
conversation: Optional[dict] = None,
|
|
53
|
-
**kwargs
|
|
54
|
+
**kwargs,
|
|
54
55
|
):
|
|
55
56
|
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
|
|
56
57
|
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
|
|
@@ -65,6 +66,6 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
|
65
66
|
to be dictionaries with keys "content" and "role".
|
|
66
67
|
:paramtype conversation: Optional[Dict]
|
|
67
68
|
:return: The relevance score.
|
|
68
|
-
:rtype:
|
|
69
|
+
:rtype: Dict[str, float]
|
|
69
70
|
"""
|
|
70
71
|
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|
|
@@ -2,19 +2,55 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from typing import List, Dict, Callable, Any
|
|
6
5
|
import inspect
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
7
8
|
|
|
8
|
-
from abc import ABC
|
|
9
|
-
|
|
10
|
-
import numpy as np
|
|
11
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
+
from typing_extensions import ParamSpec, TypeAlias
|
|
11
|
+
|
|
12
|
+
from azure.ai.evaluation._common.math import list_mean
|
|
13
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
+
|
|
15
|
+
P = ParamSpec("P")
|
|
16
|
+
T = TypeVar("T")
|
|
17
|
+
T_EvalValue = TypeVar("T_EvalValue")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DerivedEvalInput(TypedDict, total=False):
|
|
21
|
+
"""The eval input generated by EvaluatorBase._derive_conversation_starter."""
|
|
22
|
+
|
|
23
|
+
query: Dict[str, Any]
|
|
24
|
+
response: Dict[str, Any]
|
|
25
|
+
context: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
|
|
29
|
+
"""TypeAlias that models the return value of EvaluatorBase._aggregate_results
|
|
12
30
|
|
|
13
|
-
|
|
31
|
+
.. code-block:: python
|
|
32
|
+
|
|
33
|
+
foo: AggregateResult[float] = {
|
|
34
|
+
"evaluation_per_turn": {
|
|
35
|
+
"gpt_coherence": [1.0, 2.0, 3.0]
|
|
36
|
+
},
|
|
37
|
+
"gpt_coherence": 2.0
|
|
38
|
+
}
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
DoEvalResult: TypeAlias = Dict[str, T]
|
|
42
|
+
"""TypeAlias that models the return value of EvaluatorBase._do_eval
|
|
43
|
+
|
|
44
|
+
.. code-block:: python
|
|
45
|
+
|
|
46
|
+
foo: DoEvalResult[float] = {
|
|
47
|
+
"gpt_coherence": 2.0
|
|
48
|
+
}
|
|
49
|
+
"""
|
|
14
50
|
|
|
15
51
|
|
|
16
52
|
# TODO exception target pass down?
|
|
17
|
-
class EvaluatorBase(ABC):
|
|
53
|
+
class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
18
54
|
"""Base class for all evaluators that are capable of accepting either a group of single values,
|
|
19
55
|
or conversation as input. All such evaluators need to implement two functions of their own:
|
|
20
56
|
- _convert_conversation_to_eval_input
|
|
@@ -51,7 +87,7 @@ class EvaluatorBase(ABC):
|
|
|
51
87
|
# This needs to be overridden just to change the function header into something more informative,
|
|
52
88
|
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
53
89
|
# super().__call__(<inputs>)
|
|
54
|
-
def __call__(self, **kwargs) ->
|
|
90
|
+
def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
55
91
|
"""Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
|
|
56
92
|
one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
|
|
57
93
|
The actual behavior of this function shouldn't change beyond adding more inputs to the
|
|
@@ -64,9 +100,8 @@ class EvaluatorBase(ABC):
|
|
|
64
100
|
"""
|
|
65
101
|
return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
|
|
66
102
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
async def _do_eval(self, eval_input: Any) -> Dict:
|
|
103
|
+
@abstractmethod
|
|
104
|
+
async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
|
|
70
105
|
"""Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
|
|
71
106
|
In the default case, all required inputs are assumed to be within eval_input, as user-friendly
|
|
72
107
|
typing is handled above this function in favor of polymorphic simplicity. This function must be
|
|
@@ -76,12 +111,7 @@ class EvaluatorBase(ABC):
|
|
|
76
111
|
:type eval_input: Any
|
|
77
112
|
:return: A single evaluation result
|
|
78
113
|
:rtype: Dict
|
|
79
|
-
|
|
80
114
|
"""
|
|
81
|
-
raise EvaluationException(
|
|
82
|
-
message="Not implemented",
|
|
83
|
-
internal_message="BaseConversationEval's _do_eval method called somehow. This should be overridden.",
|
|
84
|
-
)
|
|
85
115
|
|
|
86
116
|
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
87
117
|
|
|
@@ -103,7 +133,7 @@ class EvaluatorBase(ABC):
|
|
|
103
133
|
singletons.append(param)
|
|
104
134
|
return singletons
|
|
105
135
|
|
|
106
|
-
def _derive_conversation_converter(self) -> Callable:
|
|
136
|
+
def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
107
137
|
"""Produce the function that will be used to convert conversations to a list of evaluable inputs.
|
|
108
138
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
109
139
|
aspects of a conversation ought to be extracted.
|
|
@@ -115,12 +145,12 @@ class EvaluatorBase(ABC):
|
|
|
115
145
|
include_query = "query" in self._singleton_inputs
|
|
116
146
|
include_response = "response" in self._singleton_inputs
|
|
117
147
|
|
|
118
|
-
def converter(conversation: Dict) -> List:
|
|
119
|
-
messages = conversation["messages"]
|
|
148
|
+
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
149
|
+
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
120
150
|
global_context = conversation.get("context", None)
|
|
121
151
|
# Extract queries, responses from conversation
|
|
122
|
-
queries = []
|
|
123
|
-
responses = []
|
|
152
|
+
queries: List[Dict[str, Any]] = []
|
|
153
|
+
responses: List[Dict[str, Any]] = []
|
|
124
154
|
|
|
125
155
|
# Convert conversation slice into queries and responses.
|
|
126
156
|
# Assume that 'user' role is asking queries and 'assistant' role is responding.
|
|
@@ -147,7 +177,7 @@ class EvaluatorBase(ABC):
|
|
|
147
177
|
if response_context and not include_response:
|
|
148
178
|
context["response_context"] = response_context
|
|
149
179
|
|
|
150
|
-
eval_input = {}
|
|
180
|
+
eval_input: DerivedEvalInput = {}
|
|
151
181
|
if include_query:
|
|
152
182
|
eval_input["query"] = query
|
|
153
183
|
if include_response:
|
|
@@ -159,7 +189,7 @@ class EvaluatorBase(ABC):
|
|
|
159
189
|
|
|
160
190
|
return converter
|
|
161
191
|
|
|
162
|
-
def _convert_kwargs_to_eval_input(self, **kwargs) -> List:
|
|
192
|
+
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
163
193
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
164
194
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
165
195
|
Either they receive a collection of keyname inputs that are all single values
|
|
@@ -211,7 +241,7 @@ class EvaluatorBase(ABC):
|
|
|
211
241
|
target=ErrorTarget.CONVERSATION,
|
|
212
242
|
)
|
|
213
243
|
|
|
214
|
-
def _aggregate_results(self, per_turn_results: List[
|
|
244
|
+
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
|
|
215
245
|
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
216
246
|
|
|
217
247
|
Exact implementation might need to vary slightly depending on the results produced.
|
|
@@ -227,8 +257,8 @@ class EvaluatorBase(ABC):
|
|
|
227
257
|
:rtype: Dict
|
|
228
258
|
"""
|
|
229
259
|
|
|
230
|
-
aggregated = {}
|
|
231
|
-
evaluation_per_turn = {}
|
|
260
|
+
aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
|
|
261
|
+
evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
|
|
232
262
|
|
|
233
263
|
# Go over each turn, and rotate the results into a
|
|
234
264
|
# metric: List[values] format for the evals_per_turn dictionary.
|
|
@@ -241,13 +271,13 @@ class EvaluatorBase(ABC):
|
|
|
241
271
|
# Find and average all numeric values
|
|
242
272
|
for metric, values in evaluation_per_turn.items():
|
|
243
273
|
if all(isinstance(value, (int, float)) for value in values):
|
|
244
|
-
aggregated[metric] =
|
|
274
|
+
aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
|
|
245
275
|
# Slap the per-turn results back in.
|
|
246
276
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
247
277
|
|
|
248
278
|
return aggregated
|
|
249
279
|
|
|
250
|
-
async def _real_call(self, **kwargs):
|
|
280
|
+
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
251
281
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
252
282
|
|
|
253
283
|
:keyword kwargs: The inputs to evaluate.
|
|
@@ -270,9 +300,8 @@ class EvaluatorBase(ABC):
|
|
|
270
300
|
# Otherwise, aggregate results.
|
|
271
301
|
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
272
302
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
def _to_async(self):
|
|
303
|
+
@final
|
|
304
|
+
def _to_async(self) -> "AsyncEvaluatorBase":
|
|
276
305
|
return self._async_evaluator
|
|
277
306
|
|
|
278
307
|
|
|
@@ -2,26 +2,23 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import math
|
|
5
6
|
import re
|
|
6
7
|
from typing import Dict
|
|
7
8
|
|
|
8
|
-
from typing_extensions import override
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
import numpy as np
|
|
12
|
-
|
|
13
9
|
from promptflow.core import AsyncPrompty
|
|
10
|
+
from typing_extensions import override
|
|
14
11
|
|
|
15
|
-
from ..._common.utils import construct_prompty_model_config
|
|
12
|
+
from ..._common.utils import construct_prompty_model_config, validate_model_config
|
|
13
|
+
from . import EvaluatorBase
|
|
16
14
|
|
|
17
15
|
try:
|
|
18
16
|
from ..._user_agent import USER_AGENT
|
|
19
17
|
except ImportError:
|
|
20
|
-
USER_AGENT = None
|
|
21
|
-
from . import EvaluatorBase
|
|
18
|
+
USER_AGENT = "None"
|
|
22
19
|
|
|
23
20
|
|
|
24
|
-
class PromptyEvaluatorBase(EvaluatorBase):
|
|
21
|
+
class PromptyEvaluatorBase(EvaluatorBase[float]):
|
|
25
22
|
"""Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
|
|
26
23
|
make use of a prompty file, and return their results as a dictionary, with a single key-value pair
|
|
27
24
|
linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
|
|
@@ -42,13 +39,13 @@ class PromptyEvaluatorBase(EvaluatorBase):
|
|
|
42
39
|
LLM_CALL_TIMEOUT = 600
|
|
43
40
|
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
44
41
|
|
|
45
|
-
def __init__(self, *, result_key: str, prompty_file: str, model_config:
|
|
42
|
+
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
|
|
46
43
|
self._result_key = result_key
|
|
47
44
|
self._prompty_file = prompty_file
|
|
48
45
|
super().__init__(eval_last_turn=eval_last_turn)
|
|
49
46
|
|
|
50
47
|
prompty_model_config = construct_prompty_model_config(
|
|
51
|
-
model_config,
|
|
48
|
+
validate_model_config(model_config),
|
|
52
49
|
self.DEFAULT_OPEN_API_VERSION,
|
|
53
50
|
USER_AGENT,
|
|
54
51
|
)
|
|
@@ -59,7 +56,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
|
|
|
59
56
|
# defining a default here.
|
|
60
57
|
|
|
61
58
|
@override
|
|
62
|
-
async def _do_eval(self, eval_input: Dict) -> Dict:
|
|
59
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
63
60
|
"""Do a relevance evaluation.
|
|
64
61
|
|
|
65
62
|
:param eval_input: The input to the evaluator. Expected to contain
|
|
@@ -71,7 +68,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
|
|
|
71
68
|
"""
|
|
72
69
|
llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
|
|
73
70
|
|
|
74
|
-
score =
|
|
71
|
+
score = math.nan
|
|
75
72
|
if llm_output:
|
|
76
73
|
match = re.search(r"\d", llm_output)
|
|
77
74
|
if match:
|
|
@@ -1,48 +1,46 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict, Optional, Union
|
|
4
5
|
|
|
5
|
-
from typing import Dict, Optional
|
|
6
6
|
from typing_extensions import override
|
|
7
7
|
|
|
8
|
-
from azure.
|
|
9
|
-
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
8
|
+
from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
|
|
10
9
|
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
10
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
11
11
|
from azure.ai.evaluation._exceptions import EvaluationException
|
|
12
|
+
from azure.core.credentials import TokenCredential
|
|
13
|
+
|
|
12
14
|
from . import EvaluatorBase
|
|
13
15
|
|
|
14
16
|
|
|
15
|
-
class RaiServiceEvaluatorBase(EvaluatorBase):
|
|
17
|
+
class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
|
|
16
18
|
"""Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
|
|
17
19
|
This includes content safety evaluators, protected material evaluators, and others. These evaluators
|
|
18
20
|
are all assumed to be of the "query and response or conversation" input variety.
|
|
19
21
|
|
|
20
|
-
param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
|
|
21
|
-
|
|
22
|
-
type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
|
|
23
|
-
param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
|
|
22
|
+
:param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
|
|
23
|
+
to specify which evaluation to perform.
|
|
24
|
+
:type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
|
|
25
|
+
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
|
|
24
26
|
aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
|
|
25
27
|
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
26
28
|
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
27
|
-
type eval_last_turn: bool
|
|
29
|
+
:type eval_last_turn: bool
|
|
28
30
|
"""
|
|
29
31
|
|
|
30
32
|
@override
|
|
31
33
|
def __init__(
|
|
32
34
|
self,
|
|
33
|
-
eval_metric: EvaluationMetrics,
|
|
35
|
+
eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
34
36
|
azure_ai_project: dict,
|
|
35
|
-
credential:
|
|
37
|
+
credential: TokenCredential,
|
|
36
38
|
eval_last_turn: bool = False,
|
|
37
39
|
):
|
|
38
40
|
super().__init__(eval_last_turn=eval_last_turn)
|
|
39
41
|
self._eval_metric = eval_metric
|
|
40
|
-
self._azure_ai_project = azure_ai_project
|
|
41
|
-
|
|
42
|
-
# Use DefaultCredential if no credential is provided
|
|
43
|
-
self._credential = DefaultAzureCredential()
|
|
44
|
-
else:
|
|
45
|
-
self._credential = credential
|
|
42
|
+
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
43
|
+
self._credential = credential
|
|
46
44
|
|
|
47
45
|
@override
|
|
48
46
|
def __call__(
|
|
@@ -51,7 +49,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
|
|
|
51
49
|
query: Optional[str] = None,
|
|
52
50
|
response: Optional[str] = None,
|
|
53
51
|
conversation: Optional[dict] = None,
|
|
54
|
-
**kwargs
|
|
52
|
+
**kwargs,
|
|
55
53
|
):
|
|
56
54
|
"""Evaluate either a query and response or a conversation. Must supply either a query AND response,
|
|
57
55
|
or a conversation, but not both.
|
|
@@ -65,12 +63,12 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
|
|
|
65
63
|
to be dictionaries with keys "content", "role", and possibly "context".
|
|
66
64
|
:paramtype conversation: Optional[Dict]
|
|
67
65
|
:return: The evaluation result.
|
|
68
|
-
:rtype: Dict
|
|
66
|
+
:rtype: Dict[str, Union[str, float]]
|
|
69
67
|
"""
|
|
70
68
|
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|
|
71
69
|
|
|
72
70
|
@override
|
|
73
|
-
async def _do_eval(self, eval_input: Dict):
|
|
71
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
|
|
74
72
|
"""Perform the evaluation using the Azure AI RAI service.
|
|
75
73
|
The exact evaluation performed is determined by the evaluation metric supplied
|
|
76
74
|
by the child class initializer.
|