azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -5
- azure/ai/evaluation/_common/constants.py +4 -2
- azure/ai/evaluation/_common/math.py +18 -0
- azure/ai/evaluation/_common/rai_service.py +54 -62
- azure/ai/evaluation/_common/utils.py +201 -16
- azure/ai/evaluation/_constants.py +12 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
- azure/ai/evaluation/_evaluate/_utils.py +44 -25
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
- azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
- azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
- azure/ai/evaluation/_exceptions.py +9 -7
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +37 -9
- azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
- azure/ai/evaluation/simulator/_simulator.py +127 -117
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
- azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
|
@@ -6,15 +6,22 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
|
-
from collections import namedtuple
|
|
10
9
|
from pathlib import Path
|
|
11
|
-
from typing import Dict
|
|
10
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict, Union
|
|
12
11
|
|
|
13
12
|
import pandas as pd
|
|
14
|
-
|
|
15
|
-
from
|
|
13
|
+
from promptflow.client import PFClient
|
|
14
|
+
from promptflow.entities import Run
|
|
15
|
+
|
|
16
|
+
from azure.ai.evaluation._constants import (
|
|
17
|
+
DEFAULT_EVALUATION_RESULTS_FILE_NAME,
|
|
18
|
+
DefaultOpenEncoding,
|
|
19
|
+
EvaluationRunProperties,
|
|
20
|
+
Prefixes,
|
|
21
|
+
)
|
|
16
22
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
17
23
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
24
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
25
|
|
|
19
26
|
LOGGER = logging.getLogger(__name__)
|
|
20
27
|
|
|
@@ -23,14 +30,26 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
|
|
|
23
30
|
"(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
|
|
24
31
|
)
|
|
25
32
|
|
|
26
|
-
|
|
33
|
+
|
|
34
|
+
class AzureMLWorkspace(NamedTuple):
|
|
35
|
+
subscription_id: str
|
|
36
|
+
resource_group_name: str
|
|
37
|
+
workspace_name: str
|
|
27
38
|
|
|
28
39
|
|
|
29
|
-
|
|
40
|
+
class EvaluateResult(TypedDict):
|
|
41
|
+
metrics: Dict[str, float]
|
|
42
|
+
studio_url: Optional[str]
|
|
43
|
+
rows: List[Dict]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def is_none(value) -> bool:
|
|
30
47
|
return value is None or str(value).lower() == "none"
|
|
31
48
|
|
|
32
49
|
|
|
33
|
-
def extract_workspace_triad_from_trace_provider(
|
|
50
|
+
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
51
|
+
trace_provider: str,
|
|
52
|
+
) -> AzureMLWorkspace:
|
|
34
53
|
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
35
54
|
if not match or len(match.groups()) != 5:
|
|
36
55
|
raise EvaluationException(
|
|
@@ -47,7 +66,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
|
|
|
47
66
|
subscription_id = match.group(1)
|
|
48
67
|
resource_group_name = match.group(3)
|
|
49
68
|
workspace_name = match.group(5)
|
|
50
|
-
return
|
|
69
|
+
return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
|
|
51
70
|
|
|
52
71
|
|
|
53
72
|
def load_jsonl(path):
|
|
@@ -55,7 +74,7 @@ def load_jsonl(path):
|
|
|
55
74
|
return [json.loads(line) for line in f.readlines()]
|
|
56
75
|
|
|
57
76
|
|
|
58
|
-
def _azure_pf_client_and_triad(trace_destination):
|
|
77
|
+
def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
|
|
59
78
|
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
60
79
|
|
|
61
80
|
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
@@ -69,14 +88,14 @@ def _azure_pf_client_and_triad(trace_destination):
|
|
|
69
88
|
|
|
70
89
|
|
|
71
90
|
def _log_metrics_and_instance_results(
|
|
72
|
-
metrics,
|
|
73
|
-
instance_results,
|
|
74
|
-
trace_destination,
|
|
75
|
-
run,
|
|
76
|
-
evaluation_name,
|
|
77
|
-
) -> str:
|
|
91
|
+
metrics: Dict[str, Any],
|
|
92
|
+
instance_results: pd.DataFrame,
|
|
93
|
+
trace_destination: Optional[str],
|
|
94
|
+
run: Run,
|
|
95
|
+
evaluation_name: Optional[str],
|
|
96
|
+
) -> Optional[str]:
|
|
78
97
|
if trace_destination is None:
|
|
79
|
-
LOGGER.
|
|
98
|
+
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
80
99
|
return None
|
|
81
100
|
|
|
82
101
|
azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
|
|
@@ -94,7 +113,6 @@ def _log_metrics_and_instance_results(
|
|
|
94
113
|
ml_client=azure_pf_client.ml_client,
|
|
95
114
|
promptflow_run=run,
|
|
96
115
|
) as ev_run:
|
|
97
|
-
|
|
98
116
|
artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
|
|
99
117
|
|
|
100
118
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
@@ -112,7 +130,8 @@ def _log_metrics_and_instance_results(
|
|
|
112
130
|
if run is None:
|
|
113
131
|
ev_run.write_properties_to_run_history(
|
|
114
132
|
properties={
|
|
115
|
-
|
|
133
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
134
|
+
EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent",
|
|
116
135
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
117
136
|
"isEvaluatorRun": "true",
|
|
118
137
|
}
|
|
@@ -138,7 +157,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
|
|
|
138
157
|
return studio_url
|
|
139
158
|
|
|
140
159
|
|
|
141
|
-
def _trace_destination_from_project_scope(project_scope:
|
|
160
|
+
def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
|
|
142
161
|
subscription_id = project_scope["subscription_id"]
|
|
143
162
|
resource_group_name = project_scope["resource_group_name"]
|
|
144
163
|
workspace_name = project_scope["project_name"]
|
|
@@ -151,9 +170,9 @@ def _trace_destination_from_project_scope(project_scope: dict) -> str:
|
|
|
151
170
|
return trace_destination
|
|
152
171
|
|
|
153
172
|
|
|
154
|
-
def _write_output(path, data_dict):
|
|
173
|
+
def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
155
174
|
p = Path(path)
|
|
156
|
-
if
|
|
175
|
+
if p.is_dir():
|
|
157
176
|
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
158
177
|
|
|
159
178
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
@@ -161,7 +180,7 @@ def _write_output(path, data_dict):
|
|
|
161
180
|
|
|
162
181
|
|
|
163
182
|
def _apply_column_mapping(
|
|
164
|
-
source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
|
|
183
|
+
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
|
|
165
184
|
) -> pd.DataFrame:
|
|
166
185
|
"""
|
|
167
186
|
Apply column mapping to source_df based on mapping_config.
|
|
@@ -211,7 +230,7 @@ def _apply_column_mapping(
|
|
|
211
230
|
return result_df
|
|
212
231
|
|
|
213
232
|
|
|
214
|
-
def _has_aggregator(evaluator):
|
|
233
|
+
def _has_aggregator(evaluator: object) -> bool:
|
|
215
234
|
return hasattr(evaluator, "__aggregate__")
|
|
216
235
|
|
|
217
236
|
|
|
@@ -234,11 +253,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
|
|
|
234
253
|
return default_value
|
|
235
254
|
|
|
236
255
|
|
|
237
|
-
def set_event_loop_policy():
|
|
256
|
+
def set_event_loop_policy() -> None:
|
|
238
257
|
import asyncio
|
|
239
258
|
import platform
|
|
240
259
|
|
|
241
260
|
if platform.system().lower() == "windows":
|
|
242
261
|
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
243
262
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
244
|
-
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
263
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
@@ -1,77 +1,15 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
|
|
5
4
|
import os
|
|
6
|
-
import
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
-
from promptflow.core import AsyncPrompty
|
|
11
|
-
|
|
12
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
13
|
-
|
|
14
|
-
from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
|
|
15
|
-
|
|
16
|
-
try:
|
|
17
|
-
from ..._user_agent import USER_AGENT
|
|
18
|
-
except ImportError:
|
|
19
|
-
USER_AGENT = None
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class _AsyncCoherenceEvaluator:
|
|
23
|
-
# Constants must be defined within eval's directory to be save/loadable
|
|
24
|
-
PROMPTY_FILE = "coherence.prompty"
|
|
25
|
-
LLM_CALL_TIMEOUT = 600
|
|
26
|
-
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
27
|
-
|
|
28
|
-
def __init__(self, model_config: dict):
|
|
29
|
-
ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
30
|
-
|
|
31
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
32
|
-
|
|
33
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
34
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
35
|
-
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
36
|
-
|
|
37
|
-
ensure_user_agent_in_aoai_model_config(
|
|
38
|
-
model_config,
|
|
39
|
-
prompty_model_config,
|
|
40
|
-
USER_AGENT,
|
|
41
|
-
)
|
|
5
|
+
from typing import Optional
|
|
42
6
|
|
|
43
|
-
|
|
44
|
-
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
45
|
-
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
46
|
-
|
|
47
|
-
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
48
|
-
# Validate input parameters
|
|
49
|
-
query = str(query or "")
|
|
50
|
-
response = str(response or "")
|
|
51
|
-
|
|
52
|
-
if not (query.strip() and response.strip()):
|
|
53
|
-
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
54
|
-
raise EvaluationException(
|
|
55
|
-
message=msg,
|
|
56
|
-
internal_message=msg,
|
|
57
|
-
error_category=ErrorCategory.INVALID_VALUE,
|
|
58
|
-
error_blame=ErrorBlame.USER_ERROR,
|
|
59
|
-
error_target=ErrorTarget.COHERENCE_EVALUATOR,
|
|
60
|
-
)
|
|
7
|
+
from typing_extensions import override
|
|
61
8
|
|
|
62
|
-
|
|
63
|
-
llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
64
10
|
|
|
65
|
-
score = np.nan
|
|
66
|
-
if llm_output:
|
|
67
|
-
match = re.search(r"\d", llm_output)
|
|
68
|
-
if match:
|
|
69
|
-
score = float(match.group())
|
|
70
11
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
class CoherenceEvaluator:
|
|
12
|
+
class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
75
13
|
"""
|
|
76
14
|
Initialize a coherence evaluator configured for a specific Azure OpenAI model.
|
|
77
15
|
|
|
@@ -97,21 +35,37 @@ class CoherenceEvaluator:
|
|
|
97
35
|
}
|
|
98
36
|
"""
|
|
99
37
|
|
|
100
|
-
|
|
101
|
-
|
|
38
|
+
PROMPTY_FILE = "coherence.prompty"
|
|
39
|
+
RESULT_KEY = "gpt_coherence"
|
|
102
40
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
41
|
+
@override
|
|
42
|
+
def __init__(self, model_config: dict):
|
|
43
|
+
current_dir = os.path.dirname(__file__)
|
|
44
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
45
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
|
|
46
|
+
|
|
47
|
+
@override
|
|
48
|
+
def __call__(
|
|
49
|
+
self,
|
|
50
|
+
*,
|
|
51
|
+
query: Optional[str] = None,
|
|
52
|
+
response: Optional[str] = None,
|
|
53
|
+
conversation: Optional[dict] = None,
|
|
54
|
+
**kwargs,
|
|
55
|
+
):
|
|
56
|
+
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
|
|
57
|
+
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
|
|
58
|
+
turns, the evaluator will aggregate the results of each turn.
|
|
106
59
|
|
|
107
|
-
:keyword query: The query to be evaluated.
|
|
108
|
-
:paramtype query: str
|
|
109
60
|
:keyword response: The response to be evaluated.
|
|
110
|
-
:paramtype response: str
|
|
111
|
-
:
|
|
61
|
+
:paramtype response: Optional[str]
|
|
62
|
+
:keyword context: The context to be evaluated.
|
|
63
|
+
:paramtype context: Optional[str]
|
|
64
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
65
|
+
key "messages". Conversation turns are expected
|
|
66
|
+
to be dictionaries with keys "content" and "role".
|
|
67
|
+
:paramtype conversation: Optional[Dict]
|
|
68
|
+
:return: The relevance score.
|
|
112
69
|
:rtype: Dict[str, float]
|
|
113
70
|
"""
|
|
114
|
-
return
|
|
115
|
-
|
|
116
|
-
def _to_async(self):
|
|
117
|
-
return self._async_evaluator
|
|
71
|
+
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|
|
@@ -3,11 +3,6 @@ name: Coherence
|
|
|
3
3
|
description: Evaluates coherence score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
-
configuration:
|
|
7
|
-
type: azure_openai
|
|
8
|
-
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
-
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
-
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
6
|
parameters:
|
|
12
7
|
temperature: 0.0
|
|
13
8
|
max_tokens: 1
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._base_eval import EvaluatorBase
|
|
6
|
+
from ._base_prompty_eval import PromptyEvaluatorBase
|
|
7
|
+
from ._base_rai_svc_eval import RaiServiceEvaluatorBase
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"EvaluatorBase",
|
|
11
|
+
"PromptyEvaluatorBase",
|
|
12
|
+
"RaiServiceEvaluatorBase",
|
|
13
|
+
]
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import inspect
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
8
|
+
|
|
9
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
+
from typing_extensions import ParamSpec, TypeAlias
|
|
11
|
+
|
|
12
|
+
from azure.ai.evaluation._common.math import list_mean
|
|
13
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
+
|
|
15
|
+
P = ParamSpec("P")
|
|
16
|
+
T = TypeVar("T")
|
|
17
|
+
T_EvalValue = TypeVar("T_EvalValue")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DerivedEvalInput(TypedDict, total=False):
|
|
21
|
+
"""The eval input generated by EvaluatorBase._derive_conversation_starter."""
|
|
22
|
+
|
|
23
|
+
query: Dict[str, Any]
|
|
24
|
+
response: Dict[str, Any]
|
|
25
|
+
context: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
|
|
29
|
+
"""TypeAlias that models the return value of EvaluatorBase._aggregate_results
|
|
30
|
+
|
|
31
|
+
.. code-block:: python
|
|
32
|
+
|
|
33
|
+
foo: AggregateResult[float] = {
|
|
34
|
+
"evaluation_per_turn": {
|
|
35
|
+
"gpt_coherence": [1.0, 2.0, 3.0]
|
|
36
|
+
},
|
|
37
|
+
"gpt_coherence": 2.0
|
|
38
|
+
}
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
DoEvalResult: TypeAlias = Dict[str, T]
|
|
42
|
+
"""TypeAlias that models the return value of EvaluatorBase._do_eval
|
|
43
|
+
|
|
44
|
+
.. code-block:: python
|
|
45
|
+
|
|
46
|
+
foo: DoEvalResult[float] = {
|
|
47
|
+
"gpt_coherence": 2.0
|
|
48
|
+
}
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# TODO exception target pass down?
|
|
53
|
+
class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
54
|
+
"""Base class for all evaluators that are capable of accepting either a group of single values,
|
|
55
|
+
or conversation as input. All such evaluators need to implement two functions of their own:
|
|
56
|
+
- _convert_conversation_to_eval_input
|
|
57
|
+
- _do_eval
|
|
58
|
+
|
|
59
|
+
Additionally, __call__ should be overridden to reshape the function header as needed to produce more informative
|
|
60
|
+
documentation, although ideally the actual child implementation of __call__ should just amount to
|
|
61
|
+
'super().__init__()'.
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
:param not_singleton_inputs: A list of strings that represent the names of
|
|
65
|
+
inputs to the child evaluator's __call__ function that are NOT singleton inputs. By default, this
|
|
66
|
+
is ["conversation", "kwargs"].
|
|
67
|
+
:type not_singleton_inputs: List[str]
|
|
68
|
+
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
|
|
69
|
+
:type eval_last_turn: bool
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
73
|
+
|
|
74
|
+
# Make sure to call super().__init__() in the child class's __init__ method.
|
|
75
|
+
# pylint: disable=dangerous-default-value
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
*,
|
|
79
|
+
not_singleton_inputs: List[str] = ["conversation", "kwargs"],
|
|
80
|
+
eval_last_turn: bool = False,
|
|
81
|
+
):
|
|
82
|
+
self._not_singleton_inputs = not_singleton_inputs
|
|
83
|
+
self._eval_last_turn = eval_last_turn
|
|
84
|
+
self._singleton_inputs = self._derive_singleton_inputs()
|
|
85
|
+
self._async_evaluator = AsyncEvaluatorBase(self._real_call)
|
|
86
|
+
|
|
87
|
+
# This needs to be overridden just to change the function header into something more informative,
|
|
88
|
+
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
89
|
+
# super().__call__(<inputs>)
|
|
90
|
+
def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
91
|
+
"""Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
|
|
92
|
+
one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
|
|
93
|
+
The actual behavior of this function shouldn't change beyond adding more inputs to the
|
|
94
|
+
async_run_allowing_running_loop call.
|
|
95
|
+
|
|
96
|
+
:keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
|
|
97
|
+
:type kwargs: Dict
|
|
98
|
+
:return: The evaluation result
|
|
99
|
+
:rtype: Dict
|
|
100
|
+
"""
|
|
101
|
+
return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
|
|
102
|
+
|
|
103
|
+
@abstractmethod
|
|
104
|
+
async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
|
|
105
|
+
"""Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
|
|
106
|
+
In the default case, all required inputs are assumed to be within eval_input, as user-friendly
|
|
107
|
+
typing is handled above this function in favor of polymorphic simplicity. This function must be
|
|
108
|
+
asynchronous.
|
|
109
|
+
|
|
110
|
+
:param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
|
|
111
|
+
:type eval_input: Any
|
|
112
|
+
:return: A single evaluation result
|
|
113
|
+
:rtype: Dict
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
117
|
+
|
|
118
|
+
def _derive_singleton_inputs(self) -> List[str]:
|
|
119
|
+
"""Inspect the evaluator's __call__ function to determine what singleton inputs are expected
|
|
120
|
+
when the evaluator is being used in a non-conversation context.
|
|
121
|
+
By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
|
|
122
|
+
Thankfully this works the way you'd hope, with the call_signature being based on the child
|
|
123
|
+
function's signature, not the parent's.
|
|
124
|
+
|
|
125
|
+
:return: A list of strings representing the names of singleton inputs.
|
|
126
|
+
:rtype: List[str]
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
call_signature = inspect.signature(self.__call__)
|
|
130
|
+
singletons = []
|
|
131
|
+
for param in call_signature.parameters:
|
|
132
|
+
if param not in self._not_singleton_inputs:
|
|
133
|
+
singletons.append(param)
|
|
134
|
+
return singletons
|
|
135
|
+
|
|
136
|
+
def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
137
|
+
"""Produce the function that will be used to convert conversations to a list of evaluable inputs.
|
|
138
|
+
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
139
|
+
aspects of a conversation ought to be extracted.
|
|
140
|
+
|
|
141
|
+
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
142
|
+
:rtype: Callable
|
|
143
|
+
"""
|
|
144
|
+
include_context = "context" in self._singleton_inputs
|
|
145
|
+
include_query = "query" in self._singleton_inputs
|
|
146
|
+
include_response = "response" in self._singleton_inputs
|
|
147
|
+
|
|
148
|
+
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
149
|
+
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
150
|
+
global_context = conversation.get("context", None)
|
|
151
|
+
# Extract queries, responses from conversation
|
|
152
|
+
queries: List[Dict[str, Any]] = []
|
|
153
|
+
responses: List[Dict[str, Any]] = []
|
|
154
|
+
|
|
155
|
+
# Convert conversation slice into queries and responses.
|
|
156
|
+
# Assume that 'user' role is asking queries and 'assistant' role is responding.
|
|
157
|
+
if self._eval_last_turn and len(messages) > 1:
|
|
158
|
+
messages = messages[-2:]
|
|
159
|
+
|
|
160
|
+
for each_turn in messages:
|
|
161
|
+
role = each_turn["role"]
|
|
162
|
+
if role == "user":
|
|
163
|
+
queries.append(each_turn)
|
|
164
|
+
elif role == "assistant":
|
|
165
|
+
responses.append(each_turn)
|
|
166
|
+
# TODO complain if len(queries) != len(responses)?
|
|
167
|
+
eval_inputs = []
|
|
168
|
+
for query, response in zip(queries, responses):
|
|
169
|
+
context = {}
|
|
170
|
+
if include_context:
|
|
171
|
+
query_context = query.get("context", None)
|
|
172
|
+
response_context = response.get("context", None)
|
|
173
|
+
if global_context:
|
|
174
|
+
context["global_context"] = global_context
|
|
175
|
+
if query_context and not include_query:
|
|
176
|
+
context["query_context"] = query_context
|
|
177
|
+
if response_context and not include_response:
|
|
178
|
+
context["response_context"] = response_context
|
|
179
|
+
|
|
180
|
+
eval_input: DerivedEvalInput = {}
|
|
181
|
+
if include_query:
|
|
182
|
+
eval_input["query"] = query
|
|
183
|
+
if include_response:
|
|
184
|
+
eval_input["response"] = response
|
|
185
|
+
if include_context:
|
|
186
|
+
eval_input["context"] = str(context)
|
|
187
|
+
eval_inputs.append(eval_input)
|
|
188
|
+
return eval_inputs
|
|
189
|
+
|
|
190
|
+
return converter
|
|
191
|
+
|
|
192
|
+
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
193
|
+
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
194
|
+
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
195
|
+
Either they receive a collection of keyname inputs that are all single values
|
|
196
|
+
(like a query and response), or they receive conversation that iss a list of dictionary
|
|
197
|
+
values.
|
|
198
|
+
|
|
199
|
+
The self._singleton_inputs list assigned during initialization is used to find and extract
|
|
200
|
+
singleton keywords, and self._allow_converssation_input is used to determine if a conversation
|
|
201
|
+
is a valid input.
|
|
202
|
+
|
|
203
|
+
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
204
|
+
are inputted.
|
|
205
|
+
|
|
206
|
+
This function must be overridden by child classes IF they need to both a conversation and
|
|
207
|
+
other inputs to be passed in.
|
|
208
|
+
|
|
209
|
+
:keyword kwargs: The inputs to convert.
|
|
210
|
+
:type kwargs: Dict
|
|
211
|
+
:return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
|
|
212
|
+
:rtype: List
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
# Collect inputs
|
|
216
|
+
conversation = kwargs.get("conversation", None)
|
|
217
|
+
singletons = {}
|
|
218
|
+
if len(self._singleton_inputs) > 0:
|
|
219
|
+
singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
|
|
220
|
+
# Check that both conversation and other inputs aren't set
|
|
221
|
+
if conversation is not None and any(singletons.values()):
|
|
222
|
+
raise EvaluationException(
|
|
223
|
+
message="Invalid input",
|
|
224
|
+
internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
|
|
225
|
+
blame=ErrorBlame.USER_ERROR,
|
|
226
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
227
|
+
target=ErrorTarget.CONVERSATION,
|
|
228
|
+
)
|
|
229
|
+
# Handle Conversation
|
|
230
|
+
if conversation is not None:
|
|
231
|
+
return self._derive_conversation_converter()(conversation)
|
|
232
|
+
# Handle Singletons
|
|
233
|
+
if all(value is not None for value in singletons.values()):
|
|
234
|
+
return [singletons] # TODO loosen requirements to allow for optional singletons?
|
|
235
|
+
# Missing input
|
|
236
|
+
raise EvaluationException(
|
|
237
|
+
message="Missing input",
|
|
238
|
+
internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
|
|
239
|
+
blame=ErrorBlame.USER_ERROR,
|
|
240
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
241
|
+
target=ErrorTarget.CONVERSATION,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
|
|
245
|
+
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
246
|
+
|
|
247
|
+
Exact implementation might need to vary slightly depending on the results produced.
|
|
248
|
+
Default behavior is to average the all number-based outputs.
|
|
249
|
+
|
|
250
|
+
:param per_turn_results: List of evaluation results for each turn in the conversation.
|
|
251
|
+
:type per_turn_results: List[Dict]
|
|
252
|
+
:return: A dictionary containing aggregated results, with numeric metrics having their
|
|
253
|
+
means as top-level values in the dictionary, and all original
|
|
254
|
+
values (including non-numerics) located in under the "evaluation_per_turn" key,
|
|
255
|
+
which each sub-key being a metric and each sub-value being a the list of that metric's
|
|
256
|
+
per-turn values.
|
|
257
|
+
:rtype: Dict
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
|
|
261
|
+
evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
|
|
262
|
+
|
|
263
|
+
# Go over each turn, and rotate the results into a
|
|
264
|
+
# metric: List[values] format for the evals_per_turn dictionary.
|
|
265
|
+
for turn in per_turn_results:
|
|
266
|
+
for metric, value in turn.items():
|
|
267
|
+
if metric not in evaluation_per_turn:
|
|
268
|
+
evaluation_per_turn[metric] = []
|
|
269
|
+
evaluation_per_turn[metric].append(value)
|
|
270
|
+
|
|
271
|
+
# Find and average all numeric values
|
|
272
|
+
for metric, values in evaluation_per_turn.items():
|
|
273
|
+
if all(isinstance(value, (int, float)) for value in values):
|
|
274
|
+
aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
|
|
275
|
+
# Slap the per-turn results back in.
|
|
276
|
+
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
277
|
+
|
|
278
|
+
return aggregated
|
|
279
|
+
|
|
280
|
+
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
281
|
+
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
282
|
+
|
|
283
|
+
:keyword kwargs: The inputs to evaluate.
|
|
284
|
+
:type kwargs: Dict
|
|
285
|
+
:return: The evaluation result.
|
|
286
|
+
:rtype: Dict
|
|
287
|
+
"""
|
|
288
|
+
# Convert inputs into list of evaluable inputs.
|
|
289
|
+
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
|
|
290
|
+
per_turn_results = []
|
|
291
|
+
# Evaluate all inputs.
|
|
292
|
+
for eval_input in eval_input_list:
|
|
293
|
+
per_turn_results.append(await self._do_eval(eval_input))
|
|
294
|
+
# Return results as-is if only one result was produced.
|
|
295
|
+
|
|
296
|
+
if len(per_turn_results) == 1:
|
|
297
|
+
return per_turn_results[0]
|
|
298
|
+
if len(per_turn_results) == 0:
|
|
299
|
+
return {} # TODO raise something?
|
|
300
|
+
# Otherwise, aggregate results.
|
|
301
|
+
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
302
|
+
|
|
303
|
+
@final
|
|
304
|
+
def _to_async(self) -> "AsyncEvaluatorBase":
|
|
305
|
+
return self._async_evaluator
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
class AsyncEvaluatorBase:
|
|
309
|
+
"""The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
|
|
310
|
+
to ensure that no one ever needs to extend or otherwise modify this class directly.
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION
|
|
314
|
+
self._real_call = real_call
|
|
315
|
+
|
|
316
|
+
# Don't look at my shame. Nothing to see here....
|
|
317
|
+
# Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
|
|
318
|
+
# to our broken async code not properly handling inputs; keyword arguments that aren't in the signature#
|
|
319
|
+
# are just not passed into this function instead of ending up in kwargs.
|
|
320
|
+
# Since we want this to be relatively call-agnostic, we just account for every input that any children
|
|
321
|
+
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
322
|
+
async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
|
|
323
|
+
if conversation is not None:
|
|
324
|
+
kwargs["conversation"] = conversation
|
|
325
|
+
if query is not None:
|
|
326
|
+
kwargs["query"] = query
|
|
327
|
+
if response is not None:
|
|
328
|
+
kwargs["response"] = response
|
|
329
|
+
if context is not None:
|
|
330
|
+
kwargs["context"] = context
|
|
331
|
+
return await self._real_call(**kwargs)
|