azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +22 -0
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
- azure/ai/evaluation/_common/constants.py +5 -0
- azure/ai/evaluation/_common/math.py +73 -2
- azure/ai/evaluation/_common/rai_service.py +250 -62
- azure/ai/evaluation/_common/utils.py +196 -23
- azure/ai/evaluation/_constants.py +7 -6
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
- azure/ai/evaluation/_evaluate/_utils.py +46 -11
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
- azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
- azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
- azure/ai/evaluation/_exceptions.py +19 -0
- azure/ai/evaluation/_model_configurations.py +83 -15
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
- azure/ai/evaluation/simulator/_simulator.py +165 -105
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
- azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
- /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -16,10 +16,10 @@ from promptflow.client import PFClient
|
|
|
16
16
|
from promptflow.core import Prompty as prompty_core
|
|
17
17
|
from typing_extensions import ParamSpec
|
|
18
18
|
|
|
19
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
19
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
20
20
|
|
|
21
21
|
from ..._user_agent import USER_AGENT
|
|
22
|
-
from .._utils import
|
|
22
|
+
from .._utils import _trace_destination_from_project_scope
|
|
23
23
|
|
|
24
24
|
LOGGER = logging.getLogger(__name__)
|
|
25
25
|
|
|
@@ -97,17 +97,17 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
# cspell:ignore isna
|
|
100
|
-
def log_evaluate_activity(func: Callable[P,
|
|
100
|
+
def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
|
|
101
101
|
"""Decorator to log evaluate activity
|
|
102
102
|
|
|
103
103
|
:param func: The function to be decorated
|
|
104
104
|
:type func: Callable
|
|
105
105
|
:returns: The decorated function
|
|
106
|
-
:rtype: Callable[P,
|
|
106
|
+
:rtype: Callable[P, EvaluationResult]
|
|
107
107
|
"""
|
|
108
108
|
|
|
109
109
|
@functools.wraps(func)
|
|
110
|
-
def wrapper(*args: P.args, **kwargs: P.kwargs) ->
|
|
110
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
|
|
111
111
|
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
112
112
|
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
113
113
|
|
|
@@ -123,7 +123,8 @@ def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, Eval
|
|
|
123
123
|
user_agent=USER_AGENT,
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
-
|
|
126
|
+
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
127
|
+
track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
|
|
127
128
|
evaluate_target = bool(kwargs.get("target", None))
|
|
128
129
|
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
129
130
|
custom_dimensions: Dict[str, Union[str, bool]] = {
|
|
@@ -7,7 +7,9 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Dict,
|
|
10
|
+
from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
|
|
11
|
+
import uuid
|
|
12
|
+
import base64
|
|
11
13
|
|
|
12
14
|
import pandas as pd
|
|
13
15
|
from promptflow.client import PFClient
|
|
@@ -19,7 +21,6 @@ from azure.ai.evaluation._constants import (
|
|
|
19
21
|
EvaluationRunProperties,
|
|
20
22
|
Prefixes,
|
|
21
23
|
)
|
|
22
|
-
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
23
24
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
24
25
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
25
26
|
|
|
@@ -37,12 +38,6 @@ class AzureMLWorkspace(NamedTuple):
|
|
|
37
38
|
workspace_name: str
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
class EvaluateResult(TypedDict):
|
|
41
|
-
metrics: Dict[str, float]
|
|
42
|
-
studio_url: Optional[str]
|
|
43
|
-
rows: List[Dict]
|
|
44
|
-
|
|
45
|
-
|
|
46
41
|
def is_none(value) -> bool:
|
|
47
42
|
return value is None or str(value).lower() == "none"
|
|
48
43
|
|
|
@@ -87,6 +82,34 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
|
|
|
87
82
|
return azure_pf_client, ws_triad
|
|
88
83
|
|
|
89
84
|
|
|
85
|
+
def _store_multimodal_content(messages, tmpdir: str):
|
|
86
|
+
# verify if images folder exists
|
|
87
|
+
images_folder_path = os.path.join(tmpdir, "images")
|
|
88
|
+
os.makedirs(images_folder_path, exist_ok=True)
|
|
89
|
+
|
|
90
|
+
# traverse all messages and replace base64 image data with new file name.
|
|
91
|
+
for message in messages:
|
|
92
|
+
if isinstance(message.get("content", []), list):
|
|
93
|
+
for content in message.get("content", []):
|
|
94
|
+
if content.get("type") == "image_url":
|
|
95
|
+
image_url = content.get("image_url")
|
|
96
|
+
if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
|
|
97
|
+
# Extract the base64 string
|
|
98
|
+
base64image = image_url["url"].replace("data:image/jpg;base64,", "")
|
|
99
|
+
|
|
100
|
+
# Generate a unique filename
|
|
101
|
+
image_file_name = f"{str(uuid.uuid4())}.jpg"
|
|
102
|
+
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
103
|
+
|
|
104
|
+
# Decode the base64 string to binary image data
|
|
105
|
+
image_data_binary = base64.b64decode(base64image)
|
|
106
|
+
|
|
107
|
+
# Write the binary image data to the file
|
|
108
|
+
image_file_path = os.path.join(images_folder_path, image_file_name)
|
|
109
|
+
with open(image_file_path, "wb") as f:
|
|
110
|
+
f.write(image_data_binary)
|
|
111
|
+
|
|
112
|
+
|
|
90
113
|
def _log_metrics_and_instance_results(
|
|
91
114
|
metrics: Dict[str, Any],
|
|
92
115
|
instance_results: pd.DataFrame,
|
|
@@ -94,6 +117,8 @@ def _log_metrics_and_instance_results(
|
|
|
94
117
|
run: Run,
|
|
95
118
|
evaluation_name: Optional[str],
|
|
96
119
|
) -> Optional[str]:
|
|
120
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
121
|
+
|
|
97
122
|
if trace_destination is None:
|
|
98
123
|
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
99
124
|
return None
|
|
@@ -113,9 +138,18 @@ def _log_metrics_and_instance_results(
|
|
|
113
138
|
ml_client=azure_pf_client.ml_client,
|
|
114
139
|
promptflow_run=run,
|
|
115
140
|
) as ev_run:
|
|
116
|
-
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
141
|
+
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
117
142
|
|
|
118
143
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
144
|
+
# storing multi_modal images if exists
|
|
145
|
+
col_name = "inputs.conversation"
|
|
146
|
+
if col_name in instance_results.columns:
|
|
147
|
+
for item in instance_results[col_name].items():
|
|
148
|
+
value = item[1]
|
|
149
|
+
if "messages" in value:
|
|
150
|
+
_store_multimodal_content(value["messages"], tmpdir)
|
|
151
|
+
|
|
152
|
+
# storing artifact result
|
|
119
153
|
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
120
154
|
|
|
121
155
|
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
@@ -131,9 +165,8 @@ def _log_metrics_and_instance_results(
|
|
|
131
165
|
ev_run.write_properties_to_run_history(
|
|
132
166
|
properties={
|
|
133
167
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
134
|
-
EvaluationRunProperties.EVALUATION_RUN: "
|
|
168
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
135
169
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
136
|
-
"isEvaluatorRun": "true",
|
|
137
170
|
}
|
|
138
171
|
)
|
|
139
172
|
|
|
@@ -178,6 +211,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
178
211
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
179
212
|
json.dump(data_dict, f)
|
|
180
213
|
|
|
214
|
+
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
215
|
+
|
|
181
216
|
|
|
182
217
|
def _apply_column_mapping(
|
|
183
218
|
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
|
|
@@ -26,31 +26,30 @@ class _AsyncBleuScoreEvaluator:
|
|
|
26
26
|
|
|
27
27
|
class BleuScoreEvaluator:
|
|
28
28
|
"""
|
|
29
|
-
|
|
29
|
+
Calculate the BLEU score for a given response and ground truth.
|
|
30
30
|
|
|
31
31
|
BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
|
|
32
|
-
translation. It is widely used in text summarization and text generation use cases.
|
|
33
|
-
generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
|
|
34
|
-
better quality.
|
|
32
|
+
translation. It is widely used in text summarization and text generation use cases.
|
|
35
33
|
|
|
36
|
-
|
|
34
|
+
Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
|
|
35
|
+
especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
|
|
36
|
+
indicator of quality.
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
result = eval_fn(
|
|
42
|
-
response="Tokyo is the capital of Japan.",
|
|
43
|
-
ground_truth="The capital of Japan is Tokyo.")
|
|
40
|
+
.. admonition:: Example:
|
|
44
41
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
}
|
|
42
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
43
|
+
:start-after: [START bleu_score_evaluator]
|
|
44
|
+
:end-before: [END bleu_score_evaluator]
|
|
45
|
+
:language: python
|
|
46
|
+
:dedent: 8
|
|
47
|
+
:caption: Initialize and call an BleuScoreEvaluator.
|
|
52
48
|
"""
|
|
53
49
|
|
|
50
|
+
id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
|
|
51
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
52
|
+
|
|
54
53
|
def __init__(self):
|
|
55
54
|
self._async_evaluator = _AsyncBleuScoreEvaluator()
|
|
56
55
|
|
|
@@ -63,7 +62,7 @@ class BleuScoreEvaluator:
|
|
|
63
62
|
:keyword ground_truth: The ground truth to be compared against.
|
|
64
63
|
:paramtype ground_truth: str
|
|
65
64
|
:return: The BLEU score.
|
|
66
|
-
:rtype:
|
|
65
|
+
:rtype: Dict[str, float]
|
|
67
66
|
"""
|
|
68
67
|
return async_run_allowing_running_loop(
|
|
69
68
|
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
@@ -2,70 +2,106 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Dict, Union, List
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import overload, override
|
|
8
8
|
|
|
9
9
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
10
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
13
|
+
class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
13
14
|
"""
|
|
14
|
-
|
|
15
|
+
Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
|
|
16
|
+
|
|
17
|
+
The coherence measure assesses the ability of the language model to generate text that reads naturally,
|
|
18
|
+
flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
|
|
19
|
+
and user-friendliness of a model's generated responses in real-world applications.
|
|
15
20
|
|
|
16
21
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
17
22
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
18
23
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
19
24
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
.. code-block:: python
|
|
23
|
-
|
|
24
|
-
eval_fn = CoherenceEvaluator(model_config)
|
|
25
|
-
result = eval_fn(
|
|
26
|
-
query="What is the capital of Japan?",
|
|
27
|
-
response="The capital of Japan is Tokyo.")
|
|
25
|
+
.. admonition:: Example:
|
|
28
26
|
|
|
29
|
-
|
|
27
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
28
|
+
:start-after: [START coherence_evaluator]
|
|
29
|
+
:end-before: [END coherence_evaluator]
|
|
30
|
+
:language: python
|
|
31
|
+
:dedent: 8
|
|
32
|
+
:caption: Initialize and call a CoherenceEvaluator with a query and response.
|
|
30
33
|
|
|
31
|
-
..
|
|
34
|
+
.. note::
|
|
32
35
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
+
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
37
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
38
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
36
39
|
"""
|
|
37
40
|
|
|
38
|
-
|
|
39
|
-
|
|
41
|
+
_PROMPTY_FILE = "coherence.prompty"
|
|
42
|
+
_RESULT_KEY = "coherence"
|
|
43
|
+
|
|
44
|
+
id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
|
|
45
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
40
46
|
|
|
41
47
|
@override
|
|
42
|
-
def __init__(self, model_config
|
|
48
|
+
def __init__(self, model_config):
|
|
43
49
|
current_dir = os.path.dirname(__file__)
|
|
44
|
-
prompty_path = os.path.join(current_dir, self.
|
|
45
|
-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.
|
|
50
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
51
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
|
|
46
52
|
|
|
47
|
-
@
|
|
53
|
+
@overload
|
|
54
|
+
def __call__(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
query: str,
|
|
58
|
+
response: str,
|
|
59
|
+
) -> Dict[str, Union[str, float]]:
|
|
60
|
+
"""Evaluate coherence for given input of query, response
|
|
61
|
+
|
|
62
|
+
:keyword query: The query to be evaluated.
|
|
63
|
+
:paramtype query: str
|
|
64
|
+
:keyword response: The response to be evaluated.
|
|
65
|
+
:paramtype response: str
|
|
66
|
+
:return: The coherence score.
|
|
67
|
+
:rtype: Dict[str, float]
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
@overload
|
|
48
71
|
def __call__(
|
|
49
72
|
self,
|
|
50
73
|
*,
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
74
|
+
conversation: Conversation,
|
|
75
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
76
|
+
"""Evaluate coherence for a conversation
|
|
77
|
+
|
|
78
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
79
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
80
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
81
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
82
|
+
:return: The coherence score.
|
|
83
|
+
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
@override
|
|
87
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
88
|
+
self,
|
|
89
|
+
*args,
|
|
54
90
|
**kwargs,
|
|
55
91
|
):
|
|
56
92
|
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
|
|
57
93
|
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
|
|
58
94
|
turns, the evaluator will aggregate the results of each turn.
|
|
59
95
|
|
|
96
|
+
:keyword query: The query to be evaluated.
|
|
97
|
+
:paramtype query: str
|
|
60
98
|
:keyword response: The response to be evaluated.
|
|
61
99
|
:paramtype response: Optional[str]
|
|
62
|
-
:keyword context: The context to be evaluated.
|
|
63
|
-
:paramtype context: Optional[str]
|
|
64
100
|
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
65
101
|
key "messages". Conversation turns are expected
|
|
66
102
|
to be dictionaries with keys "content" and "role".
|
|
67
|
-
:paramtype conversation: Optional[
|
|
103
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
68
104
|
:return: The relevance score.
|
|
69
|
-
:rtype: Dict[str, float]
|
|
105
|
+
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
|
|
70
106
|
"""
|
|
71
|
-
return super().__call__(
|
|
107
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -5,7 +5,7 @@ model:
|
|
|
5
5
|
api: chat
|
|
6
6
|
parameters:
|
|
7
7
|
temperature: 0.0
|
|
8
|
-
max_tokens:
|
|
8
|
+
max_tokens: 800
|
|
9
9
|
top_p: 1.0
|
|
10
10
|
presence_penalty: 0
|
|
11
11
|
frequency_penalty: 0
|
|
@@ -20,38 +20,80 @@ inputs:
|
|
|
20
20
|
|
|
21
21
|
---
|
|
22
22
|
system:
|
|
23
|
-
|
|
23
|
+
# Instruction
|
|
24
|
+
## Goal
|
|
25
|
+
### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
|
|
26
|
+
- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
|
|
27
|
+
- **Data**: Your input data include a QUERY and a RESPONSE.
|
|
28
|
+
- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
|
|
24
29
|
|
|
25
30
|
user:
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
question
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
31
|
+
# Definition
|
|
32
|
+
**Coherence** refers to the logical and orderly presentation of ideas in a response, allowing the reader to easily follow and understand the writer's train of thought. A coherent answer directly addresses the question with clear connections between sentences and paragraphs, using appropriate transitions and a logical sequence of ideas.
|
|
33
|
+
|
|
34
|
+
# Ratings
|
|
35
|
+
## [Coherence: 1] (Incoherent Response)
|
|
36
|
+
**Definition:** The response lacks coherence entirely. It consists of disjointed words or phrases that do not form complete or meaningful sentences. There is no logical connection to the question, making the response incomprehensible.
|
|
37
|
+
|
|
38
|
+
**Examples:**
|
|
39
|
+
**Query:** What are the benefits of renewable energy?
|
|
40
|
+
**Response:** Wind sun green jump apple silence over.
|
|
41
|
+
|
|
42
|
+
**Query:** Explain the process of photosynthesis.
|
|
43
|
+
**Response:** Plants light water flying blue music.
|
|
44
|
+
|
|
45
|
+
## [Coherence: 2] (Poorly Coherent Response)
|
|
46
|
+
**Definition:** The response shows minimal coherence with fragmented sentences and limited connection to the question. It contains some relevant keywords but lacks logical structure and clear relationships between ideas, making the overall message difficult to understand.
|
|
47
|
+
|
|
48
|
+
**Examples:**
|
|
49
|
+
**Query:** How does vaccination work?
|
|
50
|
+
**Response:** Vaccines protect disease. Immune system fight. Health better.
|
|
51
|
+
|
|
52
|
+
**Query:** Describe how a bill becomes a law.
|
|
53
|
+
**Response:** Idea proposed. Congress discuss vote. President signs.
|
|
54
|
+
|
|
55
|
+
## [Coherence: 3] (Partially Coherent Response)
|
|
56
|
+
**Definition:** The response partially addresses the question with some relevant information but exhibits issues in the logical flow and organization of ideas. Connections between sentences may be unclear or abrupt, requiring the reader to infer the links. The response may lack smooth transitions and may present ideas out of order.
|
|
57
|
+
|
|
58
|
+
**Examples:**
|
|
59
|
+
**Query:** What causes earthquakes?
|
|
60
|
+
**Response:** Earthquakes happen when tectonic plates move suddenly. Energy builds up then releases. Ground shakes and can cause damage.
|
|
61
|
+
|
|
62
|
+
**Query:** Explain the importance of the water cycle.
|
|
63
|
+
**Response:** The water cycle moves water around Earth. Evaporation, then precipitation occurs. It supports life by distributing water.
|
|
64
|
+
|
|
65
|
+
## [Coherence: 4] (Coherent Response)
|
|
66
|
+
**Definition:** The response is coherent and effectively addresses the question. Ideas are logically organized with clear connections between sentences and paragraphs. Appropriate transitions are used to guide the reader through the response, which flows smoothly and is easy to follow.
|
|
67
|
+
|
|
68
|
+
**Examples:**
|
|
69
|
+
**Query:** What is the water cycle and how does it work?
|
|
70
|
+
**Response:** The water cycle is the continuous movement of water on Earth through processes like evaporation, condensation, and precipitation. Water evaporates from bodies of water, forms clouds through condensation, and returns to the surface as precipitation. This cycle is essential for distributing water resources globally.
|
|
71
|
+
|
|
72
|
+
**Query:** Describe the role of mitochondria in cellular function.
|
|
73
|
+
**Response:** Mitochondria are organelles that produce energy for the cell. They convert nutrients into ATP through cellular respiration. This energy powers various cellular activities, making mitochondria vital for cell survival.
|
|
74
|
+
|
|
75
|
+
## [Coherence: 5] (Highly Coherent Response)
|
|
76
|
+
**Definition:** The response is exceptionally coherent, demonstrating sophisticated organization and flow. Ideas are presented in a logical and seamless manner, with excellent use of transitional phrases and cohesive devices. The connections between concepts are clear and enhance the reader's understanding. The response thoroughly addresses the question with clarity and precision.
|
|
77
|
+
|
|
78
|
+
**Examples:**
|
|
79
|
+
**Query:** Analyze the economic impacts of climate change on coastal cities.
|
|
80
|
+
**Response:** Climate change significantly affects the economies of coastal cities through rising sea levels, increased flooding, and more intense storms. These environmental changes can damage infrastructure, disrupt businesses, and lead to costly repairs. For instance, frequent flooding can hinder transportation and commerce, while the threat of severe weather may deter investment and tourism. Consequently, cities may face increased expenses for disaster preparedness and mitigation efforts, straining municipal budgets and impacting economic growth.
|
|
81
|
+
|
|
82
|
+
**Query:** Discuss the significance of the Monroe Doctrine in shaping U.S. foreign policy.
|
|
83
|
+
**Response:** The Monroe Doctrine was a pivotal policy declared in 1823 that asserted U.S. opposition to European colonization in the Americas. By stating that any intervention by external powers in the Western Hemisphere would be viewed as a hostile act, it established the U.S. as a protector of the region. This doctrine shaped U.S. foreign policy by promoting isolation from European conflicts while justifying American influence and expansion in the hemisphere. Its long-term significance lies in its enduring influence on international relations and its role in defining the U.S. position in global affairs.
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# Data
|
|
87
|
+
QUERY: {{query}}
|
|
88
|
+
RESPONSE: {{response}}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# Tasks
|
|
92
|
+
## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
|
|
93
|
+
- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
|
|
94
|
+
- **Explanation**: a very short explanation of why you think the input Data should get that Score.
|
|
95
|
+
- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
|
|
99
|
+
# Output
|
|
@@ -7,10 +7,11 @@ from abc import ABC, abstractmethod
|
|
|
7
7
|
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
8
8
|
|
|
9
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
-
from typing_extensions import ParamSpec, TypeAlias
|
|
10
|
+
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._common.math import list_mean
|
|
13
13
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
+
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
14
15
|
|
|
15
16
|
P = ParamSpec("P")
|
|
16
17
|
T = TypeVar("T")
|
|
@@ -32,9 +33,9 @@ AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
|
|
|
32
33
|
|
|
33
34
|
foo: AggregateResult[float] = {
|
|
34
35
|
"evaluation_per_turn": {
|
|
35
|
-
"
|
|
36
|
+
"coherence": [1.0, 2.0, 3.0]
|
|
36
37
|
},
|
|
37
|
-
"
|
|
38
|
+
"coherence": 2.0
|
|
38
39
|
}
|
|
39
40
|
"""
|
|
40
41
|
|
|
@@ -44,7 +45,7 @@ DoEvalResult: TypeAlias = Dict[str, T]
|
|
|
44
45
|
.. code-block:: python
|
|
45
46
|
|
|
46
47
|
foo: DoEvalResult[float] = {
|
|
47
|
-
"
|
|
48
|
+
"coherence": 2.0
|
|
48
49
|
}
|
|
49
50
|
"""
|
|
50
51
|
|
|
@@ -87,7 +88,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
87
88
|
# This needs to be overridden just to change the function header into something more informative,
|
|
88
89
|
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
89
90
|
# super().__call__(<inputs>)
|
|
90
|
-
def __call__(
|
|
91
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
92
|
+
self,
|
|
93
|
+
*args,
|
|
94
|
+
**kwargs,
|
|
95
|
+
) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
91
96
|
"""Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
|
|
92
97
|
one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
|
|
93
98
|
The actual behavior of this function shouldn't change beyond adding more inputs to the
|
|
@@ -96,7 +101,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
96
101
|
:keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
|
|
97
102
|
:type kwargs: Dict
|
|
98
103
|
:return: The evaluation result
|
|
99
|
-
:rtype:
|
|
104
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
100
105
|
"""
|
|
101
106
|
return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
|
|
102
107
|
|
|
@@ -110,7 +115,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
110
115
|
:param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
|
|
111
116
|
:type eval_input: Any
|
|
112
117
|
:return: A single evaluation result
|
|
113
|
-
:rtype:
|
|
118
|
+
:rtype: DoEvalResult[T_EvalValue]
|
|
114
119
|
"""
|
|
115
120
|
|
|
116
121
|
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
@@ -126,11 +131,19 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
126
131
|
:rtype: List[str]
|
|
127
132
|
"""
|
|
128
133
|
|
|
134
|
+
overloads = get_overloads(self.__call__)
|
|
135
|
+
if not overloads:
|
|
136
|
+
call_signatures = [inspect.signature(self.__call__)]
|
|
137
|
+
else:
|
|
138
|
+
call_signatures = [inspect.signature(overload) for overload in overloads]
|
|
129
139
|
call_signature = inspect.signature(self.__call__)
|
|
130
140
|
singletons = []
|
|
131
|
-
for
|
|
132
|
-
|
|
133
|
-
|
|
141
|
+
for call_signature in call_signatures:
|
|
142
|
+
params = call_signature.parameters
|
|
143
|
+
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
|
|
144
|
+
continue
|
|
145
|
+
# exclude self since it is not a singleton input
|
|
146
|
+
singletons.extend([p for p in params if p != "self"])
|
|
134
147
|
return singletons
|
|
135
148
|
|
|
136
149
|
def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
@@ -172,16 +185,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
172
185
|
response_context = response.get("context", None)
|
|
173
186
|
if global_context:
|
|
174
187
|
context["global_context"] = global_context
|
|
175
|
-
if query_context and
|
|
188
|
+
if query_context and include_query:
|
|
176
189
|
context["query_context"] = query_context
|
|
177
|
-
if response_context and
|
|
190
|
+
if response_context and include_response:
|
|
178
191
|
context["response_context"] = response_context
|
|
179
192
|
|
|
180
193
|
eval_input: DerivedEvalInput = {}
|
|
181
194
|
if include_query:
|
|
182
|
-
eval_input["query"] = query
|
|
195
|
+
eval_input["query"] = query.get("content", "")
|
|
183
196
|
if include_response:
|
|
184
|
-
eval_input["response"] = response
|
|
197
|
+
eval_input["response"] = response.get("content", "")
|
|
185
198
|
if include_context:
|
|
186
199
|
eval_input["context"] = str(context)
|
|
187
200
|
eval_inputs.append(eval_input)
|
|
@@ -219,9 +232,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
219
232
|
singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
|
|
220
233
|
# Check that both conversation and other inputs aren't set
|
|
221
234
|
if conversation is not None and any(singletons.values()):
|
|
235
|
+
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
|
|
222
236
|
raise EvaluationException(
|
|
223
|
-
message=
|
|
224
|
-
internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
|
|
237
|
+
message=msg,
|
|
225
238
|
blame=ErrorBlame.USER_ERROR,
|
|
226
239
|
category=ErrorCategory.INVALID_VALUE,
|
|
227
240
|
target=ErrorTarget.CONVERSATION,
|
|
@@ -230,12 +243,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
230
243
|
if conversation is not None:
|
|
231
244
|
return self._derive_conversation_converter()(conversation)
|
|
232
245
|
# Handle Singletons
|
|
233
|
-
|
|
234
|
-
|
|
246
|
+
required_singletons = remove_optional_singletons(self, singletons)
|
|
247
|
+
if all(value is not None for value in required_singletons.values()):
|
|
248
|
+
return [singletons]
|
|
235
249
|
# Missing input
|
|
250
|
+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
|
|
236
251
|
raise EvaluationException(
|
|
237
|
-
message=
|
|
238
|
-
internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
|
|
252
|
+
message=msg,
|
|
239
253
|
blame=ErrorBlame.USER_ERROR,
|
|
240
254
|
category=ErrorCategory.INVALID_VALUE,
|
|
241
255
|
target=ErrorTarget.CONVERSATION,
|
|
@@ -254,7 +268,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
254
268
|
values (including non-numerics) located in under the "evaluation_per_turn" key,
|
|
255
269
|
which each sub-key being a metric and each sub-value being a the list of that metric's
|
|
256
270
|
per-turn values.
|
|
257
|
-
:rtype:
|
|
271
|
+
:rtype: AggregateResult[T_EvalValue]
|
|
258
272
|
"""
|
|
259
273
|
|
|
260
274
|
aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
|
|
@@ -274,7 +288,6 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
274
288
|
aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
|
|
275
289
|
# Slap the per-turn results back in.
|
|
276
290
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
277
|
-
|
|
278
291
|
return aggregated
|
|
279
292
|
|
|
280
293
|
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
@@ -283,7 +296,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
283
296
|
:keyword kwargs: The inputs to evaluate.
|
|
284
297
|
:type kwargs: Dict
|
|
285
298
|
:return: The evaluation result.
|
|
286
|
-
:rtype:
|
|
299
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
287
300
|
"""
|
|
288
301
|
# Convert inputs into list of evaluable inputs.
|
|
289
302
|
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
|
|
@@ -315,7 +328,7 @@ class AsyncEvaluatorBase:
|
|
|
315
328
|
|
|
316
329
|
# Don't look at my shame. Nothing to see here....
|
|
317
330
|
# Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
|
|
318
|
-
# to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
|
|
331
|
+
# to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
|
|
319
332
|
# are just not passed into this function instead of ending up in kwargs.
|
|
320
333
|
# Since we want this to be relatively call-agnostic, we just account for every input that any children
|
|
321
334
|
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|