azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
|
@@ -3,112 +3,129 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
-
|
|
6
|
+
import re
|
|
7
|
+
from typing import Union
|
|
7
8
|
|
|
8
|
-
|
|
9
|
+
import numpy as np
|
|
9
10
|
|
|
10
|
-
from
|
|
11
|
-
from azure.ai.evaluation.
|
|
11
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
|
+
from promptflow.core import AsyncPrompty
|
|
12
14
|
|
|
15
|
+
from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
16
|
+
from ..._common.utils import (
|
|
17
|
+
check_and_add_api_version_for_aoai_model_config,
|
|
18
|
+
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
+
)
|
|
13
20
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
21
|
+
try:
|
|
22
|
+
from ..._user_agent import USER_AGENT
|
|
23
|
+
except ImportError:
|
|
24
|
+
USER_AGENT = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _AsyncRelevanceEvaluator:
|
|
28
|
+
# Constants must be defined within eval's directory to be save/loadable
|
|
29
|
+
PROMPTY_FILE = "relevance.prompty"
|
|
30
|
+
LLM_CALL_TIMEOUT = 600
|
|
31
|
+
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
32
|
+
|
|
33
|
+
def __init__(self, model_config: dict):
|
|
34
|
+
check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
17
35
|
|
|
18
|
-
|
|
19
|
-
High relevance scores signify the AI system's understanding of the input and its capability to produce coherent
|
|
20
|
-
and contextually appropriate outputs. Conversely, low relevance scores indicate that generated responses might
|
|
21
|
-
be off-topic, lacking in context, or insufficient in addressing the user's intended queries. Use the relevance
|
|
22
|
-
metric when evaluating the AI system's performance in understanding the input and generating contextually
|
|
23
|
-
appropriate responses.
|
|
36
|
+
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
24
37
|
|
|
25
|
-
|
|
38
|
+
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
39
|
+
# https://github.com/encode/httpx/discussions/2959
|
|
40
|
+
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
|
+
|
|
42
|
+
check_and_add_user_agent_for_aoai_model_config(
|
|
43
|
+
model_config,
|
|
44
|
+
prompty_model_config,
|
|
45
|
+
USER_AGENT,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
current_dir = os.path.dirname(__file__)
|
|
49
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
50
|
+
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
51
|
+
|
|
52
|
+
async def __call__(self, *, query: str, response: str, context: str, **kwargs):
|
|
53
|
+
# Validate input parameters
|
|
54
|
+
query = str(query or "")
|
|
55
|
+
response = str(response or "")
|
|
56
|
+
context = str(context or "")
|
|
57
|
+
|
|
58
|
+
if not (query.strip() and response.strip() and context.strip()):
|
|
59
|
+
msg = "'query', 'response' and 'context' must be non-empty strings."
|
|
60
|
+
raise EvaluationException(
|
|
61
|
+
message=msg,
|
|
62
|
+
internal_message=msg,
|
|
63
|
+
error_category=ErrorCategory.MISSING_FIELD,
|
|
64
|
+
error_blame=ErrorBlame.USER_ERROR,
|
|
65
|
+
error_target=ErrorTarget.RELEVANCE_EVALUATOR,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Run the evaluation flow
|
|
69
|
+
llm_output = await self._flow(
|
|
70
|
+
query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
score = np.nan
|
|
74
|
+
if llm_output:
|
|
75
|
+
match = re.search(r"\d", llm_output)
|
|
76
|
+
if match:
|
|
77
|
+
score = float(match.group())
|
|
78
|
+
|
|
79
|
+
return {"gpt_relevance": float(score)}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class RelevanceEvaluator:
|
|
83
|
+
"""
|
|
84
|
+
Initialize a relevance evaluator configured for a specific Azure OpenAI model.
|
|
26
85
|
|
|
27
86
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
28
87
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
29
88
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
30
89
|
|
|
31
|
-
|
|
90
|
+
**Usage**
|
|
32
91
|
|
|
33
|
-
|
|
34
|
-
:start-after: [START relevance_evaluator]
|
|
35
|
-
:end-before: [END relevance_evaluator]
|
|
36
|
-
:language: python
|
|
37
|
-
:dedent: 8
|
|
38
|
-
:caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
|
|
92
|
+
.. code-block:: python
|
|
39
93
|
|
|
40
|
-
|
|
94
|
+
eval_fn = RelevanceEvaluator(model_config)
|
|
95
|
+
result = eval_fn(
|
|
96
|
+
query="What is the capital of Japan?",
|
|
97
|
+
response="The capital of Japan is Tokyo.",
|
|
98
|
+
context="Tokyo is Japan's capital, known for its blend of traditional culture \
|
|
99
|
+
and technological advancements.")
|
|
41
100
|
|
|
42
|
-
|
|
43
|
-
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
44
|
-
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
45
|
-
"""
|
|
101
|
+
**Output format**
|
|
46
102
|
|
|
47
|
-
|
|
48
|
-
_PROMPTY_FILE = "relevance.prompty"
|
|
49
|
-
_RESULT_KEY = "relevance"
|
|
103
|
+
.. code-block:: python
|
|
50
104
|
|
|
51
|
-
|
|
52
|
-
|
|
105
|
+
{
|
|
106
|
+
"gpt_relevance": 3.0
|
|
107
|
+
}
|
|
108
|
+
"""
|
|
53
109
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@overload
|
|
61
|
-
def __call__(
|
|
62
|
-
self,
|
|
63
|
-
*,
|
|
64
|
-
query: str,
|
|
65
|
-
response: str,
|
|
66
|
-
) -> Dict[str, Union[str, float]]:
|
|
67
|
-
"""Evaluate groundedness for given input of query, response, context
|
|
110
|
+
def __init__(self, model_config: dict):
|
|
111
|
+
self._async_evaluator = _AsyncRelevanceEvaluator(model_config)
|
|
112
|
+
|
|
113
|
+
def __call__(self, *, query: str, response: str, context: str, **kwargs):
|
|
114
|
+
"""
|
|
115
|
+
Evaluate relevance.
|
|
68
116
|
|
|
69
117
|
:keyword query: The query to be evaluated.
|
|
70
118
|
:paramtype query: str
|
|
71
119
|
:keyword response: The response to be evaluated.
|
|
72
120
|
:paramtype response: str
|
|
121
|
+
:keyword context: The context to be evaluated.
|
|
122
|
+
:paramtype context: str
|
|
73
123
|
:return: The relevance score.
|
|
74
|
-
:rtype:
|
|
124
|
+
:rtype: dict
|
|
75
125
|
"""
|
|
126
|
+
return async_run_allowing_running_loop(
|
|
127
|
+
self._async_evaluator, query=query, response=response, context=context, **kwargs
|
|
128
|
+
)
|
|
76
129
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
self,
|
|
80
|
-
*,
|
|
81
|
-
conversation: Conversation,
|
|
82
|
-
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
83
|
-
"""Evaluate relevance for a conversation
|
|
84
|
-
|
|
85
|
-
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
86
|
-
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
87
|
-
to be dictionaries with keys "content", "role", and possibly "context".
|
|
88
|
-
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
89
|
-
:return: The relevance score.
|
|
90
|
-
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
@override
|
|
94
|
-
def __call__( # pylint: disable=docstring-missing-param
|
|
95
|
-
self,
|
|
96
|
-
*args,
|
|
97
|
-
**kwargs,
|
|
98
|
-
):
|
|
99
|
-
"""Evaluate relevance. Accepts either a query and response for a single evaluation,
|
|
100
|
-
or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
|
|
101
|
-
the evaluator will aggregate the results of each turn.
|
|
102
|
-
|
|
103
|
-
:keyword query: The query to be evaluated. Mutually exclusive with the `conversation` parameter.
|
|
104
|
-
:paramtype query: Optional[str]
|
|
105
|
-
:keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter.
|
|
106
|
-
:paramtype response: Optional[str]
|
|
107
|
-
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
108
|
-
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
109
|
-
to be dictionaries with keys "content", "role", and possibly "context".
|
|
110
|
-
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
111
|
-
:return: The relevance score.
|
|
112
|
-
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
|
|
113
|
-
"""
|
|
114
|
-
return super().__call__(*args, **kwargs)
|
|
130
|
+
def _to_async(self):
|
|
131
|
+
return self._async_evaluator
|
|
@@ -3,9 +3,14 @@ name: Relevance
|
|
|
3
3
|
description: Evaluates relevance score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
+
configuration:
|
|
7
|
+
type: azure_openai
|
|
8
|
+
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
+
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
+
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
6
11
|
parameters:
|
|
7
12
|
temperature: 0.0
|
|
8
|
-
max_tokens:
|
|
13
|
+
max_tokens: 1
|
|
9
14
|
top_p: 1.0
|
|
10
15
|
presence_penalty: 0
|
|
11
16
|
frequency_penalty: 0
|
|
@@ -17,84 +22,48 @@ inputs:
|
|
|
17
22
|
type: string
|
|
18
23
|
response:
|
|
19
24
|
type: string
|
|
25
|
+
context:
|
|
26
|
+
type: string
|
|
20
27
|
|
|
21
28
|
---
|
|
22
29
|
system:
|
|
23
|
-
|
|
24
|
-
## Goal
|
|
25
|
-
### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
|
|
26
|
-
- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
|
|
27
|
-
- **Data**: Your input data include QUERY and RESPONSE.
|
|
28
|
-
- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
|
|
29
|
-
|
|
30
|
+
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
|
|
30
31
|
user:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
**Query:** What type of food does the new restaurant offer?
|
|
70
|
-
**Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto.
|
|
71
|
-
|
|
72
|
-
**Query:** What topics will the conference cover?
|
|
73
|
-
**Response:** The conference will cover renewable energy, climate change, and sustainability practices.
|
|
74
|
-
|
|
75
|
-
## [Relevance: 5] (Comprehensive Response with Insights)
|
|
76
|
-
**Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding.
|
|
77
|
-
|
|
78
|
-
**Examples:**
|
|
79
|
-
**Query:** What type of food does the new restaurant offer?
|
|
80
|
-
**Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience.
|
|
81
|
-
|
|
82
|
-
**Query:** What topics will the conference cover?
|
|
83
|
-
**Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues.
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
# Data
|
|
88
|
-
QUERY: {{query}}
|
|
89
|
-
RESPONSE: {{response}}
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
# Tasks
|
|
93
|
-
## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
|
|
94
|
-
- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
|
|
95
|
-
- **Explanation**: a very short explanation of why you think the input Data should get that Score.
|
|
96
|
-
- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
|
|
100
|
-
# Output
|
|
32
|
+
Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale:
|
|
33
|
+
One star: the answer completely lacks relevance
|
|
34
|
+
Two stars: the answer mostly lacks relevance
|
|
35
|
+
Three stars: the answer is partially relevant
|
|
36
|
+
Four stars: the answer is mostly relevant
|
|
37
|
+
Five stars: the answer has perfect relevance
|
|
38
|
+
|
|
39
|
+
This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
|
|
40
|
+
|
|
41
|
+
context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize.
|
|
42
|
+
question: What field did Marie Curie excel in?
|
|
43
|
+
answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques.
|
|
44
|
+
stars: 1
|
|
45
|
+
|
|
46
|
+
context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history.
|
|
47
|
+
question: Where were The Beatles formed?
|
|
48
|
+
answer: The band The Beatles began their journey in London, England, and they changed the history of music.
|
|
49
|
+
stars: 2
|
|
50
|
+
|
|
51
|
+
context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
|
|
52
|
+
question: What are the main goals of Perseverance Mars rover mission?
|
|
53
|
+
answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars.
|
|
54
|
+
stars: 3
|
|
55
|
+
|
|
56
|
+
context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health.
|
|
57
|
+
question: What are the main components of the Mediterranean diet?
|
|
58
|
+
answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes.
|
|
59
|
+
stars: 4
|
|
60
|
+
|
|
61
|
+
context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty.
|
|
62
|
+
question: What are the main attractions of the Queen's Royal Castle?
|
|
63
|
+
answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty.
|
|
64
|
+
stars: 5
|
|
65
|
+
|
|
66
|
+
context: {{context}}
|
|
67
|
+
question: {{query}}
|
|
68
|
+
answer: {{response}}
|
|
69
|
+
stars:
|
|
@@ -3,12 +3,12 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from rouge_score import rouge_scorer
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class RougeType(Enum):
|
|
11
|
+
class RougeType(str, Enum):
|
|
12
12
|
"""
|
|
13
13
|
Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
|
|
14
14
|
"""
|
|
@@ -37,8 +37,8 @@ class _AsyncRougeScoreEvaluator:
|
|
|
37
37
|
self._rouge_type = rouge_type
|
|
38
38
|
|
|
39
39
|
async def __call__(self, *, ground_truth: str, response: str, **kwargs):
|
|
40
|
-
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type
|
|
41
|
-
metrics = scorer.score(ground_truth, response)[self._rouge_type
|
|
40
|
+
scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
|
|
41
|
+
metrics = scorer.score(ground_truth, response)[self._rouge_type]
|
|
42
42
|
return {
|
|
43
43
|
"rouge_precision": metrics.precision,
|
|
44
44
|
"rouge_recall": metrics.recall,
|
|
@@ -48,33 +48,33 @@ class _AsyncRougeScoreEvaluator:
|
|
|
48
48
|
|
|
49
49
|
class RougeScoreEvaluator:
|
|
50
50
|
"""
|
|
51
|
-
|
|
51
|
+
Evaluator for computes the ROUGE scores between two strings.
|
|
52
52
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
ROUGE-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
(L-graph overlap)
|
|
53
|
+
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate automatic
|
|
54
|
+
summarization and machine translation. It measures the overlap between generated text and reference summaries.
|
|
55
|
+
ROUGE focuses on recall-oriented measures to assess how well the generated text covers the reference text. Text
|
|
56
|
+
summarization and document comparison are among optimal use cases for ROUGE, particularly in scenarios where text
|
|
57
|
+
coherence and relevance are critical.
|
|
59
58
|
|
|
60
|
-
|
|
61
|
-
other natural language processing tasks, especially when focusing on recall and the ability to capture relevant
|
|
62
|
-
information from the reference text.
|
|
59
|
+
**Usage**
|
|
63
60
|
|
|
64
|
-
|
|
61
|
+
.. code-block:: python
|
|
65
62
|
|
|
66
|
-
|
|
63
|
+
eval_fn = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
|
|
64
|
+
result = eval_fn(
|
|
65
|
+
response="Tokyo is the capital of Japan.",
|
|
66
|
+
ground_truth="The capital of Japan is Tokyo.")
|
|
67
67
|
|
|
68
|
-
|
|
69
|
-
:start-after: [START rouge_score_evaluator]
|
|
70
|
-
:end-before: [END rouge_score_evaluator]
|
|
71
|
-
:language: python
|
|
72
|
-
:dedent: 8
|
|
73
|
-
:caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
74
|
-
"""
|
|
68
|
+
**Output format**
|
|
75
69
|
|
|
76
|
-
|
|
77
|
-
|
|
70
|
+
.. code-block:: python
|
|
71
|
+
|
|
72
|
+
{
|
|
73
|
+
"rouge_precision": 1.0,
|
|
74
|
+
"rouge_recall": 1.0,
|
|
75
|
+
"rouge_f1_score": 1.0
|
|
76
|
+
}
|
|
77
|
+
"""
|
|
78
78
|
|
|
79
79
|
def __init__(self, rouge_type: RougeType):
|
|
80
80
|
self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
|
|
@@ -88,7 +88,7 @@ class RougeScoreEvaluator:
|
|
|
88
88
|
:keyword ground_truth: The ground truth to be compared against.
|
|
89
89
|
:paramtype ground_truth: str
|
|
90
90
|
:return: The ROUGE score.
|
|
91
|
-
:rtype:
|
|
91
|
+
:rtype: dict
|
|
92
92
|
"""
|
|
93
93
|
return async_run_allowing_running_loop(
|
|
94
94
|
self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
|
|
@@ -2,53 +2,54 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
import math
|
|
6
5
|
import os
|
|
7
6
|
import re
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
8
10
|
|
|
9
11
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
10
13
|
from promptflow.core import AsyncPrompty
|
|
11
14
|
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
|
|
15
|
+
from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
16
|
+
from ..._common.utils import (
|
|
17
|
+
check_and_add_api_version_for_aoai_model_config,
|
|
18
|
+
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
+
)
|
|
15
20
|
|
|
16
21
|
try:
|
|
17
22
|
from ..._user_agent import USER_AGENT
|
|
18
23
|
except ImportError:
|
|
19
|
-
USER_AGENT =
|
|
24
|
+
USER_AGENT = None
|
|
20
25
|
|
|
21
26
|
|
|
22
27
|
class _AsyncSimilarityEvaluator:
|
|
23
28
|
# Constants must be defined within eval's directory to be save/loadable
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
29
|
+
PROMPTY_FILE = "similarity.prompty"
|
|
30
|
+
LLM_CALL_TIMEOUT = 600
|
|
31
|
+
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
27
32
|
|
|
28
33
|
def __init__(self, model_config: dict):
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
34
|
+
check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
35
|
+
|
|
36
|
+
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
37
|
+
|
|
38
|
+
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
39
|
+
# https://github.com/encode/httpx/discussions/2959
|
|
40
|
+
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
|
+
|
|
42
|
+
check_and_add_user_agent_for_aoai_model_config(
|
|
43
|
+
model_config,
|
|
44
|
+
prompty_model_config,
|
|
32
45
|
USER_AGENT,
|
|
33
46
|
)
|
|
34
47
|
|
|
35
48
|
current_dir = os.path.dirname(__file__)
|
|
36
|
-
prompty_path = os.path.join(current_dir, self.
|
|
49
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
37
50
|
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
38
51
|
|
|
39
52
|
async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
|
|
40
|
-
"""
|
|
41
|
-
Evaluate similarity.
|
|
42
|
-
|
|
43
|
-
:keyword query: The query to be evaluated.
|
|
44
|
-
:paramtype query: str
|
|
45
|
-
:keyword response: The response to be evaluated.
|
|
46
|
-
:paramtype response: str
|
|
47
|
-
:keyword ground_truth: The ground truth to be evaluated.
|
|
48
|
-
:paramtype ground_truth: str
|
|
49
|
-
:return: The similarity score.
|
|
50
|
-
:rtype: Dict[str, float]
|
|
51
|
-
"""
|
|
52
53
|
# Validate input parameters
|
|
53
54
|
query = str(query or "")
|
|
54
55
|
response = str(response or "")
|
|
@@ -66,57 +67,46 @@ class _AsyncSimilarityEvaluator:
|
|
|
66
67
|
|
|
67
68
|
# Run the evaluation flow
|
|
68
69
|
llm_output = await self._flow(
|
|
69
|
-
query=query, response=response, ground_truth=ground_truth, timeout=self.
|
|
70
|
+
query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
|
|
70
71
|
)
|
|
71
72
|
|
|
72
|
-
score =
|
|
73
|
+
score = np.nan
|
|
73
74
|
if llm_output:
|
|
74
75
|
match = re.search(r"\d", llm_output)
|
|
75
76
|
if match:
|
|
76
77
|
score = float(match.group())
|
|
77
78
|
|
|
78
|
-
return {"
|
|
79
|
+
return {"gpt_similarity": float(score)}
|
|
79
80
|
|
|
80
81
|
|
|
81
82
|
class SimilarityEvaluator:
|
|
82
83
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
|
|
86
|
-
AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
|
|
87
|
-
the ground truth and the model's prediction, which are high-dimensional vector representations capturing
|
|
88
|
-
the semantic meaning and context of the sentences.
|
|
89
|
-
|
|
90
|
-
Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
|
|
91
|
-
tasks where you have access to ground truth responses. Similarity enables you to assess the generated
|
|
92
|
-
text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
|
|
93
|
-
|
|
94
|
-
Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
|
|
84
|
+
Initialize a similarity evaluator configured for a specific Azure OpenAI model.
|
|
95
85
|
|
|
96
86
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
97
87
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
98
88
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
99
89
|
|
|
100
|
-
|
|
90
|
+
**Usage**
|
|
101
91
|
|
|
102
|
-
|
|
103
|
-
:start-after: [START rouge_score_evaluator]
|
|
104
|
-
:end-before: [END rouge_score_evaluator]
|
|
105
|
-
:language: python
|
|
106
|
-
:dedent: 8
|
|
107
|
-
:caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
92
|
+
.. code-block:: python
|
|
108
93
|
|
|
109
|
-
|
|
94
|
+
eval_fn = SimilarityEvaluator(model_config)
|
|
95
|
+
result = eval_fn(
|
|
96
|
+
query="What is the capital of Japan?",
|
|
97
|
+
response="The capital of Japan is Tokyo.",
|
|
98
|
+
ground_truth="Tokyo is Japan's capital.")
|
|
110
99
|
|
|
111
|
-
|
|
112
|
-
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
113
|
-
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
114
|
-
"""
|
|
100
|
+
**Output format**
|
|
115
101
|
|
|
116
|
-
|
|
117
|
-
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
102
|
+
.. code-block:: python
|
|
118
103
|
|
|
119
|
-
|
|
104
|
+
{
|
|
105
|
+
"gpt_similarity": 3.0
|
|
106
|
+
}
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(self, model_config: dict):
|
|
120
110
|
self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
|
|
121
111
|
|
|
122
112
|
def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
|
|
@@ -130,7 +120,7 @@ class SimilarityEvaluator:
|
|
|
130
120
|
:keyword ground_truth: The ground truth to be evaluated.
|
|
131
121
|
:paramtype ground_truth: str
|
|
132
122
|
:return: The similarity score.
|
|
133
|
-
:rtype:
|
|
123
|
+
:rtype: dict
|
|
134
124
|
"""
|
|
135
125
|
return async_run_allowing_running_loop(
|
|
136
126
|
self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
|
|
@@ -3,6 +3,11 @@ name: Similarity
|
|
|
3
3
|
description: Evaluates similarity score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
+
configuration:
|
|
7
|
+
type: azure_openai
|
|
8
|
+
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
+
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
+
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
6
11
|
parameters:
|
|
7
12
|
temperature: 0.0
|
|
8
13
|
max_tokens: 1
|