azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +5 -31
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +120 -300
- azure/ai/evaluation/_common/utils.py +23 -381
- azure/ai/evaluation/_constants.py +6 -19
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +7 -23
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +17 -33
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/proxy_client.py +4 -32
- azure/ai/evaluation/_evaluate/_eval_run.py +24 -81
- azure/ai/evaluation/_evaluate/_evaluate.py +239 -393
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +17 -17
- azure/ai/evaluation/_evaluate/_utils.py +28 -82
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +18 -17
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +357 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +157 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +88 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +67 -105
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +34 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +301 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +54 -44
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +19 -34
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +89 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +16 -14
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +87 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -20
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +30 -23
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +96 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -26
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +38 -53
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +105 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +132 -203
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +1 -2
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +92 -111
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +12 -13
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +67 -33
- azure/ai/evaluation/simulator/_helpers/__init__.py +2 -1
- azure/ai/evaluation/{_common → simulator/_helpers}/_experimental.py +9 -24
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +5 -26
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +94 -107
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +11 -28
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +4 -8
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_simulator.py +207 -277
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- azure/ai/evaluation/simulator/_utils.py +13 -31
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +449 -0
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +99 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Groundedness
|
|
3
|
+
description: Evaluates groundedness score for QA scenario
|
|
4
|
+
model:
|
|
5
|
+
api: chat
|
|
6
|
+
configuration:
|
|
7
|
+
type: azure_openai
|
|
8
|
+
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
+
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
+
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
11
|
+
parameters:
|
|
12
|
+
temperature: 0.0
|
|
13
|
+
max_tokens: 1
|
|
14
|
+
top_p: 1.0
|
|
15
|
+
presence_penalty: 0
|
|
16
|
+
frequency_penalty: 0
|
|
17
|
+
response_format:
|
|
18
|
+
type: text
|
|
19
|
+
|
|
20
|
+
inputs:
|
|
21
|
+
response:
|
|
22
|
+
type: string
|
|
23
|
+
context:
|
|
24
|
+
type: string
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
system:
|
|
28
|
+
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
|
|
29
|
+
user:
|
|
30
|
+
You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
|
|
31
|
+
1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
|
|
32
|
+
2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
|
|
33
|
+
3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
|
|
34
|
+
Independent Examples:
|
|
35
|
+
## Example Task #1 Input:
|
|
36
|
+
{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
|
|
37
|
+
## Example Task #1 Output:
|
|
38
|
+
1
|
|
39
|
+
## Example Task #2 Input:
|
|
40
|
+
{"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."}
|
|
41
|
+
## Example Task #2 Output:
|
|
42
|
+
5
|
|
43
|
+
## Example Task #3 Input:
|
|
44
|
+
{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."}
|
|
45
|
+
## Example Task #3 Output:
|
|
46
|
+
5
|
|
47
|
+
## Example Task #4 Input:
|
|
48
|
+
{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
|
|
49
|
+
## Example Task #4 Output:
|
|
50
|
+
1
|
|
51
|
+
## Actual Task Input:
|
|
52
|
+
{"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{response}}}
|
|
53
|
+
Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question.
|
|
54
|
+
Actual Task Output:
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
import nltk
|
|
4
5
|
from nltk.translate.meteor_score import meteor_score
|
|
5
6
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
6
7
|
|
|
7
|
-
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
|
+
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class _AsyncMeteorScoreEvaluator:
|
|
@@ -13,7 +14,10 @@ class _AsyncMeteorScoreEvaluator:
|
|
|
13
14
|
self._beta = beta
|
|
14
15
|
self._gamma = gamma
|
|
15
16
|
|
|
16
|
-
|
|
17
|
+
try:
|
|
18
|
+
nltk.find("corpora/wordnet.zip")
|
|
19
|
+
except LookupError:
|
|
20
|
+
nltk.download("wordnet")
|
|
17
21
|
|
|
18
22
|
async def __call__(self, *, ground_truth: str, response: str, **kwargs):
|
|
19
23
|
reference_tokens = nltk_tokenize(ground_truth)
|
|
@@ -34,7 +38,7 @@ class _AsyncMeteorScoreEvaluator:
|
|
|
34
38
|
|
|
35
39
|
class MeteorScoreEvaluator:
|
|
36
40
|
"""
|
|
37
|
-
|
|
41
|
+
Evaluator that computes the METEOR Score between two strings.
|
|
38
42
|
|
|
39
43
|
The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by
|
|
40
44
|
comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of
|
|
@@ -42,12 +46,6 @@ class MeteorScoreEvaluator:
|
|
|
42
46
|
word stems to more accurately capture meaning and language variations. In addition to machine translation and
|
|
43
47
|
text summarization, paraphrase detection is an optimal use case for the METEOR score.
|
|
44
48
|
|
|
45
|
-
Use the METEOR score when you want a more linguistically informed evaluation metric that captures not only
|
|
46
|
-
n-gram overlap but also accounts for synonyms, stemming, and word order. This is particularly useful for evaluating
|
|
47
|
-
tasks like machine translation, text summarization, and text generation.
|
|
48
|
-
|
|
49
|
-
The METEOR score ranges from 0 to 1, with 1 indicating a perfect match.
|
|
50
|
-
|
|
51
49
|
:param alpha: The METEOR score alpha parameter. Default is 0.9.
|
|
52
50
|
:type alpha: float
|
|
53
51
|
:param beta: The METEOR score beta parameter. Default is 3.0.
|
|
@@ -55,18 +53,27 @@ class MeteorScoreEvaluator:
|
|
|
55
53
|
:param gamma: The METEOR score gamma parameter. Default is 0.5.
|
|
56
54
|
:type gamma: float
|
|
57
55
|
|
|
58
|
-
|
|
56
|
+
**Usage**
|
|
59
57
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
58
|
+
.. code-block:: python
|
|
59
|
+
|
|
60
|
+
eval_fn = MeteorScoreEvaluator(
|
|
61
|
+
alpha=0.9,
|
|
62
|
+
beta=3.0,
|
|
63
|
+
gamma=0.5
|
|
64
|
+
)
|
|
65
|
+
result = eval_fn(
|
|
66
|
+
response="Tokyo is the capital of Japan.",
|
|
67
|
+
ground_truth="The capital of Japan is Tokyo.")
|
|
67
68
|
|
|
68
|
-
|
|
69
|
-
|
|
69
|
+
**Output format**
|
|
70
|
+
|
|
71
|
+
.. code-block:: python
|
|
72
|
+
|
|
73
|
+
{
|
|
74
|
+
"meteor_score": 0.62
|
|
75
|
+
}
|
|
76
|
+
"""
|
|
70
77
|
|
|
71
78
|
def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
|
|
72
79
|
self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
|
|
@@ -80,7 +87,7 @@ class MeteorScoreEvaluator:
|
|
|
80
87
|
:keyword ground_truth: The ground truth to be compared against.
|
|
81
88
|
:paramtype ground_truth: str
|
|
82
89
|
:return: The METEOR score.
|
|
83
|
-
:rtype:
|
|
90
|
+
:rtype: dict
|
|
84
91
|
"""
|
|
85
92
|
return async_run_allowing_running_loop(
|
|
86
93
|
self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
|
|
@@ -1,113 +1,104 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
4
5
|
|
|
5
|
-
from
|
|
6
|
+
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
7
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
8
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
6
9
|
|
|
7
|
-
from typing_extensions import overload, override
|
|
8
10
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
class _AsyncProtectedMaterialEvaluator:
|
|
12
|
+
def __init__(self, azure_ai_project: dict, credential=None):
|
|
13
|
+
self._azure_ai_project = azure_ai_project
|
|
14
|
+
self._credential = credential
|
|
13
15
|
|
|
16
|
+
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
Evaluates content according to this evaluator's metric.
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
20
|
+
:keyword query: The query to be evaluated.
|
|
21
|
+
:paramtype query: str
|
|
22
|
+
:keyword response: The response to be evaluated.
|
|
23
|
+
:paramtype response: str
|
|
24
|
+
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
25
|
+
:rtype: Any
|
|
26
|
+
"""
|
|
27
|
+
# Validate inputs
|
|
28
|
+
# Raises value error if failed, so execution alone signifies success.
|
|
29
|
+
if not (query and query.strip() and query != "None") or not (
|
|
30
|
+
response and response.strip() and response != "None"
|
|
31
|
+
):
|
|
32
|
+
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
33
|
+
raise EvaluationException(
|
|
34
|
+
message=msg,
|
|
35
|
+
internal_message=msg,
|
|
36
|
+
error_category=ErrorCategory.MISSING_FIELD,
|
|
37
|
+
error_blame=ErrorBlame.USER_ERROR,
|
|
38
|
+
error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Run score computation based on supplied metric.
|
|
42
|
+
result = await evaluate_with_rai_service(
|
|
43
|
+
metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
44
|
+
query=query,
|
|
45
|
+
response=response,
|
|
46
|
+
project_scope=self._azure_ai_project,
|
|
47
|
+
credential=self._credential,
|
|
48
|
+
)
|
|
49
|
+
return result
|
|
19
50
|
|
|
20
|
-
Protected material is any text that is under copyright, including song lyrics, recipes, and articles. Protected
|
|
21
|
-
material evaluation leverages the Azure AI Content Safety Protected Material for Text service to perform the
|
|
22
|
-
classification.
|
|
23
51
|
|
|
24
|
-
|
|
52
|
+
class ProtectedMaterialEvaluator:
|
|
53
|
+
"""
|
|
54
|
+
Initialize a protected material evaluator to detect whether protected material
|
|
55
|
+
is present in your AI system's response. Outputs True or False with AI-generated reasoning.
|
|
25
56
|
|
|
26
|
-
:param
|
|
27
|
-
|
|
28
|
-
:param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
|
|
29
|
-
resource group, and project name.
|
|
57
|
+
:param azure_ai_project: The scope of the Azure AI project.
|
|
58
|
+
It contains subscription id, resource group, and project name.
|
|
30
59
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
60
|
+
:param credential: The credential for connecting to Azure AI project.
|
|
61
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
62
|
+
:return: Whether or not protected material was found in the response, with AI-generated reasoning.
|
|
63
|
+
:rtype: Dict[str, str]
|
|
31
64
|
|
|
32
|
-
|
|
65
|
+
**Usage**
|
|
33
66
|
|
|
34
|
-
|
|
35
|
-
:start-after: [START protected_material_evaluator]
|
|
36
|
-
:end-before: [END protected_material_evaluator]
|
|
37
|
-
:language: python
|
|
38
|
-
:dedent: 8
|
|
39
|
-
:caption: Initialize and call a ProtectedMaterialEvaluator.
|
|
40
|
-
"""
|
|
67
|
+
.. code-block:: python
|
|
41
68
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
azure_ai_project,
|
|
50
|
-
):
|
|
51
|
-
super().__init__(
|
|
52
|
-
eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
53
|
-
azure_ai_project=azure_ai_project,
|
|
54
|
-
credential=credential,
|
|
55
|
-
)
|
|
69
|
+
azure_ai_project = {
|
|
70
|
+
"subscription_id": "<subscription_id>",
|
|
71
|
+
"resource_group_name": "<resource_group_name>",
|
|
72
|
+
"project_name": "<project_name>",
|
|
73
|
+
}
|
|
74
|
+
eval_fn = ProtectedMaterialEvaluator(azure_ai_project)
|
|
75
|
+
result = eval_fn(query="What is the capital of France?", response="Paris.")
|
|
56
76
|
|
|
57
|
-
|
|
58
|
-
def __call__(
|
|
59
|
-
self,
|
|
60
|
-
*,
|
|
61
|
-
query: str,
|
|
62
|
-
response: str,
|
|
63
|
-
) -> Dict[str, Union[str, bool]]:
|
|
64
|
-
"""Evaluate a given query/response pair for protected material
|
|
77
|
+
**Output format**
|
|
65
78
|
|
|
66
|
-
|
|
67
|
-
:paramtype query: str
|
|
68
|
-
:keyword response: The response to be evaluated.
|
|
69
|
-
:paramtype response: str
|
|
70
|
-
:return: The protected material score.
|
|
71
|
-
:rtype: Dict[str, Union[str, bool]]
|
|
72
|
-
"""
|
|
79
|
+
.. code-block:: python
|
|
73
80
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
83
|
-
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
84
|
-
to be dictionaries with keys "content", "role", and possibly "context".
|
|
85
|
-
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
86
|
-
:return: The protected material score.
|
|
87
|
-
:rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
|
|
88
|
-
"""
|
|
81
|
+
{
|
|
82
|
+
"protected_material_label": "False",
|
|
83
|
+
"protected_material_reason": "This query does not contain any protected material."
|
|
84
|
+
}
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, azure_ai_project: dict, credential=None):
|
|
88
|
+
self._async_evaluator = _AsyncProtectedMaterialEvaluator(azure_ai_project, credential)
|
|
89
89
|
|
|
90
|
-
|
|
91
|
-
def __call__(
|
|
92
|
-
self,
|
|
93
|
-
*,
|
|
94
|
-
query: Optional[str] = None,
|
|
95
|
-
response: Optional[str] = None,
|
|
96
|
-
conversation=None,
|
|
97
|
-
**kwargs,
|
|
98
|
-
):
|
|
90
|
+
def __call__(self, *, query: str, response: str, **kwargs):
|
|
99
91
|
"""
|
|
100
|
-
|
|
92
|
+
Evaluates protected material content.
|
|
101
93
|
|
|
102
94
|
:keyword query: The query to be evaluated.
|
|
103
|
-
:paramtype query:
|
|
95
|
+
:paramtype query: str
|
|
104
96
|
:keyword response: The response to be evaluated.
|
|
105
|
-
:paramtype response:
|
|
106
|
-
:
|
|
107
|
-
|
|
108
|
-
to be dictionaries with keys "content" and "role".
|
|
109
|
-
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
110
|
-
:return: The fluency score.
|
|
111
|
-
:rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
|
|
97
|
+
:paramtype response: str
|
|
98
|
+
:return: A dictionary containing a boolean label and reasoning.
|
|
99
|
+
:rtype: dict
|
|
112
100
|
"""
|
|
113
|
-
return
|
|
101
|
+
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
102
|
+
|
|
103
|
+
def _to_async(self):
|
|
104
|
+
return self._async_evaluator
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
5
|
+
|
|
6
|
+
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
7
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
8
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _AsyncProtectedMaterialsEvaluator:
|
|
12
|
+
def __init__(self, azure_ai_project: dict, credential=None):
|
|
13
|
+
self._azure_ai_project = azure_ai_project
|
|
14
|
+
self._credential = credential
|
|
15
|
+
|
|
16
|
+
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
Evaluates content according to this evaluator's metric.
|
|
19
|
+
|
|
20
|
+
:keyword query: The query to be evaluated.
|
|
21
|
+
:paramtype query: str
|
|
22
|
+
:keyword response: The response to be evaluated.
|
|
23
|
+
:paramtype response: str
|
|
24
|
+
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
25
|
+
:rtype: Any
|
|
26
|
+
"""
|
|
27
|
+
# Validate inputs
|
|
28
|
+
# Raises value error if failed, so execution alone signifies success.
|
|
29
|
+
if not (query and query.strip() and query != "None") or not (
|
|
30
|
+
response and response.strip() and response != "None"
|
|
31
|
+
):
|
|
32
|
+
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
33
|
+
raise EvaluationException(
|
|
34
|
+
message=msg,
|
|
35
|
+
internal_message=msg,
|
|
36
|
+
error_category=ErrorCategory.MISSING_FIELD,
|
|
37
|
+
error_blame=ErrorBlame.USER_ERROR,
|
|
38
|
+
error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Run score computation based on supplied metric.
|
|
42
|
+
result = await evaluate_with_rai_service(
|
|
43
|
+
metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
|
|
44
|
+
query=query,
|
|
45
|
+
response=response,
|
|
46
|
+
project_scope=self._azure_ai_project,
|
|
47
|
+
credential=self._credential,
|
|
48
|
+
)
|
|
49
|
+
return result
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ProtectedMaterialsEvaluator:
|
|
53
|
+
"""
|
|
54
|
+
Initialize a protected materials evaluator to detect whether protected material
|
|
55
|
+
is present in your AI system's response. Outputs True or False with AI-generated reasoning.
|
|
56
|
+
|
|
57
|
+
:param azure_ai_project: The scope of the Azure AI project.
|
|
58
|
+
It contains subscription id, resource group, and project name.
|
|
59
|
+
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
60
|
+
:param credential: The credential for connecting to Azure AI project.
|
|
61
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
62
|
+
:return: Whether or not protected material was found in the response, with AI-generated reasoning.
|
|
63
|
+
:rtype: Dict[str, str]
|
|
64
|
+
|
|
65
|
+
**Usage**
|
|
66
|
+
|
|
67
|
+
.. code-block:: python
|
|
68
|
+
|
|
69
|
+
azure_ai_project = {
|
|
70
|
+
"subscription_id": "<subscription_id>",
|
|
71
|
+
"resource_group_name": "<resource_group_name>",
|
|
72
|
+
"project_name": "<project_name>",
|
|
73
|
+
}
|
|
74
|
+
eval_fn = ProtectedMaterialsEvaluator(azure_ai_project)
|
|
75
|
+
result = eval_fn(query="What is the capital of France?", response="Paris.")
|
|
76
|
+
|
|
77
|
+
**Output format**
|
|
78
|
+
|
|
79
|
+
.. code-block:: python
|
|
80
|
+
|
|
81
|
+
{
|
|
82
|
+
"label": "False",
|
|
83
|
+
"reasoning": "This query does not contain any protected material."
|
|
84
|
+
}
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, azure_ai_project: dict, credential=None):
|
|
88
|
+
self._async_evaluator = _AsyncProtectedMaterialsEvaluator(azure_ai_project, credential)
|
|
89
|
+
|
|
90
|
+
def __call__(self, *, query: str, response: str, **kwargs):
|
|
91
|
+
"""
|
|
92
|
+
Evaluates protected materials content.
|
|
93
|
+
|
|
94
|
+
:keyword query: The query to be evaluated.
|
|
95
|
+
:paramtype query: str
|
|
96
|
+
:keyword response: The response to be evaluated.
|
|
97
|
+
:paramtype response: str
|
|
98
|
+
:return: A dictionary containing a boolean label and reasoning.
|
|
99
|
+
:rtype: dict
|
|
100
|
+
"""
|
|
101
|
+
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
102
|
+
|
|
103
|
+
def _to_async(self):
|
|
104
|
+
return self._async_evaluator
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
from concurrent.futures import as_completed
|
|
6
|
-
from typing import Callable, Dict, List, Union
|
|
7
6
|
|
|
8
7
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
9
8
|
|
|
@@ -22,33 +21,39 @@ class QAEvaluator:
|
|
|
22
21
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
23
22
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
24
23
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
25
|
-
:return: A
|
|
26
|
-
:
|
|
27
|
-
:type kwargs: Any
|
|
24
|
+
:return: A function that evaluates and generates metrics for "question-answering" scenario.
|
|
25
|
+
:rtype: Callable
|
|
28
26
|
|
|
29
|
-
|
|
27
|
+
**Usage**
|
|
30
28
|
|
|
31
|
-
|
|
32
|
-
:start-after: [START qa_evaluator]
|
|
33
|
-
:end-before: [END qa_evaluator]
|
|
34
|
-
:language: python
|
|
35
|
-
:dedent: 8
|
|
36
|
-
:caption: Initialize and call a QAEvaluator.
|
|
29
|
+
.. code-block:: python
|
|
37
30
|
|
|
38
|
-
|
|
31
|
+
eval_fn = QAEvaluator(model_config)
|
|
32
|
+
result = qa_eval(
|
|
33
|
+
query="Tokyo is the capital of which country?",
|
|
34
|
+
response="Japan",
|
|
35
|
+
context="Tokyo is the capital of Japan.",
|
|
36
|
+
ground_truth="Japan"
|
|
37
|
+
)
|
|
39
38
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
"""
|
|
39
|
+
**Output format**
|
|
40
|
+
|
|
41
|
+
.. code-block:: python
|
|
44
42
|
|
|
45
|
-
|
|
46
|
-
|
|
43
|
+
{
|
|
44
|
+
"gpt_groundedness": 3.5,
|
|
45
|
+
"gpt_relevance": 4.0,
|
|
46
|
+
"gpt_coherence": 1.5,
|
|
47
|
+
"gpt_fluency": 4.0,
|
|
48
|
+
"gpt_similarity": 3.0,
|
|
49
|
+
"f1_score": 0.42
|
|
50
|
+
}
|
|
51
|
+
"""
|
|
47
52
|
|
|
48
|
-
def __init__(self, model_config,
|
|
49
|
-
self._parallel =
|
|
53
|
+
def __init__(self, model_config: dict, parallel: bool = True):
|
|
54
|
+
self._parallel = parallel
|
|
50
55
|
|
|
51
|
-
self._evaluators
|
|
56
|
+
self._evaluators = [
|
|
52
57
|
GroundednessEvaluator(model_config),
|
|
53
58
|
RelevanceEvaluator(model_config),
|
|
54
59
|
CoherenceEvaluator(model_config),
|
|
@@ -69,10 +74,12 @@ class QAEvaluator:
|
|
|
69
74
|
:paramtype context: str
|
|
70
75
|
:keyword ground_truth: The ground truth to be evaluated.
|
|
71
76
|
:paramtype ground_truth: str
|
|
77
|
+
:keyword parallel: Whether to evaluate in parallel. Defaults to True.
|
|
78
|
+
:paramtype parallel: bool
|
|
72
79
|
:return: The scores for QA scenario.
|
|
73
|
-
:rtype:
|
|
80
|
+
:rtype: dict
|
|
74
81
|
"""
|
|
75
|
-
results
|
|
82
|
+
results = {}
|
|
76
83
|
if self._parallel:
|
|
77
84
|
with ThreadPoolExecutor() as executor:
|
|
78
85
|
futures = {
|