azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +82 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/_experimental.py +172 -0
- azure/ai/evaluation/_common/constants.py +72 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/rai_service.py +632 -0
- azure/ai/evaluation/_common/utils.py +445 -0
- azure/ai/evaluation/_constants.py +72 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
- azure/ai/evaluation/_evaluate/_utils.py +298 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
- azure/ai/evaluation/_exceptions.py +128 -0
- azure/ai/evaluation/_http_utils.py +466 -0
- azure/ai/evaluation/_model_configurations.py +123 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +16 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
- azure/ai/evaluation/simulator/_constants.py +27 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
- azure/ai/evaluation/simulator/_simulator.py +716 -0
- azure/ai/evaluation/simulator/_tracing.py +89 -0
- azure/ai/evaluation/simulator/_utils.py +132 -0
- azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Coherence
|
|
3
|
+
description: Evaluates coherence score for QA scenario
|
|
4
|
+
model:
|
|
5
|
+
api: chat
|
|
6
|
+
parameters:
|
|
7
|
+
temperature: 0.0
|
|
8
|
+
max_tokens: 800
|
|
9
|
+
top_p: 1.0
|
|
10
|
+
presence_penalty: 0
|
|
11
|
+
frequency_penalty: 0
|
|
12
|
+
response_format:
|
|
13
|
+
type: text
|
|
14
|
+
|
|
15
|
+
inputs:
|
|
16
|
+
query:
|
|
17
|
+
type: string
|
|
18
|
+
response:
|
|
19
|
+
type: string
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
system:
|
|
23
|
+
# Instruction
|
|
24
|
+
## Goal
|
|
25
|
+
### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
|
|
26
|
+
- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
|
|
27
|
+
- **Data**: Your input data include a QUERY and a RESPONSE.
|
|
28
|
+
- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
|
|
29
|
+
|
|
30
|
+
user:
|
|
31
|
+
# Definition
|
|
32
|
+
**Coherence** refers to the logical and orderly presentation of ideas in a response, allowing the reader to easily follow and understand the writer's train of thought. A coherent answer directly addresses the question with clear connections between sentences and paragraphs, using appropriate transitions and a logical sequence of ideas.
|
|
33
|
+
|
|
34
|
+
# Ratings
|
|
35
|
+
## [Coherence: 1] (Incoherent Response)
|
|
36
|
+
**Definition:** The response lacks coherence entirely. It consists of disjointed words or phrases that do not form complete or meaningful sentences. There is no logical connection to the question, making the response incomprehensible.
|
|
37
|
+
|
|
38
|
+
**Examples:**
|
|
39
|
+
**Query:** What are the benefits of renewable energy?
|
|
40
|
+
**Response:** Wind sun green jump apple silence over.
|
|
41
|
+
|
|
42
|
+
**Query:** Explain the process of photosynthesis.
|
|
43
|
+
**Response:** Plants light water flying blue music.
|
|
44
|
+
|
|
45
|
+
## [Coherence: 2] (Poorly Coherent Response)
|
|
46
|
+
**Definition:** The response shows minimal coherence with fragmented sentences and limited connection to the question. It contains some relevant keywords but lacks logical structure and clear relationships between ideas, making the overall message difficult to understand.
|
|
47
|
+
|
|
48
|
+
**Examples:**
|
|
49
|
+
**Query:** How does vaccination work?
|
|
50
|
+
**Response:** Vaccines protect disease. Immune system fight. Health better.
|
|
51
|
+
|
|
52
|
+
**Query:** Describe how a bill becomes a law.
|
|
53
|
+
**Response:** Idea proposed. Congress discuss vote. President signs.
|
|
54
|
+
|
|
55
|
+
## [Coherence: 3] (Partially Coherent Response)
|
|
56
|
+
**Definition:** The response partially addresses the question with some relevant information but exhibits issues in the logical flow and organization of ideas. Connections between sentences may be unclear or abrupt, requiring the reader to infer the links. The response may lack smooth transitions and may present ideas out of order.
|
|
57
|
+
|
|
58
|
+
**Examples:**
|
|
59
|
+
**Query:** What causes earthquakes?
|
|
60
|
+
**Response:** Earthquakes happen when tectonic plates move suddenly. Energy builds up then releases. Ground shakes and can cause damage.
|
|
61
|
+
|
|
62
|
+
**Query:** Explain the importance of the water cycle.
|
|
63
|
+
**Response:** The water cycle moves water around Earth. Evaporation, then precipitation occurs. It supports life by distributing water.
|
|
64
|
+
|
|
65
|
+
## [Coherence: 4] (Coherent Response)
|
|
66
|
+
**Definition:** The response is coherent and effectively addresses the question. Ideas are logically organized with clear connections between sentences and paragraphs. Appropriate transitions are used to guide the reader through the response, which flows smoothly and is easy to follow.
|
|
67
|
+
|
|
68
|
+
**Examples:**
|
|
69
|
+
**Query:** What is the water cycle and how does it work?
|
|
70
|
+
**Response:** The water cycle is the continuous movement of water on Earth through processes like evaporation, condensation, and precipitation. Water evaporates from bodies of water, forms clouds through condensation, and returns to the surface as precipitation. This cycle is essential for distributing water resources globally.
|
|
71
|
+
|
|
72
|
+
**Query:** Describe the role of mitochondria in cellular function.
|
|
73
|
+
**Response:** Mitochondria are organelles that produce energy for the cell. They convert nutrients into ATP through cellular respiration. This energy powers various cellular activities, making mitochondria vital for cell survival.
|
|
74
|
+
|
|
75
|
+
## [Coherence: 5] (Highly Coherent Response)
|
|
76
|
+
**Definition:** The response is exceptionally coherent, demonstrating sophisticated organization and flow. Ideas are presented in a logical and seamless manner, with excellent use of transitional phrases and cohesive devices. The connections between concepts are clear and enhance the reader's understanding. The response thoroughly addresses the question with clarity and precision.
|
|
77
|
+
|
|
78
|
+
**Examples:**
|
|
79
|
+
**Query:** Analyze the economic impacts of climate change on coastal cities.
|
|
80
|
+
**Response:** Climate change significantly affects the economies of coastal cities through rising sea levels, increased flooding, and more intense storms. These environmental changes can damage infrastructure, disrupt businesses, and lead to costly repairs. For instance, frequent flooding can hinder transportation and commerce, while the threat of severe weather may deter investment and tourism. Consequently, cities may face increased expenses for disaster preparedness and mitigation efforts, straining municipal budgets and impacting economic growth.
|
|
81
|
+
|
|
82
|
+
**Query:** Discuss the significance of the Monroe Doctrine in shaping U.S. foreign policy.
|
|
83
|
+
**Response:** The Monroe Doctrine was a pivotal policy declared in 1823 that asserted U.S. opposition to European colonization in the Americas. By stating that any intervention by external powers in the Western Hemisphere would be viewed as a hostile act, it established the U.S. as a protector of the region. This doctrine shaped U.S. foreign policy by promoting isolation from European conflicts while justifying American influence and expansion in the hemisphere. Its long-term significance lies in its enduring influence on international relations and its role in defining the U.S. position in global affairs.
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# Data
|
|
87
|
+
QUERY: {{query}}
|
|
88
|
+
RESPONSE: {{response}}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# Tasks
|
|
92
|
+
## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
|
|
93
|
+
- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
|
|
94
|
+
- **Explanation**: a very short explanation of why you think the input Data should get that Score.
|
|
95
|
+
- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
|
|
99
|
+
# Output
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._base_eval import EvaluatorBase
|
|
6
|
+
from ._base_prompty_eval import PromptyEvaluatorBase
|
|
7
|
+
from ._base_rai_svc_eval import RaiServiceEvaluatorBase
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"EvaluatorBase",
|
|
11
|
+
"PromptyEvaluatorBase",
|
|
12
|
+
"RaiServiceEvaluatorBase",
|
|
13
|
+
]
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import inspect
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
8
|
+
|
|
9
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
+
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
|
+
|
|
12
|
+
from azure.ai.evaluation._common.math import list_mean
|
|
13
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
+
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
15
|
+
|
|
16
|
+
P = ParamSpec("P")
|
|
17
|
+
T = TypeVar("T")
|
|
18
|
+
T_EvalValue = TypeVar("T_EvalValue")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DerivedEvalInput(TypedDict, total=False):
|
|
22
|
+
"""The eval input generated by EvaluatorBase._derive_conversation_starter."""
|
|
23
|
+
|
|
24
|
+
query: Dict[str, Any]
|
|
25
|
+
response: Dict[str, Any]
|
|
26
|
+
context: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
|
|
30
|
+
"""TypeAlias that models the return value of EvaluatorBase._aggregate_results
|
|
31
|
+
|
|
32
|
+
.. code-block:: python
|
|
33
|
+
|
|
34
|
+
foo: AggregateResult[float] = {
|
|
35
|
+
"evaluation_per_turn": {
|
|
36
|
+
"coherence": [1.0, 2.0, 3.0]
|
|
37
|
+
},
|
|
38
|
+
"coherence": 2.0
|
|
39
|
+
}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
DoEvalResult: TypeAlias = Dict[str, T]
|
|
43
|
+
"""TypeAlias that models the return value of EvaluatorBase._do_eval
|
|
44
|
+
|
|
45
|
+
.. code-block:: python
|
|
46
|
+
|
|
47
|
+
foo: DoEvalResult[float] = {
|
|
48
|
+
"coherence": 2.0
|
|
49
|
+
}
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# TODO exception target pass down?
|
|
54
|
+
class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
55
|
+
"""Base class for all evaluators that are capable of accepting either a group of single values,
|
|
56
|
+
or conversation as input. All such evaluators need to implement two functions of their own:
|
|
57
|
+
- _convert_conversation_to_eval_input
|
|
58
|
+
- _do_eval
|
|
59
|
+
|
|
60
|
+
Additionally, __call__ should be overridden to reshape the function header as needed to produce more informative
|
|
61
|
+
documentation, although ideally the actual child implementation of __call__ should just amount to
|
|
62
|
+
'super().__init__()'.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
:param not_singleton_inputs: A list of strings that represent the names of
|
|
66
|
+
inputs to the child evaluator's __call__ function that are NOT singleton inputs. By default, this
|
|
67
|
+
is ["conversation", "kwargs"].
|
|
68
|
+
:type not_singleton_inputs: List[str]
|
|
69
|
+
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
|
|
70
|
+
:type eval_last_turn: bool
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
74
|
+
|
|
75
|
+
# Make sure to call super().__init__() in the child class's __init__ method.
|
|
76
|
+
# pylint: disable=dangerous-default-value
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
*,
|
|
80
|
+
not_singleton_inputs: List[str] = ["conversation", "kwargs"],
|
|
81
|
+
eval_last_turn: bool = False,
|
|
82
|
+
):
|
|
83
|
+
self._not_singleton_inputs = not_singleton_inputs
|
|
84
|
+
self._eval_last_turn = eval_last_turn
|
|
85
|
+
self._singleton_inputs = self._derive_singleton_inputs()
|
|
86
|
+
self._async_evaluator = AsyncEvaluatorBase(self._real_call)
|
|
87
|
+
|
|
88
|
+
# This needs to be overridden just to change the function header into something more informative,
|
|
89
|
+
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
90
|
+
# super().__call__(<inputs>)
|
|
91
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
92
|
+
self,
|
|
93
|
+
*args,
|
|
94
|
+
**kwargs,
|
|
95
|
+
) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
96
|
+
"""Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
|
|
97
|
+
one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
|
|
98
|
+
The actual behavior of this function shouldn't change beyond adding more inputs to the
|
|
99
|
+
async_run_allowing_running_loop call.
|
|
100
|
+
|
|
101
|
+
:keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
|
|
102
|
+
:type kwargs: Dict
|
|
103
|
+
:return: The evaluation result
|
|
104
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
105
|
+
"""
|
|
106
|
+
return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
|
|
110
|
+
"""Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
|
|
111
|
+
In the default case, all required inputs are assumed to be within eval_input, as user-friendly
|
|
112
|
+
typing is handled above this function in favor of polymorphic simplicity. This function must be
|
|
113
|
+
asynchronous.
|
|
114
|
+
|
|
115
|
+
:param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
|
|
116
|
+
:type eval_input: Any
|
|
117
|
+
:return: A single evaluation result
|
|
118
|
+
:rtype: DoEvalResult[T_EvalValue]
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
122
|
+
|
|
123
|
+
def _derive_singleton_inputs(self) -> List[str]:
|
|
124
|
+
"""Inspect the evaluator's __call__ function to determine what singleton inputs are expected
|
|
125
|
+
when the evaluator is being used in a non-conversation context.
|
|
126
|
+
By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
|
|
127
|
+
Thankfully this works the way you'd hope, with the call_signature being based on the child
|
|
128
|
+
function's signature, not the parent's.
|
|
129
|
+
|
|
130
|
+
:return: A list of strings representing the names of singleton inputs.
|
|
131
|
+
:rtype: List[str]
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
overloads = get_overloads(self.__call__)
|
|
135
|
+
if not overloads:
|
|
136
|
+
call_signatures = [inspect.signature(self.__call__)]
|
|
137
|
+
else:
|
|
138
|
+
call_signatures = [inspect.signature(overload) for overload in overloads]
|
|
139
|
+
call_signature = inspect.signature(self.__call__)
|
|
140
|
+
singletons = []
|
|
141
|
+
for call_signature in call_signatures:
|
|
142
|
+
params = call_signature.parameters
|
|
143
|
+
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
|
|
144
|
+
continue
|
|
145
|
+
# exclude self since it is not a singleton input
|
|
146
|
+
singletons.extend([p for p in params if p != "self"])
|
|
147
|
+
return singletons
|
|
148
|
+
|
|
149
|
+
def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
150
|
+
"""Produce the function that will be used to convert conversations to a list of evaluable inputs.
|
|
151
|
+
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
152
|
+
aspects of a conversation ought to be extracted.
|
|
153
|
+
|
|
154
|
+
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
155
|
+
:rtype: Callable
|
|
156
|
+
"""
|
|
157
|
+
include_context = "context" in self._singleton_inputs
|
|
158
|
+
include_query = "query" in self._singleton_inputs
|
|
159
|
+
include_response = "response" in self._singleton_inputs
|
|
160
|
+
|
|
161
|
+
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
162
|
+
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
163
|
+
global_context = conversation.get("context", None)
|
|
164
|
+
# Extract queries, responses from conversation
|
|
165
|
+
queries: List[Dict[str, Any]] = []
|
|
166
|
+
responses: List[Dict[str, Any]] = []
|
|
167
|
+
|
|
168
|
+
# Convert conversation slice into queries and responses.
|
|
169
|
+
# Assume that 'user' role is asking queries and 'assistant' role is responding.
|
|
170
|
+
if self._eval_last_turn and len(messages) > 1:
|
|
171
|
+
messages = messages[-2:]
|
|
172
|
+
|
|
173
|
+
for each_turn in messages:
|
|
174
|
+
role = each_turn["role"]
|
|
175
|
+
if role == "user":
|
|
176
|
+
queries.append(each_turn)
|
|
177
|
+
elif role == "assistant":
|
|
178
|
+
responses.append(each_turn)
|
|
179
|
+
# TODO complain if len(queries) != len(responses)?
|
|
180
|
+
eval_inputs = []
|
|
181
|
+
for query, response in zip(queries, responses):
|
|
182
|
+
context = {}
|
|
183
|
+
if include_context:
|
|
184
|
+
query_context = query.get("context", None)
|
|
185
|
+
response_context = response.get("context", None)
|
|
186
|
+
if global_context:
|
|
187
|
+
context["global_context"] = global_context
|
|
188
|
+
if query_context and include_query:
|
|
189
|
+
context["query_context"] = query_context
|
|
190
|
+
if response_context and include_response:
|
|
191
|
+
context["response_context"] = response_context
|
|
192
|
+
|
|
193
|
+
eval_input: DerivedEvalInput = {}
|
|
194
|
+
if include_query:
|
|
195
|
+
eval_input["query"] = query.get("content", "")
|
|
196
|
+
if include_response:
|
|
197
|
+
eval_input["response"] = response.get("content", "")
|
|
198
|
+
if include_context:
|
|
199
|
+
eval_input["context"] = str(context)
|
|
200
|
+
eval_inputs.append(eval_input)
|
|
201
|
+
return eval_inputs
|
|
202
|
+
|
|
203
|
+
return converter
|
|
204
|
+
|
|
205
|
+
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
206
|
+
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
207
|
+
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
208
|
+
Either they receive a collection of keyname inputs that are all single values
|
|
209
|
+
(like a query and response), or they receive conversation that iss a list of dictionary
|
|
210
|
+
values.
|
|
211
|
+
|
|
212
|
+
The self._singleton_inputs list assigned during initialization is used to find and extract
|
|
213
|
+
singleton keywords, and self._allow_converssation_input is used to determine if a conversation
|
|
214
|
+
is a valid input.
|
|
215
|
+
|
|
216
|
+
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
217
|
+
are inputted.
|
|
218
|
+
|
|
219
|
+
This function must be overridden by child classes IF they need to both a conversation and
|
|
220
|
+
other inputs to be passed in.
|
|
221
|
+
|
|
222
|
+
:keyword kwargs: The inputs to convert.
|
|
223
|
+
:type kwargs: Dict
|
|
224
|
+
:return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
|
|
225
|
+
:rtype: List
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
# Collect inputs
|
|
229
|
+
conversation = kwargs.get("conversation", None)
|
|
230
|
+
singletons = {}
|
|
231
|
+
if len(self._singleton_inputs) > 0:
|
|
232
|
+
singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
|
|
233
|
+
# Check that both conversation and other inputs aren't set
|
|
234
|
+
if conversation is not None and any(singletons.values()):
|
|
235
|
+
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
|
|
236
|
+
raise EvaluationException(
|
|
237
|
+
message=msg,
|
|
238
|
+
blame=ErrorBlame.USER_ERROR,
|
|
239
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
240
|
+
target=ErrorTarget.CONVERSATION,
|
|
241
|
+
)
|
|
242
|
+
# Handle Conversation
|
|
243
|
+
if conversation is not None:
|
|
244
|
+
return self._derive_conversation_converter()(conversation)
|
|
245
|
+
# Handle Singletons
|
|
246
|
+
required_singletons = remove_optional_singletons(self, singletons)
|
|
247
|
+
if all(value is not None for value in required_singletons.values()):
|
|
248
|
+
return [singletons]
|
|
249
|
+
# Missing input
|
|
250
|
+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
|
|
251
|
+
raise EvaluationException(
|
|
252
|
+
message=msg,
|
|
253
|
+
blame=ErrorBlame.USER_ERROR,
|
|
254
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
255
|
+
target=ErrorTarget.CONVERSATION,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
|
|
259
|
+
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
260
|
+
|
|
261
|
+
Exact implementation might need to vary slightly depending on the results produced.
|
|
262
|
+
Default behavior is to average the all number-based outputs.
|
|
263
|
+
|
|
264
|
+
:param per_turn_results: List of evaluation results for each turn in the conversation.
|
|
265
|
+
:type per_turn_results: List[Dict]
|
|
266
|
+
:return: A dictionary containing aggregated results, with numeric metrics having their
|
|
267
|
+
means as top-level values in the dictionary, and all original
|
|
268
|
+
values (including non-numerics) located in under the "evaluation_per_turn" key,
|
|
269
|
+
which each sub-key being a metric and each sub-value being a the list of that metric's
|
|
270
|
+
per-turn values.
|
|
271
|
+
:rtype: AggregateResult[T_EvalValue]
|
|
272
|
+
"""
|
|
273
|
+
|
|
274
|
+
aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
|
|
275
|
+
evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
|
|
276
|
+
|
|
277
|
+
# Go over each turn, and rotate the results into a
|
|
278
|
+
# metric: List[values] format for the evals_per_turn dictionary.
|
|
279
|
+
for turn in per_turn_results:
|
|
280
|
+
for metric, value in turn.items():
|
|
281
|
+
if metric not in evaluation_per_turn:
|
|
282
|
+
evaluation_per_turn[metric] = []
|
|
283
|
+
evaluation_per_turn[metric].append(value)
|
|
284
|
+
|
|
285
|
+
# Find and average all numeric values
|
|
286
|
+
for metric, values in evaluation_per_turn.items():
|
|
287
|
+
if all(isinstance(value, (int, float)) for value in values):
|
|
288
|
+
aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
|
|
289
|
+
# Slap the per-turn results back in.
|
|
290
|
+
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
291
|
+
return aggregated
|
|
292
|
+
|
|
293
|
+
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
294
|
+
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
295
|
+
|
|
296
|
+
:keyword kwargs: The inputs to evaluate.
|
|
297
|
+
:type kwargs: Dict
|
|
298
|
+
:return: The evaluation result.
|
|
299
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
300
|
+
"""
|
|
301
|
+
# Convert inputs into list of evaluable inputs.
|
|
302
|
+
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
|
|
303
|
+
per_turn_results = []
|
|
304
|
+
# Evaluate all inputs.
|
|
305
|
+
for eval_input in eval_input_list:
|
|
306
|
+
per_turn_results.append(await self._do_eval(eval_input))
|
|
307
|
+
# Return results as-is if only one result was produced.
|
|
308
|
+
|
|
309
|
+
if len(per_turn_results) == 1:
|
|
310
|
+
return per_turn_results[0]
|
|
311
|
+
if len(per_turn_results) == 0:
|
|
312
|
+
return {} # TODO raise something?
|
|
313
|
+
# Otherwise, aggregate results.
|
|
314
|
+
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
315
|
+
|
|
316
|
+
@final
|
|
317
|
+
def _to_async(self) -> "AsyncEvaluatorBase":
|
|
318
|
+
return self._async_evaluator
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class AsyncEvaluatorBase:
|
|
322
|
+
"""The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
|
|
323
|
+
to ensure that no one ever needs to extend or otherwise modify this class directly.
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION
|
|
327
|
+
self._real_call = real_call
|
|
328
|
+
|
|
329
|
+
# Don't look at my shame. Nothing to see here....
|
|
330
|
+
# Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
|
|
331
|
+
# to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
|
|
332
|
+
# are just not passed into this function instead of ending up in kwargs.
|
|
333
|
+
# Since we want this to be relatively call-agnostic, we just account for every input that any children
|
|
334
|
+
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
335
|
+
async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
|
|
336
|
+
if conversation is not None:
|
|
337
|
+
kwargs["conversation"] = conversation
|
|
338
|
+
if query is not None:
|
|
339
|
+
kwargs["query"] = query
|
|
340
|
+
if response is not None:
|
|
341
|
+
kwargs["response"] = response
|
|
342
|
+
if context is not None:
|
|
343
|
+
kwargs["context"] = context
|
|
344
|
+
return await self._real_call(**kwargs)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
from typing import Dict, TypeVar, Union
|
|
8
|
+
|
|
9
|
+
from promptflow.core import AsyncPrompty
|
|
10
|
+
from typing_extensions import override
|
|
11
|
+
|
|
12
|
+
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
13
|
+
from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
|
|
14
|
+
from . import EvaluatorBase
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from ..._user_agent import USER_AGENT
|
|
18
|
+
except ImportError:
|
|
19
|
+
USER_AGENT = "None"
|
|
20
|
+
|
|
21
|
+
T = TypeVar("T")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
25
|
+
"""Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
|
|
26
|
+
make use of a prompty file, and return their results as a dictionary, with a single key-value pair
|
|
27
|
+
linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
|
|
28
|
+
per-turn results are stored in a list under the key "evaluation_per_turn").
|
|
29
|
+
|
|
30
|
+
:param result_key: The key to use for the result of the evaluation. Single turn evaluations will return
|
|
31
|
+
a dictionary in the format {result_key: float}.
|
|
32
|
+
:type result_key: str
|
|
33
|
+
:param prompty_file: The path to the prompty file to use for evaluation.
|
|
34
|
+
:type prompty_file: str
|
|
35
|
+
:param model_config: The model configuration to use for evaluation.
|
|
36
|
+
:type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
|
|
37
|
+
:param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
|
|
38
|
+
Useful since some evaluators of this format are response-only.
|
|
39
|
+
:type ignore_queries: bool
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
_LLM_CALL_TIMEOUT = 600
|
|
43
|
+
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
44
|
+
|
|
45
|
+
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
|
|
46
|
+
self._result_key = result_key
|
|
47
|
+
self._prompty_file = prompty_file
|
|
48
|
+
super().__init__(eval_last_turn=eval_last_turn)
|
|
49
|
+
|
|
50
|
+
prompty_model_config = construct_prompty_model_config(
|
|
51
|
+
validate_model_config(model_config),
|
|
52
|
+
self._DEFAULT_OPEN_API_VERSION,
|
|
53
|
+
USER_AGENT,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
|
|
57
|
+
|
|
58
|
+
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
59
|
+
# defining a default here.
|
|
60
|
+
|
|
61
|
+
@override
|
|
62
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
63
|
+
"""Do a relevance evaluation.
|
|
64
|
+
|
|
65
|
+
:param eval_input: The input to the evaluator. Expected to contain
|
|
66
|
+
whatever inputs are needed for the _flow method, including context
|
|
67
|
+
and other fields depending on the child class.
|
|
68
|
+
:type eval_input: Dict
|
|
69
|
+
:return: The evaluation result.
|
|
70
|
+
:rtype: Dict
|
|
71
|
+
"""
|
|
72
|
+
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
73
|
+
|
|
74
|
+
score = math.nan
|
|
75
|
+
if llm_output:
|
|
76
|
+
# Parse out score and reason from evaluators known to possess them.
|
|
77
|
+
if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
|
|
78
|
+
score, reason = parse_quality_evaluator_reason_score(llm_output)
|
|
79
|
+
return {
|
|
80
|
+
self._result_key: float(score),
|
|
81
|
+
f"gpt_{self._result_key}": float(score),
|
|
82
|
+
f"{self._result_key}_reason": reason,
|
|
83
|
+
}
|
|
84
|
+
match = re.search(r"\d", llm_output)
|
|
85
|
+
if match:
|
|
86
|
+
score = float(match.group())
|
|
87
|
+
return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
|
|
88
|
+
return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict, TypeVar, Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import override
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._common.constants import (
|
|
9
|
+
EvaluationMetrics,
|
|
10
|
+
_InternalEvaluationMetrics,
|
|
11
|
+
Tasks,
|
|
12
|
+
_InternalAnnotationTasks,
|
|
13
|
+
)
|
|
14
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
15
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
16
|
+
from azure.ai.evaluation._exceptions import EvaluationException
|
|
17
|
+
from azure.core.credentials import TokenCredential
|
|
18
|
+
|
|
19
|
+
from . import EvaluatorBase
|
|
20
|
+
|
|
21
|
+
T = TypeVar("T")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
25
|
+
"""Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
|
|
26
|
+
This includes content safety evaluators, protected material evaluators, and others. These evaluators
|
|
27
|
+
are all assumed to be of the "query and response or conversation" input variety.
|
|
28
|
+
|
|
29
|
+
:param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
|
|
30
|
+
to specify which evaluation to perform.
|
|
31
|
+
:type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
|
|
32
|
+
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
|
|
33
|
+
aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
|
|
34
|
+
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
35
|
+
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
36
|
+
:type eval_last_turn: bool
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
@override
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
43
|
+
azure_ai_project: dict,
|
|
44
|
+
credential: TokenCredential,
|
|
45
|
+
eval_last_turn: bool = False,
|
|
46
|
+
):
|
|
47
|
+
super().__init__(eval_last_turn=eval_last_turn)
|
|
48
|
+
self._eval_metric = eval_metric
|
|
49
|
+
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
50
|
+
self._credential = credential
|
|
51
|
+
|
|
52
|
+
@override
|
|
53
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
54
|
+
self,
|
|
55
|
+
*args,
|
|
56
|
+
**kwargs,
|
|
57
|
+
):
|
|
58
|
+
"""Evaluate either a query and response or a conversation. Must supply either a query AND response,
|
|
59
|
+
or a conversation, but not both.
|
|
60
|
+
|
|
61
|
+
:keyword query: The query to evaluate.
|
|
62
|
+
:paramtype query: Optional[str]
|
|
63
|
+
:keyword response: The response to evaluate.
|
|
64
|
+
:paramtype response: Optional[str]
|
|
65
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
66
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
67
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
68
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
69
|
+
:rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
|
|
70
|
+
"""
|
|
71
|
+
return super().__call__(*args, **kwargs)
|
|
72
|
+
|
|
73
|
+
@override
|
|
74
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
|
|
75
|
+
"""Perform the evaluation using the Azure AI RAI service.
|
|
76
|
+
The exact evaluation performed is determined by the evaluation metric supplied
|
|
77
|
+
by the child class initializer.
|
|
78
|
+
|
|
79
|
+
:param eval_input: The input to the evaluation function.
|
|
80
|
+
:type eval_input: Dict
|
|
81
|
+
:return: The evaluation result.
|
|
82
|
+
:rtype: Dict
|
|
83
|
+
"""
|
|
84
|
+
query = eval_input.get("query", None)
|
|
85
|
+
response = eval_input.get("response", None)
|
|
86
|
+
if query is None or response is None:
|
|
87
|
+
raise EvaluationException(
|
|
88
|
+
message="Not implemented",
|
|
89
|
+
internal_message=(
|
|
90
|
+
"Reached query/response evaluation without supplying query or response."
|
|
91
|
+
+ " This should have failed earlier."
|
|
92
|
+
),
|
|
93
|
+
)
|
|
94
|
+
input_data = {"query": query, "response": response}
|
|
95
|
+
|
|
96
|
+
if "context" in self._singleton_inputs:
|
|
97
|
+
context = eval_input.get("context", None)
|
|
98
|
+
if context is None:
|
|
99
|
+
raise EvaluationException(
|
|
100
|
+
message="Not implemented",
|
|
101
|
+
internal_message=(
|
|
102
|
+
"Attempted context-based evaluation without supplying context."
|
|
103
|
+
+ " This should have failed earlier."
|
|
104
|
+
),
|
|
105
|
+
)
|
|
106
|
+
input_data["context"] = context
|
|
107
|
+
|
|
108
|
+
return await evaluate_with_rai_service( # type: ignore
|
|
109
|
+
metric_name=self._eval_metric,
|
|
110
|
+
data=input_data,
|
|
111
|
+
project_scope=self._azure_ai_project,
|
|
112
|
+
credential=self._credential,
|
|
113
|
+
annotation_task=self._get_task(),
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _get_task(self):
|
|
117
|
+
"""Get the annotation task for the current evaluation metric.
|
|
118
|
+
The annotation task is used by the RAI service script to determine a the message format
|
|
119
|
+
of the API call, and how the output is processed, among other things.
|
|
120
|
+
|
|
121
|
+
:return: The annotation task for the evaluator's self._eval_metric value.
|
|
122
|
+
:rtype: ~azure.ai.evaluation._common.constants.Tasks
|
|
123
|
+
|
|
124
|
+
"""
|
|
125
|
+
if self._eval_metric == EvaluationMetrics.GROUNDEDNESS:
|
|
126
|
+
return Tasks.GROUNDEDNESS
|
|
127
|
+
if self._eval_metric == EvaluationMetrics.XPIA:
|
|
128
|
+
return Tasks.XPIA
|
|
129
|
+
if self._eval_metric == _InternalEvaluationMetrics.ECI:
|
|
130
|
+
return _InternalAnnotationTasks.ECI
|
|
131
|
+
if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
132
|
+
return Tasks.PROTECTED_MATERIAL
|
|
133
|
+
return Tasks.CONTENT_HARM
|