azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +51 -6
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +88 -52
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +188 -10
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +25 -17
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
|
|
4
|
+
import logging
|
|
5
|
+
import math
|
|
5
6
|
import os
|
|
6
7
|
from typing import Dict, Union, List
|
|
7
8
|
|
|
8
9
|
from typing_extensions import overload, override
|
|
9
10
|
|
|
11
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
12
|
+
from ..._common.utils import reformat_conversation_history, reformat_agent_response
|
|
13
|
+
|
|
10
14
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
11
15
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
12
16
|
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
13
19
|
|
|
14
20
|
class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
15
21
|
"""
|
|
@@ -40,13 +46,13 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
40
46
|
:caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
|
|
41
47
|
|
|
42
48
|
.. admonition:: Example using Azure AI Project URL:
|
|
43
|
-
|
|
49
|
+
|
|
44
50
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
45
51
|
:start-after: [START relevance_evaluator]
|
|
46
52
|
:end-before: [END relevance_evaluator]
|
|
47
53
|
:language: python
|
|
48
54
|
:dedent: 8
|
|
49
|
-
:caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
|
|
55
|
+
:caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
|
|
50
56
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
51
57
|
|
|
52
58
|
.. admonition:: Example with Threshold:
|
|
@@ -69,26 +75,21 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
69
75
|
_PROMPTY_FILE = "relevance.prompty"
|
|
70
76
|
_RESULT_KEY = "relevance"
|
|
71
77
|
|
|
72
|
-
id = "
|
|
78
|
+
id = "azureai://built-in/evaluators/relevance"
|
|
73
79
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
74
80
|
|
|
75
81
|
@override
|
|
76
|
-
def __init__(
|
|
77
|
-
self,
|
|
78
|
-
model_config,
|
|
79
|
-
*,
|
|
80
|
-
threshold=3
|
|
81
|
-
):
|
|
82
|
+
def __init__(self, model_config, *, threshold=3):
|
|
82
83
|
current_dir = os.path.dirname(__file__)
|
|
83
84
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
84
85
|
self._threshold = threshold
|
|
85
86
|
self._higher_is_better = True
|
|
86
87
|
super().__init__(
|
|
87
|
-
model_config=model_config,
|
|
88
|
-
prompty_file=prompty_path,
|
|
89
|
-
result_key=self._RESULT_KEY,
|
|
90
|
-
threshold=threshold,
|
|
91
|
-
_higher_is_better=self._higher_is_better
|
|
88
|
+
model_config=model_config,
|
|
89
|
+
prompty_file=prompty_path,
|
|
90
|
+
result_key=self._RESULT_KEY,
|
|
91
|
+
threshold=threshold,
|
|
92
|
+
_higher_is_better=self._higher_is_better,
|
|
92
93
|
)
|
|
93
94
|
|
|
94
95
|
@overload
|
|
@@ -146,3 +147,49 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
146
147
|
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
|
|
147
148
|
"""
|
|
148
149
|
return super().__call__(*args, **kwargs)
|
|
150
|
+
|
|
151
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
152
|
+
"""Do a relevance evaluation.
|
|
153
|
+
|
|
154
|
+
:param eval_input: The input to the evaluator. Expected to contain
|
|
155
|
+
whatever inputs are needed for the _flow method, including context
|
|
156
|
+
and other fields depending on the child class.
|
|
157
|
+
:type eval_input: Dict
|
|
158
|
+
:return: The evaluation result.
|
|
159
|
+
:rtype: Dict
|
|
160
|
+
"""
|
|
161
|
+
if "query" not in eval_input and "response" not in eval_input:
|
|
162
|
+
raise EvaluationException(
|
|
163
|
+
message="Only text conversation inputs are supported.",
|
|
164
|
+
internal_message="Only text conversation inputs are supported.",
|
|
165
|
+
blame=ErrorBlame.USER_ERROR,
|
|
166
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
167
|
+
target=ErrorTarget.CONVERSATION,
|
|
168
|
+
)
|
|
169
|
+
if not isinstance(eval_input["query"], str):
|
|
170
|
+
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
|
|
171
|
+
if not isinstance(eval_input["response"], str):
|
|
172
|
+
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
|
|
173
|
+
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
174
|
+
score = math.nan
|
|
175
|
+
|
|
176
|
+
if isinstance(llm_output, dict):
|
|
177
|
+
score = float(llm_output.get("score", math.nan))
|
|
178
|
+
reason = llm_output.get("explanation", "")
|
|
179
|
+
# Parse out score and reason from evaluators known to possess them.
|
|
180
|
+
binary_result = self._get_binary_result(score)
|
|
181
|
+
return {
|
|
182
|
+
self._result_key: float(score),
|
|
183
|
+
f"gpt_{self._result_key}": float(score),
|
|
184
|
+
f"{self._result_key}_reason": reason,
|
|
185
|
+
f"{self._result_key}_result": binary_result,
|
|
186
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
binary_result = self._get_binary_result(score)
|
|
190
|
+
return {
|
|
191
|
+
self._result_key: float(score),
|
|
192
|
+
f"gpt_{self._result_key}": float(score),
|
|
193
|
+
f"{self._result_key}_result": binary_result,
|
|
194
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
195
|
+
}
|
|
@@ -10,91 +10,172 @@ model:
|
|
|
10
10
|
presence_penalty: 0
|
|
11
11
|
frequency_penalty: 0
|
|
12
12
|
response_format:
|
|
13
|
-
type:
|
|
13
|
+
type: json_object
|
|
14
14
|
|
|
15
15
|
inputs:
|
|
16
16
|
query:
|
|
17
17
|
type: string
|
|
18
18
|
response:
|
|
19
19
|
type: string
|
|
20
|
-
|
|
21
20
|
---
|
|
21
|
+
|
|
22
22
|
system:
|
|
23
|
-
|
|
24
|
-
## Goal
|
|
25
|
-
### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
|
|
26
|
-
- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
|
|
27
|
-
- **Data**: Your input data include QUERY and RESPONSE.
|
|
28
|
-
- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
|
|
23
|
+
You are a Relevance-Judge, an impartial evaluator that scores how well the RESPONSE addresses the QUERY using the definitions provided.
|
|
29
24
|
|
|
30
25
|
user:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
# Ratings
|
|
35
|
-
## [Relevance: 1] (Irrelevant Response)
|
|
36
|
-
**Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed.
|
|
37
|
-
|
|
38
|
-
**Examples:**
|
|
39
|
-
**Query:** What is the team preparing for?
|
|
40
|
-
**Response:** I went grocery shopping yesterday evening.
|
|
41
|
-
|
|
42
|
-
**Query:** When will the company's new product line launch?
|
|
43
|
-
**Response:** International travel can be very rewarding and educational.
|
|
26
|
+
ROLE
|
|
27
|
+
====
|
|
28
|
+
You are a Relevance Evaluator. Your task is to judge how relevant a RESPONSE is to a QUERY using the Relevance definitions provided.
|
|
44
29
|
|
|
45
|
-
|
|
46
|
-
|
|
30
|
+
INPUT
|
|
31
|
+
=====
|
|
32
|
+
QUERY: {{query}}
|
|
33
|
+
RESPONSE: {{response}}
|
|
47
34
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
35
|
+
TASK
|
|
36
|
+
====
|
|
37
|
+
Output a JSON object with:
|
|
38
|
+
1) a concise explanation of 15-60 words justifying your score based on how well the response is relevant to the query.
|
|
39
|
+
2) an integer score from 1 (very poor) to 5 (excellent) using the rubric below.
|
|
51
40
|
|
|
52
|
-
|
|
53
|
-
|
|
41
|
+
The explanation should always precede the score and should clearly justify the score based on the rubric definitions.
|
|
42
|
+
Response format exactly as follows:
|
|
54
43
|
|
|
55
|
-
|
|
56
|
-
|
|
44
|
+
{
|
|
45
|
+
"explanation": "<15-60 words>",
|
|
46
|
+
"score": <1-5>
|
|
47
|
+
}
|
|
57
48
|
|
|
58
|
-
**Examples:**
|
|
59
|
-
**Query:** What type of food does the new restaurant offer?
|
|
60
|
-
**Response:** The restaurant offers Italian food like pasta.
|
|
61
49
|
|
|
62
|
-
|
|
63
|
-
|
|
50
|
+
EVALUATION STEPS
|
|
51
|
+
================
|
|
52
|
+
A. Read the QUERY and RESPONSE carefully.
|
|
53
|
+
B. Compare the RESPONSE against the rubric below:
|
|
54
|
+
- Does the response directly address the query?
|
|
55
|
+
- Is the information complete, partial, or off-topic?
|
|
56
|
+
- Is it vague, generic, or insightful?
|
|
57
|
+
C. Match the response to the best score from the rubric.
|
|
58
|
+
D. Provide a short explanation and the score using the required format.
|
|
64
59
|
|
|
65
|
-
|
|
66
|
-
|
|
60
|
+
SCORING RUBRIC
|
|
61
|
+
==============
|
|
67
62
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
**Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto.
|
|
63
|
+
### Score 1 - Irrelevant Response
|
|
64
|
+
Definition: The response is unrelated to the question. It provides off-topic information and does not attempt to address the question posed.
|
|
71
65
|
|
|
72
|
-
|
|
73
|
-
|
|
66
|
+
**Example A**
|
|
67
|
+
QUERY: What is the team preparing for?
|
|
68
|
+
RESPONSE: I went grocery shopping yesterday evening.
|
|
74
69
|
|
|
75
|
-
|
|
76
|
-
|
|
70
|
+
Expected Output:
|
|
71
|
+
{
|
|
72
|
+
"explanation": "The response is entirely off-topic and doesn't address the question.",
|
|
73
|
+
"score": 1
|
|
74
|
+
}
|
|
77
75
|
|
|
78
|
-
**Examples:**
|
|
79
|
-
**Query:** What type of food does the new restaurant offer?
|
|
80
|
-
**Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience.
|
|
81
76
|
|
|
82
|
-
|
|
83
|
-
|
|
77
|
+
**Example B**
|
|
78
|
+
QUERY: When will the company's new product line launch?
|
|
79
|
+
RESPONSE: International travel can be very rewarding and educational.
|
|
84
80
|
|
|
81
|
+
Expected Output:
|
|
82
|
+
{
|
|
83
|
+
"explanation": "The response is completely irrelevant to the product launch question.",
|
|
84
|
+
"score": 1
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
### Score 2 – Related but Unhelpful / Superficial
|
|
89
|
+
Definition: The response is loosely or formally related to the query but fails to deliver any meaningful, specific, or useful information. This includes vague phrases, non-answers, or failure/error messages.
|
|
90
|
+
|
|
91
|
+
**Example A**
|
|
92
|
+
QUERY: What is the event about?
|
|
93
|
+
RESPONSE: It’s something important.
|
|
94
|
+
|
|
95
|
+
Expected Output:
|
|
96
|
+
{
|
|
97
|
+
"explanation": "The response vaguely refers to the query topic but lacks specific or informative content.",
|
|
98
|
+
"score": 2
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
**Example B**
|
|
102
|
+
QUERY: What’s the weather in Paris?
|
|
103
|
+
RESPONSE: I tried to find the forecast but the query failed.
|
|
104
|
+
|
|
105
|
+
Expected Output:
|
|
106
|
+
{
|
|
107
|
+
"explanation": "The response acknowledges the query but provides no usable weather information. It is related but unhelpful.",
|
|
108
|
+
"score": 2
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
### Score 3 - Partially Relevant / Incomplete
|
|
112
|
+
Definition: The response addresses the query and includes relevant information, but omits essential components or detail. The answer is on-topic but insufficient to fully satisfy the request.
|
|
113
|
+
|
|
114
|
+
**Example A**
|
|
115
|
+
QUERY: What amenities does the new apartment complex provide?
|
|
116
|
+
RESPONSE: The apartment complex has a gym.
|
|
117
|
+
|
|
118
|
+
Expected Output:
|
|
119
|
+
{
|
|
120
|
+
"explanation": "The response mentions one amenity but does not provide a fuller list or clarify whether other standard features (like parking or security) are included. It partially addresses the query but lacks completeness.",
|
|
121
|
+
"score": 3
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
**Example B**
|
|
125
|
+
QUERY: What services does the premium membership include?
|
|
126
|
+
RESPONSE: It includes priority customer support.
|
|
127
|
+
|
|
128
|
+
Expected Output:
|
|
129
|
+
{
|
|
130
|
+
"explanation": "The response identifies one service but omits other likely components of a premium membership (e.g., exclusive content or discounts). The information is relevant but incomplete.",
|
|
131
|
+
"score": 3
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
### Score 4 - Fully Relevant / Sufficient Response
|
|
137
|
+
Definition: The response fully addresses the question with accurate and sufficient information, covering all essential aspects. Very minor omissions are acceptable as long as the core information is intact and the intent is clearly conveyed.
|
|
138
|
+
|
|
139
|
+
**Example A**
|
|
140
|
+
QUERY: What amenities does the new apartment complex provide?
|
|
141
|
+
RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security.
|
|
142
|
+
|
|
143
|
+
Expected Output:
|
|
144
|
+
{
|
|
145
|
+
"explanation": "The response mentions multiple key amenities likely to be relevant to most users. While it may not list every feature, it clearly conveys the core offerings of the complex.",
|
|
146
|
+
"score": 4
|
|
147
|
+
}
|
|
85
148
|
|
|
149
|
+
**Example B**
|
|
150
|
+
QUERY: What services does the premium membership include?
|
|
151
|
+
RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases.
|
|
152
|
+
|
|
153
|
+
Expected Output:
|
|
154
|
+
{
|
|
155
|
+
"explanation": "The response outlines all major services expected from a premium membership. Even if a minor service is not mentioned, the core value is clearly and fully represented.",
|
|
156
|
+
"score": 4
|
|
157
|
+
}
|
|
86
158
|
|
|
87
|
-
# Data
|
|
88
|
-
QUERY: {{query}}
|
|
89
|
-
RESPONSE: {{response}}
|
|
90
159
|
|
|
160
|
+
### Score 5 - Comprehensive Response with Insights
|
|
161
|
+
Definition: The response not only fully and accurately answers the question, but also adds meaningful elaboration, interpretation, or context that enhances the user's understanding. This goes beyond just listing relevant details — it offers insight into why the information matters, how it's useful, or what impact it has.
|
|
91
162
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
163
|
+
**Example A**
|
|
164
|
+
QUERY: What amenities does the new apartment complex provide?
|
|
165
|
+
RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security, designed to offer residents a comfortable and active lifestyle while ensuring their safety.
|
|
166
|
+
|
|
167
|
+
Expected Output:
|
|
168
|
+
{
|
|
169
|
+
"explanation": "The response fully lists key amenities and additionally explains how these features contribute to resident experience, enhancing the usefulness of the information.",
|
|
170
|
+
"score": 5
|
|
171
|
+
}
|
|
97
172
|
|
|
173
|
+
**Example B**
|
|
174
|
+
QUERY: What services does the premium membership include?
|
|
175
|
+
RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases — tailored for users who want quicker resolutions and first access to new features.
|
|
98
176
|
|
|
99
|
-
|
|
100
|
-
|
|
177
|
+
Expected Output:
|
|
178
|
+
{
|
|
179
|
+
"explanation": "The response covers all essential services and adds valuable insight about the target user and benefits, enriching the response beyond basic listing.",
|
|
180
|
+
"score": 5
|
|
181
|
+
}
|
|
@@ -37,24 +37,24 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
37
37
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
38
38
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
39
39
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
.. admonition:: Example:
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
44
44
|
:start-after: [START completeness_evaluator]
|
|
45
45
|
:end-before: [END completeness_evaluator]
|
|
46
46
|
:language: python
|
|
47
47
|
:dedent: 8
|
|
48
48
|
:caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
.. admonition:: Example using Azure AI Project URL:
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
53
53
|
:start-after: [START completeness_evaluator]
|
|
54
54
|
:end-before: [END completeness_evaluator]
|
|
55
55
|
:language: python
|
|
56
56
|
:dedent: 8
|
|
57
|
-
:caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
|
|
57
|
+
:caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
|
|
58
58
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
59
59
|
|
|
60
60
|
"""
|
|
@@ -64,7 +64,7 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
64
64
|
_PROMPTY_FILE = "response_completeness.prompty"
|
|
65
65
|
_RESULT_KEY = "response_completeness"
|
|
66
66
|
|
|
67
|
-
id = "
|
|
67
|
+
id = "azureai://built-in/evaluators/response_completeness"
|
|
68
68
|
|
|
69
69
|
_MIN_COMPLETENESS_SCORE = 1
|
|
70
70
|
_MAX_COMPLETENESS_SCORE = 5
|
|
@@ -73,23 +73,18 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
73
73
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
74
74
|
|
|
75
75
|
@override
|
|
76
|
-
def __init__(self, model_config, *,
|
|
77
|
-
threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD,
|
|
78
|
-
**kwargs):
|
|
76
|
+
def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs):
|
|
79
77
|
current_dir = os.path.dirname(__file__)
|
|
80
78
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
81
79
|
self.threshold = threshold
|
|
82
|
-
super().__init__(model_config=model_config,
|
|
83
|
-
prompty_file=prompty_path,
|
|
84
|
-
result_key=self._RESULT_KEY,
|
|
85
|
-
**kwargs)
|
|
80
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
|
|
86
81
|
|
|
87
82
|
@overload
|
|
88
83
|
def __call__(
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
84
|
+
self,
|
|
85
|
+
*,
|
|
86
|
+
ground_truth: str,
|
|
87
|
+
response: str,
|
|
93
88
|
) -> Dict[str, Union[str, float]]:
|
|
94
89
|
"""Evaluate completeness in given response. Accepts ground truth and response for evaluation.
|
|
95
90
|
Example usage:
|
|
@@ -111,9 +106,9 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
111
106
|
|
|
112
107
|
@overload
|
|
113
108
|
def __call__(
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
109
|
+
self,
|
|
110
|
+
*,
|
|
111
|
+
conversation: Conversation,
|
|
117
112
|
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
118
113
|
"""Evaluate completeness for a conversation
|
|
119
114
|
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
@@ -126,9 +121,9 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
126
121
|
|
|
127
122
|
@override
|
|
128
123
|
def __call__( # pylint: disable=docstring-missing-param
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
124
|
+
self,
|
|
125
|
+
*args,
|
|
126
|
+
**kwargs,
|
|
132
127
|
):
|
|
133
128
|
"""
|
|
134
129
|
Invokes the instance using the overloaded __call__ signature.
|
|
@@ -151,7 +146,7 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
151
146
|
raise EvaluationException(
|
|
152
147
|
message=f"Both ground_truth and response must be provided as input to the completeness evaluator.",
|
|
153
148
|
internal_message=f"Both ground_truth and response must be provided as input to the completeness"
|
|
154
|
-
|
|
149
|
+
f" evaluator.",
|
|
155
150
|
blame=ErrorBlame.USER_ERROR,
|
|
156
151
|
category=ErrorCategory.MISSING_FIELD,
|
|
157
152
|
target=ErrorTarget.COMPLETENESS_EVALUATOR,
|
|
@@ -163,7 +158,7 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
163
158
|
if llm_output:
|
|
164
159
|
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
|
|
165
160
|
|
|
166
|
-
score_result =
|
|
161
|
+
score_result = "pass" if score >= self.threshold else "fail"
|
|
167
162
|
|
|
168
163
|
# updating the result key and threshold to int based on the schema
|
|
169
164
|
return {
|
|
@@ -172,5 +167,5 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
172
167
|
f"{self._result_key}_threshold": int(self.threshold),
|
|
173
168
|
f"{self._result_key}_reason": reason,
|
|
174
169
|
}
|
|
175
|
-
|
|
170
|
+
|
|
176
171
|
return {self._result_key: math.nan}
|
|
@@ -46,17 +46,17 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
46
46
|
:caption: Initialize and call a RetrievalEvaluator.
|
|
47
47
|
|
|
48
48
|
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
51
|
:start-after: [START retrieval_evaluator]
|
|
52
52
|
:end-before: [END retrieval_evaluator]
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent: 8
|
|
55
|
-
:caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
|
|
55
|
+
:caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
|
|
56
56
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
57
|
|
|
58
58
|
.. admonition:: Example with Threshold:
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
61
61
|
:start-after: [START threshold_retrieval_evaluator]
|
|
62
62
|
:end-before: [END threshold_retrieval_evaluator]
|
|
@@ -74,11 +74,11 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
74
74
|
_PROMPTY_FILE = "retrieval.prompty"
|
|
75
75
|
_RESULT_KEY = "retrieval"
|
|
76
76
|
|
|
77
|
-
id = "
|
|
77
|
+
id = "azureai://built-in/evaluators/retrieval"
|
|
78
78
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
79
79
|
|
|
80
80
|
@override
|
|
81
|
-
def __init__(self, model_config, *, threshold: float=3): # pylint: disable=super-init-not-called
|
|
81
|
+
def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=super-init-not-called
|
|
82
82
|
current_dir = os.path.dirname(__file__)
|
|
83
83
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
84
84
|
self._threshold = threshold
|
|
@@ -71,13 +71,13 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
71
71
|
:caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
72
72
|
|
|
73
73
|
.. admonition:: Example using Azure AI Project URL:
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
76
76
|
:start-after: [START rouge_score_evaluator]
|
|
77
77
|
:end-before: [END rouge_score_evaluator]
|
|
78
78
|
:language: python
|
|
79
79
|
:dedent: 8
|
|
80
|
-
:caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
|
|
80
|
+
:caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
|
|
81
81
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
82
82
|
|
|
83
83
|
.. admonition:: Example with threshold:
|
|
@@ -90,22 +90,22 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
90
90
|
:caption: Initialize with a specified threshold and call a RougeScoreEvaluator with a four-gram rouge type.
|
|
91
91
|
"""
|
|
92
92
|
|
|
93
|
-
id = "
|
|
93
|
+
id = "azureai://built-in/evaluators/rouge_score"
|
|
94
94
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
95
95
|
|
|
96
96
|
@override
|
|
97
97
|
def __init__(
|
|
98
|
-
self,
|
|
98
|
+
self,
|
|
99
99
|
rouge_type: RougeType,
|
|
100
100
|
*,
|
|
101
101
|
precision_threshold: float = 0.5,
|
|
102
102
|
recall_threshold: float = 0.5,
|
|
103
|
-
f1_score_threshold: float = 0.5
|
|
103
|
+
f1_score_threshold: float = 0.5,
|
|
104
104
|
):
|
|
105
105
|
self._rouge_type = rouge_type
|
|
106
106
|
self._higher_is_better = True
|
|
107
107
|
super().__init__()
|
|
108
|
-
|
|
108
|
+
|
|
109
109
|
# Type checking for threshold parameters
|
|
110
110
|
for name, value in [
|
|
111
111
|
("precision_threshold", precision_threshold),
|
|
@@ -114,7 +114,7 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
114
114
|
]:
|
|
115
115
|
if not isinstance(value, float):
|
|
116
116
|
raise TypeError(f"{name} must be a float, got {type(value)}")
|
|
117
|
-
|
|
117
|
+
|
|
118
118
|
self._threshold = {
|
|
119
119
|
"precision": precision_threshold,
|
|
120
120
|
"recall": recall_threshold,
|
|
@@ -122,10 +122,10 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
122
122
|
}
|
|
123
123
|
|
|
124
124
|
def _get_binary_result(
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
125
|
+
self,
|
|
126
|
+
rouge_precision: float,
|
|
127
|
+
rouge_recall: float,
|
|
128
|
+
rouge_f1_score: float,
|
|
129
129
|
) -> Dict[str, bool]:
|
|
130
130
|
"""
|
|
131
131
|
Get binary result based on the threshold.
|
|
@@ -150,22 +150,22 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
150
150
|
precision_valid = not math.isnan(rouge_precision)
|
|
151
151
|
recall_valid = not math.isnan(rouge_recall)
|
|
152
152
|
f1_valid = not math.isnan(rouge_f1_score)
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
if self._higher_is_better:
|
|
155
155
|
if precision_valid:
|
|
156
|
-
results["rouge_precision_result"] =
|
|
156
|
+
results["rouge_precision_result"] = rouge_precision >= self._threshold["precision"]
|
|
157
157
|
if recall_valid:
|
|
158
|
-
results["rouge_recall_result"] =
|
|
158
|
+
results["rouge_recall_result"] = rouge_recall >= self._threshold["recall"]
|
|
159
159
|
if f1_valid:
|
|
160
|
-
results["rouge_f1_score_result"] =
|
|
160
|
+
results["rouge_f1_score_result"] = rouge_f1_score >= self._threshold["f1_score"]
|
|
161
161
|
else:
|
|
162
162
|
if precision_valid:
|
|
163
|
-
results["rouge_precision_result"] =
|
|
163
|
+
results["rouge_precision_result"] = rouge_precision <= self._threshold["precision"]
|
|
164
164
|
if recall_valid:
|
|
165
|
-
results["rouge_recall_result"] =
|
|
165
|
+
results["rouge_recall_result"] = rouge_recall <= self._threshold["recall"]
|
|
166
166
|
if f1_valid:
|
|
167
|
-
results["rouge_f1_score_result"] =
|
|
168
|
-
|
|
167
|
+
results["rouge_f1_score_result"] = rouge_f1_score <= self._threshold["f1_score"]
|
|
168
|
+
|
|
169
169
|
return results
|
|
170
170
|
|
|
171
171
|
@override
|
|
@@ -187,9 +187,9 @@ class RougeScoreEvaluator(EvaluatorBase):
|
|
|
187
187
|
"rouge_f1_score_result": False,
|
|
188
188
|
}
|
|
189
189
|
# Convert metrics to floats, using nan for None or non-convertible values
|
|
190
|
-
rouge_precision = float(metrics.precision) if metrics.precision is not None else float(
|
|
191
|
-
rouge_recall = float(metrics.recall) if metrics.recall is not None else float(
|
|
192
|
-
rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float(
|
|
190
|
+
rouge_precision = float(metrics.precision) if metrics.precision is not None else float("nan")
|
|
191
|
+
rouge_recall = float(metrics.recall) if metrics.recall is not None else float("nan")
|
|
192
|
+
rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float("nan")
|
|
193
193
|
binary_results = self._get_binary_result(
|
|
194
194
|
rouge_precision=rouge_precision,
|
|
195
195
|
rouge_recall=rouge_recall,
|
|
@@ -24,9 +24,9 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
24
24
|
|
|
25
25
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
26
26
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
27
|
-
:param azure_ai_project: The
|
|
28
|
-
It contains subscription id, resource group, and project name.
|
|
29
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
27
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
28
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
29
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
30
30
|
:param threshold: The threshold for the groundedness pro evaluator. Default is 5.
|
|
31
31
|
:type threshold: int
|
|
32
32
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
@@ -42,13 +42,13 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
42
42
|
:caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
|
|
43
43
|
|
|
44
44
|
.. admonition:: Example using Azure AI Project URL:
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
47
47
|
:start-after: [START groundedness_pro_evaluator]
|
|
48
48
|
:end-before: [END groundedness_pro_evaluator]
|
|
49
49
|
:language: python
|
|
50
50
|
:dedent: 8
|
|
51
|
-
:caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
|
|
51
|
+
:caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
|
|
52
52
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
53
53
|
|
|
54
54
|
.. admonition:: Example with threshold:
|
|
@@ -66,8 +66,9 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
66
66
|
for the groundedness pro label will be "groundedness_pro_passing_rate".
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
|
-
id = "
|
|
69
|
+
id = "azureai://built-in/evaluators/groundedness_pro"
|
|
70
70
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
71
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
71
72
|
|
|
72
73
|
@override
|
|
73
74
|
def __init__(
|