azure-ai-evaluation 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +1 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +1 -1
- azure/ai/evaluation/_aoai/label_grader.py +2 -2
- azure/ai/evaluation/_aoai/string_check_grader.py +2 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +2 -2
- azure/ai/evaluation/_common/__init__.py +3 -1
- azure/ai/evaluation/_common/evaluation_onedp_client.py +50 -5
- azure/ai/evaluation/_common/onedp/operations/_operations.py +4 -2
- azure/ai/evaluation/_common/rai_service.py +7 -6
- azure/ai/evaluation/_converters/_ai_services.py +162 -118
- azure/ai/evaluation/_converters/_models.py +76 -6
- azure/ai/evaluation/_eval_mapping.py +2 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +15 -17
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +24 -5
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +31 -29
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +10 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +10 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +10 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +10 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +13 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +14 -4
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +10 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +80 -10
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +26 -7
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +264 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +503 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +69 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +237 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -0
- azure/ai/evaluation/red_team/_red_team.py +572 -207
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +570 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
- azure/ai/evaluation/red_team/_utils/constants.py +5 -1
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +2 -2
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +3 -3
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +3 -3
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +3 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +15 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +6 -5
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/METADATA +35 -3
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/RECORD +69 -61
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,18 @@ import json
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
|
-
from azure.ai.projects.models import RunStepFunctionToolCall
|
|
7
|
-
|
|
8
6
|
from typing import List, Optional, Union
|
|
9
7
|
|
|
8
|
+
# Models moved in a later version of agents SDK, so try a few different locations
|
|
9
|
+
try:
|
|
10
|
+
from azure.ai.projects.models import RunStepFunctionToolCall
|
|
11
|
+
except ImportError:
|
|
12
|
+
pass
|
|
13
|
+
try:
|
|
14
|
+
from azure.ai.agents.models import RunStepFunctionToolCall
|
|
15
|
+
except ImportError:
|
|
16
|
+
pass
|
|
17
|
+
|
|
10
18
|
# Message roles constants.
|
|
11
19
|
_SYSTEM = "system"
|
|
12
20
|
_USER = "user"
|
|
@@ -21,6 +29,57 @@ _FUNCTION = "function"
|
|
|
21
29
|
# This is returned by AI services in the API to filter against tool invocations.
|
|
22
30
|
_TOOL_CALLS = "tool_calls"
|
|
23
31
|
|
|
32
|
+
# Constants to only be used internally in this file for the built-in tools.
|
|
33
|
+
_CODE_INTERPRETER = "code_interpreter"
|
|
34
|
+
_BING_GROUNDING = "bing_grounding"
|
|
35
|
+
_FILE_SEARCH = "file_search"
|
|
36
|
+
_AZURE_AI_SEARCH = "azure_ai_search"
|
|
37
|
+
_FABRIC_DATAAGENT = "fabric_dataagent"
|
|
38
|
+
|
|
39
|
+
# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
|
|
40
|
+
# for evaluation purposes.
|
|
41
|
+
_BUILT_IN_DESCRIPTIONS = {
|
|
42
|
+
_CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
|
|
43
|
+
+ "generate code, and create graphs and charts using your data. Supports "
|
|
44
|
+
+ "up to 20 files.",
|
|
45
|
+
_BING_GROUNDING: "Enhance model output with web data.",
|
|
46
|
+
_FILE_SEARCH: "Search for data across uploaded files.",
|
|
47
|
+
_AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
|
|
48
|
+
_FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
|
|
52
|
+
_BUILT_IN_PARAMS = {
|
|
53
|
+
_CODE_INTERPRETER: {
|
|
54
|
+
"type": "object",
|
|
55
|
+
"properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
|
|
56
|
+
},
|
|
57
|
+
_BING_GROUNDING: {
|
|
58
|
+
"type": "object",
|
|
59
|
+
"properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
|
|
60
|
+
},
|
|
61
|
+
_FILE_SEARCH: {
|
|
62
|
+
"type": "object",
|
|
63
|
+
"properties": {
|
|
64
|
+
"ranking_options": {
|
|
65
|
+
"type": "object",
|
|
66
|
+
"properties": {
|
|
67
|
+
"ranker": {"type": "string", "description": "Ranking algorithm to use."},
|
|
68
|
+
"score_threshold": {"type": "number", "description": "Threshold for search results."},
|
|
69
|
+
},
|
|
70
|
+
"description": "Ranking options for search results.",
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
},
|
|
74
|
+
_AZURE_AI_SEARCH: {
|
|
75
|
+
"type": "object",
|
|
76
|
+
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
|
|
77
|
+
},
|
|
78
|
+
_FABRIC_DATAAGENT: {
|
|
79
|
+
"type": "object",
|
|
80
|
+
"properties": {"input": {"type": "string", "description": "Search terms to use."}},
|
|
81
|
+
},
|
|
82
|
+
}
|
|
24
83
|
|
|
25
84
|
class Message(BaseModel):
|
|
26
85
|
"""Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
|
|
@@ -98,6 +157,8 @@ class ToolDefinition(BaseModel):
|
|
|
98
157
|
|
|
99
158
|
:param name: The name of the tool.
|
|
100
159
|
:type name: str
|
|
160
|
+
:param type: The type of the tool.
|
|
161
|
+
:type type: str
|
|
101
162
|
:param description: A description of the tool.
|
|
102
163
|
:type description: str
|
|
103
164
|
:param parameters: The parameters required by the tool.
|
|
@@ -105,6 +166,7 @@ class ToolDefinition(BaseModel):
|
|
|
105
166
|
"""
|
|
106
167
|
|
|
107
168
|
name: str
|
|
169
|
+
type: str
|
|
108
170
|
description: Optional[str] = None
|
|
109
171
|
parameters: dict
|
|
110
172
|
|
|
@@ -191,6 +253,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
|
|
|
191
253
|
arguments = {
|
|
192
254
|
"ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
|
|
193
255
|
}
|
|
256
|
+
elif tool_call.details["type"] == "azure_ai_search":
|
|
257
|
+
arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
|
|
258
|
+
elif tool_call.details["type"] == "fabric_dataagent":
|
|
259
|
+
arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
|
|
194
260
|
else:
|
|
195
261
|
# unsupported tool type, skip
|
|
196
262
|
return messages
|
|
@@ -211,17 +277,17 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
|
|
|
211
277
|
messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
|
|
212
278
|
|
|
213
279
|
if hasattr(tool_call.details, _FUNCTION):
|
|
214
|
-
output = safe_loads(tool_call.details.function
|
|
280
|
+
output = safe_loads(tool_call.details.function["output"])
|
|
215
281
|
else:
|
|
216
282
|
try:
|
|
217
283
|
# Some built-ins may have output, others may not
|
|
218
284
|
# Try to retrieve it, but if we don't find anything, skip adding the message
|
|
219
285
|
# Just manually converting to dicts for easy serialization for now rather than custom serializers
|
|
220
|
-
if tool_call.details.type ==
|
|
286
|
+
if tool_call.details.type == _CODE_INTERPRETER:
|
|
221
287
|
output = tool_call.details.code_interpreter.outputs
|
|
222
|
-
elif tool_call.details.type ==
|
|
288
|
+
elif tool_call.details.type == _BING_GROUNDING:
|
|
223
289
|
return messages # not supported yet from bing grounding tool
|
|
224
|
-
elif tool_call.details.type ==
|
|
290
|
+
elif tool_call.details.type == _FILE_SEARCH:
|
|
225
291
|
output = [
|
|
226
292
|
{
|
|
227
293
|
"file_id": result.file_id,
|
|
@@ -231,6 +297,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
|
|
|
231
297
|
}
|
|
232
298
|
for result in tool_call.details.file_search.results
|
|
233
299
|
]
|
|
300
|
+
elif tool_call.details.type == _AZURE_AI_SEARCH:
|
|
301
|
+
output = tool_call.details.azure_ai_search["output"]
|
|
302
|
+
elif tool_call.details.type == _FABRIC_DATAAGENT:
|
|
303
|
+
output = tool_call.details.fabric_dataagent["output"]
|
|
234
304
|
except:
|
|
235
305
|
return messages
|
|
236
306
|
|
|
@@ -16,6 +16,7 @@ from azure.ai.evaluation import (
|
|
|
16
16
|
CodeVulnerabilityEvaluator,
|
|
17
17
|
CoherenceEvaluator,
|
|
18
18
|
ContentSafetyEvaluator,
|
|
19
|
+
DocumentRetrievalEvaluator,
|
|
19
20
|
F1ScoreEvaluator,
|
|
20
21
|
FluencyEvaluator,
|
|
21
22
|
GleuScoreEvaluator,
|
|
@@ -45,6 +46,7 @@ EVAL_CLASS_MAP = {
|
|
|
45
46
|
CodeVulnerabilityEvaluator: "code_vulnerability",
|
|
46
47
|
CoherenceEvaluator: "coherence",
|
|
47
48
|
ContentSafetyEvaluator: "content_safety",
|
|
49
|
+
DocumentRetrievalEvaluator: "document_retrieval",
|
|
48
50
|
ECIEvaluator: "eci",
|
|
49
51
|
F1ScoreEvaluator: "f1_score",
|
|
50
52
|
FluencyEvaluator: "fluency",
|
|
@@ -141,7 +141,6 @@ def _aggregate_content_safety_metrics(
|
|
|
141
141
|
module = inspect.getmodule(evaluators[evaluator_name])
|
|
142
142
|
if (
|
|
143
143
|
module
|
|
144
|
-
and module.__name__.startswith("azure.ai.evaluation.")
|
|
145
144
|
and metric_name.endswith("_score")
|
|
146
145
|
and metric_name.replace("_score", "") in content_safety_metrics
|
|
147
146
|
):
|
|
@@ -739,7 +738,17 @@ def evaluate(
|
|
|
739
738
|
:end-before: [END evaluate_method]
|
|
740
739
|
:language: python
|
|
741
740
|
:dedent: 8
|
|
742
|
-
:caption: Run an evaluation on local data with
|
|
741
|
+
:caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
|
|
742
|
+
|
|
743
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
744
|
+
|
|
745
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
746
|
+
:start-after: [START evaluate_method]
|
|
747
|
+
:end-before: [END evaluate_method]
|
|
748
|
+
:language: python
|
|
749
|
+
:dedent: 8
|
|
750
|
+
:caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
|
|
751
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
743
752
|
"""
|
|
744
753
|
try:
|
|
745
754
|
return _evaluate(
|
|
@@ -978,17 +987,6 @@ def _preprocess_data(
|
|
|
978
987
|
# Split normal evaluators and OAI graders
|
|
979
988
|
evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
|
|
980
989
|
|
|
981
|
-
input_data_df = _validate_and_load_data(
|
|
982
|
-
target,
|
|
983
|
-
data,
|
|
984
|
-
evaluators_and_graders,
|
|
985
|
-
output_path,
|
|
986
|
-
azure_ai_project,
|
|
987
|
-
evaluation_name
|
|
988
|
-
)
|
|
989
|
-
if target is not None:
|
|
990
|
-
_validate_columns_for_target(input_data_df, target)
|
|
991
|
-
|
|
992
990
|
target_run: Optional[BatchClientRun] = None
|
|
993
991
|
target_generated_columns: Set[str] = set()
|
|
994
992
|
batch_run_client: BatchClient
|
|
@@ -1135,8 +1133,8 @@ def _map_names_to_builtins(
|
|
|
1135
1133
|
) -> Dict[str, str]:
|
|
1136
1134
|
"""
|
|
1137
1135
|
Construct a mapping from user-supplied evaluator names to which known, built-in
|
|
1138
|
-
evaluator or grader they refer to. Custom
|
|
1139
|
-
|
|
1136
|
+
evaluator or grader they refer to. Custom evaluators are excluded from the mapping
|
|
1137
|
+
as we only want to track built-in evaluators and graders.
|
|
1140
1138
|
|
|
1141
1139
|
:param evaluators: The dictionary of evaluators.
|
|
1142
1140
|
:type evaluators: Dict[str, Callable]
|
|
@@ -1158,8 +1156,8 @@ def _map_names_to_builtins(
|
|
|
1158
1156
|
found_eval = True
|
|
1159
1157
|
break
|
|
1160
1158
|
if not found_eval:
|
|
1161
|
-
#
|
|
1162
|
-
|
|
1159
|
+
# Skip custom evaluators - we only want to track built-in evaluators
|
|
1160
|
+
pass
|
|
1163
1161
|
|
|
1164
1162
|
for name, grader in graders.items():
|
|
1165
1163
|
name_map[name] = grader.id
|
|
@@ -208,7 +208,7 @@ def _get_single_run_results(
|
|
|
208
208
|
if run_results.status != "completed":
|
|
209
209
|
raise EvaluationException(
|
|
210
210
|
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
|
|
211
|
-
+ " failed with status {run_results.status}.",
|
|
211
|
+
+ f" failed with status {run_results.status}.",
|
|
212
212
|
blame=ErrorBlame.UNKNOWN,
|
|
213
213
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
214
214
|
target=ErrorTarget.AOAI_GRADER,
|
|
@@ -217,6 +217,16 @@ def _get_single_run_results(
|
|
|
217
217
|
+ " completed successfully. Gathering results...")
|
|
218
218
|
# Convert run results into a dictionary of metrics
|
|
219
219
|
run_metrics = {}
|
|
220
|
+
if run_results.per_testing_criteria_results is None:
|
|
221
|
+
msg = ("AOAI evaluation run returned no results, despite 'completed' status. This might" +
|
|
222
|
+
" occur when invalid or conflicting models are selected in the model and grader configs."
|
|
223
|
+
f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}")
|
|
224
|
+
raise EvaluationException(
|
|
225
|
+
message=msg,
|
|
226
|
+
blame=ErrorBlame.UNKNOWN,
|
|
227
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
228
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
229
|
+
)
|
|
220
230
|
for criteria_result in run_results.per_testing_criteria_results:
|
|
221
231
|
grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
|
|
222
232
|
passed = criteria_result.passed
|
|
@@ -240,8 +250,12 @@ def _get_single_run_results(
|
|
|
240
250
|
eval_id=run_info["eval_group_id"],
|
|
241
251
|
run_id=run_info["eval_run_id"]
|
|
242
252
|
)
|
|
243
|
-
listed_results = {}
|
|
253
|
+
listed_results = {"index": []}
|
|
254
|
+
# raw data has no order guarantees, we need to sort them by their
|
|
255
|
+
# datasource_item_id
|
|
244
256
|
for row_result in raw_list_results.data:
|
|
257
|
+
# Add the datasource_item_id for later sorting
|
|
258
|
+
listed_results["index"].append(row_result.datasource_item_id)
|
|
245
259
|
for single_grader_row_result in row_result.results:
|
|
246
260
|
grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
|
|
247
261
|
for name, value in single_grader_row_result.items():
|
|
@@ -251,14 +265,19 @@ def _get_single_run_results(
|
|
|
251
265
|
# create a `_result` column for each grader
|
|
252
266
|
result_column_name = f"outputs.{grader_name}.{grader_name}_result"
|
|
253
267
|
if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
|
|
254
|
-
|
|
268
|
+
if (result_column_name not in listed_results):
|
|
269
|
+
listed_results[result_column_name] = []
|
|
270
|
+
listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
|
|
255
271
|
|
|
256
272
|
formatted_column_name = f"outputs.{grader_name}.{name}"
|
|
257
273
|
if (formatted_column_name not in listed_results):
|
|
258
274
|
listed_results[formatted_column_name] = []
|
|
259
|
-
listed_results[
|
|
275
|
+
listed_results[formatted_column_name].append(value)
|
|
260
276
|
output_df = pd.DataFrame(listed_results)
|
|
261
|
-
|
|
277
|
+
# sort by index
|
|
278
|
+
output_df = output_df.sort_values('index', ascending=[True])
|
|
279
|
+
# remove index column
|
|
280
|
+
output_df.drop(columns=["index"], inplace=True)
|
|
262
281
|
return output_df, run_metrics
|
|
263
282
|
|
|
264
283
|
|
|
@@ -33,7 +33,17 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
33
33
|
:end-before: [END bleu_score_evaluator]
|
|
34
34
|
:language: python
|
|
35
35
|
:dedent: 8
|
|
36
|
-
:caption: Initialize and call an BleuScoreEvaluator.
|
|
36
|
+
:caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
|
|
37
|
+
|
|
38
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
39
|
+
|
|
40
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
41
|
+
:start-after: [START bleu_score_evaluator]
|
|
42
|
+
:end-before: [END bleu_score_evaluator]
|
|
43
|
+
:language: python
|
|
44
|
+
:dedent: 8
|
|
45
|
+
:caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
|
|
46
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
37
47
|
|
|
38
48
|
.. admonition:: Example with Threshold:
|
|
39
49
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -62,7 +62,15 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
62
62
|
:end-before: [END code_vulnerability_evaluator]
|
|
63
63
|
:language: python
|
|
64
64
|
:dedent: 8
|
|
65
|
-
:caption: Initialize and call
|
|
65
|
+
:caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
|
|
66
|
+
|
|
67
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
68
|
+
:start-after: [START code_vulnerability_evaluator]
|
|
69
|
+
:end-before: [END code_vulnerability_evaluator]
|
|
70
|
+
:language: python
|
|
71
|
+
:dedent: 8
|
|
72
|
+
:caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
|
|
73
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
66
74
|
|
|
67
75
|
.. note::
|
|
68
76
|
|
|
@@ -31,7 +31,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
31
31
|
:end-before: [END coherence_evaluator]
|
|
32
32
|
:language: python
|
|
33
33
|
:dedent: 8
|
|
34
|
-
:caption: Initialize and call
|
|
34
|
+
:caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
|
|
35
|
+
|
|
36
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
37
|
+
|
|
38
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
39
|
+
:start-after: [START coherence_evaluator]
|
|
40
|
+
:end-before: [END coherence_evaluator]
|
|
41
|
+
:language: python
|
|
42
|
+
:dedent: 8
|
|
43
|
+
:caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
|
|
44
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
35
45
|
|
|
36
46
|
.. admonition:: Example with Threshold:
|
|
37
47
|
|
|
@@ -40,7 +50,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
40
50
|
:end-before: [END threshold_coherence_evaluator]
|
|
41
51
|
:language: python
|
|
42
52
|
:dedent: 8
|
|
43
|
-
:caption: Initialize with threshold and
|
|
53
|
+
:caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
|
|
44
54
|
|
|
45
55
|
.. note::
|
|
46
56
|
|
|
@@ -86,6 +86,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
86
86
|
:type _higher_is_better: Optional[bool]
|
|
87
87
|
"""
|
|
88
88
|
|
|
89
|
+
_NOT_APPLICABLE_RESULT = "not applicable"
|
|
90
|
+
_PASS_RESULT = "pass"
|
|
91
|
+
_FAIL_RESULT = "fail"
|
|
92
|
+
|
|
89
93
|
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
90
94
|
|
|
91
95
|
# Make sure to call super().__init__() in the child class's __init__ method.
|
|
@@ -44,7 +44,17 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
44
44
|
:end-before: [END content_safety_evaluator]
|
|
45
45
|
:language: python
|
|
46
46
|
:dedent: 8
|
|
47
|
-
:caption: Initialize and call
|
|
47
|
+
:caption: Initialize and call ContentSafetyEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
48
|
+
|
|
49
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
50
|
+
|
|
51
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
52
|
+
:start-after: [START content_safety_evaluator]
|
|
53
|
+
:end-before: [END content_safety_evaluator]
|
|
54
|
+
:language: python
|
|
55
|
+
:dedent: 8
|
|
56
|
+
:caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
|
|
57
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
48
58
|
|
|
49
59
|
.. admonition:: Example with Threshold:
|
|
50
60
|
|
|
@@ -53,7 +63,7 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
53
63
|
:end-before: [END threshold_content_safety_evaluator]
|
|
54
64
|
:language: python
|
|
55
65
|
:dedent: 8
|
|
56
|
-
:caption: Initialize with threshold and call a ContentSafetyEvaluator.
|
|
66
|
+
:caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
|
|
57
67
|
"""
|
|
58
68
|
|
|
59
69
|
id = "content_safety"
|
|
@@ -58,16 +58,26 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
58
58
|
:end-before: [END hate_unfairness_evaluator]
|
|
59
59
|
:language: python
|
|
60
60
|
:dedent: 8
|
|
61
|
-
:caption: Initialize and call
|
|
62
|
-
|
|
63
|
-
.. admonition:: Example
|
|
61
|
+
:caption: Initialize and call HateUnfairnessEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
62
|
+
|
|
63
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
64
|
+
|
|
65
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
66
|
+
:start-after: [START hate_unfairness_evaluator]
|
|
67
|
+
:end-before: [END hate_unfairness_evaluator]
|
|
68
|
+
:language: python
|
|
69
|
+
:dedent: 8
|
|
70
|
+
:caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
|
|
71
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
64
72
|
|
|
73
|
+
.. admonition:: Example with Threshold:
|
|
74
|
+
|
|
65
75
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
66
76
|
:start-after: [START threshold_hate_unfairness_evaluator]
|
|
67
77
|
:end-before: [END threshold_hate_unfairness_evaluator]
|
|
68
78
|
:language: python
|
|
69
79
|
:dedent: 8
|
|
70
|
-
:caption: Initialize with threshold and call a HateUnfairnessEvaluator.
|
|
80
|
+
:caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
|
|
71
81
|
"""
|
|
72
82
|
|
|
73
83
|
id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
|
|
@@ -52,16 +52,17 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
52
52
|
:end-before: [END self_harm_evaluator]
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent: 8
|
|
55
|
-
:caption: Initialize and call
|
|
56
|
-
|
|
57
|
-
.. admonition:: Example:
|
|
58
|
-
|
|
59
|
-
.. literalinclude:: ../samples/
|
|
60
|
-
:start-after: [START
|
|
61
|
-
:end-before: [END
|
|
55
|
+
:caption: Initialize and call SelfHarmEvaluator using azure.ai.evaluation.AzureAIProject.
|
|
56
|
+
|
|
57
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
58
|
+
|
|
59
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
60
|
+
:start-after: [START self_harm_evaluator]
|
|
61
|
+
:end-before: [END self_harm_evaluator]
|
|
62
62
|
:language: python
|
|
63
63
|
:dedent: 8
|
|
64
|
-
:caption: Initialize
|
|
64
|
+
:caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
|
|
65
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
65
66
|
"""
|
|
66
67
|
|
|
67
68
|
id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
|
|
@@ -56,6 +56,16 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
56
56
|
:dedent: 8
|
|
57
57
|
:caption: Initialize and call a SexualEvaluator.
|
|
58
58
|
|
|
59
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
60
|
+
|
|
61
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
62
|
+
:start-after: [START sexual_evaluator]
|
|
63
|
+
:end-before: [END sexual_evaluator]
|
|
64
|
+
:language: python
|
|
65
|
+
:dedent: 8
|
|
66
|
+
:caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
|
|
67
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
68
|
+
|
|
59
69
|
.. admonition:: Example with Threshold:
|
|
60
70
|
|
|
61
71
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -56,6 +56,16 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
56
56
|
:dedent: 8
|
|
57
57
|
:caption: Initialize and call a ViolenceEvaluator.
|
|
58
58
|
|
|
59
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
60
|
+
|
|
61
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
62
|
+
:start-after: [START violence_evaluator]
|
|
63
|
+
:end-before: [END violence_evaluator]
|
|
64
|
+
:language: python
|
|
65
|
+
:dedent: 8
|
|
66
|
+
:caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
|
|
67
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
68
|
+
|
|
59
69
|
.. admonition:: Example:
|
|
60
70
|
|
|
61
71
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
import math
|
|
5
5
|
import operator
|
|
6
6
|
from itertools import starmap
|
|
7
|
-
from typing import Dict, List, TypedDict, Tuple, Optional
|
|
7
|
+
from typing import Any, Dict, List, TypedDict, Tuple, Optional, Union
|
|
8
8
|
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
9
9
|
from azure.ai.evaluation._exceptions import EvaluationException
|
|
10
10
|
from typing_extensions import override, overload
|
|
@@ -30,8 +30,18 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
30
30
|
:end-before: [END document_retrieval_evaluator]
|
|
31
31
|
:language: python
|
|
32
32
|
:dedent: 8
|
|
33
|
-
:caption: Initialize and call a
|
|
33
|
+
:caption: Initialize and call a DocumentRetrievalEvaluator
|
|
34
34
|
|
|
35
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
36
|
+
|
|
37
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
38
|
+
:start-after: [START document_retrieval_evaluator]
|
|
39
|
+
:end-before: [END document_retrieval_evaluator]
|
|
40
|
+
:language: python
|
|
41
|
+
:dedent: 8
|
|
42
|
+
:caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
|
|
43
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
44
|
+
|
|
35
45
|
.. admonition:: Example with Threshold:
|
|
36
46
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
37
47
|
:start-after: [START threshold_document_retrieval_evaluator]
|
|
@@ -46,7 +56,13 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
46
56
|
*,
|
|
47
57
|
ground_truth_label_min: int = 0,
|
|
48
58
|
ground_truth_label_max: int = 4,
|
|
49
|
-
|
|
59
|
+
ndcg_threshold: Optional[float] = 0.5,
|
|
60
|
+
xdcg_threshold: Optional[float] = 50.0,
|
|
61
|
+
fidelity_threshold: Optional[float] = 0.5,
|
|
62
|
+
top1_relevance_threshold: Optional[float] = 50.0,
|
|
63
|
+
top3_max_relevance_threshold: Optional[float] = 50.0,
|
|
64
|
+
total_retrieved_documents_threshold: Optional[int] = 50,
|
|
65
|
+
total_ground_truth_documents_threshold: Optional[int] = 50
|
|
50
66
|
):
|
|
51
67
|
super().__init__()
|
|
52
68
|
self.k = 3
|
|
@@ -71,27 +87,19 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
71
87
|
self.ground_truth_label_max = ground_truth_label_max
|
|
72
88
|
|
|
73
89
|
# The default threshold for metrics where higher numbers are better.
|
|
74
|
-
self._threshold_metrics = {
|
|
75
|
-
"ndcg@3":
|
|
76
|
-
"xdcg@3":
|
|
77
|
-
"fidelity":
|
|
78
|
-
"top1_relevance":
|
|
79
|
-
"top3_max_relevance":
|
|
80
|
-
"total_retrieved_documents":
|
|
81
|
-
"total_ground_truth_documents":
|
|
90
|
+
self._threshold_metrics: Dict[str, Any] = {
|
|
91
|
+
"ndcg@3": ndcg_threshold,
|
|
92
|
+
"xdcg@3": xdcg_threshold,
|
|
93
|
+
"fidelity": fidelity_threshold,
|
|
94
|
+
"top1_relevance": top1_relevance_threshold,
|
|
95
|
+
"top3_max_relevance": top3_max_relevance_threshold,
|
|
96
|
+
"total_retrieved_documents": total_retrieved_documents_threshold,
|
|
97
|
+
"total_ground_truth_documents": total_ground_truth_documents_threshold,
|
|
82
98
|
}
|
|
83
99
|
|
|
84
100
|
# Ideally, the number of holes should be zero.
|
|
85
101
|
self._threshold_holes = {"holes": 0, "holes_ratio": 0}
|
|
86
102
|
|
|
87
|
-
if threshold and not isinstance(threshold, dict):
|
|
88
|
-
raise EvaluationException(
|
|
89
|
-
f"Threshold must be a dictionary, got {type(threshold)}"
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
elif isinstance(threshold, dict):
|
|
93
|
-
self._threshold_metrics.update(threshold)
|
|
94
|
-
|
|
95
103
|
def _compute_holes(self, actual_docs: List[str], labeled_docs: List[str]) -> int:
|
|
96
104
|
"""
|
|
97
105
|
The number of documents retrieved from a search query which have no provided ground-truth label.
|
|
@@ -214,22 +222,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
214
222
|
return weighted_sum_by_rating_results / float(weighted_sum_by_rating_index)
|
|
215
223
|
|
|
216
224
|
def _get_binary_result(self, **metrics) -> Dict[str, float]:
|
|
217
|
-
result = {}
|
|
225
|
+
result: Dict[str, Any] = {}
|
|
218
226
|
|
|
219
227
|
for metric_name, metric_value in metrics.items():
|
|
220
228
|
if metric_name in self._threshold_metrics.keys():
|
|
221
|
-
result[f"{metric_name}_result"] =
|
|
222
|
-
|
|
223
|
-
)
|
|
224
|
-
result[f"{metric_name}_threshold"] = self._threshold_metrics[
|
|
225
|
-
metric_name
|
|
226
|
-
]
|
|
229
|
+
result[f"{metric_name}_result"] = "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
|
|
230
|
+
result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
|
|
227
231
|
result[f"{metric_name}_higher_is_better"] = True
|
|
228
232
|
|
|
229
233
|
elif metric_name in self._threshold_holes.keys():
|
|
230
|
-
result[f"{metric_name}_result"] =
|
|
231
|
-
metric_value <= self._threshold_holes[metric_name]
|
|
232
|
-
)
|
|
234
|
+
result[f"{metric_name}_result"] = "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
|
|
233
235
|
result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
|
|
234
236
|
result[f"{metric_name}_higher_is_better"] = False
|
|
235
237
|
|
|
@@ -38,6 +38,16 @@ class F1ScoreEvaluator(EvaluatorBase):
|
|
|
38
38
|
:dedent: 8
|
|
39
39
|
:caption: Initialize and call an F1ScoreEvaluator.
|
|
40
40
|
|
|
41
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
42
|
+
|
|
43
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
44
|
+
:start-after: [START f1_score_evaluator]
|
|
45
|
+
:end-before: [END f1_score_evaluator]
|
|
46
|
+
:language: python
|
|
47
|
+
:dedent: 8
|
|
48
|
+
:caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
|
|
49
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
50
|
+
|
|
41
51
|
.. admonition:: Example with Threshold:
|
|
42
52
|
|
|
43
53
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -44,6 +44,16 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
44
44
|
:dedent: 8
|
|
45
45
|
:caption: Initialize with threshold and call a FluencyEvaluator.
|
|
46
46
|
|
|
47
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
48
|
+
|
|
49
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
50
|
+
:start-after: [START fluency_evaluator]
|
|
51
|
+
:end-before: [END fluency_evaluator]
|
|
52
|
+
:language: python
|
|
53
|
+
:dedent: 8
|
|
54
|
+
:caption: Initialize and call FluencyEvaluator using Azure AI Project URL in the following format
|
|
55
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
56
|
+
|
|
47
57
|
.. note::
|
|
48
58
|
|
|
49
59
|
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
@@ -43,6 +43,16 @@ class GleuScoreEvaluator(EvaluatorBase):
|
|
|
43
43
|
:language: python
|
|
44
44
|
:dedent: 8
|
|
45
45
|
:caption: Initialize with threshold and call a GleuScoreEvaluator.
|
|
46
|
+
|
|
47
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
48
|
+
|
|
49
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
50
|
+
:start-after: [START gleu_score_evaluator]
|
|
51
|
+
:end-before: [END gleu_score_evaluator]
|
|
52
|
+
:language: python
|
|
53
|
+
:dedent: 8
|
|
54
|
+
:caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
|
|
55
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
46
56
|
"""
|
|
47
57
|
|
|
48
58
|
id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
|