azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +46 -12
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- azure/ai/evaluation/_common/rai_service.py +3 -3
- azure/ai/evaluation/_common/utils.py +74 -17
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- azure/ai/evaluation/_evaluate/_evaluate.py +150 -40
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +2 -0
- azure/ai/evaluation/_evaluate/_utils.py +1 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +30 -6
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -8
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +1 -1
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +54 -2
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +1 -1
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +1 -1
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -10
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +169 -186
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +101 -23
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +2 -2
- azure/ai/evaluation/red_team/_red_team.py +838 -478
- azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +8 -3
- azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure/ai/evaluation/simulator/_adversarial_simulator.py +5 -2
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +2 -2
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +20 -2
- azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +32 -3
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +64 -63
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,9 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
import re
|
|
10
|
-
|
|
10
|
+
import tempfile
|
|
11
|
+
import json
|
|
12
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
11
13
|
|
|
12
14
|
from openai import OpenAI, AzureOpenAI
|
|
13
15
|
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
@@ -611,6 +613,18 @@ def _apply_target_to_data(
|
|
|
611
613
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
612
614
|
blame=ErrorBlame.USER_ERROR,
|
|
613
615
|
)
|
|
616
|
+
|
|
617
|
+
# Log a warning if some rows failed
|
|
618
|
+
failed_lines = run_summary.get("failed_lines", 0)
|
|
619
|
+
completed_lines = run_summary["completed_lines"]
|
|
620
|
+
total_lines = failed_lines + completed_lines
|
|
621
|
+
|
|
622
|
+
if failed_lines > 0:
|
|
623
|
+
LOGGER.warning(
|
|
624
|
+
f"Target function completed {completed_lines} out of {total_lines} rows. "
|
|
625
|
+
f"{failed_lines} rows failed and will be filled with NaN values."
|
|
626
|
+
)
|
|
627
|
+
|
|
614
628
|
# Remove input and output prefix
|
|
615
629
|
generated_columns = {
|
|
616
630
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -618,6 +632,13 @@ def _apply_target_to_data(
|
|
|
618
632
|
# Sort output by line numbers
|
|
619
633
|
target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
|
|
620
634
|
target_output.sort_index(inplace=True)
|
|
635
|
+
|
|
636
|
+
initial_data_with_line_numbers = initial_data.copy()
|
|
637
|
+
initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
|
|
638
|
+
|
|
639
|
+
complete_index = initial_data_with_line_numbers[LINE_NUMBER]
|
|
640
|
+
target_output = target_output.reindex(complete_index)
|
|
641
|
+
|
|
621
642
|
target_output.reset_index(inplace=True, drop=False)
|
|
622
643
|
# target_output contains only input columns, taken by function,
|
|
623
644
|
# so we need to concatenate it to the input data frame.
|
|
@@ -626,8 +647,8 @@ def _apply_target_to_data(
|
|
|
626
647
|
# Rename outputs columns to __outputs
|
|
627
648
|
rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
|
|
628
649
|
target_output.rename(columns=rename_dict, inplace=True)
|
|
629
|
-
# Concatenate output to input
|
|
630
|
-
target_output = pd.concat([
|
|
650
|
+
# Concatenate output to input - now both dataframes have the same number of rows
|
|
651
|
+
target_output = pd.concat([initial_data, target_output], axis=1)
|
|
631
652
|
|
|
632
653
|
return target_output, generated_columns, run
|
|
633
654
|
|
|
@@ -645,7 +666,7 @@ def _process_column_mappings(
|
|
|
645
666
|
|
|
646
667
|
processed_config: Dict[str, Dict[str, str]] = {}
|
|
647
668
|
|
|
648
|
-
expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]
|
|
669
|
+
expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
|
|
649
670
|
|
|
650
671
|
if column_mapping:
|
|
651
672
|
for evaluator, mapping_config in column_mapping.items():
|
|
@@ -855,6 +876,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
855
876
|
output_path=output_path,
|
|
856
877
|
azure_ai_project=azure_ai_project,
|
|
857
878
|
evaluation_name=evaluation_name,
|
|
879
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
858
880
|
**kwargs,
|
|
859
881
|
)
|
|
860
882
|
|
|
@@ -962,6 +984,7 @@ def _preprocess_data(
|
|
|
962
984
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
963
985
|
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
964
986
|
evaluation_name: Optional[str] = None,
|
|
987
|
+
fail_on_evaluator_errors: bool = False,
|
|
965
988
|
**kwargs,
|
|
966
989
|
) -> __ValidatedData:
|
|
967
990
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
@@ -995,15 +1018,49 @@ def _preprocess_data(
|
|
|
995
1018
|
batch_run_client: BatchClient
|
|
996
1019
|
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
997
1020
|
|
|
998
|
-
|
|
999
|
-
|
|
1021
|
+
def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
|
|
1022
|
+
"""Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
|
|
1023
|
+
_use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
|
|
1024
|
+
_use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
|
|
1025
|
+
|
|
1026
|
+
if _use_run_submitter_client is None and _use_pf_client is None:
|
|
1027
|
+
# If both are unset, return default
|
|
1028
|
+
return "run_submitter"
|
|
1029
|
+
|
|
1030
|
+
if _use_run_submitter_client and _use_pf_client:
|
|
1031
|
+
raise EvaluationException(
|
|
1032
|
+
message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
|
|
1033
|
+
target=ErrorTarget.EVALUATE,
|
|
1034
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1035
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
if _use_run_submitter_client == False and _use_pf_client == False:
|
|
1039
|
+
return "code_client"
|
|
1040
|
+
|
|
1041
|
+
if _use_run_submitter_client:
|
|
1042
|
+
return "run_submitter"
|
|
1043
|
+
if _use_pf_client:
|
|
1044
|
+
return "pf_client"
|
|
1045
|
+
|
|
1046
|
+
if _use_run_submitter_client is None and _use_pf_client == False:
|
|
1047
|
+
return "run_submitter"
|
|
1048
|
+
if _use_run_submitter_client == False and _use_pf_client is None:
|
|
1049
|
+
return "pf_client"
|
|
1050
|
+
|
|
1051
|
+
assert False, "This should be impossible"
|
|
1052
|
+
|
|
1053
|
+
client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
|
|
1054
|
+
|
|
1055
|
+
if client_type == "run_submitter":
|
|
1056
|
+
batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
|
|
1000
1057
|
batch_run_data = input_data_df
|
|
1001
|
-
elif
|
|
1058
|
+
elif client_type == "pf_client":
|
|
1002
1059
|
batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
|
|
1003
1060
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
1004
1061
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1005
1062
|
batch_run_data = os.path.abspath(data)
|
|
1006
|
-
|
|
1063
|
+
elif client_type == "code_client":
|
|
1007
1064
|
batch_run_client = CodeClient()
|
|
1008
1065
|
batch_run_data = input_data_df
|
|
1009
1066
|
|
|
@@ -1013,17 +1070,50 @@ def _preprocess_data(
|
|
|
1013
1070
|
target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
1014
1071
|
)
|
|
1015
1072
|
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1073
|
+
# IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
|
|
1074
|
+
# This ensures that evaluators get all rows (including failed ones with NaN values)
|
|
1075
|
+
if isinstance(batch_run_client, ProxyClient):
|
|
1076
|
+
# Create a temporary JSONL file with the complete dataframe
|
|
1077
|
+
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
|
|
1078
|
+
try:
|
|
1079
|
+
for _, row in input_data_df.iterrows():
|
|
1080
|
+
row_dict = row.to_dict()
|
|
1081
|
+
temp_file.write(json.dumps(row_dict) + "\n")
|
|
1082
|
+
temp_file.close()
|
|
1083
|
+
batch_run_data = temp_file.name
|
|
1084
|
+
|
|
1085
|
+
# Update column mappings to use data references instead of run outputs
|
|
1086
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1087
|
+
mapped_to_values = set(mapping.values())
|
|
1088
|
+
for col in target_generated_columns:
|
|
1089
|
+
# Use data reference instead of run output to ensure we get all rows
|
|
1090
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1091
|
+
|
|
1092
|
+
# We will add our mapping only if customer did not map target output.
|
|
1093
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1094
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1095
|
+
|
|
1096
|
+
# Don't pass the target_run since we're now using the complete dataframe
|
|
1097
|
+
target_run = None
|
|
1098
|
+
|
|
1099
|
+
except Exception as e:
|
|
1100
|
+
# Clean up the temp file if something goes wrong
|
|
1101
|
+
if os.path.exists(temp_file.name):
|
|
1102
|
+
os.unlink(temp_file.name)
|
|
1103
|
+
raise e
|
|
1104
|
+
else:
|
|
1105
|
+
# For DataFrame-based clients, update batch_run_data to use the updated input_data_df
|
|
1106
|
+
batch_run_data = input_data_df
|
|
1107
|
+
|
|
1108
|
+
# Update column mappings for DataFrame clients
|
|
1109
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1110
|
+
mapped_to_values = set(mapping.values())
|
|
1111
|
+
for col in target_generated_columns:
|
|
1112
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1113
|
+
|
|
1114
|
+
# We will add our mapping only if customer did not map target output.
|
|
1115
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1116
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1027
1117
|
|
|
1028
1118
|
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
1029
1119
|
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
@@ -1062,30 +1152,50 @@ def _run_callable_evaluators(
|
|
|
1062
1152
|
batch_run_data = validated_data["batch_run_data"]
|
|
1063
1153
|
column_mapping = validated_data["column_mapping"]
|
|
1064
1154
|
evaluators = validated_data["evaluators"]
|
|
1065
|
-
with EvalRunContext(batch_run_client):
|
|
1066
|
-
runs = {
|
|
1067
|
-
evaluator_name: batch_run_client.run(
|
|
1068
|
-
flow=evaluator,
|
|
1069
|
-
data=batch_run_data,
|
|
1070
|
-
run=target_run,
|
|
1071
|
-
evaluator_name=evaluator_name,
|
|
1072
|
-
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
1073
|
-
stream=True,
|
|
1074
|
-
name=kwargs.get("_run_name"),
|
|
1075
|
-
)
|
|
1076
|
-
for evaluator_name, evaluator in evaluators.items()
|
|
1077
|
-
}
|
|
1078
1155
|
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1156
|
+
# Clean up temporary file after evaluation if it was created
|
|
1157
|
+
temp_file_to_cleanup = None
|
|
1158
|
+
if (
|
|
1159
|
+
isinstance(batch_run_client, ProxyClient)
|
|
1160
|
+
and isinstance(batch_run_data, str)
|
|
1161
|
+
and batch_run_data.endswith(".jsonl")
|
|
1162
|
+
):
|
|
1163
|
+
# Check if it's a temporary file (contains temp directory path)
|
|
1164
|
+
if tempfile.gettempdir() in batch_run_data:
|
|
1165
|
+
temp_file_to_cleanup = batch_run_data
|
|
1166
|
+
|
|
1167
|
+
try:
|
|
1168
|
+
with EvalRunContext(batch_run_client):
|
|
1169
|
+
runs = {
|
|
1170
|
+
evaluator_name: batch_run_client.run(
|
|
1171
|
+
flow=evaluator,
|
|
1172
|
+
data=batch_run_data,
|
|
1173
|
+
# Don't pass target_run when using complete dataframe
|
|
1174
|
+
run=target_run,
|
|
1175
|
+
evaluator_name=evaluator_name,
|
|
1176
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
1177
|
+
stream=True,
|
|
1178
|
+
name=kwargs.get("_run_name"),
|
|
1179
|
+
)
|
|
1180
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
1085
1181
|
}
|
|
1086
|
-
for evaluator_name, run in runs.items()
|
|
1087
|
-
}
|
|
1088
1182
|
|
|
1183
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
1184
|
+
per_evaluator_results: Dict[str, __EvaluatorInfo] = {
|
|
1185
|
+
evaluator_name: {
|
|
1186
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
1187
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
1188
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
1189
|
+
}
|
|
1190
|
+
for evaluator_name, run in runs.items()
|
|
1191
|
+
}
|
|
1192
|
+
finally:
|
|
1193
|
+
# Clean up temporary file if it was created
|
|
1194
|
+
if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
|
|
1195
|
+
try:
|
|
1196
|
+
os.unlink(temp_file_to_cleanup)
|
|
1197
|
+
except Exception as e:
|
|
1198
|
+
LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
|
|
1089
1199
|
# Concatenate all results
|
|
1090
1200
|
evaluators_result_df = pd.DataFrame()
|
|
1091
1201
|
evaluators_metric = {}
|
|
@@ -353,6 +353,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
|
353
353
|
AzureOpenAIStringCheckGrader,
|
|
354
354
|
AzureOpenAITextSimilarityGrader,
|
|
355
355
|
AzureOpenAIScoreModelGrader,
|
|
356
|
+
AzureOpenAIPythonGrader,
|
|
356
357
|
)
|
|
357
358
|
|
|
358
359
|
id_map = {
|
|
@@ -361,6 +362,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
|
361
362
|
AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
|
|
362
363
|
AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
|
|
363
364
|
AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
|
|
365
|
+
AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
|
|
364
366
|
}
|
|
365
367
|
|
|
366
368
|
for key in id_map.keys():
|
|
@@ -178,7 +178,6 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
178
178
|
|
|
179
179
|
properties = {
|
|
180
180
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
181
|
-
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
182
181
|
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
183
182
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
184
183
|
}
|
|
@@ -191,6 +190,7 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
191
190
|
upload_run_response = client.start_evaluation_run(
|
|
192
191
|
evaluation=EvaluationUpload(
|
|
193
192
|
display_name=evaluation_name,
|
|
193
|
+
properties=properties,
|
|
194
194
|
)
|
|
195
195
|
)
|
|
196
196
|
|
|
@@ -202,7 +202,6 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
202
202
|
outputs={
|
|
203
203
|
"evaluationResultId": create_evaluation_result_response.id,
|
|
204
204
|
},
|
|
205
|
-
properties=properties,
|
|
206
205
|
),
|
|
207
206
|
)
|
|
208
207
|
|
|
@@ -54,7 +54,7 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
54
54
|
:caption: Initialize with threshold and call an BleuScoreEvaluator.
|
|
55
55
|
"""
|
|
56
56
|
|
|
57
|
-
id = "
|
|
57
|
+
id = "azureai://built-in/evaluators/bleu_score"
|
|
58
58
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
59
59
|
|
|
60
60
|
def __init__(self, *, threshold=0.5):
|
|
@@ -79,19 +79,26 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
79
79
|
for the code vulnerability will be "code_vulnerability_label".
|
|
80
80
|
"""
|
|
81
81
|
|
|
82
|
-
id = "code_vulnerability"
|
|
82
|
+
id = "azureai://built-in/evaluators/code_vulnerability"
|
|
83
83
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
84
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
84
85
|
|
|
85
86
|
@override
|
|
86
87
|
def __init__(
|
|
87
88
|
self,
|
|
88
89
|
credential,
|
|
89
90
|
azure_ai_project,
|
|
91
|
+
**kwargs,
|
|
90
92
|
):
|
|
93
|
+
# Set default for evaluate_query if not provided
|
|
94
|
+
if "evaluate_query" not in kwargs:
|
|
95
|
+
kwargs["evaluate_query"] = True
|
|
96
|
+
|
|
91
97
|
super().__init__(
|
|
92
98
|
eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
|
|
93
99
|
azure_ai_project=azure_ai_project,
|
|
94
100
|
credential=credential,
|
|
101
|
+
**kwargs,
|
|
95
102
|
)
|
|
96
103
|
|
|
97
104
|
@overload
|
|
@@ -62,7 +62,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
62
62
|
_PROMPTY_FILE = "coherence.prompty"
|
|
63
63
|
_RESULT_KEY = "coherence"
|
|
64
64
|
|
|
65
|
-
id = "
|
|
65
|
+
id = "azureai://built-in/evaluators/coherence"
|
|
66
66
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
67
67
|
|
|
68
68
|
@override
|
|
@@ -4,14 +4,34 @@
|
|
|
4
4
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import (
|
|
8
|
+
Any,
|
|
9
|
+
Callable,
|
|
10
|
+
Dict,
|
|
11
|
+
Generic,
|
|
12
|
+
List,
|
|
13
|
+
TypedDict,
|
|
14
|
+
TypeVar,
|
|
15
|
+
Union,
|
|
16
|
+
cast,
|
|
17
|
+
final,
|
|
18
|
+
Optional,
|
|
19
|
+
)
|
|
8
20
|
|
|
9
21
|
from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
|
|
10
22
|
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
23
|
|
|
12
|
-
from azure.ai.evaluation._exceptions import
|
|
24
|
+
from azure.ai.evaluation._exceptions import (
|
|
25
|
+
ErrorBlame,
|
|
26
|
+
ErrorCategory,
|
|
27
|
+
ErrorTarget,
|
|
28
|
+
EvaluationException,
|
|
29
|
+
)
|
|
13
30
|
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
14
|
-
from azure.ai.evaluation._constants import
|
|
31
|
+
from azure.ai.evaluation._constants import (
|
|
32
|
+
_AggregationType,
|
|
33
|
+
EVALUATION_PASS_FAIL_MAPPING,
|
|
34
|
+
)
|
|
15
35
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
16
36
|
from azure.ai.evaluation._common._experimental import experimental
|
|
17
37
|
|
|
@@ -176,7 +196,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
176
196
|
singletons.extend([p for p in params if p != "self"])
|
|
177
197
|
return singletons
|
|
178
198
|
|
|
179
|
-
def _derive_conversation_converter(
|
|
199
|
+
def _derive_conversation_converter(
|
|
200
|
+
self,
|
|
201
|
+
) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
180
202
|
"""Produce the function that will be used to convert conversations to a list of evaluable inputs.
|
|
181
203
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
182
204
|
aspects of a conversation ought to be extracted.
|
|
@@ -235,7 +257,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
235
257
|
|
|
236
258
|
return converter
|
|
237
259
|
|
|
238
|
-
def _derive_multi_modal_conversation_converter(
|
|
260
|
+
def _derive_multi_modal_conversation_converter(
|
|
261
|
+
self,
|
|
262
|
+
) -> Callable[[Dict], List[Dict[str, Any]]]:
|
|
239
263
|
"""Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
|
|
240
264
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
241
265
|
aspects of a conversation ought to be extracted.
|
|
@@ -288,7 +312,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
288
312
|
|
|
289
313
|
return multi_modal_converter
|
|
290
314
|
|
|
291
|
-
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
315
|
+
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
|
|
292
316
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
293
317
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
294
318
|
Either they receive a collection of keyname inputs that are all single values
|
|
@@ -36,14 +36,17 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
36
36
|
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
37
37
|
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
38
38
|
:type eval_last_turn: bool
|
|
39
|
-
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
|
|
40
|
-
to produce a single result.
|
|
39
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
|
|
41
40
|
Default is ~azure.ai.evaluation._AggregationType.MEAN.
|
|
42
41
|
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
43
42
|
:param threshold: The threshold for the evaluation. Default is 3.
|
|
44
43
|
:type threshold: Optional[int]
|
|
45
44
|
:param _higher_is_better: If True, higher scores are better. Default is True.
|
|
46
45
|
:type _higher_is_better: Optional[bool]
|
|
46
|
+
:param evaluate_query: If True, the query will be included in the evaluation data when evaluating
|
|
47
|
+
query-response pairs. If False, only the response will be evaluated. Default is False.
|
|
48
|
+
Can be passed as a keyword argument.
|
|
49
|
+
:type evaluate_query: bool
|
|
47
50
|
"""
|
|
48
51
|
|
|
49
52
|
@override
|
|
@@ -56,6 +59,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
56
59
|
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
57
60
|
threshold: int = 3,
|
|
58
61
|
_higher_is_better: Optional[bool] = False,
|
|
62
|
+
**kwargs,
|
|
59
63
|
):
|
|
60
64
|
super().__init__(
|
|
61
65
|
eval_last_turn=eval_last_turn,
|
|
@@ -67,6 +71,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
67
71
|
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
68
72
|
self._credential = credential
|
|
69
73
|
self._threshold = threshold
|
|
74
|
+
|
|
75
|
+
# Handle evaluate_query parameter from kwargs
|
|
76
|
+
self._evaluate_query = kwargs.get("evaluate_query", False)
|
|
70
77
|
self._higher_is_better = _higher_is_better
|
|
71
78
|
|
|
72
79
|
@override
|
|
@@ -101,7 +108,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
101
108
|
:return: The evaluation result.
|
|
102
109
|
:rtype: Dict
|
|
103
110
|
"""
|
|
104
|
-
if "
|
|
111
|
+
if "response" in eval_input:
|
|
105
112
|
return await self._evaluate_query_response(eval_input)
|
|
106
113
|
|
|
107
114
|
conversation = eval_input.get("conversation", None)
|
|
@@ -131,17 +138,20 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
131
138
|
return result
|
|
132
139
|
|
|
133
140
|
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
|
|
134
|
-
query =
|
|
135
|
-
response =
|
|
136
|
-
if
|
|
141
|
+
query = eval_input.get("query", None)
|
|
142
|
+
response = eval_input.get("response", None)
|
|
143
|
+
if response is None:
|
|
137
144
|
raise EvaluationException(
|
|
138
145
|
message="Not implemented",
|
|
139
146
|
internal_message=(
|
|
140
|
-
"Reached query/response evaluation without supplying
|
|
147
|
+
"Reached query/response evaluation without supplying response."
|
|
141
148
|
+ " This should have failed earlier."
|
|
142
149
|
),
|
|
143
150
|
)
|
|
144
|
-
input_data = {"
|
|
151
|
+
input_data = {"response": str(response)}
|
|
152
|
+
|
|
153
|
+
if query is not None and self._evaluate_query:
|
|
154
|
+
input_data["query"] = str(query)
|
|
145
155
|
|
|
146
156
|
if "context" in self._singleton_inputs:
|
|
147
157
|
context = eval_input.get("context", None)
|
|
@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
33
33
|
:type self_harm_threshold: int
|
|
34
34
|
:param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
|
|
35
35
|
:type hate_unfairness_threshold: int
|
|
36
|
+
:param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
|
|
37
|
+
:type evaluate_query: bool
|
|
36
38
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
37
39
|
:type kwargs: Any
|
|
38
40
|
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
|
|
@@ -66,8 +68,9 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
66
68
|
:caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
|
|
67
69
|
"""
|
|
68
70
|
|
|
69
|
-
id = "content_safety"
|
|
71
|
+
id = "azureai://built-in/evaluators/content_safety"
|
|
70
72
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
73
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
71
74
|
|
|
72
75
|
def __init__(
|
|
73
76
|
self,
|
|
@@ -90,11 +93,18 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
90
93
|
if not isinstance(value, int):
|
|
91
94
|
raise TypeError(f"{name} must be an int, got {type(value)}")
|
|
92
95
|
|
|
96
|
+
# Extract evaluate_query from kwargs if present
|
|
97
|
+
evaluate_query_kwargs = {}
|
|
98
|
+
if "evaluate_query" in kwargs:
|
|
99
|
+
evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
|
|
100
|
+
|
|
93
101
|
evaluators = [
|
|
94
|
-
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
|
|
95
|
-
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
|
|
96
|
-
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
|
|
97
|
-
HateUnfairnessEvaluator(
|
|
102
|
+
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
|
|
103
|
+
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
|
|
104
|
+
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
|
|
105
|
+
HateUnfairnessEvaluator(
|
|
106
|
+
credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
|
|
107
|
+
),
|
|
98
108
|
]
|
|
99
109
|
super().__init__(evaluators=evaluators, **kwargs)
|
|
100
110
|
|
|
@@ -80,8 +80,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
80
80
|
:caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
|
|
81
81
|
"""
|
|
82
82
|
|
|
83
|
-
id = "
|
|
83
|
+
id = "azureai://built-in/evaluators/hate_unfairness"
|
|
84
84
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
85
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
85
86
|
|
|
86
87
|
@override
|
|
87
88
|
def __init__(
|
|
@@ -90,6 +91,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
90
91
|
azure_ai_project,
|
|
91
92
|
*,
|
|
92
93
|
threshold: int = 3,
|
|
94
|
+
**kwargs,
|
|
93
95
|
):
|
|
94
96
|
super().__init__(
|
|
95
97
|
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
|
|
@@ -98,6 +100,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
98
100
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
99
101
|
threshold=threshold,
|
|
100
102
|
_higher_is_better=False,
|
|
103
|
+
**kwargs,
|
|
101
104
|
)
|
|
102
105
|
|
|
103
106
|
@overload
|
|
@@ -65,8 +65,9 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
65
65
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
66
66
|
"""
|
|
67
67
|
|
|
68
|
-
id = "
|
|
68
|
+
id = "azureai://built-in/evaluators/self_harm"
|
|
69
69
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
70
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
70
71
|
|
|
71
72
|
@override
|
|
72
73
|
def __init__(
|
|
@@ -75,6 +76,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
75
76
|
azure_ai_project,
|
|
76
77
|
*,
|
|
77
78
|
threshold: int = 3,
|
|
79
|
+
**kwargs,
|
|
78
80
|
):
|
|
79
81
|
super().__init__(
|
|
80
82
|
eval_metric=EvaluationMetrics.SELF_HARM,
|
|
@@ -83,6 +85,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
83
85
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
84
86
|
threshold=threshold,
|
|
85
87
|
_higher_is_better=False,
|
|
88
|
+
**kwargs,
|
|
86
89
|
)
|
|
87
90
|
|
|
88
91
|
@overload
|
|
@@ -76,8 +76,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
:caption: Initialize with threshold and call a SexualEvaluator.
|
|
77
77
|
"""
|
|
78
78
|
|
|
79
|
-
id = "
|
|
79
|
+
id = "azureai://built-in/evaluators/sexual"
|
|
80
80
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
81
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
81
82
|
|
|
82
83
|
@override
|
|
83
84
|
def __init__(
|
|
@@ -86,6 +87,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
86
87
|
azure_ai_project,
|
|
87
88
|
*,
|
|
88
89
|
threshold: int = 3,
|
|
90
|
+
**kwargs,
|
|
89
91
|
):
|
|
90
92
|
super().__init__(
|
|
91
93
|
eval_metric=EvaluationMetrics.SEXUAL,
|
|
@@ -94,6 +96,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
94
96
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
95
97
|
threshold=threshold,
|
|
96
98
|
_higher_is_better=False,
|
|
99
|
+
**kwargs,
|
|
97
100
|
)
|
|
98
101
|
|
|
99
102
|
@overload
|
|
@@ -146,7 +149,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
146
149
|
key "messages". Conversation turns are expected
|
|
147
150
|
to be dictionaries with keys "content" and "role".
|
|
148
151
|
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
149
|
-
:return: The
|
|
152
|
+
:return: The sexual score.
|
|
150
153
|
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
|
|
151
154
|
"""
|
|
152
155
|
return super().__call__(*args, **kwargs)
|
|
@@ -76,8 +76,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
:caption: Initialize with threshold and call a ViolenceEvaluator.
|
|
77
77
|
"""
|
|
78
78
|
|
|
79
|
-
id = "
|
|
79
|
+
id = "azureai://built-in/evaluators/violence"
|
|
80
80
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
81
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
81
82
|
|
|
82
83
|
@override
|
|
83
84
|
def __init__(
|
|
@@ -86,6 +87,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
86
87
|
azure_ai_project,
|
|
87
88
|
*,
|
|
88
89
|
threshold: int = 3,
|
|
90
|
+
**kwargs,
|
|
89
91
|
):
|
|
90
92
|
super().__init__(
|
|
91
93
|
eval_metric=EvaluationMetrics.VIOLENCE,
|
|
@@ -94,6 +96,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
94
96
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
95
97
|
threshold=threshold,
|
|
96
98
|
_higher_is_better=False,
|
|
99
|
+
**kwargs,
|
|
97
100
|
)
|
|
98
101
|
|
|
99
102
|
@overload
|