azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +46 -12
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_common/rai_service.py +3 -3
- azure/ai/evaluation/_common/utils.py +74 -17
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
- azure/ai/evaluation/_evaluate/_utils.py +5 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +4 -3
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +655 -2665
- azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
- azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,9 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
import re
|
|
10
|
-
|
|
10
|
+
import tempfile
|
|
11
|
+
import json
|
|
12
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
|
11
13
|
|
|
12
14
|
from openai import OpenAI, AzureOpenAI
|
|
13
15
|
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
@@ -462,7 +464,7 @@ def _validate_columns_for_evaluators(
|
|
|
462
464
|
)
|
|
463
465
|
|
|
464
466
|
|
|
465
|
-
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
|
|
467
|
+
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name, tags):
|
|
466
468
|
if data is None:
|
|
467
469
|
msg = "The 'data' parameter is required for evaluation."
|
|
468
470
|
raise EvaluationException(
|
|
@@ -611,6 +613,18 @@ def _apply_target_to_data(
|
|
|
611
613
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
612
614
|
blame=ErrorBlame.USER_ERROR,
|
|
613
615
|
)
|
|
616
|
+
|
|
617
|
+
# Log a warning if some rows failed
|
|
618
|
+
failed_lines = run_summary.get("failed_lines", 0)
|
|
619
|
+
completed_lines = run_summary["completed_lines"]
|
|
620
|
+
total_lines = failed_lines + completed_lines
|
|
621
|
+
|
|
622
|
+
if failed_lines > 0:
|
|
623
|
+
LOGGER.warning(
|
|
624
|
+
f"Target function completed {completed_lines} out of {total_lines} rows. "
|
|
625
|
+
f"{failed_lines} rows failed and will be filled with NaN values."
|
|
626
|
+
)
|
|
627
|
+
|
|
614
628
|
# Remove input and output prefix
|
|
615
629
|
generated_columns = {
|
|
616
630
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -618,6 +632,13 @@ def _apply_target_to_data(
|
|
|
618
632
|
# Sort output by line numbers
|
|
619
633
|
target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
|
|
620
634
|
target_output.sort_index(inplace=True)
|
|
635
|
+
|
|
636
|
+
initial_data_with_line_numbers = initial_data.copy()
|
|
637
|
+
initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
|
|
638
|
+
|
|
639
|
+
complete_index = initial_data_with_line_numbers[LINE_NUMBER]
|
|
640
|
+
target_output = target_output.reindex(complete_index)
|
|
641
|
+
|
|
621
642
|
target_output.reset_index(inplace=True, drop=False)
|
|
622
643
|
# target_output contains only input columns, taken by function,
|
|
623
644
|
# so we need to concatenate it to the input data frame.
|
|
@@ -626,8 +647,8 @@ def _apply_target_to_data(
|
|
|
626
647
|
# Rename outputs columns to __outputs
|
|
627
648
|
rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
|
|
628
649
|
target_output.rename(columns=rename_dict, inplace=True)
|
|
629
|
-
# Concatenate output to input
|
|
630
|
-
target_output = pd.concat([
|
|
650
|
+
# Concatenate output to input - now both dataframes have the same number of rows
|
|
651
|
+
target_output = pd.concat([initial_data, target_output], axis=1)
|
|
631
652
|
|
|
632
653
|
return target_output, generated_columns, run
|
|
633
654
|
|
|
@@ -645,7 +666,7 @@ def _process_column_mappings(
|
|
|
645
666
|
|
|
646
667
|
processed_config: Dict[str, Dict[str, str]] = {}
|
|
647
668
|
|
|
648
|
-
expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]
|
|
669
|
+
expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
|
|
649
670
|
|
|
650
671
|
if column_mapping:
|
|
651
672
|
for evaluator, mapping_config in column_mapping.items():
|
|
@@ -704,6 +725,7 @@ def evaluate(
|
|
|
704
725
|
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
705
726
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
706
727
|
fail_on_evaluator_errors: bool = False,
|
|
728
|
+
tags: Optional[Dict[str, str]] = None,
|
|
707
729
|
**kwargs,
|
|
708
730
|
) -> EvaluationResult:
|
|
709
731
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
@@ -736,6 +758,10 @@ def evaluate(
|
|
|
736
758
|
Defaults to false, which means that evaluations will continue regardless of failures.
|
|
737
759
|
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
|
|
738
760
|
:paramtype fail_on_evaluator_errors: bool
|
|
761
|
+
:keyword tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
|
|
762
|
+
Keys and values must be strings. For more information about tag limits, see:
|
|
763
|
+
https://learn.microsoft.com/en-us/azure/machine-learning/resource-limits-capacity?view=azureml-api-2#runs
|
|
764
|
+
:paramtype tags: Optional[Dict[str, str]]
|
|
739
765
|
:keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
|
|
740
766
|
:paramtype user_agent: Optional[str]
|
|
741
767
|
:return: Evaluation results.
|
|
@@ -772,6 +798,7 @@ def evaluate(
|
|
|
772
798
|
azure_ai_project=azure_ai_project,
|
|
773
799
|
output_path=output_path,
|
|
774
800
|
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
801
|
+
tags=tags,
|
|
775
802
|
**kwargs,
|
|
776
803
|
)
|
|
777
804
|
except Exception as e:
|
|
@@ -840,6 +867,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
840
867
|
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
841
868
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
842
869
|
fail_on_evaluator_errors: bool = False,
|
|
870
|
+
tags: Optional[Dict[str, str]] = None,
|
|
843
871
|
**kwargs,
|
|
844
872
|
) -> EvaluationResult:
|
|
845
873
|
if fail_on_evaluator_errors:
|
|
@@ -855,6 +883,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
855
883
|
output_path=output_path,
|
|
856
884
|
azure_ai_project=azure_ai_project,
|
|
857
885
|
evaluation_name=evaluation_name,
|
|
886
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors,
|
|
887
|
+
tags=tags,
|
|
858
888
|
**kwargs,
|
|
859
889
|
)
|
|
860
890
|
|
|
@@ -934,7 +964,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
934
964
|
name_map = _map_names_to_builtins(evaluators, graders)
|
|
935
965
|
if is_onedp_project(azure_ai_project):
|
|
936
966
|
studio_url = _log_metrics_and_instance_results_onedp(
|
|
937
|
-
metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
|
|
967
|
+
metrics, results_df, azure_ai_project, evaluation_name, name_map, tags=tags, **kwargs
|
|
938
968
|
)
|
|
939
969
|
else:
|
|
940
970
|
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
@@ -942,7 +972,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
942
972
|
studio_url = None
|
|
943
973
|
if trace_destination:
|
|
944
974
|
studio_url = _log_metrics_and_instance_results(
|
|
945
|
-
metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
|
|
975
|
+
metrics, results_df, trace_destination, None, evaluation_name, name_map, tags=tags, **kwargs
|
|
946
976
|
)
|
|
947
977
|
|
|
948
978
|
result_df_dict = results_df.to_dict("records")
|
|
@@ -962,6 +992,8 @@ def _preprocess_data(
|
|
|
962
992
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
963
993
|
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
964
994
|
evaluation_name: Optional[str] = None,
|
|
995
|
+
fail_on_evaluator_errors: bool = False,
|
|
996
|
+
tags: Optional[Dict[str, str]] = None,
|
|
965
997
|
**kwargs,
|
|
966
998
|
) -> __ValidatedData:
|
|
967
999
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
@@ -969,7 +1001,7 @@ def _preprocess_data(
|
|
|
969
1001
|
evaluator_config = {}
|
|
970
1002
|
|
|
971
1003
|
input_data_df = _validate_and_load_data(
|
|
972
|
-
target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name
|
|
1004
|
+
target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name, tags
|
|
973
1005
|
)
|
|
974
1006
|
if target is not None:
|
|
975
1007
|
_validate_columns_for_target(input_data_df, target)
|
|
@@ -995,15 +1027,49 @@ def _preprocess_data(
|
|
|
995
1027
|
batch_run_client: BatchClient
|
|
996
1028
|
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
997
1029
|
|
|
998
|
-
|
|
999
|
-
|
|
1030
|
+
def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
|
|
1031
|
+
"""Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
|
|
1032
|
+
_use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
|
|
1033
|
+
_use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
|
|
1034
|
+
|
|
1035
|
+
if _use_run_submitter_client is None and _use_pf_client is None:
|
|
1036
|
+
# If both are unset, return default
|
|
1037
|
+
return "run_submitter"
|
|
1038
|
+
|
|
1039
|
+
if _use_run_submitter_client and _use_pf_client:
|
|
1040
|
+
raise EvaluationException(
|
|
1041
|
+
message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
|
|
1042
|
+
target=ErrorTarget.EVALUATE,
|
|
1043
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1044
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
if _use_run_submitter_client == False and _use_pf_client == False:
|
|
1048
|
+
return "code_client"
|
|
1049
|
+
|
|
1050
|
+
if _use_run_submitter_client:
|
|
1051
|
+
return "run_submitter"
|
|
1052
|
+
if _use_pf_client:
|
|
1053
|
+
return "pf_client"
|
|
1054
|
+
|
|
1055
|
+
if _use_run_submitter_client is None and _use_pf_client == False:
|
|
1056
|
+
return "run_submitter"
|
|
1057
|
+
if _use_run_submitter_client == False and _use_pf_client is None:
|
|
1058
|
+
return "pf_client"
|
|
1059
|
+
|
|
1060
|
+
assert False, "This should be impossible"
|
|
1061
|
+
|
|
1062
|
+
client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
|
|
1063
|
+
|
|
1064
|
+
if client_type == "run_submitter":
|
|
1065
|
+
batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
|
|
1000
1066
|
batch_run_data = input_data_df
|
|
1001
|
-
elif
|
|
1067
|
+
elif client_type == "pf_client":
|
|
1002
1068
|
batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
|
|
1003
1069
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
1004
1070
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1005
1071
|
batch_run_data = os.path.abspath(data)
|
|
1006
|
-
|
|
1072
|
+
elif client_type == "code_client":
|
|
1007
1073
|
batch_run_client = CodeClient()
|
|
1008
1074
|
batch_run_data = input_data_df
|
|
1009
1075
|
|
|
@@ -1013,17 +1079,50 @@ def _preprocess_data(
|
|
|
1013
1079
|
target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
1014
1080
|
)
|
|
1015
1081
|
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1082
|
+
# IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
|
|
1083
|
+
# This ensures that evaluators get all rows (including failed ones with NaN values)
|
|
1084
|
+
if isinstance(batch_run_client, ProxyClient):
|
|
1085
|
+
# Create a temporary JSONL file with the complete dataframe
|
|
1086
|
+
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
|
|
1087
|
+
try:
|
|
1088
|
+
for _, row in input_data_df.iterrows():
|
|
1089
|
+
row_dict = row.to_dict()
|
|
1090
|
+
temp_file.write(json.dumps(row_dict) + "\n")
|
|
1091
|
+
temp_file.close()
|
|
1092
|
+
batch_run_data = temp_file.name
|
|
1093
|
+
|
|
1094
|
+
# Update column mappings to use data references instead of run outputs
|
|
1095
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1096
|
+
mapped_to_values = set(mapping.values())
|
|
1097
|
+
for col in target_generated_columns:
|
|
1098
|
+
# Use data reference instead of run output to ensure we get all rows
|
|
1099
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1100
|
+
|
|
1101
|
+
# We will add our mapping only if customer did not map target output.
|
|
1102
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1103
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1104
|
+
|
|
1105
|
+
# Don't pass the target_run since we're now using the complete dataframe
|
|
1106
|
+
target_run = None
|
|
1107
|
+
|
|
1108
|
+
except Exception as e:
|
|
1109
|
+
# Clean up the temp file if something goes wrong
|
|
1110
|
+
if os.path.exists(temp_file.name):
|
|
1111
|
+
os.unlink(temp_file.name)
|
|
1112
|
+
raise e
|
|
1113
|
+
else:
|
|
1114
|
+
# For DataFrame-based clients, update batch_run_data to use the updated input_data_df
|
|
1115
|
+
batch_run_data = input_data_df
|
|
1116
|
+
|
|
1117
|
+
# Update column mappings for DataFrame clients
|
|
1118
|
+
for evaluator_name, mapping in column_mapping.items():
|
|
1119
|
+
mapped_to_values = set(mapping.values())
|
|
1120
|
+
for col in target_generated_columns:
|
|
1121
|
+
target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
|
|
1122
|
+
|
|
1123
|
+
# We will add our mapping only if customer did not map target output.
|
|
1124
|
+
if col not in mapping and target_reference not in mapped_to_values:
|
|
1125
|
+
column_mapping[evaluator_name][col] = target_reference
|
|
1027
1126
|
|
|
1028
1127
|
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
1029
1128
|
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
@@ -1062,30 +1161,50 @@ def _run_callable_evaluators(
|
|
|
1062
1161
|
batch_run_data = validated_data["batch_run_data"]
|
|
1063
1162
|
column_mapping = validated_data["column_mapping"]
|
|
1064
1163
|
evaluators = validated_data["evaluators"]
|
|
1065
|
-
with EvalRunContext(batch_run_client):
|
|
1066
|
-
runs = {
|
|
1067
|
-
evaluator_name: batch_run_client.run(
|
|
1068
|
-
flow=evaluator,
|
|
1069
|
-
data=batch_run_data,
|
|
1070
|
-
run=target_run,
|
|
1071
|
-
evaluator_name=evaluator_name,
|
|
1072
|
-
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
1073
|
-
stream=True,
|
|
1074
|
-
name=kwargs.get("_run_name"),
|
|
1075
|
-
)
|
|
1076
|
-
for evaluator_name, evaluator in evaluators.items()
|
|
1077
|
-
}
|
|
1078
1164
|
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1165
|
+
# Clean up temporary file after evaluation if it was created
|
|
1166
|
+
temp_file_to_cleanup = None
|
|
1167
|
+
if (
|
|
1168
|
+
isinstance(batch_run_client, ProxyClient)
|
|
1169
|
+
and isinstance(batch_run_data, str)
|
|
1170
|
+
and batch_run_data.endswith(".jsonl")
|
|
1171
|
+
):
|
|
1172
|
+
# Check if it's a temporary file (contains temp directory path)
|
|
1173
|
+
if tempfile.gettempdir() in batch_run_data:
|
|
1174
|
+
temp_file_to_cleanup = batch_run_data
|
|
1175
|
+
|
|
1176
|
+
try:
|
|
1177
|
+
with EvalRunContext(batch_run_client):
|
|
1178
|
+
runs = {
|
|
1179
|
+
evaluator_name: batch_run_client.run(
|
|
1180
|
+
flow=evaluator,
|
|
1181
|
+
data=batch_run_data,
|
|
1182
|
+
# Don't pass target_run when using complete dataframe
|
|
1183
|
+
run=target_run,
|
|
1184
|
+
evaluator_name=evaluator_name,
|
|
1185
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
1186
|
+
stream=True,
|
|
1187
|
+
name=kwargs.get("_run_name"),
|
|
1188
|
+
)
|
|
1189
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
1085
1190
|
}
|
|
1086
|
-
for evaluator_name, run in runs.items()
|
|
1087
|
-
}
|
|
1088
1191
|
|
|
1192
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
1193
|
+
per_evaluator_results: Dict[str, __EvaluatorInfo] = {
|
|
1194
|
+
evaluator_name: {
|
|
1195
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
1196
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
1197
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
1198
|
+
}
|
|
1199
|
+
for evaluator_name, run in runs.items()
|
|
1200
|
+
}
|
|
1201
|
+
finally:
|
|
1202
|
+
# Clean up temporary file if it was created
|
|
1203
|
+
if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
|
|
1204
|
+
try:
|
|
1205
|
+
os.unlink(temp_file_to_cleanup)
|
|
1206
|
+
except Exception as e:
|
|
1207
|
+
LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
|
|
1089
1208
|
# Concatenate all results
|
|
1090
1209
|
evaluators_result_df = pd.DataFrame()
|
|
1091
1210
|
evaluators_metric = {}
|
|
@@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
|
|
|
29
29
|
eval_group_id: str
|
|
30
30
|
eval_run_id: str
|
|
31
31
|
grader_name_map: Dict[str, str]
|
|
32
|
+
# Total number of expected rows in the original dataset. Used to
|
|
33
|
+
# re-align AOAI grader results to guard against silent row drops
|
|
34
|
+
# causing horizontal concatenation misalignment.
|
|
35
|
+
expected_rows: int
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
def _split_evaluators_and_grader_configs(
|
|
@@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation(
|
|
|
157
161
|
)
|
|
158
162
|
|
|
159
163
|
return OAIEvalRunCreationInfo(
|
|
160
|
-
client=client,
|
|
164
|
+
client=client,
|
|
165
|
+
eval_group_id=eval_group_info.id,
|
|
166
|
+
eval_run_id=eval_run_id,
|
|
167
|
+
grader_name_map=grader_name_map,
|
|
168
|
+
expected_rows=len(data),
|
|
161
169
|
)
|
|
162
170
|
|
|
163
171
|
|
|
@@ -214,7 +222,7 @@ def _get_single_run_results(
|
|
|
214
222
|
)
|
|
215
223
|
|
|
216
224
|
# Convert run results into a dictionary of metrics
|
|
217
|
-
run_metrics = {}
|
|
225
|
+
run_metrics: Dict[str, Any] = {}
|
|
218
226
|
if run_results.per_testing_criteria_results is None:
|
|
219
227
|
msg = (
|
|
220
228
|
"AOAI evaluation run returned no results, despite 'completed' status. This might"
|
|
@@ -231,28 +239,16 @@ def _get_single_run_results(
|
|
|
231
239
|
grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
|
|
232
240
|
passed = criteria_result.passed
|
|
233
241
|
failed = criteria_result.failed
|
|
234
|
-
ratio = passed / (passed + failed)
|
|
242
|
+
ratio = passed / (passed + failed) if (passed + failed) else 0.0
|
|
235
243
|
formatted_column_name = f"{grader_name}.pass_rate"
|
|
236
244
|
run_metrics[formatted_column_name] = ratio
|
|
237
245
|
|
|
238
|
-
# Get full results and convert them into a dataframe.
|
|
239
|
-
# Notes on raw full data output from OAI eval runs:
|
|
240
|
-
# Each row in the full results list in itself a list.
|
|
241
|
-
# Each entry corresponds to one grader's results from the criteria list
|
|
242
|
-
# that was inputted to the eval group.
|
|
243
|
-
# Each entry is a dictionary, with a name, sample, passed boolean, and score number.
|
|
244
|
-
# The name is used to figure out which grader the entry refers to, the sample is ignored.
|
|
245
|
-
# The passed and score values are then added to the results dictionary, prepended with the grader's name
|
|
246
|
-
# as entered by the user in the inputted dictionary.
|
|
247
|
-
# Other values, if they exist, are also added to the results dictionary.
|
|
248
|
-
|
|
249
246
|
# Collect all results with pagination
|
|
250
|
-
all_results = []
|
|
251
|
-
next_cursor = None
|
|
247
|
+
all_results: List[Any] = []
|
|
248
|
+
next_cursor: Optional[str] = None
|
|
252
249
|
limit = 100 # Max allowed by API
|
|
253
250
|
|
|
254
251
|
while True:
|
|
255
|
-
# Build kwargs for the API call
|
|
256
252
|
list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
|
|
257
253
|
if next_cursor is not None:
|
|
258
254
|
list_kwargs["after"] = next_cursor
|
|
@@ -265,28 +261,25 @@ def _get_single_run_results(
|
|
|
265
261
|
# Check for more pages
|
|
266
262
|
if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
|
|
267
263
|
if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
|
|
268
|
-
# Get the last item's ID for cursor-based pagination
|
|
269
264
|
next_cursor = raw_list_results.data[-1].id
|
|
270
265
|
else:
|
|
271
266
|
break
|
|
272
267
|
else:
|
|
273
268
|
break
|
|
274
269
|
|
|
275
|
-
listed_results = {"index": []}
|
|
276
|
-
#
|
|
277
|
-
# datasource_item_id
|
|
270
|
+
listed_results: Dict[str, List[Any]] = {"index": []}
|
|
271
|
+
# Raw data has no order guarantees; capture datasource_item_id per row for ordering.
|
|
278
272
|
for row_result in all_results:
|
|
279
|
-
# Add the datasource_item_id for later sorting
|
|
280
273
|
listed_results["index"].append(row_result.datasource_item_id)
|
|
281
274
|
for single_grader_row_result in row_result.results:
|
|
282
275
|
grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
|
|
283
276
|
for name, value in single_grader_row_result.items():
|
|
284
|
-
if name in ["name"]:
|
|
277
|
+
if name in ["name"]:
|
|
285
278
|
continue
|
|
286
279
|
if name.lower() == "passed":
|
|
287
|
-
#
|
|
280
|
+
# Create a `_result` column for each grader
|
|
288
281
|
result_column_name = f"outputs.{grader_name}.{grader_name}_result"
|
|
289
|
-
if len(result_column_name) < 50:
|
|
282
|
+
if len(result_column_name) < 50:
|
|
290
283
|
if result_column_name not in listed_results:
|
|
291
284
|
listed_results[result_column_name] = []
|
|
292
285
|
listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
|
|
@@ -296,23 +289,67 @@ def _get_single_run_results(
|
|
|
296
289
|
listed_results[formatted_column_name] = []
|
|
297
290
|
listed_results[formatted_column_name].append(value)
|
|
298
291
|
|
|
299
|
-
# Ensure all columns
|
|
292
|
+
# Ensure all columns are the same length as the 'index' list
|
|
300
293
|
num_rows = len(listed_results["index"])
|
|
301
294
|
for col_name in list(listed_results.keys()):
|
|
302
295
|
if col_name != "index":
|
|
303
296
|
col_length = len(listed_results[col_name])
|
|
304
297
|
if col_length < num_rows:
|
|
305
|
-
# Pad with None values
|
|
306
298
|
listed_results[col_name].extend([None] * (num_rows - col_length))
|
|
307
299
|
elif col_length > num_rows:
|
|
308
|
-
# This shouldn't happen, but truncate if it does
|
|
309
300
|
listed_results[col_name] = listed_results[col_name][:num_rows]
|
|
310
301
|
|
|
311
302
|
output_df = pd.DataFrame(listed_results)
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
303
|
+
|
|
304
|
+
# If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
|
|
305
|
+
if "index" not in output_df.columns:
|
|
306
|
+
output_df["index"] = list(range(len(output_df)))
|
|
307
|
+
|
|
308
|
+
# Deterministic ordering by original datasource_item_id
|
|
309
|
+
output_df = output_df.sort_values("index", ascending=True)
|
|
310
|
+
|
|
311
|
+
# Keep a temporary row-id copy for debugging/inspection.
|
|
312
|
+
# Use underscores (not hyphens) to avoid pandas column handling quirks.
|
|
313
|
+
output_df["__azure_ai_evaluation_index"] = output_df["index"]
|
|
314
|
+
|
|
315
|
+
# Preserve original ids as index, then pad to expected length
|
|
316
|
+
output_df.set_index("index", inplace=True)
|
|
317
|
+
|
|
318
|
+
expected = run_info.get("expected_rows", None)
|
|
319
|
+
if expected is not None:
|
|
320
|
+
pre_len = len(output_df)
|
|
321
|
+
# Assumes original datasource_item_id space is 0..expected-1
|
|
322
|
+
output_df = output_df.reindex(range(expected))
|
|
323
|
+
if pre_len != expected:
|
|
324
|
+
missing_rows = expected - pre_len
|
|
325
|
+
LOGGER.warning(
|
|
326
|
+
"AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
|
|
327
|
+
run_info["eval_run_id"],
|
|
328
|
+
pre_len,
|
|
329
|
+
expected,
|
|
330
|
+
missing_rows,
|
|
331
|
+
)
|
|
332
|
+
# Add a per-grader 'row_missing' boolean for padded rows
|
|
333
|
+
grader_user_names: Set[str] = set()
|
|
334
|
+
for col in output_df.columns:
|
|
335
|
+
if col.startswith("outputs."):
|
|
336
|
+
parts = col.split(".")
|
|
337
|
+
if len(parts) > 2:
|
|
338
|
+
grader_user_names.add(parts[1])
|
|
339
|
+
if grader_user_names:
|
|
340
|
+
missing_index_mask = output_df.isna().all(axis=1)
|
|
341
|
+
for g in grader_user_names:
|
|
342
|
+
col_name = f"outputs.{g}.row_missing"
|
|
343
|
+
if col_name not in output_df:
|
|
344
|
+
output_df[col_name] = False
|
|
345
|
+
output_df.loc[missing_index_mask, col_name] = True
|
|
346
|
+
|
|
347
|
+
# Drop the temporary helper column before returning (no public surface change)
|
|
348
|
+
if "__azure_ai_evaluation_index" in output_df.columns:
|
|
349
|
+
output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
|
|
350
|
+
|
|
351
|
+
# Reset to RangeIndex so downstream concatenation aligns on position
|
|
352
|
+
output_df.reset_index(drop=True, inplace=True)
|
|
316
353
|
return output_df, run_metrics
|
|
317
354
|
|
|
318
355
|
|
|
@@ -353,6 +390,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
|
353
390
|
AzureOpenAIStringCheckGrader,
|
|
354
391
|
AzureOpenAITextSimilarityGrader,
|
|
355
392
|
AzureOpenAIScoreModelGrader,
|
|
393
|
+
AzureOpenAIPythonGrader,
|
|
356
394
|
)
|
|
357
395
|
|
|
358
396
|
id_map = {
|
|
@@ -361,6 +399,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
|
361
399
|
AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
|
|
362
400
|
AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
|
|
363
401
|
AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
|
|
402
|
+
AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
|
|
364
403
|
}
|
|
365
404
|
|
|
366
405
|
for key in id_map.keys():
|
|
@@ -404,8 +443,15 @@ def _get_graders_and_column_mappings(
|
|
|
404
443
|
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
|
|
405
444
|
"""
|
|
406
445
|
|
|
446
|
+
if column_mappings is None:
|
|
447
|
+
return [({name: grader}, None) for name, grader in graders.items()]
|
|
407
448
|
default_mapping = column_mappings.get("default", None)
|
|
408
|
-
|
|
449
|
+
if default_mapping is None:
|
|
450
|
+
default_mapping = {}
|
|
451
|
+
return [
|
|
452
|
+
({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
|
|
453
|
+
for name, grader in graders.items()
|
|
454
|
+
]
|
|
409
455
|
|
|
410
456
|
|
|
411
457
|
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
138
138
|
project_url: str,
|
|
139
139
|
evaluation_name: Optional[str],
|
|
140
140
|
name_map: Dict[str, str],
|
|
141
|
+
tags: Optional[Dict[str, str]] = None,
|
|
141
142
|
**kwargs,
|
|
142
143
|
) -> Optional[str]:
|
|
143
144
|
|
|
@@ -178,7 +179,6 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
178
179
|
|
|
179
180
|
properties = {
|
|
180
181
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
181
|
-
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
182
182
|
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
183
183
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
184
184
|
}
|
|
@@ -191,6 +191,8 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
191
191
|
upload_run_response = client.start_evaluation_run(
|
|
192
192
|
evaluation=EvaluationUpload(
|
|
193
193
|
display_name=evaluation_name,
|
|
194
|
+
properties=properties,
|
|
195
|
+
tags=tags,
|
|
194
196
|
)
|
|
195
197
|
)
|
|
196
198
|
|
|
@@ -202,7 +204,6 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
202
204
|
outputs={
|
|
203
205
|
"evaluationResultId": create_evaluation_result_response.id,
|
|
204
206
|
},
|
|
205
|
-
properties=properties,
|
|
206
207
|
),
|
|
207
208
|
)
|
|
208
209
|
|
|
@@ -216,6 +217,7 @@ def _log_metrics_and_instance_results(
|
|
|
216
217
|
run: Optional[Run],
|
|
217
218
|
evaluation_name: Optional[str],
|
|
218
219
|
name_map: Dict[str, str],
|
|
220
|
+
tags: Optional[Dict[str, str]] = None,
|
|
219
221
|
**kwargs,
|
|
220
222
|
) -> Optional[str]:
|
|
221
223
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
@@ -245,6 +247,7 @@ def _log_metrics_and_instance_results(
|
|
|
245
247
|
workspace_name=ws_triad.workspace_name,
|
|
246
248
|
management_client=management_client,
|
|
247
249
|
promptflow_run=run,
|
|
250
|
+
tags=tags,
|
|
248
251
|
) as ev_run:
|
|
249
252
|
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
250
253
|
|
|
@@ -54,7 +54,7 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
54
54
|
:caption: Initialize with threshold and call an BleuScoreEvaluator.
|
|
55
55
|
"""
|
|
56
56
|
|
|
57
|
-
id = "
|
|
57
|
+
id = "azureai://built-in/evaluators/bleu_score"
|
|
58
58
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
59
59
|
|
|
60
60
|
def __init__(self, *, threshold=0.5):
|
|
@@ -79,19 +79,26 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
79
79
|
for the code vulnerability will be "code_vulnerability_label".
|
|
80
80
|
"""
|
|
81
81
|
|
|
82
|
-
id = "code_vulnerability"
|
|
82
|
+
id = "azureai://built-in/evaluators/code_vulnerability"
|
|
83
83
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
84
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
84
85
|
|
|
85
86
|
@override
|
|
86
87
|
def __init__(
|
|
87
88
|
self,
|
|
88
89
|
credential,
|
|
89
90
|
azure_ai_project,
|
|
91
|
+
**kwargs,
|
|
90
92
|
):
|
|
93
|
+
# Set default for evaluate_query if not provided
|
|
94
|
+
if "evaluate_query" not in kwargs:
|
|
95
|
+
kwargs["evaluate_query"] = True
|
|
96
|
+
|
|
91
97
|
super().__init__(
|
|
92
98
|
eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
|
|
93
99
|
azure_ai_project=azure_ai_project,
|
|
94
100
|
credential=credential,
|
|
101
|
+
**kwargs,
|
|
95
102
|
)
|
|
96
103
|
|
|
97
104
|
@overload
|
|
@@ -62,11 +62,11 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
62
62
|
_PROMPTY_FILE = "coherence.prompty"
|
|
63
63
|
_RESULT_KEY = "coherence"
|
|
64
64
|
|
|
65
|
-
id = "
|
|
65
|
+
id = "azureai://built-in/evaluators/coherence"
|
|
66
66
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
67
67
|
|
|
68
68
|
@override
|
|
69
|
-
def __init__(self, model_config, *, threshold=3):
|
|
69
|
+
def __init__(self, model_config, *, threshold=3, credential=None):
|
|
70
70
|
current_dir = os.path.dirname(__file__)
|
|
71
71
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
72
72
|
self._threshold = threshold
|
|
@@ -76,6 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
prompty_file=prompty_path,
|
|
77
77
|
result_key=self._RESULT_KEY,
|
|
78
78
|
threshold=threshold,
|
|
79
|
+
credential=credential,
|
|
79
80
|
_higher_is_better=self._higher_is_better,
|
|
80
81
|
)
|
|
81
82
|
|