azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show
  1. azure/ai/evaluation/__init__.py +46 -12
  2. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  3. azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
  4. azure/ai/evaluation/_common/rai_service.py +3 -3
  5. azure/ai/evaluation/_common/utils.py +74 -17
  6. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
  7. azure/ai/evaluation/_evaluate/_evaluate.py +150 -40
  8. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +2 -0
  9. azure/ai/evaluation/_evaluate/_utils.py +1 -2
  10. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  11. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
  12. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +1 -1
  13. azure/ai/evaluation/_evaluators/_common/_base_eval.py +30 -6
  14. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -8
  15. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  16. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  17. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  18. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  19. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  20. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  21. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  22. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  23. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
  24. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  25. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
  26. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +1 -1
  27. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  28. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  29. azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  30. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +54 -2
  31. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  32. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +1 -1
  33. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +1 -1
  34. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  35. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  36. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
  37. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -10
  38. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  39. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +169 -186
  40. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +101 -23
  41. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
  42. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  43. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  44. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
  45. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  46. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  47. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
  48. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  49. azure/ai/evaluation/_version.py +1 -1
  50. azure/ai/evaluation/red_team/__init__.py +2 -2
  51. azure/ai/evaluation/red_team/_red_team.py +838 -478
  52. azure/ai/evaluation/red_team/_red_team_result.py +6 -0
  53. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +8 -3
  54. azure/ai/evaluation/red_team/_utils/constants.py +0 -2
  55. azure/ai/evaluation/simulator/_adversarial_simulator.py +5 -2
  56. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
  57. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +2 -2
  58. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +20 -2
  59. azure/ai/evaluation/simulator/_simulator.py +12 -0
  60. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +32 -3
  61. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +64 -63
  62. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  63. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  64. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,9 @@ import json
7
7
  import logging
8
8
  import os
9
9
  import re
10
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
10
+ import tempfile
11
+ import json
12
+ from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
11
13
 
12
14
  from openai import OpenAI, AzureOpenAI
13
15
  from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
@@ -611,6 +613,18 @@ def _apply_target_to_data(
611
613
  category=ErrorCategory.FAILED_EXECUTION,
612
614
  blame=ErrorBlame.USER_ERROR,
613
615
  )
616
+
617
+ # Log a warning if some rows failed
618
+ failed_lines = run_summary.get("failed_lines", 0)
619
+ completed_lines = run_summary["completed_lines"]
620
+ total_lines = failed_lines + completed_lines
621
+
622
+ if failed_lines > 0:
623
+ LOGGER.warning(
624
+ f"Target function completed {completed_lines} out of {total_lines} rows. "
625
+ f"{failed_lines} rows failed and will be filled with NaN values."
626
+ )
627
+
614
628
  # Remove input and output prefix
615
629
  generated_columns = {
616
630
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -618,6 +632,13 @@ def _apply_target_to_data(
618
632
  # Sort output by line numbers
619
633
  target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
620
634
  target_output.sort_index(inplace=True)
635
+
636
+ initial_data_with_line_numbers = initial_data.copy()
637
+ initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
638
+
639
+ complete_index = initial_data_with_line_numbers[LINE_NUMBER]
640
+ target_output = target_output.reindex(complete_index)
641
+
621
642
  target_output.reset_index(inplace=True, drop=False)
622
643
  # target_output contains only input columns, taken by function,
623
644
  # so we need to concatenate it to the input data frame.
@@ -626,8 +647,8 @@ def _apply_target_to_data(
626
647
  # Rename outputs columns to __outputs
627
648
  rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
628
649
  target_output.rename(columns=rename_dict, inplace=True)
629
- # Concatenate output to input
630
- target_output = pd.concat([target_output, initial_data], axis=1)
650
+ # Concatenate output to input - now both dataframes have the same number of rows
651
+ target_output = pd.concat([initial_data, target_output], axis=1)
631
652
 
632
653
  return target_output, generated_columns, run
633
654
 
@@ -645,7 +666,7 @@ def _process_column_mappings(
645
666
 
646
667
  processed_config: Dict[str, Dict[str, str]] = {}
647
668
 
648
- expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
669
+ expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
649
670
 
650
671
  if column_mapping:
651
672
  for evaluator, mapping_config in column_mapping.items():
@@ -855,6 +876,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
855
876
  output_path=output_path,
856
877
  azure_ai_project=azure_ai_project,
857
878
  evaluation_name=evaluation_name,
879
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
858
880
  **kwargs,
859
881
  )
860
882
 
@@ -962,6 +984,7 @@ def _preprocess_data(
962
984
  output_path: Optional[Union[str, os.PathLike]] = None,
963
985
  azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
964
986
  evaluation_name: Optional[str] = None,
987
+ fail_on_evaluator_errors: bool = False,
965
988
  **kwargs,
966
989
  ) -> __ValidatedData:
967
990
  # Process evaluator config to replace ${target.} with ${data.}
@@ -995,15 +1018,49 @@ def _preprocess_data(
995
1018
  batch_run_client: BatchClient
996
1019
  batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
997
1020
 
998
- if kwargs.pop("_use_run_submitter_client", False):
999
- batch_run_client = RunSubmitterClient()
1021
+ def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
1022
+ """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
1023
+ _use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
1024
+ _use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
1025
+
1026
+ if _use_run_submitter_client is None and _use_pf_client is None:
1027
+ # If both are unset, return default
1028
+ return "run_submitter"
1029
+
1030
+ if _use_run_submitter_client and _use_pf_client:
1031
+ raise EvaluationException(
1032
+ message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
1033
+ target=ErrorTarget.EVALUATE,
1034
+ category=ErrorCategory.INVALID_VALUE,
1035
+ blame=ErrorBlame.USER_ERROR,
1036
+ )
1037
+
1038
+ if _use_run_submitter_client == False and _use_pf_client == False:
1039
+ return "code_client"
1040
+
1041
+ if _use_run_submitter_client:
1042
+ return "run_submitter"
1043
+ if _use_pf_client:
1044
+ return "pf_client"
1045
+
1046
+ if _use_run_submitter_client is None and _use_pf_client == False:
1047
+ return "run_submitter"
1048
+ if _use_run_submitter_client == False and _use_pf_client is None:
1049
+ return "pf_client"
1050
+
1051
+ assert False, "This should be impossible"
1052
+
1053
+ client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
1054
+
1055
+ if client_type == "run_submitter":
1056
+ batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
1000
1057
  batch_run_data = input_data_df
1001
- elif kwargs.pop("_use_pf_client", True):
1058
+ elif client_type == "pf_client":
1002
1059
  batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
1003
1060
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1004
1061
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
1005
1062
  batch_run_data = os.path.abspath(data)
1006
- else:
1063
+ elif client_type == "code_client":
1007
1064
  batch_run_client = CodeClient()
1008
1065
  batch_run_data = input_data_df
1009
1066
 
@@ -1013,17 +1070,50 @@ def _preprocess_data(
1013
1070
  target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
1014
1071
  )
1015
1072
 
1016
- for evaluator_name, mapping in column_mapping.items():
1017
- mapped_to_values = set(mapping.values())
1018
- for col in target_generated_columns:
1019
- # If user defined mapping differently, do not change it.
1020
- # If it was mapped to target, we have already changed it
1021
- # in _process_column_mappings
1022
- run_output = f"${{run.outputs.{col}}}"
1023
- # We will add our mapping only if
1024
- # customer did not mapped target output.
1025
- if col not in mapping and run_output not in mapped_to_values:
1026
- column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
1073
+ # IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
1074
+ # This ensures that evaluators get all rows (including failed ones with NaN values)
1075
+ if isinstance(batch_run_client, ProxyClient):
1076
+ # Create a temporary JSONL file with the complete dataframe
1077
+ temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
1078
+ try:
1079
+ for _, row in input_data_df.iterrows():
1080
+ row_dict = row.to_dict()
1081
+ temp_file.write(json.dumps(row_dict) + "\n")
1082
+ temp_file.close()
1083
+ batch_run_data = temp_file.name
1084
+
1085
+ # Update column mappings to use data references instead of run outputs
1086
+ for evaluator_name, mapping in column_mapping.items():
1087
+ mapped_to_values = set(mapping.values())
1088
+ for col in target_generated_columns:
1089
+ # Use data reference instead of run output to ensure we get all rows
1090
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1091
+
1092
+ # We will add our mapping only if customer did not map target output.
1093
+ if col not in mapping and target_reference not in mapped_to_values:
1094
+ column_mapping[evaluator_name][col] = target_reference
1095
+
1096
+ # Don't pass the target_run since we're now using the complete dataframe
1097
+ target_run = None
1098
+
1099
+ except Exception as e:
1100
+ # Clean up the temp file if something goes wrong
1101
+ if os.path.exists(temp_file.name):
1102
+ os.unlink(temp_file.name)
1103
+ raise e
1104
+ else:
1105
+ # For DataFrame-based clients, update batch_run_data to use the updated input_data_df
1106
+ batch_run_data = input_data_df
1107
+
1108
+ # Update column mappings for DataFrame clients
1109
+ for evaluator_name, mapping in column_mapping.items():
1110
+ mapped_to_values = set(mapping.values())
1111
+ for col in target_generated_columns:
1112
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1113
+
1114
+ # We will add our mapping only if customer did not map target output.
1115
+ if col not in mapping and target_reference not in mapped_to_values:
1116
+ column_mapping[evaluator_name][col] = target_reference
1027
1117
 
1028
1118
  # After we have generated all columns, we can check if we have everything we need for evaluators.
1029
1119
  _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -1062,30 +1152,50 @@ def _run_callable_evaluators(
1062
1152
  batch_run_data = validated_data["batch_run_data"]
1063
1153
  column_mapping = validated_data["column_mapping"]
1064
1154
  evaluators = validated_data["evaluators"]
1065
- with EvalRunContext(batch_run_client):
1066
- runs = {
1067
- evaluator_name: batch_run_client.run(
1068
- flow=evaluator,
1069
- data=batch_run_data,
1070
- run=target_run,
1071
- evaluator_name=evaluator_name,
1072
- column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
1073
- stream=True,
1074
- name=kwargs.get("_run_name"),
1075
- )
1076
- for evaluator_name, evaluator in evaluators.items()
1077
- }
1078
1155
 
1079
- # get_details needs to be called within EvalRunContext scope in order to have user agent populated
1080
- per_evaluator_results: Dict[str, __EvaluatorInfo] = {
1081
- evaluator_name: {
1082
- "result": batch_run_client.get_details(run, all_results=True),
1083
- "metrics": batch_run_client.get_metrics(run),
1084
- "run_summary": batch_run_client.get_run_summary(run),
1156
+ # Clean up temporary file after evaluation if it was created
1157
+ temp_file_to_cleanup = None
1158
+ if (
1159
+ isinstance(batch_run_client, ProxyClient)
1160
+ and isinstance(batch_run_data, str)
1161
+ and batch_run_data.endswith(".jsonl")
1162
+ ):
1163
+ # Check if it's a temporary file (contains temp directory path)
1164
+ if tempfile.gettempdir() in batch_run_data:
1165
+ temp_file_to_cleanup = batch_run_data
1166
+
1167
+ try:
1168
+ with EvalRunContext(batch_run_client):
1169
+ runs = {
1170
+ evaluator_name: batch_run_client.run(
1171
+ flow=evaluator,
1172
+ data=batch_run_data,
1173
+ # Don't pass target_run when using complete dataframe
1174
+ run=target_run,
1175
+ evaluator_name=evaluator_name,
1176
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
1177
+ stream=True,
1178
+ name=kwargs.get("_run_name"),
1179
+ )
1180
+ for evaluator_name, evaluator in evaluators.items()
1085
1181
  }
1086
- for evaluator_name, run in runs.items()
1087
- }
1088
1182
 
1183
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
1184
+ per_evaluator_results: Dict[str, __EvaluatorInfo] = {
1185
+ evaluator_name: {
1186
+ "result": batch_run_client.get_details(run, all_results=True),
1187
+ "metrics": batch_run_client.get_metrics(run),
1188
+ "run_summary": batch_run_client.get_run_summary(run),
1189
+ }
1190
+ for evaluator_name, run in runs.items()
1191
+ }
1192
+ finally:
1193
+ # Clean up temporary file if it was created
1194
+ if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
1195
+ try:
1196
+ os.unlink(temp_file_to_cleanup)
1197
+ except Exception as e:
1198
+ LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
1089
1199
  # Concatenate all results
1090
1200
  evaluators_result_df = pd.DataFrame()
1091
1201
  evaluators_metric = {}
@@ -353,6 +353,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
353
353
  AzureOpenAIStringCheckGrader,
354
354
  AzureOpenAITextSimilarityGrader,
355
355
  AzureOpenAIScoreModelGrader,
356
+ AzureOpenAIPythonGrader,
356
357
  )
357
358
 
358
359
  id_map = {
@@ -361,6 +362,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
361
362
  AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
362
363
  AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
363
364
  AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
365
+ AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
364
366
  }
365
367
 
366
368
  for key in id_map.keys():
@@ -178,7 +178,6 @@ def _log_metrics_and_instance_results_onedp(
178
178
 
179
179
  properties = {
180
180
  EvaluationRunProperties.RUN_TYPE: "eval_run",
181
- EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
182
181
  EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
183
182
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
184
183
  }
@@ -191,6 +190,7 @@ def _log_metrics_and_instance_results_onedp(
191
190
  upload_run_response = client.start_evaluation_run(
192
191
  evaluation=EvaluationUpload(
193
192
  display_name=evaluation_name,
193
+ properties=properties,
194
194
  )
195
195
  )
196
196
 
@@ -202,7 +202,6 @@ def _log_metrics_and_instance_results_onedp(
202
202
  outputs={
203
203
  "evaluationResultId": create_evaluation_result_response.id,
204
204
  },
205
- properties=properties,
206
205
  ),
207
206
  )
208
207
 
@@ -54,7 +54,7 @@ class BleuScoreEvaluator(EvaluatorBase):
54
54
  :caption: Initialize with threshold and call an BleuScoreEvaluator.
55
55
  """
56
56
 
57
- id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
57
+ id = "azureai://built-in/evaluators/bleu_score"
58
58
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
59
59
 
60
60
  def __init__(self, *, threshold=0.5):
@@ -79,19 +79,26 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
79
79
  for the code vulnerability will be "code_vulnerability_label".
80
80
  """
81
81
 
82
- id = "code_vulnerability"
82
+ id = "azureai://built-in/evaluators/code_vulnerability"
83
83
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
84
+ _OPTIONAL_PARAMS = ["query"]
84
85
 
85
86
  @override
86
87
  def __init__(
87
88
  self,
88
89
  credential,
89
90
  azure_ai_project,
91
+ **kwargs,
90
92
  ):
93
+ # Set default for evaluate_query if not provided
94
+ if "evaluate_query" not in kwargs:
95
+ kwargs["evaluate_query"] = True
96
+
91
97
  super().__init__(
92
98
  eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
93
99
  azure_ai_project=azure_ai_project,
94
100
  credential=credential,
101
+ **kwargs,
95
102
  )
96
103
 
97
104
  @overload
@@ -62,7 +62,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
62
62
  _PROMPTY_FILE = "coherence.prompty"
63
63
  _RESULT_KEY = "coherence"
64
64
 
65
- id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
65
+ id = "azureai://built-in/evaluators/coherence"
66
66
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
67
67
 
68
68
  @override
@@ -4,14 +4,34 @@
4
4
 
5
5
  import inspect
6
6
  from abc import ABC, abstractmethod
7
- from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
7
+ from typing import (
8
+ Any,
9
+ Callable,
10
+ Dict,
11
+ Generic,
12
+ List,
13
+ TypedDict,
14
+ TypeVar,
15
+ Union,
16
+ cast,
17
+ final,
18
+ Optional,
19
+ )
8
20
 
9
21
  from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
10
22
  from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
23
 
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
24
+ from azure.ai.evaluation._exceptions import (
25
+ ErrorBlame,
26
+ ErrorCategory,
27
+ ErrorTarget,
28
+ EvaluationException,
29
+ )
13
30
  from azure.ai.evaluation._common.utils import remove_optional_singletons
14
- from azure.ai.evaluation._constants import _AggregationType, EVALUATION_PASS_FAIL_MAPPING
31
+ from azure.ai.evaluation._constants import (
32
+ _AggregationType,
33
+ EVALUATION_PASS_FAIL_MAPPING,
34
+ )
15
35
  from azure.ai.evaluation._model_configurations import Conversation
16
36
  from azure.ai.evaluation._common._experimental import experimental
17
37
 
@@ -176,7 +196,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
176
196
  singletons.extend([p for p in params if p != "self"])
177
197
  return singletons
178
198
 
179
- def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
199
+ def _derive_conversation_converter(
200
+ self,
201
+ ) -> Callable[[Dict], List[DerivedEvalInput]]:
180
202
  """Produce the function that will be used to convert conversations to a list of evaluable inputs.
181
203
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
182
204
  aspects of a conversation ought to be extracted.
@@ -235,7 +257,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
235
257
 
236
258
  return converter
237
259
 
238
- def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
260
+ def _derive_multi_modal_conversation_converter(
261
+ self,
262
+ ) -> Callable[[Dict], List[Dict[str, Any]]]:
239
263
  """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
240
264
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
241
265
  aspects of a conversation ought to be extracted.
@@ -288,7 +312,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
288
312
 
289
313
  return multi_modal_converter
290
314
 
291
- def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
315
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
292
316
  """Convert an arbitrary input into a list of inputs for evaluators.
293
317
  It is assumed that evaluators generally make use of their inputs in one of two ways.
294
318
  Either they receive a collection of keyname inputs that are all single values
@@ -36,14 +36,17 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
36
36
  aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
37
37
  when this occurs. Default is False, resulting full conversation evaluation and aggregation.
38
38
  :type eval_last_turn: bool
39
- :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
40
- to produce a single result.
39
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
41
40
  Default is ~azure.ai.evaluation._AggregationType.MEAN.
42
41
  :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
43
42
  :param threshold: The threshold for the evaluation. Default is 3.
44
43
  :type threshold: Optional[int]
45
44
  :param _higher_is_better: If True, higher scores are better. Default is True.
46
45
  :type _higher_is_better: Optional[bool]
46
+ :param evaluate_query: If True, the query will be included in the evaluation data when evaluating
47
+ query-response pairs. If False, only the response will be evaluated. Default is False.
48
+ Can be passed as a keyword argument.
49
+ :type evaluate_query: bool
47
50
  """
48
51
 
49
52
  @override
@@ -56,6 +59,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
56
59
  conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
57
60
  threshold: int = 3,
58
61
  _higher_is_better: Optional[bool] = False,
62
+ **kwargs,
59
63
  ):
60
64
  super().__init__(
61
65
  eval_last_turn=eval_last_turn,
@@ -67,6 +71,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
67
71
  self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
68
72
  self._credential = credential
69
73
  self._threshold = threshold
74
+
75
+ # Handle evaluate_query parameter from kwargs
76
+ self._evaluate_query = kwargs.get("evaluate_query", False)
70
77
  self._higher_is_better = _higher_is_better
71
78
 
72
79
  @override
@@ -101,7 +108,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
101
108
  :return: The evaluation result.
102
109
  :rtype: Dict
103
110
  """
104
- if "query" in eval_input and "response" in eval_input:
111
+ if "response" in eval_input:
105
112
  return await self._evaluate_query_response(eval_input)
106
113
 
107
114
  conversation = eval_input.get("conversation", None)
@@ -131,17 +138,20 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
131
138
  return result
132
139
 
133
140
  async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
134
- query = str(eval_input.get("query", None))
135
- response = str(eval_input.get("response", None))
136
- if query is None or response is None:
141
+ query = eval_input.get("query", None)
142
+ response = eval_input.get("response", None)
143
+ if response is None:
137
144
  raise EvaluationException(
138
145
  message="Not implemented",
139
146
  internal_message=(
140
- "Reached query/response evaluation without supplying query or response."
147
+ "Reached query/response evaluation without supplying response."
141
148
  + " This should have failed earlier."
142
149
  ),
143
150
  )
144
- input_data = {"query": query, "response": response}
151
+ input_data = {"response": str(response)}
152
+
153
+ if query is not None and self._evaluate_query:
154
+ input_data["query"] = str(query)
145
155
 
146
156
  if "context" in self._singleton_inputs:
147
157
  context = eval_input.get("context", None)
@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
33
33
  :type self_harm_threshold: int
34
34
  :param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
35
35
  :type hate_unfairness_threshold: int
36
+ :param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
37
+ :type evaluate_query: bool
36
38
  :param kwargs: Additional arguments to pass to the evaluator.
37
39
  :type kwargs: Any
38
40
  :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -66,8 +68,9 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
66
68
  :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
67
69
  """
68
70
 
69
- id = "content_safety"
71
+ id = "azureai://built-in/evaluators/content_safety"
70
72
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
73
+ _OPTIONAL_PARAMS = ["query"]
71
74
 
72
75
  def __init__(
73
76
  self,
@@ -90,11 +93,18 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
90
93
  if not isinstance(value, int):
91
94
  raise TypeError(f"{name} must be an int, got {type(value)}")
92
95
 
96
+ # Extract evaluate_query from kwargs if present
97
+ evaluate_query_kwargs = {}
98
+ if "evaluate_query" in kwargs:
99
+ evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
100
+
93
101
  evaluators = [
94
- ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
95
- SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
96
- SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
97
- HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
102
+ ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
103
+ SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
104
+ SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
105
+ HateUnfairnessEvaluator(
106
+ credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
107
+ ),
98
108
  ]
99
109
  super().__init__(evaluators=evaluators, **kwargs)
100
110
 
@@ -80,8 +80,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
80
80
  :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
81
81
  """
82
82
 
83
- id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
83
+ id = "azureai://built-in/evaluators/hate_unfairness"
84
84
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
85
+ _OPTIONAL_PARAMS = ["query"]
85
86
 
86
87
  @override
87
88
  def __init__(
@@ -90,6 +91,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
90
91
  azure_ai_project,
91
92
  *,
92
93
  threshold: int = 3,
94
+ **kwargs,
93
95
  ):
94
96
  super().__init__(
95
97
  eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -98,6 +100,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
98
100
  conversation_aggregation_type=_AggregationType.MAX,
99
101
  threshold=threshold,
100
102
  _higher_is_better=False,
103
+ **kwargs,
101
104
  )
102
105
 
103
106
  @overload
@@ -65,8 +65,9 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
65
65
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
66
66
  """
67
67
 
68
- id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
68
+ id = "azureai://built-in/evaluators/self_harm"
69
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
+ _OPTIONAL_PARAMS = ["query"]
70
71
 
71
72
  @override
72
73
  def __init__(
@@ -75,6 +76,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
75
76
  azure_ai_project,
76
77
  *,
77
78
  threshold: int = 3,
79
+ **kwargs,
78
80
  ):
79
81
  super().__init__(
80
82
  eval_metric=EvaluationMetrics.SELF_HARM,
@@ -83,6 +85,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
83
85
  conversation_aggregation_type=_AggregationType.MAX,
84
86
  threshold=threshold,
85
87
  _higher_is_better=False,
88
+ **kwargs,
86
89
  )
87
90
 
88
91
  @overload
@@ -76,8 +76,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
76
76
  :caption: Initialize with threshold and call a SexualEvaluator.
77
77
  """
78
78
 
79
- id = "azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3"
79
+ id = "azureai://built-in/evaluators/sexual"
80
80
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
81
+ _OPTIONAL_PARAMS = ["query"]
81
82
 
82
83
  @override
83
84
  def __init__(
@@ -86,6 +87,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
86
87
  azure_ai_project,
87
88
  *,
88
89
  threshold: int = 3,
90
+ **kwargs,
89
91
  ):
90
92
  super().__init__(
91
93
  eval_metric=EvaluationMetrics.SEXUAL,
@@ -94,6 +96,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
94
96
  conversation_aggregation_type=_AggregationType.MAX,
95
97
  threshold=threshold,
96
98
  _higher_is_better=False,
99
+ **kwargs,
97
100
  )
98
101
 
99
102
  @overload
@@ -146,7 +149,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
146
149
  key "messages". Conversation turns are expected
147
150
  to be dictionaries with keys "content" and "role".
148
151
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
149
- :return: The fluency score.
152
+ :return: The sexual score.
150
153
  :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
151
154
  """
152
155
  return super().__call__(*args, **kwargs)
@@ -76,8 +76,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
76
76
  :caption: Initialize with threshold and call a ViolenceEvaluator.
77
77
  """
78
78
 
79
- id = "azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3"
79
+ id = "azureai://built-in/evaluators/violence"
80
80
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
81
+ _OPTIONAL_PARAMS = ["query"]
81
82
 
82
83
  @override
83
84
  def __init__(
@@ -86,6 +87,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
86
87
  azure_ai_project,
87
88
  *,
88
89
  threshold: int = 3,
90
+ **kwargs,
89
91
  ):
90
92
  super().__init__(
91
93
  eval_metric=EvaluationMetrics.VIOLENCE,
@@ -94,6 +96,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
94
96
  conversation_aggregation_type=_AggregationType.MAX,
95
97
  threshold=threshold,
96
98
  _higher_is_better=False,
99
+ **kwargs,
97
100
  )
98
101
 
99
102
  @overload