azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. azure/ai/evaluation/__init__.py +13 -2
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
  6. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  8. azure/ai/evaluation/_azure/_envs.py +9 -10
  9. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  10. azure/ai/evaluation/_common/constants.py +11 -2
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  13. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  14. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  15. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  16. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  17. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  18. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  20. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  21. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  22. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  23. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  24. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  25. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  26. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  27. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  28. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  29. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  30. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  31. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  32. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  33. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
  34. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  35. azure/ai/evaluation/_common/rai_service.py +86 -50
  36. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  37. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  38. azure/ai/evaluation/_common/utils.py +124 -3
  39. azure/ai/evaluation/_constants.py +2 -1
  40. azure/ai/evaluation/_converters/__init__.py +1 -1
  41. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  42. azure/ai/evaluation/_converters/_models.py +46 -0
  43. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  44. azure/ai/evaluation/_eval_mapping.py +2 -2
  45. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
  46. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  47. azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
  48. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
  49. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  50. azure/ai/evaluation/_evaluate/_utils.py +24 -15
  51. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
  52. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
  53. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
  54. azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
  55. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  56. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
  57. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
  58. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
  59. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
  60. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
  61. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
  62. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  63. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
  64. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
  65. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
  66. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
  67. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
  68. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
  69. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
  70. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  71. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
  72. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
  73. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
  74. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
  75. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
  76. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
  77. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
  78. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
  79. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
  80. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
  81. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
  82. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
  83. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
  84. azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
  85. azure/ai/evaluation/_exceptions.py +10 -0
  86. azure/ai/evaluation/_http_utils.py +3 -3
  87. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
  88. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  91. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  92. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  93. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  94. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  95. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
  96. azure/ai/evaluation/_user_agent.py +32 -1
  97. azure/ai/evaluation/_version.py +1 -1
  98. azure/ai/evaluation/red_team/__init__.py +3 -1
  99. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  100. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  101. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  102. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  103. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  104. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  105. azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
  106. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  107. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  108. azure/ai/evaluation/red_team/_red_team.py +1622 -765
  109. azure/ai/evaluation/red_team/_red_team_result.py +43 -38
  110. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  111. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
  112. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
  113. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
  114. azure/ai/evaluation/red_team/_utils/constants.py +6 -12
  115. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  116. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  117. azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
  118. azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
  119. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  120. azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
  121. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  122. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  123. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
  124. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
  125. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  126. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
  127. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  128. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  129. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  130. azure/ai/evaluation/simulator/_simulator.py +9 -8
  131. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
  132. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
  133. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  134. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
  135. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
  136. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
+ import contextlib
5
6
  import json
6
7
  import logging
7
8
  import os
@@ -27,10 +28,10 @@ from .._constants import (
27
28
  Prefixes,
28
29
  _InternalEvaluationMetrics,
29
30
  BINARY_AGGREGATE_SUFFIX,
30
- DEFAULT_OAI_EVAL_RUN_NAME
31
+ DEFAULT_OAI_EVAL_RUN_NAME,
31
32
  )
32
33
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
33
- from .._user_agent import USER_AGENT
34
+ from .._user_agent import UserAgentSingleton
34
35
  from ._batch_run import (
35
36
  EvalRunContext,
36
37
  CodeClient,
@@ -43,7 +44,8 @@ from ._utils import (
43
44
  _log_metrics_and_instance_results,
44
45
  _trace_destination_from_project_scope,
45
46
  _write_output,
46
- DataLoaderFactory, _log_metrics_and_instance_results_onedp,
47
+ DataLoaderFactory,
48
+ _log_metrics_and_instance_results_onedp,
47
49
  )
48
50
  from ._batch_run.batch_clients import BatchClient, BatchClientRun
49
51
 
@@ -51,8 +53,9 @@ from ._evaluate_aoai import (
51
53
  _begin_aoai_evaluation,
52
54
  _split_evaluators_and_grader_configs,
53
55
  _get_evaluation_run_results,
54
- OAIEvalRunCreationInfo
56
+ OAIEvalRunCreationInfo,
55
57
  )
58
+
56
59
  LOGGER = logging.getLogger(__name__)
57
60
 
58
61
  # For metrics (aggregates) whose metric names intentionally differ from their
@@ -69,11 +72,13 @@ class __EvaluatorInfo(TypedDict):
69
72
  metrics: Dict[str, Any]
70
73
  run_summary: Dict[str, Any]
71
74
 
75
+
72
76
  class __ValidatedData(TypedDict):
73
- '''
77
+ """
74
78
  Simple dictionary that contains ALL pre-processed data and
75
79
  the resultant objects that are needed for downstream evaluation.
76
- '''
80
+ """
81
+
77
82
  evaluators: Dict[str, Callable]
78
83
  graders: Dict[str, AzureOpenAIGrader]
79
84
  input_data_df: pd.DataFrame
@@ -255,7 +260,9 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
255
260
  if len(parts) >= 3:
256
261
  evaluator_name = parts[1]
257
262
  else:
258
- LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
263
+ LOGGER.warning(
264
+ "Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col
265
+ )
259
266
  continue
260
267
  if evaluator_name:
261
268
  # Count the occurrences of each unique value (pass/fail)
@@ -721,13 +728,16 @@ def evaluate(
721
728
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
722
729
  the results will be saved to a file named `evaluation_results.json` in the folder.
723
730
  :paramtype output_path: Optional[str]
724
- :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
725
- :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
731
+ :keyword azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
732
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
733
+ :paramtype azure_ai_project: Optional[Union[str, ~azure.ai.evaluation.AzureAIProject]]
726
734
  :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
727
735
  if ANY evaluator fails during their evaluation.
728
736
  Defaults to false, which means that evaluations will continue regardless of failures.
729
737
  If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
730
738
  :paramtype fail_on_evaluator_errors: bool
739
+ :keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
740
+ :paramtype user_agent: Optional[str]
731
741
  :return: Evaluation results.
732
742
  :rtype: ~azure.ai.evaluation.EvaluationResult
733
743
 
@@ -739,29 +749,31 @@ def evaluate(
739
749
  :language: python
740
750
  :dedent: 8
741
751
  :caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
742
-
752
+
743
753
  .. admonition:: Example using Azure AI Project URL:
744
-
754
+
745
755
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
746
756
  :start-after: [START evaluate_method]
747
757
  :end-before: [END evaluate_method]
748
758
  :language: python
749
759
  :dedent: 8
750
- :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
760
+ :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
751
761
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
752
762
  """
753
763
  try:
754
- return _evaluate(
755
- evaluation_name=evaluation_name,
756
- target=target,
757
- data=data,
758
- evaluators_and_graders=evaluators,
759
- evaluator_config=evaluator_config,
760
- azure_ai_project=azure_ai_project,
761
- output_path=output_path,
762
- fail_on_evaluator_errors=fail_on_evaluator_errors,
763
- **kwargs,
764
- )
764
+ user_agent: Optional[str] = kwargs.get("user_agent")
765
+ with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
766
+ return _evaluate(
767
+ evaluation_name=evaluation_name,
768
+ target=target,
769
+ data=data,
770
+ evaluators_and_graders=evaluators,
771
+ evaluator_config=evaluator_config,
772
+ azure_ai_project=azure_ai_project,
773
+ output_path=output_path,
774
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
775
+ **kwargs,
776
+ )
765
777
  except Exception as e:
766
778
  # Handle multiprocess bootstrap error
767
779
  bootstrap_error = (
@@ -832,7 +844,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
832
844
  ) -> EvaluationResult:
833
845
  if fail_on_evaluator_errors:
834
846
  _print_fail_flag_warning()
835
-
847
+
836
848
  # Turn inputted mess of data into a dataframe, apply targets if needed
837
849
  # split graders and evaluators, and verify that column mappings are sensible.
838
850
  validated_data = _preprocess_data(
@@ -845,7 +857,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
845
857
  evaluation_name=evaluation_name,
846
858
  **kwargs,
847
859
  )
848
-
860
+
849
861
  # extract relevant info from validated data
850
862
  column_mapping = validated_data["column_mapping"]
851
863
  evaluators = validated_data["evaluators"]
@@ -863,29 +875,25 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
863
875
  if need_oai_run:
864
876
  try:
865
877
  aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
866
- eval_run_info_list = _begin_aoai_evaluation(
867
- graders,
868
- column_mapping,
869
- input_data_df,
870
- aoi_name
871
- )
878
+ eval_run_info_list = _begin_aoai_evaluation(graders, column_mapping, input_data_df, aoi_name)
872
879
  need_get_oai_results = len(eval_run_info_list) > 0
873
880
  except EvaluationException as e:
874
881
  if need_local_run:
875
882
  # If there are normal evaluators, don't stop execution and try to run
876
883
  # those.
877
- LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
878
- " Continuing with local evaluators.")
884
+ LOGGER.warning(
885
+ "Remote Azure Open AI grader evaluations failed during run creation."
886
+ + " Continuing with local evaluators."
887
+ )
879
888
  LOGGER.warning(e)
880
889
  else:
881
890
  raise e
882
-
891
+
883
892
  # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
884
893
  if need_local_run:
885
894
  try:
886
- eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
887
- validated_data=validated_data,
888
- fail_on_evaluator_errors=fail_on_evaluator_errors
895
+ eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
896
+ validated_data=validated_data, fail_on_evaluator_errors=fail_on_evaluator_errors
889
897
  )
890
898
  results_df = eval_result_df
891
899
  metrics = eval_metrics
@@ -903,7 +911,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
903
911
  # Retrieve OAI eval run results if needed.
904
912
  if need_get_oai_results:
905
913
  try:
906
- aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
914
+ aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
907
915
  # Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
908
916
 
909
917
  # Combine results if both evaluators and graders are present
@@ -955,22 +963,17 @@ def _preprocess_data(
955
963
  azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
956
964
  evaluation_name: Optional[str] = None,
957
965
  **kwargs,
958
- ) -> __ValidatedData:
966
+ ) -> __ValidatedData:
959
967
  # Process evaluator config to replace ${target.} with ${data.}
960
968
  if evaluator_config is None:
961
969
  evaluator_config = {}
962
970
 
963
971
  input_data_df = _validate_and_load_data(
964
- target,
965
- data,
966
- evaluators_and_graders,
967
- output_path,
968
- azure_ai_project,
969
- evaluation_name
972
+ target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name
970
973
  )
971
974
  if target is not None:
972
975
  _validate_columns_for_target(input_data_df, target)
973
-
976
+
974
977
  # extract column mapping dicts into dictionary mapping evaluator name to column mapping
975
978
  column_mapping = _process_column_mappings(
976
979
  {
@@ -996,7 +999,7 @@ def _preprocess_data(
996
999
  batch_run_client = RunSubmitterClient()
997
1000
  batch_run_data = input_data_df
998
1001
  elif kwargs.pop("_use_pf_client", True):
999
- batch_run_client = ProxyClient(user_agent=USER_AGENT)
1002
+ batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
1000
1003
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1001
1004
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
1002
1005
  batch_run_data = os.path.abspath(data)
@@ -1127,14 +1130,15 @@ def _run_callable_evaluators(
1127
1130
 
1128
1131
  return eval_result_df, eval_metrics, per_evaluator_results
1129
1132
 
1133
+
1130
1134
  def _map_names_to_builtins(
1131
- evaluators: Dict[str, Callable],
1132
- graders: Dict[str, AzureOpenAIGrader],
1133
- ) -> Dict[str, str]:
1135
+ evaluators: Dict[str, Callable],
1136
+ graders: Dict[str, AzureOpenAIGrader],
1137
+ ) -> Dict[str, str]:
1134
1138
  """
1135
1139
  Construct a mapping from user-supplied evaluator names to which known, built-in
1136
- evaluator or grader they refer to. Custom or otherwise unknown evaluators are
1137
- mapped to the "unknown" value.
1140
+ evaluator or grader they refer to. Custom evaluators are excluded from the mapping
1141
+ as we only want to track built-in evaluators and graders.
1138
1142
 
1139
1143
  :param evaluators: The dictionary of evaluators.
1140
1144
  :type evaluators: Dict[str, Callable]
@@ -1142,9 +1146,10 @@ def _map_names_to_builtins(
1142
1146
  :type graders: Dict[str, AzureOpenAIGrader]
1143
1147
  :param evaluator_config: The configuration for evaluators.
1144
1148
  :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1145
-
1149
+
1146
1150
  """
1147
1151
  from .._eval_mapping import EVAL_CLASS_MAP
1152
+
1148
1153
  name_map = {}
1149
1154
 
1150
1155
  for name, evaluator in evaluators.items():
@@ -1156,14 +1161,15 @@ def _map_names_to_builtins(
1156
1161
  found_eval = True
1157
1162
  break
1158
1163
  if not found_eval:
1159
- # If not found, map to "unknown"
1160
- name_map[name] = "unknown"
1161
-
1162
- for name, grader in graders.items():
1164
+ # Skip custom evaluators - we only want to track built-in evaluators
1165
+ pass
1166
+
1167
+ for name, grader in graders.items():
1163
1168
  name_map[name] = grader.id
1164
1169
 
1165
1170
  return name_map
1166
1171
 
1172
+
1167
1173
  def _turn_error_logs_into_exception(log_path: str) -> None:
1168
1174
  """Produce an EvaluationException using the contents of the inputted
1169
1175
  file as the error message.
@@ -1178,4 +1184,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
1178
1184
  target=ErrorTarget.EVALUATE,
1179
1185
  category=ErrorCategory.FAILED_EXECUTION,
1180
1186
  blame=ErrorBlame.UNKNOWN,
1181
- )
1187
+ )