azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -2,11 +2,14 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
+ import contextlib
5
6
  import json
6
7
  import logging
7
8
  import os
8
9
  import re
9
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
10
+ import tempfile
11
+ import json
12
+ from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
10
13
 
11
14
  from openai import OpenAI, AzureOpenAI
12
15
  from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
@@ -27,10 +30,10 @@ from .._constants import (
27
30
  Prefixes,
28
31
  _InternalEvaluationMetrics,
29
32
  BINARY_AGGREGATE_SUFFIX,
30
- DEFAULT_OAI_EVAL_RUN_NAME
33
+ DEFAULT_OAI_EVAL_RUN_NAME,
31
34
  )
32
35
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
33
- from .._user_agent import USER_AGENT
36
+ from .._user_agent import UserAgentSingleton
34
37
  from ._batch_run import (
35
38
  EvalRunContext,
36
39
  CodeClient,
@@ -43,7 +46,8 @@ from ._utils import (
43
46
  _log_metrics_and_instance_results,
44
47
  _trace_destination_from_project_scope,
45
48
  _write_output,
46
- DataLoaderFactory, _log_metrics_and_instance_results_onedp,
49
+ DataLoaderFactory,
50
+ _log_metrics_and_instance_results_onedp,
47
51
  )
48
52
  from ._batch_run.batch_clients import BatchClient, BatchClientRun
49
53
 
@@ -51,8 +55,9 @@ from ._evaluate_aoai import (
51
55
  _begin_aoai_evaluation,
52
56
  _split_evaluators_and_grader_configs,
53
57
  _get_evaluation_run_results,
54
- OAIEvalRunCreationInfo
58
+ OAIEvalRunCreationInfo,
55
59
  )
60
+
56
61
  LOGGER = logging.getLogger(__name__)
57
62
 
58
63
  # For metrics (aggregates) whose metric names intentionally differ from their
@@ -69,11 +74,13 @@ class __EvaluatorInfo(TypedDict):
69
74
  metrics: Dict[str, Any]
70
75
  run_summary: Dict[str, Any]
71
76
 
77
+
72
78
  class __ValidatedData(TypedDict):
73
- '''
79
+ """
74
80
  Simple dictionary that contains ALL pre-processed data and
75
81
  the resultant objects that are needed for downstream evaluation.
76
- '''
82
+ """
83
+
77
84
  evaluators: Dict[str, Callable]
78
85
  graders: Dict[str, AzureOpenAIGrader]
79
86
  input_data_df: pd.DataFrame
@@ -255,7 +262,9 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
255
262
  if len(parts) >= 3:
256
263
  evaluator_name = parts[1]
257
264
  else:
258
- LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
265
+ LOGGER.warning(
266
+ "Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col
267
+ )
259
268
  continue
260
269
  if evaluator_name:
261
270
  # Count the occurrences of each unique value (pass/fail)
@@ -604,6 +613,18 @@ def _apply_target_to_data(
604
613
  category=ErrorCategory.FAILED_EXECUTION,
605
614
  blame=ErrorBlame.USER_ERROR,
606
615
  )
616
+
617
+ # Log a warning if some rows failed
618
+ failed_lines = run_summary.get("failed_lines", 0)
619
+ completed_lines = run_summary["completed_lines"]
620
+ total_lines = failed_lines + completed_lines
621
+
622
+ if failed_lines > 0:
623
+ LOGGER.warning(
624
+ f"Target function completed {completed_lines} out of {total_lines} rows. "
625
+ f"{failed_lines} rows failed and will be filled with NaN values."
626
+ )
627
+
607
628
  # Remove input and output prefix
608
629
  generated_columns = {
609
630
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -611,6 +632,13 @@ def _apply_target_to_data(
611
632
  # Sort output by line numbers
612
633
  target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
613
634
  target_output.sort_index(inplace=True)
635
+
636
+ initial_data_with_line_numbers = initial_data.copy()
637
+ initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
638
+
639
+ complete_index = initial_data_with_line_numbers[LINE_NUMBER]
640
+ target_output = target_output.reindex(complete_index)
641
+
614
642
  target_output.reset_index(inplace=True, drop=False)
615
643
  # target_output contains only input columns, taken by function,
616
644
  # so we need to concatenate it to the input data frame.
@@ -619,8 +647,8 @@ def _apply_target_to_data(
619
647
  # Rename outputs columns to __outputs
620
648
  rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
621
649
  target_output.rename(columns=rename_dict, inplace=True)
622
- # Concatenate output to input
623
- target_output = pd.concat([target_output, initial_data], axis=1)
650
+ # Concatenate output to input - now both dataframes have the same number of rows
651
+ target_output = pd.concat([initial_data, target_output], axis=1)
624
652
 
625
653
  return target_output, generated_columns, run
626
654
 
@@ -638,7 +666,7 @@ def _process_column_mappings(
638
666
 
639
667
  processed_config: Dict[str, Dict[str, str]] = {}
640
668
 
641
- expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
669
+ expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
642
670
 
643
671
  if column_mapping:
644
672
  for evaluator, mapping_config in column_mapping.items():
@@ -721,13 +749,16 @@ def evaluate(
721
749
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
722
750
  the results will be saved to a file named `evaluation_results.json` in the folder.
723
751
  :paramtype output_path: Optional[str]
724
- :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
725
- :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
752
+ :keyword azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
753
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
754
+ :paramtype azure_ai_project: Optional[Union[str, ~azure.ai.evaluation.AzureAIProject]]
726
755
  :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
727
756
  if ANY evaluator fails during their evaluation.
728
757
  Defaults to false, which means that evaluations will continue regardless of failures.
729
758
  If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
730
759
  :paramtype fail_on_evaluator_errors: bool
760
+ :keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
761
+ :paramtype user_agent: Optional[str]
731
762
  :return: Evaluation results.
732
763
  :rtype: ~azure.ai.evaluation.EvaluationResult
733
764
 
@@ -739,29 +770,31 @@ def evaluate(
739
770
  :language: python
740
771
  :dedent: 8
741
772
  :caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
742
-
773
+
743
774
  .. admonition:: Example using Azure AI Project URL:
744
-
775
+
745
776
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
746
777
  :start-after: [START evaluate_method]
747
778
  :end-before: [END evaluate_method]
748
779
  :language: python
749
780
  :dedent: 8
750
- :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
781
+ :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
751
782
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
752
783
  """
753
784
  try:
754
- return _evaluate(
755
- evaluation_name=evaluation_name,
756
- target=target,
757
- data=data,
758
- evaluators_and_graders=evaluators,
759
- evaluator_config=evaluator_config,
760
- azure_ai_project=azure_ai_project,
761
- output_path=output_path,
762
- fail_on_evaluator_errors=fail_on_evaluator_errors,
763
- **kwargs,
764
- )
785
+ user_agent: Optional[str] = kwargs.get("user_agent")
786
+ with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
787
+ return _evaluate(
788
+ evaluation_name=evaluation_name,
789
+ target=target,
790
+ data=data,
791
+ evaluators_and_graders=evaluators,
792
+ evaluator_config=evaluator_config,
793
+ azure_ai_project=azure_ai_project,
794
+ output_path=output_path,
795
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
796
+ **kwargs,
797
+ )
765
798
  except Exception as e:
766
799
  # Handle multiprocess bootstrap error
767
800
  bootstrap_error = (
@@ -832,7 +865,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
832
865
  ) -> EvaluationResult:
833
866
  if fail_on_evaluator_errors:
834
867
  _print_fail_flag_warning()
835
-
868
+
836
869
  # Turn inputted mess of data into a dataframe, apply targets if needed
837
870
  # split graders and evaluators, and verify that column mappings are sensible.
838
871
  validated_data = _preprocess_data(
@@ -843,9 +876,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
843
876
  output_path=output_path,
844
877
  azure_ai_project=azure_ai_project,
845
878
  evaluation_name=evaluation_name,
879
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
846
880
  **kwargs,
847
881
  )
848
-
882
+
849
883
  # extract relevant info from validated data
850
884
  column_mapping = validated_data["column_mapping"]
851
885
  evaluators = validated_data["evaluators"]
@@ -863,29 +897,25 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
863
897
  if need_oai_run:
864
898
  try:
865
899
  aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
866
- eval_run_info_list = _begin_aoai_evaluation(
867
- graders,
868
- column_mapping,
869
- input_data_df,
870
- aoi_name
871
- )
900
+ eval_run_info_list = _begin_aoai_evaluation(graders, column_mapping, input_data_df, aoi_name)
872
901
  need_get_oai_results = len(eval_run_info_list) > 0
873
902
  except EvaluationException as e:
874
903
  if need_local_run:
875
904
  # If there are normal evaluators, don't stop execution and try to run
876
905
  # those.
877
- LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
878
- " Continuing with local evaluators.")
906
+ LOGGER.warning(
907
+ "Remote Azure Open AI grader evaluations failed during run creation."
908
+ + " Continuing with local evaluators."
909
+ )
879
910
  LOGGER.warning(e)
880
911
  else:
881
912
  raise e
882
-
913
+
883
914
  # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
884
915
  if need_local_run:
885
916
  try:
886
- eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
887
- validated_data=validated_data,
888
- fail_on_evaluator_errors=fail_on_evaluator_errors
917
+ eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
918
+ validated_data=validated_data, fail_on_evaluator_errors=fail_on_evaluator_errors
889
919
  )
890
920
  results_df = eval_result_df
891
921
  metrics = eval_metrics
@@ -903,7 +933,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
903
933
  # Retrieve OAI eval run results if needed.
904
934
  if need_get_oai_results:
905
935
  try:
906
- aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
936
+ aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
907
937
  # Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
908
938
 
909
939
  # Combine results if both evaluators and graders are present
@@ -954,23 +984,19 @@ def _preprocess_data(
954
984
  output_path: Optional[Union[str, os.PathLike]] = None,
955
985
  azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
956
986
  evaluation_name: Optional[str] = None,
987
+ fail_on_evaluator_errors: bool = False,
957
988
  **kwargs,
958
- ) -> __ValidatedData:
989
+ ) -> __ValidatedData:
959
990
  # Process evaluator config to replace ${target.} with ${data.}
960
991
  if evaluator_config is None:
961
992
  evaluator_config = {}
962
993
 
963
994
  input_data_df = _validate_and_load_data(
964
- target,
965
- data,
966
- evaluators_and_graders,
967
- output_path,
968
- azure_ai_project,
969
- evaluation_name
995
+ target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name
970
996
  )
971
997
  if target is not None:
972
998
  _validate_columns_for_target(input_data_df, target)
973
-
999
+
974
1000
  # extract column mapping dicts into dictionary mapping evaluator name to column mapping
975
1001
  column_mapping = _process_column_mappings(
976
1002
  {
@@ -992,15 +1018,49 @@ def _preprocess_data(
992
1018
  batch_run_client: BatchClient
993
1019
  batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
994
1020
 
995
- if kwargs.pop("_use_run_submitter_client", False):
996
- batch_run_client = RunSubmitterClient()
1021
+ def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
1022
+ """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
1023
+ _use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
1024
+ _use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
1025
+
1026
+ if _use_run_submitter_client is None and _use_pf_client is None:
1027
+ # If both are unset, return default
1028
+ return "run_submitter"
1029
+
1030
+ if _use_run_submitter_client and _use_pf_client:
1031
+ raise EvaluationException(
1032
+ message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
1033
+ target=ErrorTarget.EVALUATE,
1034
+ category=ErrorCategory.INVALID_VALUE,
1035
+ blame=ErrorBlame.USER_ERROR,
1036
+ )
1037
+
1038
+ if _use_run_submitter_client == False and _use_pf_client == False:
1039
+ return "code_client"
1040
+
1041
+ if _use_run_submitter_client:
1042
+ return "run_submitter"
1043
+ if _use_pf_client:
1044
+ return "pf_client"
1045
+
1046
+ if _use_run_submitter_client is None and _use_pf_client == False:
1047
+ return "run_submitter"
1048
+ if _use_run_submitter_client == False and _use_pf_client is None:
1049
+ return "pf_client"
1050
+
1051
+ assert False, "This should be impossible"
1052
+
1053
+ client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
1054
+
1055
+ if client_type == "run_submitter":
1056
+ batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
997
1057
  batch_run_data = input_data_df
998
- elif kwargs.pop("_use_pf_client", True):
999
- batch_run_client = ProxyClient(user_agent=USER_AGENT)
1058
+ elif client_type == "pf_client":
1059
+ batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
1000
1060
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1001
1061
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
1002
1062
  batch_run_data = os.path.abspath(data)
1003
- else:
1063
+ elif client_type == "code_client":
1004
1064
  batch_run_client = CodeClient()
1005
1065
  batch_run_data = input_data_df
1006
1066
 
@@ -1010,17 +1070,50 @@ def _preprocess_data(
1010
1070
  target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
1011
1071
  )
1012
1072
 
1013
- for evaluator_name, mapping in column_mapping.items():
1014
- mapped_to_values = set(mapping.values())
1015
- for col in target_generated_columns:
1016
- # If user defined mapping differently, do not change it.
1017
- # If it was mapped to target, we have already changed it
1018
- # in _process_column_mappings
1019
- run_output = f"${{run.outputs.{col}}}"
1020
- # We will add our mapping only if
1021
- # customer did not mapped target output.
1022
- if col not in mapping and run_output not in mapped_to_values:
1023
- column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
1073
+ # IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
1074
+ # This ensures that evaluators get all rows (including failed ones with NaN values)
1075
+ if isinstance(batch_run_client, ProxyClient):
1076
+ # Create a temporary JSONL file with the complete dataframe
1077
+ temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
1078
+ try:
1079
+ for _, row in input_data_df.iterrows():
1080
+ row_dict = row.to_dict()
1081
+ temp_file.write(json.dumps(row_dict) + "\n")
1082
+ temp_file.close()
1083
+ batch_run_data = temp_file.name
1084
+
1085
+ # Update column mappings to use data references instead of run outputs
1086
+ for evaluator_name, mapping in column_mapping.items():
1087
+ mapped_to_values = set(mapping.values())
1088
+ for col in target_generated_columns:
1089
+ # Use data reference instead of run output to ensure we get all rows
1090
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1091
+
1092
+ # We will add our mapping only if customer did not map target output.
1093
+ if col not in mapping and target_reference not in mapped_to_values:
1094
+ column_mapping[evaluator_name][col] = target_reference
1095
+
1096
+ # Don't pass the target_run since we're now using the complete dataframe
1097
+ target_run = None
1098
+
1099
+ except Exception as e:
1100
+ # Clean up the temp file if something goes wrong
1101
+ if os.path.exists(temp_file.name):
1102
+ os.unlink(temp_file.name)
1103
+ raise e
1104
+ else:
1105
+ # For DataFrame-based clients, update batch_run_data to use the updated input_data_df
1106
+ batch_run_data = input_data_df
1107
+
1108
+ # Update column mappings for DataFrame clients
1109
+ for evaluator_name, mapping in column_mapping.items():
1110
+ mapped_to_values = set(mapping.values())
1111
+ for col in target_generated_columns:
1112
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1113
+
1114
+ # We will add our mapping only if customer did not map target output.
1115
+ if col not in mapping and target_reference not in mapped_to_values:
1116
+ column_mapping[evaluator_name][col] = target_reference
1024
1117
 
1025
1118
  # After we have generated all columns, we can check if we have everything we need for evaluators.
1026
1119
  _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -1059,30 +1152,50 @@ def _run_callable_evaluators(
1059
1152
  batch_run_data = validated_data["batch_run_data"]
1060
1153
  column_mapping = validated_data["column_mapping"]
1061
1154
  evaluators = validated_data["evaluators"]
1062
- with EvalRunContext(batch_run_client):
1063
- runs = {
1064
- evaluator_name: batch_run_client.run(
1065
- flow=evaluator,
1066
- data=batch_run_data,
1067
- run=target_run,
1068
- evaluator_name=evaluator_name,
1069
- column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
1070
- stream=True,
1071
- name=kwargs.get("_run_name"),
1072
- )
1073
- for evaluator_name, evaluator in evaluators.items()
1074
- }
1075
1155
 
1076
- # get_details needs to be called within EvalRunContext scope in order to have user agent populated
1077
- per_evaluator_results: Dict[str, __EvaluatorInfo] = {
1078
- evaluator_name: {
1079
- "result": batch_run_client.get_details(run, all_results=True),
1080
- "metrics": batch_run_client.get_metrics(run),
1081
- "run_summary": batch_run_client.get_run_summary(run),
1156
+ # Clean up temporary file after evaluation if it was created
1157
+ temp_file_to_cleanup = None
1158
+ if (
1159
+ isinstance(batch_run_client, ProxyClient)
1160
+ and isinstance(batch_run_data, str)
1161
+ and batch_run_data.endswith(".jsonl")
1162
+ ):
1163
+ # Check if it's a temporary file (contains temp directory path)
1164
+ if tempfile.gettempdir() in batch_run_data:
1165
+ temp_file_to_cleanup = batch_run_data
1166
+
1167
+ try:
1168
+ with EvalRunContext(batch_run_client):
1169
+ runs = {
1170
+ evaluator_name: batch_run_client.run(
1171
+ flow=evaluator,
1172
+ data=batch_run_data,
1173
+ # Don't pass target_run when using complete dataframe
1174
+ run=target_run,
1175
+ evaluator_name=evaluator_name,
1176
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
1177
+ stream=True,
1178
+ name=kwargs.get("_run_name"),
1179
+ )
1180
+ for evaluator_name, evaluator in evaluators.items()
1082
1181
  }
1083
- for evaluator_name, run in runs.items()
1084
- }
1085
1182
 
1183
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
1184
+ per_evaluator_results: Dict[str, __EvaluatorInfo] = {
1185
+ evaluator_name: {
1186
+ "result": batch_run_client.get_details(run, all_results=True),
1187
+ "metrics": batch_run_client.get_metrics(run),
1188
+ "run_summary": batch_run_client.get_run_summary(run),
1189
+ }
1190
+ for evaluator_name, run in runs.items()
1191
+ }
1192
+ finally:
1193
+ # Clean up temporary file if it was created
1194
+ if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
1195
+ try:
1196
+ os.unlink(temp_file_to_cleanup)
1197
+ except Exception as e:
1198
+ LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
1086
1199
  # Concatenate all results
1087
1200
  evaluators_result_df = pd.DataFrame()
1088
1201
  evaluators_metric = {}
@@ -1127,10 +1240,11 @@ def _run_callable_evaluators(
1127
1240
 
1128
1241
  return eval_result_df, eval_metrics, per_evaluator_results
1129
1242
 
1243
+
1130
1244
  def _map_names_to_builtins(
1131
- evaluators: Dict[str, Callable],
1132
- graders: Dict[str, AzureOpenAIGrader],
1133
- ) -> Dict[str, str]:
1245
+ evaluators: Dict[str, Callable],
1246
+ graders: Dict[str, AzureOpenAIGrader],
1247
+ ) -> Dict[str, str]:
1134
1248
  """
1135
1249
  Construct a mapping from user-supplied evaluator names to which known, built-in
1136
1250
  evaluator or grader they refer to. Custom evaluators are excluded from the mapping
@@ -1142,9 +1256,10 @@ def _map_names_to_builtins(
1142
1256
  :type graders: Dict[str, AzureOpenAIGrader]
1143
1257
  :param evaluator_config: The configuration for evaluators.
1144
1258
  :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1145
-
1259
+
1146
1260
  """
1147
1261
  from .._eval_mapping import EVAL_CLASS_MAP
1262
+
1148
1263
  name_map = {}
1149
1264
 
1150
1265
  for name, evaluator in evaluators.items():
@@ -1158,12 +1273,13 @@ def _map_names_to_builtins(
1158
1273
  if not found_eval:
1159
1274
  # Skip custom evaluators - we only want to track built-in evaluators
1160
1275
  pass
1161
-
1162
- for name, grader in graders.items():
1276
+
1277
+ for name, grader in graders.items():
1163
1278
  name_map[name] = grader.id
1164
1279
 
1165
1280
  return name_map
1166
1281
 
1282
+
1167
1283
  def _turn_error_logs_into_exception(log_path: str) -> None:
1168
1284
  """Produce an EvaluationException using the contents of the inputted
1169
1285
  file as the error message.
@@ -1178,4 +1294,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
1178
1294
  target=ErrorTarget.EVALUATE,
1179
1295
  category=ErrorCategory.FAILED_EXECUTION,
1180
1296
  blame=ErrorBlame.UNKNOWN,
1181
- )
1297
+ )