azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (123) hide show
  1. azure/ai/evaluation/__init__.py +9 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +5 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +158 -28
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_eval_mapping.py +71 -0
  59. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  60. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  62. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  63. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  64. azure/ai/evaluation/_evaluate/_evaluate.py +325 -74
  65. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
  66. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  67. azure/ai/evaluation/_evaluators/_common/_base_eval.py +8 -3
  68. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  69. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  70. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  71. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
  72. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
  73. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
  74. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
  76. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +7 -2
  77. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  78. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
  79. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
  80. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
  81. azure/ai/evaluation/_exceptions.py +2 -0
  82. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  83. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  84. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  85. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  86. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  87. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  88. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  90. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  91. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  92. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  93. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  94. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  95. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  96. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  97. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  98. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  99. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
  100. azure/ai/evaluation/_version.py +1 -1
  101. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  102. azure/ai/evaluation/red_team/_red_team.py +825 -450
  103. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  104. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  105. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  106. azure/ai/evaluation/simulator/_constants.py +1 -0
  107. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  108. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  109. azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
  110. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  111. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
  112. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  113. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
  114. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  115. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  116. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
  117. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  118. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +25 -2
  119. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +123 -65
  120. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  121. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
  122. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
  123. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
@@ -8,20 +8,26 @@ import os
8
8
  import re
9
9
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
10
10
 
11
+ from openai import OpenAI, AzureOpenAI
11
12
  from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
12
13
  from azure.ai.evaluation._legacy._adapters.entities import Run
13
14
  import pandas as pd
14
15
 
15
16
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
16
- from azure.ai.evaluation._common.utils import validate_azure_ai_project
17
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
17
18
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
18
19
 
20
+ from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
21
+
19
22
  from .._constants import (
20
23
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
24
+ EVALUATION_PASS_FAIL_MAPPING,
21
25
  EvaluationMetrics,
22
26
  DefaultOpenEncoding,
23
27
  Prefixes,
24
28
  _InternalEvaluationMetrics,
29
+ BINARY_AGGREGATE_SUFFIX,
30
+ DEFAULT_OAI_EVAL_RUN_NAME
25
31
  )
26
32
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
27
33
  from .._user_agent import USER_AGENT
@@ -29,7 +35,6 @@ from ._batch_run import (
29
35
  EvalRunContext,
30
36
  CodeClient,
31
37
  ProxyClient,
32
- ProxyRun,
33
38
  TargetRunContext,
34
39
  RunSubmitterClient,
35
40
  )
@@ -38,16 +43,22 @@ from ._utils import (
38
43
  _log_metrics_and_instance_results,
39
44
  _trace_destination_from_project_scope,
40
45
  _write_output,
41
- DataLoaderFactory,
46
+ DataLoaderFactory, _log_metrics_and_instance_results_onedp,
42
47
  )
43
- from ._batch_run.batch_clients import BatchClient
48
+ from ._batch_run.batch_clients import BatchClient, BatchClientRun
44
49
 
50
+ from ._evaluate_aoai import (
51
+ _begin_aoai_evaluation,
52
+ _split_evaluators_and_grader_configs,
53
+ _get_evaluation_run_results,
54
+ OAIEvalRunCreationInfo
55
+ )
45
56
  LOGGER = logging.getLogger(__name__)
46
57
 
47
58
  # For metrics (aggregates) whose metric names intentionally differ from their
48
59
  # originating column name, usually because the aggregation of the original value
49
60
  # means something sufficiently different.
50
- # Note that content safety metrics are handled seprately.
61
+ # Note that content safety metrics are handled separately.
51
62
  METRIC_COLUMN_NAME_REPLACEMENTS = {
52
63
  "groundedness_pro_label": "groundedness_pro_passing_rate",
53
64
  }
@@ -58,6 +69,19 @@ class __EvaluatorInfo(TypedDict):
58
69
  metrics: Dict[str, Any]
59
70
  run_summary: Dict[str, Any]
60
71
 
72
+ class __ValidatedData(TypedDict):
73
+ '''
74
+ Simple dictionary that contains ALL pre-processed data and
75
+ the resultant objects that are needed for downstream evaluation.
76
+ '''
77
+ evaluators: Dict[str, Callable]
78
+ graders: Dict[str, AzureOpenAIGrader]
79
+ input_data_df: pd.DataFrame
80
+ column_mapping: Dict[str, Dict[str, str]]
81
+ target_run: Optional[BatchClientRun]
82
+ batch_run_client: BatchClient
83
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame]
84
+
61
85
 
62
86
  def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
63
87
  """Identify and average various metrics that need to have the metric name be replaced,
@@ -208,6 +232,48 @@ def _process_rows(row, detail_defect_rates):
208
232
  return detail_defect_rates
209
233
 
210
234
 
235
+ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
236
+ """
237
+ Aggregate binary output results (pass/fail) from evaluation dataframe.
238
+
239
+ For each evaluator, calculates the proportion of "pass" results.
240
+
241
+ :param df: The dataframe of evaluation results.
242
+ :type df: ~pandas.DataFrame
243
+ :return: A dictionary mapping evaluator names to the proportion of pass results.
244
+ :rtype: Dict[str, float]
245
+ """
246
+ results = {}
247
+
248
+ # Find all columns that end with "_result"
249
+ result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
250
+
251
+ for col in result_columns:
252
+ # Extract the evaluator name from the column name
253
+ # (outputs.<evaluator>.<metric>_result)
254
+ parts = col.split(".")
255
+ evaluator_name = None
256
+ if len(parts) >= 3:
257
+ evaluator_name = parts[1]
258
+ else:
259
+ LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
260
+ continue
261
+ if evaluator_name:
262
+ # Count the occurrences of each unique value (pass/fail)
263
+ value_counts = df[col].value_counts().to_dict()
264
+
265
+ # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
266
+ total_rows = len(df)
267
+ pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
268
+ proportion = pass_count / total_rows if total_rows > 0 else 0.0
269
+
270
+ # Set the result with the evaluator name as the key
271
+ result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
272
+ results[result_key] = round(proportion, 2)
273
+
274
+ return results
275
+
276
+
211
277
  def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
212
278
  """Aggregate metrics from the evaluation results.
213
279
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -221,6 +287,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
221
287
  :return: The aggregated metrics.
222
288
  :rtype: Dict[str, float]
223
289
  """
290
+ binary_metrics = _aggregation_binary_output(df)
291
+
224
292
  df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
225
293
 
226
294
  handled_columns = []
@@ -248,6 +316,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
248
316
  metrics = mean_value.to_dict()
249
317
  # Add defect rates back into metrics
250
318
  metrics.update(defect_rates)
319
+
320
+ # Add binary threshold metrics based on pass/fail results
321
+ metrics.update(binary_metrics)
322
+
251
323
  return metrics
252
324
 
253
325
 
@@ -486,12 +558,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
486
558
 
487
559
  def _apply_target_to_data(
488
560
  target: Callable,
489
- data: Union[str, os.PathLike],
561
+ data: Union[str, os.PathLike, pd.DataFrame],
490
562
  batch_client: BatchClient,
491
563
  initial_data: pd.DataFrame,
492
564
  evaluation_name: Optional[str] = None,
493
565
  **kwargs,
494
- ) -> Tuple[pd.DataFrame, Set[str], Run]:
566
+ ) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
495
567
  """
496
568
  Apply the target function to the data set and return updated data and generated columns.
497
569
 
@@ -509,24 +581,18 @@ def _apply_target_to_data(
509
581
  :rtype: Tuple[pandas.DataFrame, List[str]]
510
582
  """
511
583
 
512
- if not isinstance(batch_client, ProxyClient):
513
- raise ValueError("Only ProxyClient supports target runs for now.")
514
-
515
584
  _run_name = kwargs.get("_run_name")
516
- with TargetRunContext():
517
- run = cast(
518
- ProxyRun,
519
- batch_client.run(
520
- flow=target,
521
- display_name=evaluation_name,
522
- data=data,
523
- stream=True,
524
- name=_run_name,
525
- ),
585
+ with TargetRunContext(batch_client):
586
+ run: BatchClientRun = batch_client.run(
587
+ flow=target,
588
+ display_name=evaluation_name,
589
+ data=data,
590
+ stream=True,
591
+ name=_run_name,
592
+ evaluator_name=getattr(target, "__qualname__", "TARGET"),
526
593
  )
527
-
528
- target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
529
- run_summary = batch_client.get_run_summary(run)
594
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
595
+ run_summary = batch_client.get_run_summary(run)
530
596
 
531
597
  if run_summary["completed_lines"] == 0:
532
598
  msg = (
@@ -557,7 +623,7 @@ def _apply_target_to_data(
557
623
  # Concatenate output to input
558
624
  target_output = pd.concat([target_output, initial_data], axis=1)
559
625
 
560
- return target_output, generated_columns, run.run.result()
626
+ return target_output, generated_columns, run
561
627
 
562
628
 
563
629
  def _process_column_mappings(
@@ -573,7 +639,7 @@ def _process_column_mappings(
573
639
 
574
640
  processed_config: Dict[str, Dict[str, str]] = {}
575
641
 
576
- expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z_]+\}$")
642
+ expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
577
643
 
578
644
  if column_mapping:
579
645
  for evaluator, mapping_config in column_mapping.items():
@@ -625,11 +691,11 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
625
691
  def evaluate(
626
692
  *,
627
693
  data: Union[str, os.PathLike],
628
- evaluators: Dict[str, Callable],
694
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
629
695
  evaluation_name: Optional[str] = None,
630
696
  target: Optional[Callable] = None,
631
697
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
632
- azure_ai_project: Optional[AzureAIProject] = None,
698
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
633
699
  output_path: Optional[Union[str, os.PathLike]] = None,
634
700
  fail_on_evaluator_errors: bool = False,
635
701
  **kwargs,
@@ -641,8 +707,9 @@ def evaluate(
641
707
  JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
642
708
  :paramtype data: str
643
709
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
644
- and value as the evaluator function. Required.
645
- :paramtype evaluators: Dict[str, Callable]
710
+ and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
711
+ Required.
712
+ :paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
646
713
  :keyword evaluation_name: Display name of the evaluation.
647
714
  :paramtype evaluation_name: Optional[str]
648
715
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
@@ -679,7 +746,7 @@ def evaluate(
679
746
  evaluation_name=evaluation_name,
680
747
  target=target,
681
748
  data=data,
682
- evaluators=evaluators,
749
+ evaluators_and_graders=evaluators,
683
750
  evaluator_config=evaluator_config,
684
751
  azure_ai_project=azure_ai_project,
685
752
  output_path=output_path,
@@ -744,23 +811,157 @@ def _print_fail_flag_warning() -> None:
744
811
 
745
812
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
746
813
  *,
747
- evaluators: Dict[str, Callable],
814
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
748
815
  evaluation_name: Optional[str] = None,
749
816
  target: Optional[Callable] = None,
750
817
  data: Union[str, os.PathLike],
751
818
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
752
- azure_ai_project: Optional[AzureAIProject] = None,
819
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
753
820
  output_path: Optional[Union[str, os.PathLike]] = None,
754
821
  fail_on_evaluator_errors: bool = False,
755
822
  **kwargs,
756
823
  ) -> EvaluationResult:
757
824
  if fail_on_evaluator_errors:
758
825
  _print_fail_flag_warning()
759
- input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
826
+
827
+ # Turn inputted mess of data into a dataframe, apply targets if needed
828
+ # split graders and evaluators, and verify that column mappings are sensible.
829
+ validated_data = _preprocess_data(
830
+ data=data,
831
+ evaluators_and_graders=evaluators_and_graders,
832
+ evaluator_config=evaluator_config,
833
+ target=target,
834
+ output_path=output_path,
835
+ azure_ai_project=azure_ai_project,
836
+ evaluation_name=evaluation_name,
837
+ **kwargs,
838
+ )
839
+
840
+ # extract relevant info from validated data
841
+ column_mapping = validated_data["column_mapping"]
842
+ evaluators = validated_data["evaluators"]
843
+ graders = validated_data["graders"]
844
+ input_data_df = validated_data["input_data_df"]
845
+ results_df = pd.DataFrame()
846
+ metrics: Dict[str, float] = {}
847
+ eval_run_info_list: List[OAIEvalRunCreationInfo] = []
848
+
849
+ # Start OAI eval runs if any graders are present.
850
+ need_oai_run = len(graders) > 0
851
+ need_local_run = len(evaluators) > 0
852
+ need_get_oai_results = False
853
+ got_local_results = False
854
+ if need_oai_run:
855
+ try:
856
+ aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
857
+ eval_run_info_list = _begin_aoai_evaluation(
858
+ graders,
859
+ column_mapping,
860
+ input_data_df,
861
+ aoi_name
862
+ )
863
+ need_get_oai_results = len(eval_run_info_list) > 0
864
+ except EvaluationException as e:
865
+ if need_local_run:
866
+ # If there are normal evaluators, don't stop execution and try to run
867
+ # those.
868
+ LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
869
+ " Continuing with local evaluators.")
870
+ LOGGER.warning(e)
871
+ else:
872
+ raise e
873
+
874
+ # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
875
+ if need_local_run:
876
+ try:
877
+ eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
878
+ validated_data=validated_data,
879
+ fail_on_evaluator_errors=fail_on_evaluator_errors
880
+ )
881
+ results_df = eval_result_df
882
+ metrics = eval_metrics
883
+ got_local_results = True
884
+ # TODO figure out how to update this printing to include OAI results?
885
+ _print_summary(per_evaluator_results)
886
+ except EvaluationException as e:
887
+ if need_get_oai_results:
888
+ # If there are OAI graders, we only print a warning on local failures.
889
+ LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
890
+ LOGGER.warning(e)
891
+ else:
892
+ raise e
893
+
894
+ # Retrieve OAI eval run results if needed.
895
+ if need_get_oai_results:
896
+ try:
897
+ aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
898
+ # Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
760
899
 
900
+ # Combine results if both evaluators and graders are present
901
+ if len(evaluators) > 0:
902
+ results_df = pd.concat([results_df, aoai_results], axis=1)
903
+ metrics.update(aoai_metrics)
904
+ else:
905
+ # Otherwise combine aoai results with input data df to include input columns in outputs.
906
+ results_df = pd.concat([input_data_df, aoai_results], axis=1)
907
+ metrics = aoai_metrics
908
+ except EvaluationException as e:
909
+ if got_local_results:
910
+ # If there are local eval results, we only print a warning on OAI failure.
911
+ LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
912
+ LOGGER.warning(e)
913
+ else:
914
+ raise e
915
+
916
+ # Done with all evaluations, message outputs into final forms, and log results if needed.
917
+ name_map = _map_names_to_builtins(evaluators, graders)
918
+ if is_onedp_project(azure_ai_project):
919
+ studio_url = _log_metrics_and_instance_results_onedp(
920
+ metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
921
+ )
922
+ else:
923
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
924
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
925
+ studio_url = None
926
+ if trace_destination:
927
+ studio_url = _log_metrics_and_instance_results(
928
+ metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
929
+ )
930
+
931
+ result_df_dict = results_df.to_dict("records")
932
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
933
+
934
+ if output_path:
935
+ _write_output(output_path, result)
936
+
937
+ return result
938
+
939
+
940
+ def _preprocess_data(
941
+ data: Union[str, os.PathLike],
942
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
943
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
944
+ target: Optional[Callable] = None,
945
+ output_path: Optional[Union[str, os.PathLike]] = None,
946
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
947
+ evaluation_name: Optional[str] = None,
948
+ **kwargs,
949
+ ) -> __ValidatedData:
761
950
  # Process evaluator config to replace ${target.} with ${data.}
762
951
  if evaluator_config is None:
763
952
  evaluator_config = {}
953
+
954
+ input_data_df = _validate_and_load_data(
955
+ target,
956
+ data,
957
+ evaluators_and_graders,
958
+ output_path,
959
+ azure_ai_project,
960
+ evaluation_name
961
+ )
962
+ if target is not None:
963
+ _validate_columns_for_target(input_data_df, target)
964
+
764
965
  # extract column mapping dicts into dictionary mapping evaluator name to column mapping
765
966
  column_mapping = _process_column_mappings(
766
967
  {
@@ -769,27 +970,46 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
769
970
  }
770
971
  )
771
972
 
772
- if target is not None:
773
- _validate_columns_for_target(input_data_df, target)
774
-
775
973
  # Create default configuration for evaluators that directly maps
776
974
  # input data names to keyword inputs of the same name in the evaluators.
777
975
  column_mapping = column_mapping or {}
778
976
  column_mapping.setdefault("default", {})
779
977
 
780
- target_run: Optional[Run] = None
978
+ # Split normal evaluators and OAI graders
979
+ evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
980
+
981
+ input_data_df = _validate_and_load_data(
982
+ target,
983
+ data,
984
+ evaluators_and_graders,
985
+ output_path,
986
+ azure_ai_project,
987
+ evaluation_name
988
+ )
989
+ if target is not None:
990
+ _validate_columns_for_target(input_data_df, target)
991
+
992
+ target_run: Optional[BatchClientRun] = None
781
993
  target_generated_columns: Set[str] = set()
782
994
  batch_run_client: BatchClient
783
995
  batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
784
996
 
785
- # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
786
- if data is not None and target is not None:
787
- # Right now, only the ProxyClient that uses Promptflow supports a target function
997
+ if kwargs.pop("_use_run_submitter_client", False):
998
+ batch_run_client = RunSubmitterClient()
999
+ batch_run_data = input_data_df
1000
+ elif kwargs.pop("_use_pf_client", True):
788
1001
  batch_run_client = ProxyClient(user_agent=USER_AGENT)
1002
+ # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1003
+ # multiple evaluators. If the path is already absolute, abspath will return the original path.
789
1004
  batch_run_data = os.path.abspath(data)
1005
+ else:
1006
+ batch_run_client = CodeClient()
1007
+ batch_run_data = input_data_df
790
1008
 
1009
+ # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
1010
+ if data is not None and target is not None:
791
1011
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
792
- target, data, batch_run_client, input_data_df, evaluation_name, **kwargs
1012
+ target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
793
1013
  )
794
1014
 
795
1015
  for evaluator_name, mapping in column_mapping.items():
@@ -803,17 +1023,6 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
803
1023
  # customer did not mapped target output.
804
1024
  if col not in mapping and run_output not in mapped_to_values:
805
1025
  column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
806
- elif kwargs.pop("_use_run_submitter_client", False):
807
- batch_run_client = RunSubmitterClient()
808
- batch_run_data = input_data_df
809
- elif kwargs.pop("_use_pf_client", True):
810
- batch_run_client = ProxyClient(user_agent=USER_AGENT)
811
- # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
812
- # multiple evaluators. If the path is already absolute, abspath will return the original path.
813
- batch_run_data = os.path.abspath(data)
814
- else:
815
- batch_run_client = CodeClient()
816
- batch_run_data = input_data_df
817
1026
 
818
1027
  # After we have generated all columns, we can check if we have everything we need for evaluators.
819
1028
  _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -829,6 +1038,29 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
829
1038
  if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
830
1039
  column_mapping["default"][col] = f"${{data.{col}}}"
831
1040
 
1041
+ return __ValidatedData(
1042
+ evaluators=evaluators,
1043
+ graders=graders,
1044
+ input_data_df=input_data_df,
1045
+ column_mapping=column_mapping,
1046
+ target_run=target_run,
1047
+ batch_run_client=batch_run_client,
1048
+ batch_run_data=batch_run_data,
1049
+ )
1050
+
1051
+
1052
+ def _run_callable_evaluators(
1053
+ validated_data: __ValidatedData,
1054
+ fail_on_evaluator_errors: bool = False,
1055
+ **kwargs,
1056
+ ) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
1057
+
1058
+ # Extract needed values
1059
+ batch_run_client = validated_data["batch_run_client"]
1060
+ target_run = validated_data["target_run"]
1061
+ batch_run_data = validated_data["batch_run_data"]
1062
+ column_mapping = validated_data["column_mapping"]
1063
+ evaluators = validated_data["evaluators"]
832
1064
  with EvalRunContext(batch_run_client):
833
1065
  runs = {
834
1066
  evaluator_name: batch_run_client.run(
@@ -889,31 +1121,50 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
889
1121
  # Rename columns, generated by target function to outputs instead of inputs.
890
1122
  # If target generates columns, already present in the input data, these columns
891
1123
  # will be marked as outputs already so we do not need to rename them.
892
- input_data_df = _rename_columns_conditionally(input_data_df)
893
-
894
- result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
895
- metrics = _aggregate_metrics(evaluators_result_df, evaluators)
896
- metrics.update(evaluators_metric)
897
-
898
- # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
899
- target_run: Optional[Run] = None
900
- trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
901
- studio_url = None
902
- if trace_destination:
903
- studio_url = _log_metrics_and_instance_results(
904
- metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
905
- )
906
1124
 
907
- result_df_dict = result_df.to_dict("records")
908
- result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
1125
+ input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
1126
+ eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
1127
+ eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
1128
+ eval_metrics.update(evaluators_metric)
909
1129
 
910
- _print_summary(per_evaluator_results)
911
-
912
- if output_path:
913
- _write_output(output_path, result)
1130
+ return eval_result_df, eval_metrics, per_evaluator_results
914
1131
 
915
- return result
1132
+ def _map_names_to_builtins(
1133
+ evaluators: Dict[str, Callable],
1134
+ graders: Dict[str, AzureOpenAIGrader],
1135
+ ) -> Dict[str, str]:
1136
+ """
1137
+ Construct a mapping from user-supplied evaluator names to which known, built-in
1138
+ evaluator or grader they refer to. Custom or otherwise unknown evaluators are
1139
+ mapped to the "unknown" value.
916
1140
 
1141
+ :param evaluators: The dictionary of evaluators.
1142
+ :type evaluators: Dict[str, Callable]
1143
+ :param graders: The dictionary of graders.
1144
+ :type graders: Dict[str, AzureOpenAIGrader]
1145
+ :param evaluator_config: The configuration for evaluators.
1146
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1147
+
1148
+ """
1149
+ from .._eval_mapping import EVAL_CLASS_MAP
1150
+ name_map = {}
1151
+
1152
+ for name, evaluator in evaluators.items():
1153
+ # Check if the evaluator is a known built-in evaluator
1154
+ found_eval = False
1155
+ for eval_class, eval_id in EVAL_CLASS_MAP.items():
1156
+ if isinstance(evaluator, eval_class):
1157
+ name_map[name] = eval_id
1158
+ found_eval = True
1159
+ break
1160
+ if not found_eval:
1161
+ # If not found, map to "unknown"
1162
+ name_map[name] = "unknown"
1163
+
1164
+ for name, grader in graders.items():
1165
+ name_map[name] = grader.id
1166
+
1167
+ return name_map
917
1168
 
918
1169
  def _turn_error_logs_into_exception(log_path: str) -> None:
919
1170
  """Produce an EvaluationException using the contents of the inputted
@@ -929,4 +1180,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
929
1180
  target=ErrorTarget.EVALUATE,
930
1181
  category=ErrorCategory.FAILED_EXECUTION,
931
1182
  blame=ErrorBlame.UNKNOWN,
932
- )
1183
+ )