azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show
  1. azure/ai/evaluation/__init__.py +10 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +7 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +165 -34
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  59. azure/ai/evaluation/_converters/_models.py +76 -6
  60. azure/ai/evaluation/_eval_mapping.py +73 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  63. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  64. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  65. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  66. azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
  67. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
  68. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  69. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  70. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  71. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  72. azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  76. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  77. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  78. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  79. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  80. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  81. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
  82. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  83. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
  84. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  85. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
  86. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
  87. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  88. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  89. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  90. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
  91. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
  92. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  93. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  94. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  95. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  96. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
  97. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
  98. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
  99. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  100. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  101. azure/ai/evaluation/_exceptions.py +2 -0
  102. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  103. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  104. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  106. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  107. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  109. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  110. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  111. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  112. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  113. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  114. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  115. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  116. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  117. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  118. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  119. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
  120. azure/ai/evaluation/_version.py +1 -1
  121. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  122. azure/ai/evaluation/red_team/_red_team.py +976 -546
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  125. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  126. azure/ai/evaluation/simulator/_constants.py +1 -0
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  128. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  129. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
  140. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
  141. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  142. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  143. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  144. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -8,20 +8,26 @@ import os
8
8
  import re
9
9
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
10
10
 
11
+ from openai import OpenAI, AzureOpenAI
11
12
  from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
12
13
  from azure.ai.evaluation._legacy._adapters.entities import Run
13
14
  import pandas as pd
14
15
 
15
16
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
16
- from azure.ai.evaluation._common.utils import validate_azure_ai_project
17
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
17
18
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
18
19
 
20
+ from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
21
+
19
22
  from .._constants import (
20
23
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
24
+ EVALUATION_PASS_FAIL_MAPPING,
21
25
  EvaluationMetrics,
22
26
  DefaultOpenEncoding,
23
27
  Prefixes,
24
28
  _InternalEvaluationMetrics,
29
+ BINARY_AGGREGATE_SUFFIX,
30
+ DEFAULT_OAI_EVAL_RUN_NAME
25
31
  )
26
32
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
27
33
  from .._user_agent import USER_AGENT
@@ -29,7 +35,6 @@ from ._batch_run import (
29
35
  EvalRunContext,
30
36
  CodeClient,
31
37
  ProxyClient,
32
- ProxyRun,
33
38
  TargetRunContext,
34
39
  RunSubmitterClient,
35
40
  )
@@ -38,16 +43,22 @@ from ._utils import (
38
43
  _log_metrics_and_instance_results,
39
44
  _trace_destination_from_project_scope,
40
45
  _write_output,
41
- DataLoaderFactory,
46
+ DataLoaderFactory, _log_metrics_and_instance_results_onedp,
42
47
  )
43
- from ._batch_run.batch_clients import BatchClient
48
+ from ._batch_run.batch_clients import BatchClient, BatchClientRun
44
49
 
50
+ from ._evaluate_aoai import (
51
+ _begin_aoai_evaluation,
52
+ _split_evaluators_and_grader_configs,
53
+ _get_evaluation_run_results,
54
+ OAIEvalRunCreationInfo
55
+ )
45
56
  LOGGER = logging.getLogger(__name__)
46
57
 
47
58
  # For metrics (aggregates) whose metric names intentionally differ from their
48
59
  # originating column name, usually because the aggregation of the original value
49
60
  # means something sufficiently different.
50
- # Note that content safety metrics are handled seprately.
61
+ # Note that content safety metrics are handled separately.
51
62
  METRIC_COLUMN_NAME_REPLACEMENTS = {
52
63
  "groundedness_pro_label": "groundedness_pro_passing_rate",
53
64
  }
@@ -58,6 +69,19 @@ class __EvaluatorInfo(TypedDict):
58
69
  metrics: Dict[str, Any]
59
70
  run_summary: Dict[str, Any]
60
71
 
72
+ class __ValidatedData(TypedDict):
73
+ '''
74
+ Simple dictionary that contains ALL pre-processed data and
75
+ the resultant objects that are needed for downstream evaluation.
76
+ '''
77
+ evaluators: Dict[str, Callable]
78
+ graders: Dict[str, AzureOpenAIGrader]
79
+ input_data_df: pd.DataFrame
80
+ column_mapping: Dict[str, Dict[str, str]]
81
+ target_run: Optional[BatchClientRun]
82
+ batch_run_client: BatchClient
83
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame]
84
+
61
85
 
62
86
  def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
63
87
  """Identify and average various metrics that need to have the metric name be replaced,
@@ -117,7 +141,6 @@ def _aggregate_content_safety_metrics(
117
141
  module = inspect.getmodule(evaluators[evaluator_name])
118
142
  if (
119
143
  module
120
- and module.__name__.startswith("azure.ai.evaluation.")
121
144
  and metric_name.endswith("_score")
122
145
  and metric_name.replace("_score", "") in content_safety_metrics
123
146
  ):
@@ -208,6 +231,48 @@ def _process_rows(row, detail_defect_rates):
208
231
  return detail_defect_rates
209
232
 
210
233
 
234
+ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
235
+ """
236
+ Aggregate binary output results (pass/fail) from evaluation dataframe.
237
+
238
+ For each evaluator, calculates the proportion of "pass" results.
239
+
240
+ :param df: The dataframe of evaluation results.
241
+ :type df: ~pandas.DataFrame
242
+ :return: A dictionary mapping evaluator names to the proportion of pass results.
243
+ :rtype: Dict[str, float]
244
+ """
245
+ results = {}
246
+
247
+ # Find all columns that end with "_result"
248
+ result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
249
+
250
+ for col in result_columns:
251
+ # Extract the evaluator name from the column name
252
+ # (outputs.<evaluator>.<metric>_result)
253
+ parts = col.split(".")
254
+ evaluator_name = None
255
+ if len(parts) >= 3:
256
+ evaluator_name = parts[1]
257
+ else:
258
+ LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
259
+ continue
260
+ if evaluator_name:
261
+ # Count the occurrences of each unique value (pass/fail)
262
+ value_counts = df[col].value_counts().to_dict()
263
+
264
+ # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
265
+ total_rows = len(df)
266
+ pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
267
+ proportion = pass_count / total_rows if total_rows > 0 else 0.0
268
+
269
+ # Set the result with the evaluator name as the key
270
+ result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
271
+ results[result_key] = round(proportion, 2)
272
+
273
+ return results
274
+
275
+
211
276
  def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
212
277
  """Aggregate metrics from the evaluation results.
213
278
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -221,6 +286,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
221
286
  :return: The aggregated metrics.
222
287
  :rtype: Dict[str, float]
223
288
  """
289
+ binary_metrics = _aggregation_binary_output(df)
290
+
224
291
  df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
225
292
 
226
293
  handled_columns = []
@@ -248,6 +315,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
248
315
  metrics = mean_value.to_dict()
249
316
  # Add defect rates back into metrics
250
317
  metrics.update(defect_rates)
318
+
319
+ # Add binary threshold metrics based on pass/fail results
320
+ metrics.update(binary_metrics)
321
+
251
322
  return metrics
252
323
 
253
324
 
@@ -486,12 +557,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
486
557
 
487
558
  def _apply_target_to_data(
488
559
  target: Callable,
489
- data: Union[str, os.PathLike],
560
+ data: Union[str, os.PathLike, pd.DataFrame],
490
561
  batch_client: BatchClient,
491
562
  initial_data: pd.DataFrame,
492
563
  evaluation_name: Optional[str] = None,
493
564
  **kwargs,
494
- ) -> Tuple[pd.DataFrame, Set[str], Run]:
565
+ ) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
495
566
  """
496
567
  Apply the target function to the data set and return updated data and generated columns.
497
568
 
@@ -509,24 +580,18 @@ def _apply_target_to_data(
509
580
  :rtype: Tuple[pandas.DataFrame, List[str]]
510
581
  """
511
582
 
512
- if not isinstance(batch_client, ProxyClient):
513
- raise ValueError("Only ProxyClient supports target runs for now.")
514
-
515
583
  _run_name = kwargs.get("_run_name")
516
- with TargetRunContext():
517
- run = cast(
518
- ProxyRun,
519
- batch_client.run(
520
- flow=target,
521
- display_name=evaluation_name,
522
- data=data,
523
- stream=True,
524
- name=_run_name,
525
- ),
584
+ with TargetRunContext(batch_client):
585
+ run: BatchClientRun = batch_client.run(
586
+ flow=target,
587
+ display_name=evaluation_name,
588
+ data=data,
589
+ stream=True,
590
+ name=_run_name,
591
+ evaluator_name=getattr(target, "__qualname__", "TARGET"),
526
592
  )
527
-
528
- target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
529
- run_summary = batch_client.get_run_summary(run)
593
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
594
+ run_summary = batch_client.get_run_summary(run)
530
595
 
531
596
  if run_summary["completed_lines"] == 0:
532
597
  msg = (
@@ -557,7 +622,7 @@ def _apply_target_to_data(
557
622
  # Concatenate output to input
558
623
  target_output = pd.concat([target_output, initial_data], axis=1)
559
624
 
560
- return target_output, generated_columns, run.run.result()
625
+ return target_output, generated_columns, run
561
626
 
562
627
 
563
628
  def _process_column_mappings(
@@ -573,7 +638,7 @@ def _process_column_mappings(
573
638
 
574
639
  processed_config: Dict[str, Dict[str, str]] = {}
575
640
 
576
- expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z_]+\}$")
641
+ expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
577
642
 
578
643
  if column_mapping:
579
644
  for evaluator, mapping_config in column_mapping.items():
@@ -625,11 +690,11 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
625
690
  def evaluate(
626
691
  *,
627
692
  data: Union[str, os.PathLike],
628
- evaluators: Dict[str, Callable],
693
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
629
694
  evaluation_name: Optional[str] = None,
630
695
  target: Optional[Callable] = None,
631
696
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
632
- azure_ai_project: Optional[AzureAIProject] = None,
697
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
633
698
  output_path: Optional[Union[str, os.PathLike]] = None,
634
699
  fail_on_evaluator_errors: bool = False,
635
700
  **kwargs,
@@ -641,8 +706,9 @@ def evaluate(
641
706
  JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
642
707
  :paramtype data: str
643
708
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
644
- and value as the evaluator function. Required.
645
- :paramtype evaluators: Dict[str, Callable]
709
+ and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
710
+ Required.
711
+ :paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
646
712
  :keyword evaluation_name: Display name of the evaluation.
647
713
  :paramtype evaluation_name: Optional[str]
648
714
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
@@ -672,14 +738,24 @@ def evaluate(
672
738
  :end-before: [END evaluate_method]
673
739
  :language: python
674
740
  :dedent: 8
675
- :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
741
+ :caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
742
+
743
+ .. admonition:: Example using Azure AI Project URL:
744
+
745
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
746
+ :start-after: [START evaluate_method]
747
+ :end-before: [END evaluate_method]
748
+ :language: python
749
+ :dedent: 8
750
+ :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
751
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
676
752
  """
677
753
  try:
678
754
  return _evaluate(
679
755
  evaluation_name=evaluation_name,
680
756
  target=target,
681
757
  data=data,
682
- evaluators=evaluators,
758
+ evaluators_and_graders=evaluators,
683
759
  evaluator_config=evaluator_config,
684
760
  azure_ai_project=azure_ai_project,
685
761
  output_path=output_path,
@@ -744,23 +820,157 @@ def _print_fail_flag_warning() -> None:
744
820
 
745
821
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
746
822
  *,
747
- evaluators: Dict[str, Callable],
823
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
748
824
  evaluation_name: Optional[str] = None,
749
825
  target: Optional[Callable] = None,
750
826
  data: Union[str, os.PathLike],
751
827
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
752
- azure_ai_project: Optional[AzureAIProject] = None,
828
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
753
829
  output_path: Optional[Union[str, os.PathLike]] = None,
754
830
  fail_on_evaluator_errors: bool = False,
755
831
  **kwargs,
756
832
  ) -> EvaluationResult:
757
833
  if fail_on_evaluator_errors:
758
834
  _print_fail_flag_warning()
759
- input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
835
+
836
+ # Turn inputted mess of data into a dataframe, apply targets if needed
837
+ # split graders and evaluators, and verify that column mappings are sensible.
838
+ validated_data = _preprocess_data(
839
+ data=data,
840
+ evaluators_and_graders=evaluators_and_graders,
841
+ evaluator_config=evaluator_config,
842
+ target=target,
843
+ output_path=output_path,
844
+ azure_ai_project=azure_ai_project,
845
+ evaluation_name=evaluation_name,
846
+ **kwargs,
847
+ )
848
+
849
+ # extract relevant info from validated data
850
+ column_mapping = validated_data["column_mapping"]
851
+ evaluators = validated_data["evaluators"]
852
+ graders = validated_data["graders"]
853
+ input_data_df = validated_data["input_data_df"]
854
+ results_df = pd.DataFrame()
855
+ metrics: Dict[str, float] = {}
856
+ eval_run_info_list: List[OAIEvalRunCreationInfo] = []
857
+
858
+ # Start OAI eval runs if any graders are present.
859
+ need_oai_run = len(graders) > 0
860
+ need_local_run = len(evaluators) > 0
861
+ need_get_oai_results = False
862
+ got_local_results = False
863
+ if need_oai_run:
864
+ try:
865
+ aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
866
+ eval_run_info_list = _begin_aoai_evaluation(
867
+ graders,
868
+ column_mapping,
869
+ input_data_df,
870
+ aoi_name
871
+ )
872
+ need_get_oai_results = len(eval_run_info_list) > 0
873
+ except EvaluationException as e:
874
+ if need_local_run:
875
+ # If there are normal evaluators, don't stop execution and try to run
876
+ # those.
877
+ LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
878
+ " Continuing with local evaluators.")
879
+ LOGGER.warning(e)
880
+ else:
881
+ raise e
882
+
883
+ # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
884
+ if need_local_run:
885
+ try:
886
+ eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
887
+ validated_data=validated_data,
888
+ fail_on_evaluator_errors=fail_on_evaluator_errors
889
+ )
890
+ results_df = eval_result_df
891
+ metrics = eval_metrics
892
+ got_local_results = True
893
+ # TODO figure out how to update this printing to include OAI results?
894
+ _print_summary(per_evaluator_results)
895
+ except EvaluationException as e:
896
+ if need_get_oai_results:
897
+ # If there are OAI graders, we only print a warning on local failures.
898
+ LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
899
+ LOGGER.warning(e)
900
+ else:
901
+ raise e
902
+
903
+ # Retrieve OAI eval run results if needed.
904
+ if need_get_oai_results:
905
+ try:
906
+ aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
907
+ # Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
908
+
909
+ # Combine results if both evaluators and graders are present
910
+ if len(evaluators) > 0:
911
+ results_df = pd.concat([results_df, aoai_results], axis=1)
912
+ metrics.update(aoai_metrics)
913
+ else:
914
+ # Otherwise combine aoai results with input data df to include input columns in outputs.
915
+ results_df = pd.concat([input_data_df, aoai_results], axis=1)
916
+ metrics = aoai_metrics
917
+ except EvaluationException as e:
918
+ if got_local_results:
919
+ # If there are local eval results, we only print a warning on OAI failure.
920
+ LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
921
+ LOGGER.warning(e)
922
+ else:
923
+ raise e
924
+
925
+ # Done with all evaluations, message outputs into final forms, and log results if needed.
926
+ name_map = _map_names_to_builtins(evaluators, graders)
927
+ if is_onedp_project(azure_ai_project):
928
+ studio_url = _log_metrics_and_instance_results_onedp(
929
+ metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
930
+ )
931
+ else:
932
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
933
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
934
+ studio_url = None
935
+ if trace_destination:
936
+ studio_url = _log_metrics_and_instance_results(
937
+ metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
938
+ )
939
+
940
+ result_df_dict = results_df.to_dict("records")
941
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
942
+
943
+ if output_path:
944
+ _write_output(output_path, result)
945
+
946
+ return result
947
+
760
948
 
949
+ def _preprocess_data(
950
+ data: Union[str, os.PathLike],
951
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
952
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
953
+ target: Optional[Callable] = None,
954
+ output_path: Optional[Union[str, os.PathLike]] = None,
955
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
956
+ evaluation_name: Optional[str] = None,
957
+ **kwargs,
958
+ ) -> __ValidatedData:
761
959
  # Process evaluator config to replace ${target.} with ${data.}
762
960
  if evaluator_config is None:
763
961
  evaluator_config = {}
962
+
963
+ input_data_df = _validate_and_load_data(
964
+ target,
965
+ data,
966
+ evaluators_and_graders,
967
+ output_path,
968
+ azure_ai_project,
969
+ evaluation_name
970
+ )
971
+ if target is not None:
972
+ _validate_columns_for_target(input_data_df, target)
973
+
764
974
  # extract column mapping dicts into dictionary mapping evaluator name to column mapping
765
975
  column_mapping = _process_column_mappings(
766
976
  {
@@ -769,27 +979,35 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
769
979
  }
770
980
  )
771
981
 
772
- if target is not None:
773
- _validate_columns_for_target(input_data_df, target)
774
-
775
982
  # Create default configuration for evaluators that directly maps
776
983
  # input data names to keyword inputs of the same name in the evaluators.
777
984
  column_mapping = column_mapping or {}
778
985
  column_mapping.setdefault("default", {})
779
986
 
780
- target_run: Optional[Run] = None
987
+ # Split normal evaluators and OAI graders
988
+ evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
989
+
990
+ target_run: Optional[BatchClientRun] = None
781
991
  target_generated_columns: Set[str] = set()
782
992
  batch_run_client: BatchClient
783
993
  batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
784
994
 
785
- # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
786
- if data is not None and target is not None:
787
- # Right now, only the ProxyClient that uses Promptflow supports a target function
995
+ if kwargs.pop("_use_run_submitter_client", False):
996
+ batch_run_client = RunSubmitterClient()
997
+ batch_run_data = input_data_df
998
+ elif kwargs.pop("_use_pf_client", True):
788
999
  batch_run_client = ProxyClient(user_agent=USER_AGENT)
1000
+ # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1001
+ # multiple evaluators. If the path is already absolute, abspath will return the original path.
789
1002
  batch_run_data = os.path.abspath(data)
1003
+ else:
1004
+ batch_run_client = CodeClient()
1005
+ batch_run_data = input_data_df
790
1006
 
1007
+ # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
1008
+ if data is not None and target is not None:
791
1009
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
792
- target, data, batch_run_client, input_data_df, evaluation_name, **kwargs
1010
+ target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
793
1011
  )
794
1012
 
795
1013
  for evaluator_name, mapping in column_mapping.items():
@@ -803,17 +1021,6 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
803
1021
  # customer did not mapped target output.
804
1022
  if col not in mapping and run_output not in mapped_to_values:
805
1023
  column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
806
- elif kwargs.pop("_use_run_submitter_client", False):
807
- batch_run_client = RunSubmitterClient()
808
- batch_run_data = input_data_df
809
- elif kwargs.pop("_use_pf_client", True):
810
- batch_run_client = ProxyClient(user_agent=USER_AGENT)
811
- # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
812
- # multiple evaluators. If the path is already absolute, abspath will return the original path.
813
- batch_run_data = os.path.abspath(data)
814
- else:
815
- batch_run_client = CodeClient()
816
- batch_run_data = input_data_df
817
1024
 
818
1025
  # After we have generated all columns, we can check if we have everything we need for evaluators.
819
1026
  _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -829,6 +1036,29 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
829
1036
  if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
830
1037
  column_mapping["default"][col] = f"${{data.{col}}}"
831
1038
 
1039
+ return __ValidatedData(
1040
+ evaluators=evaluators,
1041
+ graders=graders,
1042
+ input_data_df=input_data_df,
1043
+ column_mapping=column_mapping,
1044
+ target_run=target_run,
1045
+ batch_run_client=batch_run_client,
1046
+ batch_run_data=batch_run_data,
1047
+ )
1048
+
1049
+
1050
+ def _run_callable_evaluators(
1051
+ validated_data: __ValidatedData,
1052
+ fail_on_evaluator_errors: bool = False,
1053
+ **kwargs,
1054
+ ) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
1055
+
1056
+ # Extract needed values
1057
+ batch_run_client = validated_data["batch_run_client"]
1058
+ target_run = validated_data["target_run"]
1059
+ batch_run_data = validated_data["batch_run_data"]
1060
+ column_mapping = validated_data["column_mapping"]
1061
+ evaluators = validated_data["evaluators"]
832
1062
  with EvalRunContext(batch_run_client):
833
1063
  runs = {
834
1064
  evaluator_name: batch_run_client.run(
@@ -889,31 +1119,50 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
889
1119
  # Rename columns, generated by target function to outputs instead of inputs.
890
1120
  # If target generates columns, already present in the input data, these columns
891
1121
  # will be marked as outputs already so we do not need to rename them.
892
- input_data_df = _rename_columns_conditionally(input_data_df)
893
-
894
- result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
895
- metrics = _aggregate_metrics(evaluators_result_df, evaluators)
896
- metrics.update(evaluators_metric)
897
-
898
- # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
899
- target_run: Optional[Run] = None
900
- trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
901
- studio_url = None
902
- if trace_destination:
903
- studio_url = _log_metrics_and_instance_results(
904
- metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
905
- )
906
1122
 
907
- result_df_dict = result_df.to_dict("records")
908
- result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
1123
+ input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
1124
+ eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
1125
+ eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
1126
+ eval_metrics.update(evaluators_metric)
909
1127
 
910
- _print_summary(per_evaluator_results)
911
-
912
- if output_path:
913
- _write_output(output_path, result)
1128
+ return eval_result_df, eval_metrics, per_evaluator_results
914
1129
 
915
- return result
1130
+ def _map_names_to_builtins(
1131
+ evaluators: Dict[str, Callable],
1132
+ graders: Dict[str, AzureOpenAIGrader],
1133
+ ) -> Dict[str, str]:
1134
+ """
1135
+ Construct a mapping from user-supplied evaluator names to which known, built-in
1136
+ evaluator or grader they refer to. Custom or otherwise unknown evaluators are
1137
+ mapped to the "unknown" value.
916
1138
 
1139
+ :param evaluators: The dictionary of evaluators.
1140
+ :type evaluators: Dict[str, Callable]
1141
+ :param graders: The dictionary of graders.
1142
+ :type graders: Dict[str, AzureOpenAIGrader]
1143
+ :param evaluator_config: The configuration for evaluators.
1144
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1145
+
1146
+ """
1147
+ from .._eval_mapping import EVAL_CLASS_MAP
1148
+ name_map = {}
1149
+
1150
+ for name, evaluator in evaluators.items():
1151
+ # Check if the evaluator is a known built-in evaluator
1152
+ found_eval = False
1153
+ for eval_class, eval_id in EVAL_CLASS_MAP.items():
1154
+ if isinstance(evaluator, eval_class):
1155
+ name_map[name] = eval_id
1156
+ found_eval = True
1157
+ break
1158
+ if not found_eval:
1159
+ # If not found, map to "unknown"
1160
+ name_map[name] = "unknown"
1161
+
1162
+ for name, grader in graders.items():
1163
+ name_map[name] = grader.id
1164
+
1165
+ return name_map
917
1166
 
918
1167
  def _turn_error_logs_into_exception(log_path: str) -> None:
919
1168
  """Produce an EvaluationException using the contents of the inputted
@@ -929,4 +1178,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
929
1178
  target=ErrorTarget.EVALUATE,
930
1179
  category=ErrorCategory.FAILED_EXECUTION,
931
1180
  blame=ErrorBlame.UNKNOWN,
932
- )
1181
+ )