azure-ai-evaluation 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (150) hide show
  1. azure/ai/evaluation/__init__.py +9 -16
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +5 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +159 -29
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +80 -2
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/__init__.py +1 -1
  59. azure/ai/evaluation/_converters/_ai_services.py +4 -4
  60. azure/ai/evaluation/_eval_mapping.py +71 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  63. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +17 -4
  64. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  65. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  66. azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
  67. azure/ai/evaluation/_evaluate/_evaluate.py +372 -105
  68. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
  69. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
  70. azure/ai/evaluation/_evaluate/_utils.py +120 -7
  71. azure/ai/evaluation/_evaluators/_common/_base_eval.py +9 -4
  72. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +1 -1
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  76. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
  77. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
  78. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +2 -2
  79. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
  80. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
  81. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +8 -2
  82. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  83. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
  84. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
  86. azure/ai/evaluation/_exceptions.py +2 -0
  87. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  88. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  89. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  90. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  91. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  92. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  93. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  94. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  95. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  96. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  97. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  98. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  99. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  100. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  101. azure/ai/evaluation/_legacy/_batch_engine/_result.py +7 -1
  102. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  103. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  104. azure/ai/evaluation/_legacy/_batch_engine/_status.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  106. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  107. azure/ai/evaluation/{_red_team/_utils → _legacy/_common}/__init__.py +1 -1
  108. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  109. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  110. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  111. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  112. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  113. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  114. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
  115. azure/ai/evaluation/_version.py +1 -1
  116. azure/ai/evaluation/red_team/__init__.py +19 -0
  117. azure/ai/evaluation/{_red_team → red_team}/_attack_objective_generator.py +3 -0
  118. azure/ai/evaluation/{_red_team → red_team}/_attack_strategy.py +4 -1
  119. azure/ai/evaluation/{_red_team → red_team}/_red_team.py +885 -481
  120. azure/ai/evaluation/red_team/_red_team_result.py +382 -0
  121. azure/ai/evaluation/{_red_team → red_team}/_utils/constants.py +2 -1
  122. azure/ai/evaluation/{_red_team → red_team}/_utils/formatting_utils.py +23 -22
  123. azure/ai/evaluation/{_red_team → red_team}/_utils/logging_utils.py +1 -1
  124. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  125. azure/ai/evaluation/{_red_team → red_team}/_utils/strategy_utils.py +9 -5
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  127. azure/ai/evaluation/simulator/_constants.py +1 -0
  128. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  129. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. azure/ai/evaluation/simulator/_simulator.py +1 -1
  140. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +36 -2
  141. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +148 -80
  142. azure/ai/evaluation/_red_team/_red_team_result.py +0 -246
  143. azure/ai/evaluation/simulator/_tracing.py +0 -89
  144. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  145. /azure/ai/evaluation/{_red_team → red_team}/_callback_chat_target.py +0 -0
  146. /azure/ai/evaluation/{_red_team → red_team}/_default_converter.py +0 -0
  147. /azure/ai/evaluation/{_red_team → red_team/_utils}/__init__.py +0 -0
  148. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
  149. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
  150. {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
@@ -6,43 +6,59 @@ import json
6
6
  import logging
7
7
  import os
8
8
  import re
9
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
9
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
10
10
 
11
+ from openai import OpenAI, AzureOpenAI
12
+ from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
13
+ from azure.ai.evaluation._legacy._adapters.entities import Run
11
14
  import pandas as pd
12
- from promptflow._sdk._constants import LINE_NUMBER
13
- from promptflow.client import PFClient
14
- from promptflow.entities import Run
15
- from promptflow._sdk._configuration import Configuration
16
15
 
17
16
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
18
- from azure.ai.evaluation._common.utils import validate_azure_ai_project
17
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
19
18
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
20
19
 
20
+ from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
21
+
21
22
  from .._constants import (
22
23
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
24
+ EVALUATION_PASS_FAIL_MAPPING,
23
25
  EvaluationMetrics,
24
26
  DefaultOpenEncoding,
25
27
  Prefixes,
26
28
  _InternalEvaluationMetrics,
29
+ BINARY_AGGREGATE_SUFFIX,
30
+ DEFAULT_OAI_EVAL_RUN_NAME
27
31
  )
28
32
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
29
33
  from .._user_agent import USER_AGENT
30
- from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
34
+ from ._batch_run import (
35
+ EvalRunContext,
36
+ CodeClient,
37
+ ProxyClient,
38
+ TargetRunContext,
39
+ RunSubmitterClient,
40
+ )
31
41
  from ._utils import (
32
42
  _apply_column_mapping,
33
43
  _log_metrics_and_instance_results,
34
44
  _trace_destination_from_project_scope,
35
45
  _write_output,
36
- DataLoaderFactory,
46
+ DataLoaderFactory, _log_metrics_and_instance_results_onedp,
37
47
  )
48
+ from ._batch_run.batch_clients import BatchClient, BatchClientRun
38
49
 
39
- TClient = TypeVar("TClient", ProxyClient, CodeClient)
50
+ from ._evaluate_aoai import (
51
+ _begin_aoai_evaluation,
52
+ _split_evaluators_and_grader_configs,
53
+ _get_evaluation_run_results,
54
+ OAIEvalRunCreationInfo
55
+ )
40
56
  LOGGER = logging.getLogger(__name__)
41
57
 
42
58
  # For metrics (aggregates) whose metric names intentionally differ from their
43
59
  # originating column name, usually because the aggregation of the original value
44
60
  # means something sufficiently different.
45
- # Note that content safety metrics are handled seprately.
61
+ # Note that content safety metrics are handled separately.
46
62
  METRIC_COLUMN_NAME_REPLACEMENTS = {
47
63
  "groundedness_pro_label": "groundedness_pro_passing_rate",
48
64
  }
@@ -53,6 +69,19 @@ class __EvaluatorInfo(TypedDict):
53
69
  metrics: Dict[str, Any]
54
70
  run_summary: Dict[str, Any]
55
71
 
72
+ class __ValidatedData(TypedDict):
73
+ '''
74
+ Simple dictionary that contains ALL pre-processed data and
75
+ the resultant objects that are needed for downstream evaluation.
76
+ '''
77
+ evaluators: Dict[str, Callable]
78
+ graders: Dict[str, AzureOpenAIGrader]
79
+ input_data_df: pd.DataFrame
80
+ column_mapping: Dict[str, Dict[str, str]]
81
+ target_run: Optional[BatchClientRun]
82
+ batch_run_client: BatchClient
83
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame]
84
+
56
85
 
57
86
  def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
58
87
  """Identify and average various metrics that need to have the metric name be replaced,
@@ -71,7 +100,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
71
100
  if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
72
101
  renamed_cols.append(col)
73
102
  new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
74
- col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
103
+ col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
75
104
  try:
76
105
  metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
77
106
  except EvaluationException: # only exception that can be cause is all NaN values
@@ -122,7 +151,7 @@ def _aggregate_content_safety_metrics(
122
151
  defect_rates = {}
123
152
  for col in content_safety_df.columns:
124
153
  defect_rate_name = col.replace("_score", "_defect_rate")
125
- col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
154
+ col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
126
155
  try:
127
156
  col_with_boolean_values = apply_transform_nan_safe(
128
157
  col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
@@ -161,37 +190,40 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
161
190
  metric_name = col.split(".")[1]
162
191
  if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
163
192
  label_cols.append(col)
164
- if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
193
+ if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
165
194
  details_cols = col
166
195
 
167
196
  label_df = df[label_cols]
168
197
  defect_rates = {}
169
198
  for col in label_df.columns:
170
199
  defect_rate_name = col.replace("_label", "_defect_rate")
171
- col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
200
+ col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
172
201
  try:
173
202
  defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
174
203
  except EvaluationException: # only exception that can be cause is all NaN values
175
204
  msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
176
205
  LOGGER.warning(msg)
177
-
206
+
178
207
  if details_cols:
179
208
  details_df = df[details_cols]
180
209
  detail_defect_rates = {}
181
-
210
+
182
211
  for key, value in details_df.items():
183
212
  _process_rows(value, detail_defect_rates)
184
-
213
+
185
214
  for key, value in detail_defect_rates.items():
186
215
  col_with_boolean_values = pd.to_numeric(value, errors="coerce")
187
216
  try:
188
- defect_rates[f"{details_cols}.{key}_defect_rate"] = round(list_mean_nan_safe(col_with_boolean_values), 2)
217
+ defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
218
+ list_mean_nan_safe(col_with_boolean_values), 2
219
+ )
189
220
  except EvaluationException: # only exception that can be cause is all NaN values
190
221
  msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
191
222
  LOGGER.warning(msg)
192
-
223
+
193
224
  return label_cols, defect_rates
194
225
 
226
+
195
227
  def _process_rows(row, detail_defect_rates):
196
228
  for key, value in row.items():
197
229
  if key not in detail_defect_rates:
@@ -199,6 +231,49 @@ def _process_rows(row, detail_defect_rates):
199
231
  detail_defect_rates[key].append(value)
200
232
  return detail_defect_rates
201
233
 
234
+
235
+ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
236
+ """
237
+ Aggregate binary output results (pass/fail) from evaluation dataframe.
238
+
239
+ For each evaluator, calculates the proportion of "pass" results.
240
+
241
+ :param df: The dataframe of evaluation results.
242
+ :type df: ~pandas.DataFrame
243
+ :return: A dictionary mapping evaluator names to the proportion of pass results.
244
+ :rtype: Dict[str, float]
245
+ """
246
+ results = {}
247
+
248
+ # Find all columns that end with "_result"
249
+ result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
250
+
251
+ for col in result_columns:
252
+ # Extract the evaluator name from the column name
253
+ # (outputs.<evaluator>.<metric>_result)
254
+ parts = col.split(".")
255
+ evaluator_name = None
256
+ if len(parts) >= 3:
257
+ evaluator_name = parts[1]
258
+ else:
259
+ LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
260
+ continue
261
+ if evaluator_name:
262
+ # Count the occurrences of each unique value (pass/fail)
263
+ value_counts = df[col].value_counts().to_dict()
264
+
265
+ # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
266
+ total_rows = len(df)
267
+ pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
268
+ proportion = pass_count / total_rows if total_rows > 0 else 0.0
269
+
270
+ # Set the result with the evaluator name as the key
271
+ result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
272
+ results[result_key] = round(proportion, 2)
273
+
274
+ return results
275
+
276
+
202
277
  def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
203
278
  """Aggregate metrics from the evaluation results.
204
279
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -212,6 +287,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
212
287
  :return: The aggregated metrics.
213
288
  :rtype: Dict[str, float]
214
289
  """
290
+ binary_metrics = _aggregation_binary_output(df)
291
+
215
292
  df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
216
293
 
217
294
  handled_columns = []
@@ -239,6 +316,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
239
316
  metrics = mean_value.to_dict()
240
317
  # Add defect rates back into metrics
241
318
  metrics.update(defect_rates)
319
+
320
+ # Add binary threshold metrics based on pass/fail results
321
+ metrics.update(binary_metrics)
322
+
242
323
  return metrics
243
324
 
244
325
 
@@ -330,7 +411,7 @@ def _validate_columns_for_evaluators(
330
411
  missing_inputs = []
331
412
  else:
332
413
  optional_params = (
333
- evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
414
+ cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
334
415
  if hasattr(evaluator, "_OPTIONAL_PARAMS")
335
416
  else []
336
417
  )
@@ -477,12 +558,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
477
558
 
478
559
  def _apply_target_to_data(
479
560
  target: Callable,
480
- data: Union[str, os.PathLike],
481
- batch_client: TClient,
561
+ data: Union[str, os.PathLike, pd.DataFrame],
562
+ batch_client: BatchClient,
482
563
  initial_data: pd.DataFrame,
483
564
  evaluation_name: Optional[str] = None,
484
565
  **kwargs,
485
- ) -> Tuple[pd.DataFrame, Set[str], Run]:
566
+ ) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
486
567
  """
487
568
  Apply the target function to the data set and return updated data and generated columns.
488
569
 
@@ -499,18 +580,19 @@ def _apply_target_to_data(
499
580
  :return: The tuple, containing data frame and the list of added columns.
500
581
  :rtype: Tuple[pandas.DataFrame, List[str]]
501
582
  """
583
+
502
584
  _run_name = kwargs.get("_run_name")
503
- with TargetRunContext():
504
- run: ProxyRun = batch_client.run(
585
+ with TargetRunContext(batch_client):
586
+ run: BatchClientRun = batch_client.run(
505
587
  flow=target,
506
588
  display_name=evaluation_name,
507
589
  data=data,
508
590
  stream=True,
509
591
  name=_run_name,
592
+ evaluator_name=getattr(target, "__qualname__", "TARGET"),
510
593
  )
511
-
512
- target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
513
- run_summary = batch_client.get_run_summary(run)
594
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
595
+ run_summary = batch_client.get_run_summary(run)
514
596
 
515
597
  if run_summary["completed_lines"] == 0:
516
598
  msg = (
@@ -541,7 +623,7 @@ def _apply_target_to_data(
541
623
  # Concatenate output to input
542
624
  target_output = pd.concat([target_output, initial_data], axis=1)
543
625
 
544
- return target_output, generated_columns, run.run.result()
626
+ return target_output, generated_columns, run
545
627
 
546
628
 
547
629
  def _process_column_mappings(
@@ -557,7 +639,7 @@ def _process_column_mappings(
557
639
 
558
640
  processed_config: Dict[str, Dict[str, str]] = {}
559
641
 
560
- expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z_]+\}$")
642
+ expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
561
643
 
562
644
  if column_mapping:
563
645
  for evaluator, mapping_config in column_mapping.items():
@@ -606,15 +688,14 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
606
688
  return df
607
689
 
608
690
 
609
- # @log_evaluate_activity
610
691
  def evaluate(
611
692
  *,
612
693
  data: Union[str, os.PathLike],
613
- evaluators: Dict[str, Callable],
694
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
614
695
  evaluation_name: Optional[str] = None,
615
696
  target: Optional[Callable] = None,
616
697
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
617
- azure_ai_project: Optional[AzureAIProject] = None,
698
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
618
699
  output_path: Optional[Union[str, os.PathLike]] = None,
619
700
  fail_on_evaluator_errors: bool = False,
620
701
  **kwargs,
@@ -626,8 +707,9 @@ def evaluate(
626
707
  JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
627
708
  :paramtype data: str
628
709
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
629
- and value as the evaluator function. Required.
630
- :paramtype evaluators: Dict[str, Callable]
710
+ and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
711
+ Required.
712
+ :paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
631
713
  :keyword evaluation_name: Display name of the evaluation.
632
714
  :paramtype evaluation_name: Optional[str]
633
715
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
@@ -664,7 +746,7 @@ def evaluate(
664
746
  evaluation_name=evaluation_name,
665
747
  target=target,
666
748
  data=data,
667
- evaluators=evaluators,
749
+ evaluators_and_graders=evaluators,
668
750
  evaluator_config=evaluator_config,
669
751
  azure_ai_project=azure_ai_project,
670
752
  output_path=output_path,
@@ -729,23 +811,157 @@ def _print_fail_flag_warning() -> None:
729
811
 
730
812
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
731
813
  *,
732
- evaluators: Dict[str, Callable],
814
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
733
815
  evaluation_name: Optional[str] = None,
734
816
  target: Optional[Callable] = None,
735
817
  data: Union[str, os.PathLike],
736
818
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
737
- azure_ai_project: Optional[AzureAIProject] = None,
819
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
738
820
  output_path: Optional[Union[str, os.PathLike]] = None,
739
821
  fail_on_evaluator_errors: bool = False,
740
822
  **kwargs,
741
823
  ) -> EvaluationResult:
742
824
  if fail_on_evaluator_errors:
743
825
  _print_fail_flag_warning()
744
- input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
826
+
827
+ # Turn inputted mess of data into a dataframe, apply targets if needed
828
+ # split graders and evaluators, and verify that column mappings are sensible.
829
+ validated_data = _preprocess_data(
830
+ data=data,
831
+ evaluators_and_graders=evaluators_and_graders,
832
+ evaluator_config=evaluator_config,
833
+ target=target,
834
+ output_path=output_path,
835
+ azure_ai_project=azure_ai_project,
836
+ evaluation_name=evaluation_name,
837
+ **kwargs,
838
+ )
839
+
840
+ # extract relevant info from validated data
841
+ column_mapping = validated_data["column_mapping"]
842
+ evaluators = validated_data["evaluators"]
843
+ graders = validated_data["graders"]
844
+ input_data_df = validated_data["input_data_df"]
845
+ results_df = pd.DataFrame()
846
+ metrics: Dict[str, float] = {}
847
+ eval_run_info_list: List[OAIEvalRunCreationInfo] = []
848
+
849
+ # Start OAI eval runs if any graders are present.
850
+ need_oai_run = len(graders) > 0
851
+ need_local_run = len(evaluators) > 0
852
+ need_get_oai_results = False
853
+ got_local_results = False
854
+ if need_oai_run:
855
+ try:
856
+ aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
857
+ eval_run_info_list = _begin_aoai_evaluation(
858
+ graders,
859
+ column_mapping,
860
+ input_data_df,
861
+ aoi_name
862
+ )
863
+ need_get_oai_results = len(eval_run_info_list) > 0
864
+ except EvaluationException as e:
865
+ if need_local_run:
866
+ # If there are normal evaluators, don't stop execution and try to run
867
+ # those.
868
+ LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
869
+ " Continuing with local evaluators.")
870
+ LOGGER.warning(e)
871
+ else:
872
+ raise e
873
+
874
+ # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
875
+ if need_local_run:
876
+ try:
877
+ eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
878
+ validated_data=validated_data,
879
+ fail_on_evaluator_errors=fail_on_evaluator_errors
880
+ )
881
+ results_df = eval_result_df
882
+ metrics = eval_metrics
883
+ got_local_results = True
884
+ # TODO figure out how to update this printing to include OAI results?
885
+ _print_summary(per_evaluator_results)
886
+ except EvaluationException as e:
887
+ if need_get_oai_results:
888
+ # If there are OAI graders, we only print a warning on local failures.
889
+ LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
890
+ LOGGER.warning(e)
891
+ else:
892
+ raise e
893
+
894
+ # Retrieve OAI eval run results if needed.
895
+ if need_get_oai_results:
896
+ try:
897
+ aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
898
+ # Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
899
+
900
+ # Combine results if both evaluators and graders are present
901
+ if len(evaluators) > 0:
902
+ results_df = pd.concat([results_df, aoai_results], axis=1)
903
+ metrics.update(aoai_metrics)
904
+ else:
905
+ # Otherwise combine aoai results with input data df to include input columns in outputs.
906
+ results_df = pd.concat([input_data_df, aoai_results], axis=1)
907
+ metrics = aoai_metrics
908
+ except EvaluationException as e:
909
+ if got_local_results:
910
+ # If there are local eval results, we only print a warning on OAI failure.
911
+ LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
912
+ LOGGER.warning(e)
913
+ else:
914
+ raise e
915
+
916
+ # Done with all evaluations, message outputs into final forms, and log results if needed.
917
+ name_map = _map_names_to_builtins(evaluators, graders)
918
+ if is_onedp_project(azure_ai_project):
919
+ studio_url = _log_metrics_and_instance_results_onedp(
920
+ metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
921
+ )
922
+ else:
923
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
924
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
925
+ studio_url = None
926
+ if trace_destination:
927
+ studio_url = _log_metrics_and_instance_results(
928
+ metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
929
+ )
930
+
931
+ result_df_dict = results_df.to_dict("records")
932
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
933
+
934
+ if output_path:
935
+ _write_output(output_path, result)
936
+
937
+ return result
745
938
 
939
+
940
+ def _preprocess_data(
941
+ data: Union[str, os.PathLike],
942
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
943
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
944
+ target: Optional[Callable] = None,
945
+ output_path: Optional[Union[str, os.PathLike]] = None,
946
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
947
+ evaluation_name: Optional[str] = None,
948
+ **kwargs,
949
+ ) -> __ValidatedData:
746
950
  # Process evaluator config to replace ${target.} with ${data.}
747
951
  if evaluator_config is None:
748
952
  evaluator_config = {}
953
+
954
+ input_data_df = _validate_and_load_data(
955
+ target,
956
+ data,
957
+ evaluators_and_graders,
958
+ output_path,
959
+ azure_ai_project,
960
+ evaluation_name
961
+ )
962
+ if target is not None:
963
+ _validate_columns_for_target(input_data_df, target)
964
+
749
965
  # extract column mapping dicts into dictionary mapping evaluator name to column mapping
750
966
  column_mapping = _process_column_mappings(
751
967
  {
@@ -754,23 +970,46 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
754
970
  }
755
971
  )
756
972
 
757
- if target is not None:
758
- _validate_columns_for_target(input_data_df, target)
759
-
760
- Configuration.get_instance().set_config("trace.destination", "none")
761
- pf_client = PFClient(user_agent=USER_AGENT)
762
- target_run: Optional[Run] = None
763
-
764
973
  # Create default configuration for evaluators that directly maps
765
974
  # input data names to keyword inputs of the same name in the evaluators.
766
975
  column_mapping = column_mapping or {}
767
976
  column_mapping.setdefault("default", {})
768
977
 
769
- # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
978
+ # Split normal evaluators and OAI graders
979
+ evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
980
+
981
+ input_data_df = _validate_and_load_data(
982
+ target,
983
+ data,
984
+ evaluators_and_graders,
985
+ output_path,
986
+ azure_ai_project,
987
+ evaluation_name
988
+ )
989
+ if target is not None:
990
+ _validate_columns_for_target(input_data_df, target)
991
+
992
+ target_run: Optional[BatchClientRun] = None
770
993
  target_generated_columns: Set[str] = set()
994
+ batch_run_client: BatchClient
995
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
996
+
997
+ if kwargs.pop("_use_run_submitter_client", False):
998
+ batch_run_client = RunSubmitterClient()
999
+ batch_run_data = input_data_df
1000
+ elif kwargs.pop("_use_pf_client", True):
1001
+ batch_run_client = ProxyClient(user_agent=USER_AGENT)
1002
+ # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1003
+ # multiple evaluators. If the path is already absolute, abspath will return the original path.
1004
+ batch_run_data = os.path.abspath(data)
1005
+ else:
1006
+ batch_run_client = CodeClient()
1007
+ batch_run_data = input_data_df
1008
+
1009
+ # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
771
1010
  if data is not None and target is not None:
772
1011
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
773
- target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
1012
+ target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
774
1013
  )
775
1014
 
776
1015
  for evaluator_name, mapping in column_mapping.items():
@@ -799,46 +1038,55 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
799
1038
  if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
800
1039
  column_mapping["default"][col] = f"${{data.{col}}}"
801
1040
 
802
- def eval_batch_run(
803
- batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
804
- ) -> Dict[str, __EvaluatorInfo]:
805
- with EvalRunContext(batch_run_client):
806
- runs = {
807
- evaluator_name: batch_run_client.run(
808
- flow=evaluator,
809
- run=target_run,
810
- evaluator_name=evaluator_name,
811
- column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
812
- data=data,
813
- stream=True,
814
- name=kwargs.get("_run_name"),
815
- )
816
- for evaluator_name, evaluator in evaluators.items()
817
- }
1041
+ return __ValidatedData(
1042
+ evaluators=evaluators,
1043
+ graders=graders,
1044
+ input_data_df=input_data_df,
1045
+ column_mapping=column_mapping,
1046
+ target_run=target_run,
1047
+ batch_run_client=batch_run_client,
1048
+ batch_run_data=batch_run_data,
1049
+ )
818
1050
 
819
- # get_details needs to be called within EvalRunContext scope in order to have user agent populated
820
- return {
821
- evaluator_name: {
822
- "result": batch_run_client.get_details(run, all_results=True),
823
- "metrics": batch_run_client.get_metrics(run),
824
- "run_summary": batch_run_client.get_run_summary(run),
825
- }
826
- for evaluator_name, run in runs.items()
827
- }
828
1051
 
829
- # Batch Run
830
- use_pf_client = kwargs.get("_use_pf_client", True)
831
- if use_pf_client:
832
- # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
833
- # multiple evaluators. If the path is already absolute, abspath will return the original path.
834
- data = os.path.abspath(data)
835
- per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
836
- else:
837
- data = input_data_df
838
- per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
1052
+ def _run_callable_evaluators(
1053
+ validated_data: __ValidatedData,
1054
+ fail_on_evaluator_errors: bool = False,
1055
+ **kwargs,
1056
+ ) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
1057
+
1058
+ # Extract needed values
1059
+ batch_run_client = validated_data["batch_run_client"]
1060
+ target_run = validated_data["target_run"]
1061
+ batch_run_data = validated_data["batch_run_data"]
1062
+ column_mapping = validated_data["column_mapping"]
1063
+ evaluators = validated_data["evaluators"]
1064
+ with EvalRunContext(batch_run_client):
1065
+ runs = {
1066
+ evaluator_name: batch_run_client.run(
1067
+ flow=evaluator,
1068
+ data=batch_run_data,
1069
+ run=target_run,
1070
+ evaluator_name=evaluator_name,
1071
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
1072
+ stream=True,
1073
+ name=kwargs.get("_run_name"),
1074
+ )
1075
+ for evaluator_name, evaluator in evaluators.items()
1076
+ }
1077
+
1078
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
1079
+ per_evaluator_results: Dict[str, __EvaluatorInfo] = {
1080
+ evaluator_name: {
1081
+ "result": batch_run_client.get_details(run, all_results=True),
1082
+ "metrics": batch_run_client.get_metrics(run),
1083
+ "run_summary": batch_run_client.get_run_summary(run),
1084
+ }
1085
+ for evaluator_name, run in runs.items()
1086
+ }
839
1087
 
840
1088
  # Concatenate all results
841
- evaluators_result_df = None
1089
+ evaluators_result_df = pd.DataFrame()
842
1090
  evaluators_metric = {}
843
1091
  for evaluator_name, evaluator_result in per_evaluator_results.items():
844
1092
  if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
@@ -873,31 +1121,50 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
873
1121
  # Rename columns, generated by target function to outputs instead of inputs.
874
1122
  # If target generates columns, already present in the input data, these columns
875
1123
  # will be marked as outputs already so we do not need to rename them.
876
- input_data_df = _rename_columns_conditionally(input_data_df)
877
-
878
- result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
879
- metrics = _aggregate_metrics(evaluators_result_df, evaluators)
880
- metrics.update(evaluators_metric)
881
-
882
- # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
883
- target_run = None
884
- trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
885
- studio_url = None
886
- if trace_destination:
887
- studio_url = _log_metrics_and_instance_results(
888
- metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
889
- )
890
1124
 
891
- result_df_dict = result_df.to_dict("records")
892
- result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
1125
+ input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
1126
+ eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
1127
+ eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
1128
+ eval_metrics.update(evaluators_metric)
893
1129
 
894
- _print_summary(per_evaluator_results)
1130
+ return eval_result_df, eval_metrics, per_evaluator_results
895
1131
 
896
- if output_path:
897
- _write_output(output_path, result)
1132
+ def _map_names_to_builtins(
1133
+ evaluators: Dict[str, Callable],
1134
+ graders: Dict[str, AzureOpenAIGrader],
1135
+ ) -> Dict[str, str]:
1136
+ """
1137
+ Construct a mapping from user-supplied evaluator names to which known, built-in
1138
+ evaluator or grader they refer to. Custom or otherwise unknown evaluators are
1139
+ mapped to the "unknown" value.
898
1140
 
899
- return result
1141
+ :param evaluators: The dictionary of evaluators.
1142
+ :type evaluators: Dict[str, Callable]
1143
+ :param graders: The dictionary of graders.
1144
+ :type graders: Dict[str, AzureOpenAIGrader]
1145
+ :param evaluator_config: The configuration for evaluators.
1146
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1147
+
1148
+ """
1149
+ from .._eval_mapping import EVAL_CLASS_MAP
1150
+ name_map = {}
1151
+
1152
+ for name, evaluator in evaluators.items():
1153
+ # Check if the evaluator is a known built-in evaluator
1154
+ found_eval = False
1155
+ for eval_class, eval_id in EVAL_CLASS_MAP.items():
1156
+ if isinstance(evaluator, eval_class):
1157
+ name_map[name] = eval_id
1158
+ found_eval = True
1159
+ break
1160
+ if not found_eval:
1161
+ # If not found, map to "unknown"
1162
+ name_map[name] = "unknown"
1163
+
1164
+ for name, grader in graders.items():
1165
+ name_map[name] = grader.id
900
1166
 
1167
+ return name_map
901
1168
 
902
1169
  def _turn_error_logs_into_exception(log_path: str) -> None:
903
1170
  """Produce an EvaluationException using the contents of the inputted
@@ -913,4 +1180,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
913
1180
  target=ErrorTarget.EVALUATE,
914
1181
  category=ErrorCategory.FAILED_EXECUTION,
915
1182
  blame=ErrorBlame.UNKNOWN,
916
- )
1183
+ )