azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show
  1. azure/ai/evaluation/__init__.py +10 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +7 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +165 -34
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  59. azure/ai/evaluation/_converters/_models.py +76 -6
  60. azure/ai/evaluation/_eval_mapping.py +73 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  63. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  64. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  65. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  66. azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
  67. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
  68. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  69. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  70. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  71. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  72. azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  76. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  77. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  78. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  79. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  80. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  81. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
  82. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  83. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
  84. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  85. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
  86. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
  87. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  88. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  89. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  90. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
  91. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
  92. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  93. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  94. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  95. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  96. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
  97. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
  98. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
  99. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  100. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  101. azure/ai/evaluation/_exceptions.py +2 -0
  102. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  103. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  104. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  106. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  107. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  109. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  110. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  111. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  112. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  113. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  114. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  115. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  116. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  117. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  118. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  119. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
  120. azure/ai/evaluation/_version.py +1 -1
  121. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  122. azure/ai/evaluation/red_team/_red_team.py +976 -546
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  125. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  126. azure/ai/evaluation/simulator/_constants.py +1 -0
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  128. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  129. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
  140. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
  141. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  142. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  143. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  144. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,553 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import logging
6
+
7
+ from openai import AzureOpenAI, OpenAI
8
+ import pandas as pd
9
+ from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List
10
+ from time import sleep
11
+
12
+ from ._batch_run import CodeClient, ProxyClient
13
+
14
+ #import aoai_mapping
15
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
17
+ from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
18
+ from azure.ai.evaluation._common._experimental import experimental
19
+
20
+
21
+ TClient = TypeVar("TClient", ProxyClient, CodeClient)
22
+ LOGGER = logging.getLogger(__name__)
23
+
24
+
25
+ class OAIEvalRunCreationInfo(TypedDict, total=True):
26
+ """Configuration for an evaluator"""
27
+
28
+ client: Union[AzureOpenAI, OpenAI]
29
+ eval_group_id: str
30
+ eval_run_id: str
31
+ grader_name_map: Dict[str, str]
32
+
33
+ def _split_evaluators_and_grader_configs(
34
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]]
35
+ ) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
36
+ """
37
+ Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
38
+ dictionaries that each contain one subset, the first containing the evaluators and the second containing
39
+ the AOAI graders. AOAI graders are defined as anything that is an instance of the AoaiGrader class,
40
+ including child class instances.
41
+
42
+ :param evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
43
+ and value as the evaluator function or AOAI grader.
44
+ :type evaluators: Dict[str, Union[Callable, ]]
45
+ :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
46
+ :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
47
+ """
48
+ true_evaluators = {}
49
+ aoai_graders = {}
50
+ for key, value in evaluators.items():
51
+ if isinstance(value, AzureOpenAIGrader):
52
+ aoai_graders[key] = value
53
+ else:
54
+ true_evaluators[key] = value
55
+ return true_evaluators, aoai_graders
56
+
57
+ @experimental
58
+ def _begin_aoai_evaluation(
59
+ graders: Dict[str, AzureOpenAIGrader],
60
+ column_mappings: Optional[Dict[str, Dict[str, str]]],
61
+ data: pd.DataFrame,
62
+ run_name: str
63
+ ) -> List[OAIEvalRunCreationInfo]:
64
+ """
65
+ Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
66
+ AOAI evaluation runs must be queried for completion, so this returns the IDs needed to poll for the
67
+ results, and map those results to the user-supplied names of the graders.
68
+
69
+ If any of the graders require unique column mappings, this function will
70
+ create a separate evaluation run for each grader. Otherwise, all graders
71
+ will be evaluated in a single run.
72
+
73
+ :param client: The AOAI client to use for the evaluation.
74
+ :type client: Union[OpenAI, AzureOpenAI]
75
+ :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
76
+ :type graders: Dict[str, AoaiGrader]
77
+ :param column_mappings: The column mappings to use for the evaluation.
78
+ :type column_mappings: Optional[Dict[str, Dict[str, str]]]
79
+ :param data: The data to evaluate, preprocessed by the `_validate_and_load_data` method.
80
+ :type data: pd.DataFrame
81
+ :param run_name: The name of the evaluation run.
82
+ :type run_name: str
83
+ :return: A list of evaluation run info that can be used to retrieve the results of the evaluation later
84
+ :rtype: List[OAIEvalRunCreationInfo]
85
+ """
86
+
87
+
88
+ LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
89
+ all_eval_run_info: List[OAIEvalRunCreationInfo] = []
90
+
91
+ for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
92
+ all_eval_run_info.append(_begin_single_aoai_evaluation(
93
+ selected_graders,
94
+ data,
95
+ selected_column_mapping,
96
+ run_name
97
+ ))
98
+
99
+ return all_eval_run_info
100
+
101
+ def _begin_single_aoai_evaluation(
102
+ graders: Dict[str, AzureOpenAIGrader],
103
+ data: pd.DataFrame,
104
+ column_mapping: Dict[str, str],
105
+ run_name: str
106
+ ) -> OAIEvalRunCreationInfo:
107
+ """
108
+ Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
109
+ AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
110
+ at a later time.
111
+
112
+ :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
113
+ :type graders: Dict[str, AoaiGrader]
114
+ :param data_source_config: The data source configuration to apply to the
115
+ :type data_source_config: pd.DataFrame
116
+ :param run_name: The name of the evaluation run.
117
+ :type run_name: str
118
+ :return: A tuple containing the eval group ID and eval run ID of the resultant eval run, as well as a dictionary
119
+ that maps the user-supplied evaluators to the names of the graders as generated by the OAI service.
120
+ :rtype: Tuple[str, str, Dict[str, str]]
121
+ """
122
+
123
+ # Format data for eval group creation
124
+ grader_name_list = []
125
+ grader_list = []
126
+ # It's expected that all graders supplied for a single eval run use the same credentials
127
+ # so grab a client from the first grader.
128
+ client = list(graders.values())[0].get_client()
129
+
130
+ for name, grader in graders.items():
131
+ grader_name_list.append(name)
132
+ grader_list.append(grader._grader_config)
133
+ data_source_config = _generate_data_source_config(data, column_mapping)
134
+
135
+ # Create eval group
136
+ # import pdb; pdb.set_trace()
137
+ eval_group_info = client.evals.create(
138
+ data_source_config=data_source_config,
139
+ testing_criteria=grader_list,
140
+ metadata={"is_foundry_eval": "true"}
141
+ )
142
+
143
+ LOGGER.info(f"AOAI: Eval group created with id {eval_group_info.id}. Creating eval run next...")
144
+ # Use eval group info to map grader IDs back to user-assigned names.
145
+ grader_name_map = {}
146
+ num_criteria = len(eval_group_info.testing_criteria)
147
+ if num_criteria != len(grader_name_list):
148
+ raise EvaluationException(
149
+ message=f"Number of testing criteria ({num_criteria})" +
150
+ f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
151
+ blame=ErrorBlame.USER_ERROR,
152
+ category=ErrorCategory.INVALID_VALUE,
153
+ target=ErrorTarget.AOAI_GRADER,
154
+ )
155
+ for name, criteria in zip(grader_name_list, eval_group_info.testing_criteria):
156
+ grader_name_map[criteria.id] = name
157
+
158
+ # Create eval run
159
+ eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
160
+ LOGGER.info(f"AOAI: Eval run created with id {eval_run_id}." +
161
+ " Results will be retrieved after normal evaluation is complete...")
162
+
163
+ return OAIEvalRunCreationInfo(client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
164
+
165
+ def _get_evaluation_run_results(
166
+ all_run_info: List[OAIEvalRunCreationInfo]
167
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
168
+ """
169
+ Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
170
+ pipeline to consume. This method accepts a list of eval run information, and will combine the
171
+ results into a single dataframe and metrics dictionary.
172
+
173
+ :param all_run_info: A list of evaluation run information that contains the needed values
174
+ to retrieve the results of the evaluation run.
175
+ :type all_run_info: List[OAIEvalRunCreationInfo]
176
+ :return: A tuple containing the results of the evaluation run as a dataframe, and a dictionary of metrics
177
+ calculated from the evaluation run.
178
+ :rtype: Tuple[pd.DataFrame, Dict[str, Any]]
179
+ :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
180
+ """
181
+
182
+ run_metrics = {}
183
+ output_df = pd.DataFrame()
184
+ for run_info in all_run_info:
185
+ cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
186
+ output_df = pd.concat([output_df, cur_output_df], axis=1)
187
+ run_metrics.update(cur_run_metrics)
188
+
189
+ return output_df, run_metrics
190
+
191
+ def _get_single_run_results(
192
+ run_info: OAIEvalRunCreationInfo,
193
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
194
+ """
195
+ Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
196
+ pipeline to consume.
197
+
198
+ :param run_info: The evaluation run information that contains the needed values
199
+ to retrieve the results of the evaluation run.
200
+ :type run_info: OAIEvalRunCreationInfo
201
+ :return: A tuple containing the results of the evaluation run as a dataframe, and a dictionary of metrics
202
+ calculated from the evaluation run.
203
+ :rtype: Tuple[pd.DataFrame, Dict[str, Any]]
204
+ :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
205
+ """
206
+ # Wait for evaluation run to complete
207
+ run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
208
+ if run_results.status != "completed":
209
+ raise EvaluationException(
210
+ message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
211
+ + f" failed with status {run_results.status}.",
212
+ blame=ErrorBlame.UNKNOWN,
213
+ category=ErrorCategory.FAILED_EXECUTION,
214
+ target=ErrorTarget.AOAI_GRADER,
215
+ )
216
+ LOGGER.info(f"AOAI: Evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
217
+ + " completed successfully. Gathering results...")
218
+ # Convert run results into a dictionary of metrics
219
+ run_metrics = {}
220
+ if run_results.per_testing_criteria_results is None:
221
+ msg = ("AOAI evaluation run returned no results, despite 'completed' status. This might" +
222
+ " occur when invalid or conflicting models are selected in the model and grader configs."
223
+ f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}")
224
+ raise EvaluationException(
225
+ message=msg,
226
+ blame=ErrorBlame.UNKNOWN,
227
+ category=ErrorCategory.FAILED_EXECUTION,
228
+ target=ErrorTarget.AOAI_GRADER,
229
+ )
230
+ for criteria_result in run_results.per_testing_criteria_results:
231
+ grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
232
+ passed = criteria_result.passed
233
+ failed = criteria_result.failed
234
+ ratio = passed / (passed + failed)
235
+ formatted_column_name = f"{grader_name}.pass_rate"
236
+ run_metrics[formatted_column_name] = ratio
237
+
238
+
239
+ # Get full results and convert them into a dataframe.
240
+ # Notes on raw full data output from OAI eval runs:
241
+ # Each row in the full results list in itself a list.
242
+ # Each entry corresponds to one grader's results from the criteria list
243
+ # that was inputted to the eval group.
244
+ # Each entry is a dictionary, with a name, sample, passed boolean, and score number.
245
+ # The name is used to figure out which grader the entry refers to, the sample is ignored.
246
+ # The passed and score values are then added to the results dictionary, prepended with the grader's name
247
+ # as entered by the user in the inputted dictionary.
248
+ # Other values, if they exist, are also added to the results dictionary.
249
+ raw_list_results = run_info["client"].evals.runs.output_items.list(
250
+ eval_id=run_info["eval_group_id"],
251
+ run_id=run_info["eval_run_id"]
252
+ )
253
+ listed_results = {"index": []}
254
+ # raw data has no order guarantees, we need to sort them by their
255
+ # datasource_item_id
256
+ for row_result in raw_list_results.data:
257
+ # Add the datasource_item_id for later sorting
258
+ listed_results["index"].append(row_result.datasource_item_id)
259
+ for single_grader_row_result in row_result.results:
260
+ grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
261
+ for name, value in single_grader_row_result.items():
262
+ if name in ["name"]: # Todo decide if we also want to exclude "sample"
263
+ continue
264
+ if name.lower() == "passed":
265
+ # create a `_result` column for each grader
266
+ result_column_name = f"outputs.{grader_name}.{grader_name}_result"
267
+ if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
268
+ if (result_column_name not in listed_results):
269
+ listed_results[result_column_name] = []
270
+ listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
271
+
272
+ formatted_column_name = f"outputs.{grader_name}.{name}"
273
+ if (formatted_column_name not in listed_results):
274
+ listed_results[formatted_column_name] = []
275
+ listed_results[formatted_column_name].append(value)
276
+ output_df = pd.DataFrame(listed_results)
277
+ # sort by index
278
+ output_df = output_df.sort_values('index', ascending=[True])
279
+ # remove index column
280
+ output_df.drop(columns=["index"], inplace=True)
281
+ return output_df, run_metrics
282
+
283
+
284
+ def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str, Any]) -> AzureOpenAIGrader:
285
+ """
286
+ Helper function for the remote evaluation service.
287
+ Given a model ID that refers to a specific AOAI grader wrapper class, return an instance of that class
288
+ using the provided initialization parameters.
289
+
290
+ :param grader_id: The model ID that refers to a specific AOAI grader wrapper class.
291
+ :type grader_id: str
292
+ :param init_params: The initialization parameters to be used for the AOAI grader wrapper class.
293
+ Requires that it contain a model_config and grader_config as top-level keys.
294
+ :type init_params: Dict[str, Any]
295
+ """
296
+
297
+ model_config = init_params.get("model_config", None)
298
+ if model_config is None:
299
+ raise EvaluationException(
300
+ message="Grader converter needs a valid 'model_config' key in init_params.",
301
+ blame=ErrorBlame.USER_ERROR,
302
+ category=ErrorCategory.INVALID_VALUE,
303
+ target=ErrorTarget.AOAI_GRADER,
304
+ )
305
+
306
+ grader_class = _get_grader_class(grader_id)
307
+ return grader_class(**init_params)
308
+
309
+ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
310
+ """
311
+ Given a model ID, return the class of the corresponding grader wrapper.
312
+ """
313
+
314
+ from azure.ai.evaluation import (
315
+ AzureOpenAIGrader,
316
+ AzureOpenAILabelGrader,
317
+ AzureOpenAIStringCheckGrader,
318
+ AzureOpenAITextSimilarityGrader,
319
+ )
320
+ id_map = {
321
+ AzureOpenAIGrader.id: AzureOpenAIGrader,
322
+ AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
323
+ AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
324
+ AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
325
+ }
326
+
327
+ for key in id_map.keys():
328
+ if model_id == key:
329
+ return id_map[key]
330
+ raise EvaluationException(
331
+ message=f"Model ID {model_id} not recognized as an AOAI grader ID",
332
+ blame=ErrorBlame.USER_ERROR,
333
+ category=ErrorCategory.INVALID_VALUE,
334
+ target=ErrorTarget.AOAI_GRADER,
335
+ )
336
+
337
+
338
+ def _get_graders_and_column_mappings(
339
+ graders: Dict[str, AzureOpenAIGrader],
340
+ column_mappings: Optional[Dict[str, Dict[str, str]]],
341
+ ) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
342
+ """
343
+ Given a dictionary of column mappings and a dictionary of AOAI graders,
344
+ Split them into sub-lists and sub-dictionaries that each correspond to a single evaluation run
345
+ that must be performed to evaluate the entire dataset.
346
+
347
+ Currently this function is fairly naive; it always splits the data if there are multiple
348
+ graders present and any of them have a unique column mapping.
349
+
350
+ This odd separate of data is necessary because our system allows for different evaluators
351
+ to have different dataset columns mapped to the same input name for each evaluator, while
352
+ the OAI API can't. So, if if there's a possibility that such a conflict might arise,
353
+ we need to split the incoming data up.
354
+
355
+ Currently splits each grader into its own eval group/run to ensure they each use
356
+ their own credentials later on. Planned fast follow is to group things by
357
+ matching credentials later.
358
+
359
+ :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
360
+ :type graders: Dict[str, AoaiGrader]
361
+ :param column_mappings: The column mappings to use for the evaluation.
362
+ :type column_mappings: Optional[Dict[str, Dict[str, str]]]
363
+ :return: A list of tuples, each containing dictionary of AOAI graders,
364
+ and the column mapping they should use.
365
+ :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
366
+ """
367
+
368
+ default_mapping = column_mappings.get("default", None)
369
+ return [({name : grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
370
+
371
+ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
372
+ """Produce a data source config that maps all columns from the supplied data source into
373
+ the OAI API. The mapping is naive unless a column mapping is provided, in which case
374
+ the column mapping's values overrule the relevant naive mappings
375
+
376
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
377
+ helper function.
378
+ :type input_data_df: pd.DataFrame
379
+ :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
380
+ :type column_mapping: Optional[Dict[str, str]]
381
+ :return: A dictionary that can act as data source config for OAI evaluation group creation.
382
+ :rtype: Dict[str, Any]
383
+ """
384
+
385
+ data_source_config = {
386
+ "type": "custom",
387
+ "item_schema": {
388
+ "type": "object",
389
+ "properties": {},
390
+ "required": [],
391
+ }
392
+ }
393
+ properties = data_source_config["item_schema"]["properties"]
394
+ required = data_source_config["item_schema"]["required"]
395
+ for key in column_mapping.keys():
396
+ properties[key] = {
397
+ "type": "string",
398
+ }
399
+ required.append(key)
400
+ return data_source_config
401
+
402
+ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
403
+ """Produce a data source config that naively maps all columns from the supplied data source into
404
+ the OAI API.
405
+
406
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
407
+ helper function.
408
+ :type input_data_df: pd.DataFrame
409
+ :return: A dictionary that can act as data source config for OAI evaluation group creation.
410
+ :rtype: Dict[str, Any]
411
+ """
412
+
413
+ properties = {}
414
+ required = []
415
+
416
+ for column in input_data_df.columns:
417
+ properties[column] = {
418
+ "type": "string",
419
+ }
420
+ required.append(column)
421
+ data_source_config = {
422
+ "type": "custom",
423
+ "item_schema": {
424
+ "type": "object",
425
+ "properties": properties,
426
+ "required": required,
427
+ }
428
+ }
429
+ return data_source_config
430
+
431
+ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
432
+ """
433
+ Given a dataframe of data to be evaluated, and an optional column mapping,
434
+ produce a dictionary can be used as the data source input for an OAI evaluation run.
435
+
436
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
437
+ helper function.
438
+ :type input_data_df: pd.DataFrame
439
+ :param column_mapping: The column mapping to use for the evaluation. If None, a naive 1:1 mapping is used.
440
+ :type column_mapping: Optional[Dict[str, str]]
441
+ :return: A dictionary that can be used as the data source input for an OAI evaluation run.
442
+ :rtype: Dict[str, Any]
443
+ """
444
+ content = []
445
+ column_to_source_map = {}
446
+ # Convert from column mapping's format to figure out actual column names in
447
+ # input dataframe, and map those to the appropriate OAI input names.
448
+ for name, formatted_entry in column_mapping.items():
449
+ # From "${" from start and "}" from end before splitting.
450
+ entry_pieces = formatted_entry[2:-1].split(".")
451
+ if len(entry_pieces) == 2 and entry_pieces[0] == "data":
452
+ column_to_source_map[name] = entry_pieces[1]
453
+ elif len(entry_pieces) == 3 and entry_pieces[0] == "run" and entry_pieces[1] == "outputs":
454
+ column_to_source_map[name] = f"__outputs.{entry_pieces[2]}"
455
+
456
+ # Using the above mapping, transform the input dataframe into a content
457
+ # dictionary that'll work in an OAI data source.
458
+ for row in input_data_df.iterrows():
459
+ row_dict = {}
460
+ for oai_key,dataframe_key in column_to_source_map.items():
461
+ row_dict[oai_key] = str(row[1][dataframe_key])
462
+ content.append({"item": row_dict})
463
+
464
+ return {
465
+ "type": "jsonl",
466
+ "source": {
467
+ "type": "file_content",
468
+ "content": content,
469
+ }
470
+ }
471
+
472
+ def _begin_eval_run(
473
+ client: Union[OpenAI, AzureOpenAI],
474
+ eval_group_id: str,
475
+ run_name: str,
476
+ input_data_df: pd.DataFrame,
477
+ column_mapping: Dict[str, str]
478
+ ) -> str:
479
+ """
480
+ Given an eval group id and a dataset file path, use the AOAI API to
481
+ start an evaluation run with the given name and description.
482
+ Returns a poller that can be used to monitor the run.
483
+
484
+ :param client: The AOAI client to use for the evaluation.
485
+ :type client: Union[OpenAI, AzureOpenAI]
486
+ :param eval_group_id: The ID of the evaluation group to use for the evaluation run.
487
+ :type eval_group_id: str
488
+ :param run_name: The name of the evaluation run.
489
+ :type run_name: str
490
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
491
+ helper function.
492
+ :type input_data_df: pd.DataFrame
493
+ :return: The ID of the evaluation run.
494
+ :rtype: str
495
+ """
496
+
497
+ data_source = _get_data_source(input_data_df, column_mapping)
498
+ eval_run = client.evals.runs.create(
499
+ eval_id=eval_group_id,
500
+ data_source=data_source,
501
+ name=run_name,
502
+ metadata={"sample_generation": "off","file_format": "jsonl", "is_foundry_eval": "true"}
503
+ # TODO decide if we want to add our own timeout value?
504
+ )
505
+ return eval_run.id
506
+
507
+ # Post built TODO: replace with _red_team.py's retry logic?
508
+ def _wait_for_run_conclusion(
509
+ client: Union[OpenAI, AzureOpenAI],
510
+ eval_group_id: str,
511
+ eval_run_id: str,
512
+ max_wait_seconds = 21600
513
+ ) -> Any:
514
+ """
515
+ Perform exponential backoff polling to get the results of an AOAI evaluation run.
516
+ Raises an EvaluationException if max attempts are reached without receiving a concluding status.
517
+
518
+ :param client: The AOAI client to use for the evaluation.
519
+ :type client: Union[OpenAI, AzureOpenAI]
520
+ :param eval_group_id: The ID of the evaluation group that contains the evaluation run of interest.
521
+ :type eval_group_id: str
522
+ :param eval_run_id: The evaluation run ID to get the results of.
523
+ :type eval_run_id: str
524
+ :param max_wait_seconds: The maximum amount of time to wait for the evaluation run to complete.
525
+ :type max_wait_seconds: int
526
+ :return: The results of the evaluation run.
527
+ :rtype: Any
528
+ """
529
+
530
+ LOGGER.info(f"AOAI: Getting OAI eval run results from group/run {eval_group_id}/{eval_run_id}...")
531
+ total_wait = 0
532
+ iters = 0
533
+ # start with ~51 minutes of exponential backoff
534
+ # max wait time = 2^10 * 3 = 3072 seconds ~= 51 minutes
535
+ wait_interval = 3 # Seconds.
536
+ while(True):
537
+ wait_interval *= 1.5
538
+ total_wait += wait_interval
539
+ # Reduce last wait interval if total wait time exceeds max wait time
540
+ if total_wait > max_wait_seconds:
541
+ wait_interval -= total_wait - max_wait_seconds
542
+ sleep(wait_interval)
543
+ response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
544
+ if response.status not in ["queued", "in_progress"]:
545
+ return response
546
+ if total_wait > max_wait_seconds:
547
+ raise EvaluationException(
548
+ message=f"Timed out waiting for AOAI evaluation to complete after {iters}"
549
+ + f" rounds of polling. Final status was {response.status}",
550
+ blame=ErrorBlame.USER_ERROR,
551
+ category=ErrorCategory.FAILED_EXECUTION,
552
+ target=ErrorTarget.AOAI_GRADER,
553
+ )