azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +10 -0
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +7 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +165 -34
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +79 -1
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_converters/_ai_services.py +162 -118
- azure/ai/evaluation/_converters/_models.py +76 -6
- azure/ai/evaluation/_eval_mapping.py +73 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
- azure/ai/evaluation/_evaluate/_utils.py +117 -4
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +976 -546
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from openai import AzureOpenAI, OpenAI
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List
|
|
10
|
+
from time import sleep
|
|
11
|
+
|
|
12
|
+
from ._batch_run import CodeClient, ProxyClient
|
|
13
|
+
|
|
14
|
+
#import aoai_mapping
|
|
15
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
17
|
+
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
18
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
22
|
+
LOGGER = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OAIEvalRunCreationInfo(TypedDict, total=True):
|
|
26
|
+
"""Configuration for an evaluator"""
|
|
27
|
+
|
|
28
|
+
client: Union[AzureOpenAI, OpenAI]
|
|
29
|
+
eval_group_id: str
|
|
30
|
+
eval_run_id: str
|
|
31
|
+
grader_name_map: Dict[str, str]
|
|
32
|
+
|
|
33
|
+
def _split_evaluators_and_grader_configs(
|
|
34
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]]
|
|
35
|
+
) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
|
|
36
|
+
"""
|
|
37
|
+
Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
|
|
38
|
+
dictionaries that each contain one subset, the first containing the evaluators and the second containing
|
|
39
|
+
the AOAI graders. AOAI graders are defined as anything that is an instance of the AoaiGrader class,
|
|
40
|
+
including child class instances.
|
|
41
|
+
|
|
42
|
+
:param evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
43
|
+
and value as the evaluator function or AOAI grader.
|
|
44
|
+
:type evaluators: Dict[str, Union[Callable, ]]
|
|
45
|
+
:return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
|
|
46
|
+
:rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
|
|
47
|
+
"""
|
|
48
|
+
true_evaluators = {}
|
|
49
|
+
aoai_graders = {}
|
|
50
|
+
for key, value in evaluators.items():
|
|
51
|
+
if isinstance(value, AzureOpenAIGrader):
|
|
52
|
+
aoai_graders[key] = value
|
|
53
|
+
else:
|
|
54
|
+
true_evaluators[key] = value
|
|
55
|
+
return true_evaluators, aoai_graders
|
|
56
|
+
|
|
57
|
+
@experimental
|
|
58
|
+
def _begin_aoai_evaluation(
|
|
59
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
60
|
+
column_mappings: Optional[Dict[str, Dict[str, str]]],
|
|
61
|
+
data: pd.DataFrame,
|
|
62
|
+
run_name: str
|
|
63
|
+
) -> List[OAIEvalRunCreationInfo]:
|
|
64
|
+
"""
|
|
65
|
+
Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
|
|
66
|
+
AOAI evaluation runs must be queried for completion, so this returns the IDs needed to poll for the
|
|
67
|
+
results, and map those results to the user-supplied names of the graders.
|
|
68
|
+
|
|
69
|
+
If any of the graders require unique column mappings, this function will
|
|
70
|
+
create a separate evaluation run for each grader. Otherwise, all graders
|
|
71
|
+
will be evaluated in a single run.
|
|
72
|
+
|
|
73
|
+
:param client: The AOAI client to use for the evaluation.
|
|
74
|
+
:type client: Union[OpenAI, AzureOpenAI]
|
|
75
|
+
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
|
|
76
|
+
:type graders: Dict[str, AoaiGrader]
|
|
77
|
+
:param column_mappings: The column mappings to use for the evaluation.
|
|
78
|
+
:type column_mappings: Optional[Dict[str, Dict[str, str]]]
|
|
79
|
+
:param data: The data to evaluate, preprocessed by the `_validate_and_load_data` method.
|
|
80
|
+
:type data: pd.DataFrame
|
|
81
|
+
:param run_name: The name of the evaluation run.
|
|
82
|
+
:type run_name: str
|
|
83
|
+
:return: A list of evaluation run info that can be used to retrieve the results of the evaluation later
|
|
84
|
+
:rtype: List[OAIEvalRunCreationInfo]
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
|
|
89
|
+
all_eval_run_info: List[OAIEvalRunCreationInfo] = []
|
|
90
|
+
|
|
91
|
+
for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
|
|
92
|
+
all_eval_run_info.append(_begin_single_aoai_evaluation(
|
|
93
|
+
selected_graders,
|
|
94
|
+
data,
|
|
95
|
+
selected_column_mapping,
|
|
96
|
+
run_name
|
|
97
|
+
))
|
|
98
|
+
|
|
99
|
+
return all_eval_run_info
|
|
100
|
+
|
|
101
|
+
def _begin_single_aoai_evaluation(
|
|
102
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
103
|
+
data: pd.DataFrame,
|
|
104
|
+
column_mapping: Dict[str, str],
|
|
105
|
+
run_name: str
|
|
106
|
+
) -> OAIEvalRunCreationInfo:
|
|
107
|
+
"""
|
|
108
|
+
Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
|
|
109
|
+
AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
|
|
110
|
+
at a later time.
|
|
111
|
+
|
|
112
|
+
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
|
|
113
|
+
:type graders: Dict[str, AoaiGrader]
|
|
114
|
+
:param data_source_config: The data source configuration to apply to the
|
|
115
|
+
:type data_source_config: pd.DataFrame
|
|
116
|
+
:param run_name: The name of the evaluation run.
|
|
117
|
+
:type run_name: str
|
|
118
|
+
:return: A tuple containing the eval group ID and eval run ID of the resultant eval run, as well as a dictionary
|
|
119
|
+
that maps the user-supplied evaluators to the names of the graders as generated by the OAI service.
|
|
120
|
+
:rtype: Tuple[str, str, Dict[str, str]]
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
# Format data for eval group creation
|
|
124
|
+
grader_name_list = []
|
|
125
|
+
grader_list = []
|
|
126
|
+
# It's expected that all graders supplied for a single eval run use the same credentials
|
|
127
|
+
# so grab a client from the first grader.
|
|
128
|
+
client = list(graders.values())[0].get_client()
|
|
129
|
+
|
|
130
|
+
for name, grader in graders.items():
|
|
131
|
+
grader_name_list.append(name)
|
|
132
|
+
grader_list.append(grader._grader_config)
|
|
133
|
+
data_source_config = _generate_data_source_config(data, column_mapping)
|
|
134
|
+
|
|
135
|
+
# Create eval group
|
|
136
|
+
# import pdb; pdb.set_trace()
|
|
137
|
+
eval_group_info = client.evals.create(
|
|
138
|
+
data_source_config=data_source_config,
|
|
139
|
+
testing_criteria=grader_list,
|
|
140
|
+
metadata={"is_foundry_eval": "true"}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
LOGGER.info(f"AOAI: Eval group created with id {eval_group_info.id}. Creating eval run next...")
|
|
144
|
+
# Use eval group info to map grader IDs back to user-assigned names.
|
|
145
|
+
grader_name_map = {}
|
|
146
|
+
num_criteria = len(eval_group_info.testing_criteria)
|
|
147
|
+
if num_criteria != len(grader_name_list):
|
|
148
|
+
raise EvaluationException(
|
|
149
|
+
message=f"Number of testing criteria ({num_criteria})" +
|
|
150
|
+
f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
|
|
151
|
+
blame=ErrorBlame.USER_ERROR,
|
|
152
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
153
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
154
|
+
)
|
|
155
|
+
for name, criteria in zip(grader_name_list, eval_group_info.testing_criteria):
|
|
156
|
+
grader_name_map[criteria.id] = name
|
|
157
|
+
|
|
158
|
+
# Create eval run
|
|
159
|
+
eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
|
|
160
|
+
LOGGER.info(f"AOAI: Eval run created with id {eval_run_id}." +
|
|
161
|
+
" Results will be retrieved after normal evaluation is complete...")
|
|
162
|
+
|
|
163
|
+
return OAIEvalRunCreationInfo(client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
|
|
164
|
+
|
|
165
|
+
def _get_evaluation_run_results(
|
|
166
|
+
all_run_info: List[OAIEvalRunCreationInfo]
|
|
167
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
168
|
+
"""
|
|
169
|
+
Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
|
|
170
|
+
pipeline to consume. This method accepts a list of eval run information, and will combine the
|
|
171
|
+
results into a single dataframe and metrics dictionary.
|
|
172
|
+
|
|
173
|
+
:param all_run_info: A list of evaluation run information that contains the needed values
|
|
174
|
+
to retrieve the results of the evaluation run.
|
|
175
|
+
:type all_run_info: List[OAIEvalRunCreationInfo]
|
|
176
|
+
:return: A tuple containing the results of the evaluation run as a dataframe, and a dictionary of metrics
|
|
177
|
+
calculated from the evaluation run.
|
|
178
|
+
:rtype: Tuple[pd.DataFrame, Dict[str, Any]]
|
|
179
|
+
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
run_metrics = {}
|
|
183
|
+
output_df = pd.DataFrame()
|
|
184
|
+
for run_info in all_run_info:
|
|
185
|
+
cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
|
|
186
|
+
output_df = pd.concat([output_df, cur_output_df], axis=1)
|
|
187
|
+
run_metrics.update(cur_run_metrics)
|
|
188
|
+
|
|
189
|
+
return output_df, run_metrics
|
|
190
|
+
|
|
191
|
+
def _get_single_run_results(
|
|
192
|
+
run_info: OAIEvalRunCreationInfo,
|
|
193
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
194
|
+
"""
|
|
195
|
+
Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
|
|
196
|
+
pipeline to consume.
|
|
197
|
+
|
|
198
|
+
:param run_info: The evaluation run information that contains the needed values
|
|
199
|
+
to retrieve the results of the evaluation run.
|
|
200
|
+
:type run_info: OAIEvalRunCreationInfo
|
|
201
|
+
:return: A tuple containing the results of the evaluation run as a dataframe, and a dictionary of metrics
|
|
202
|
+
calculated from the evaluation run.
|
|
203
|
+
:rtype: Tuple[pd.DataFrame, Dict[str, Any]]
|
|
204
|
+
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
|
|
205
|
+
"""
|
|
206
|
+
# Wait for evaluation run to complete
|
|
207
|
+
run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
|
|
208
|
+
if run_results.status != "completed":
|
|
209
|
+
raise EvaluationException(
|
|
210
|
+
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
|
|
211
|
+
+ f" failed with status {run_results.status}.",
|
|
212
|
+
blame=ErrorBlame.UNKNOWN,
|
|
213
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
214
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
215
|
+
)
|
|
216
|
+
LOGGER.info(f"AOAI: Evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
|
|
217
|
+
+ " completed successfully. Gathering results...")
|
|
218
|
+
# Convert run results into a dictionary of metrics
|
|
219
|
+
run_metrics = {}
|
|
220
|
+
if run_results.per_testing_criteria_results is None:
|
|
221
|
+
msg = ("AOAI evaluation run returned no results, despite 'completed' status. This might" +
|
|
222
|
+
" occur when invalid or conflicting models are selected in the model and grader configs."
|
|
223
|
+
f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}")
|
|
224
|
+
raise EvaluationException(
|
|
225
|
+
message=msg,
|
|
226
|
+
blame=ErrorBlame.UNKNOWN,
|
|
227
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
228
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
229
|
+
)
|
|
230
|
+
for criteria_result in run_results.per_testing_criteria_results:
|
|
231
|
+
grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
|
|
232
|
+
passed = criteria_result.passed
|
|
233
|
+
failed = criteria_result.failed
|
|
234
|
+
ratio = passed / (passed + failed)
|
|
235
|
+
formatted_column_name = f"{grader_name}.pass_rate"
|
|
236
|
+
run_metrics[formatted_column_name] = ratio
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# Get full results and convert them into a dataframe.
|
|
240
|
+
# Notes on raw full data output from OAI eval runs:
|
|
241
|
+
# Each row in the full results list in itself a list.
|
|
242
|
+
# Each entry corresponds to one grader's results from the criteria list
|
|
243
|
+
# that was inputted to the eval group.
|
|
244
|
+
# Each entry is a dictionary, with a name, sample, passed boolean, and score number.
|
|
245
|
+
# The name is used to figure out which grader the entry refers to, the sample is ignored.
|
|
246
|
+
# The passed and score values are then added to the results dictionary, prepended with the grader's name
|
|
247
|
+
# as entered by the user in the inputted dictionary.
|
|
248
|
+
# Other values, if they exist, are also added to the results dictionary.
|
|
249
|
+
raw_list_results = run_info["client"].evals.runs.output_items.list(
|
|
250
|
+
eval_id=run_info["eval_group_id"],
|
|
251
|
+
run_id=run_info["eval_run_id"]
|
|
252
|
+
)
|
|
253
|
+
listed_results = {"index": []}
|
|
254
|
+
# raw data has no order guarantees, we need to sort them by their
|
|
255
|
+
# datasource_item_id
|
|
256
|
+
for row_result in raw_list_results.data:
|
|
257
|
+
# Add the datasource_item_id for later sorting
|
|
258
|
+
listed_results["index"].append(row_result.datasource_item_id)
|
|
259
|
+
for single_grader_row_result in row_result.results:
|
|
260
|
+
grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
|
|
261
|
+
for name, value in single_grader_row_result.items():
|
|
262
|
+
if name in ["name"]: # Todo decide if we also want to exclude "sample"
|
|
263
|
+
continue
|
|
264
|
+
if name.lower() == "passed":
|
|
265
|
+
# create a `_result` column for each grader
|
|
266
|
+
result_column_name = f"outputs.{grader_name}.{grader_name}_result"
|
|
267
|
+
if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
|
|
268
|
+
if (result_column_name not in listed_results):
|
|
269
|
+
listed_results[result_column_name] = []
|
|
270
|
+
listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
|
|
271
|
+
|
|
272
|
+
formatted_column_name = f"outputs.{grader_name}.{name}"
|
|
273
|
+
if (formatted_column_name not in listed_results):
|
|
274
|
+
listed_results[formatted_column_name] = []
|
|
275
|
+
listed_results[formatted_column_name].append(value)
|
|
276
|
+
output_df = pd.DataFrame(listed_results)
|
|
277
|
+
# sort by index
|
|
278
|
+
output_df = output_df.sort_values('index', ascending=[True])
|
|
279
|
+
# remove index column
|
|
280
|
+
output_df.drop(columns=["index"], inplace=True)
|
|
281
|
+
return output_df, run_metrics
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str, Any]) -> AzureOpenAIGrader:
|
|
285
|
+
"""
|
|
286
|
+
Helper function for the remote evaluation service.
|
|
287
|
+
Given a model ID that refers to a specific AOAI grader wrapper class, return an instance of that class
|
|
288
|
+
using the provided initialization parameters.
|
|
289
|
+
|
|
290
|
+
:param grader_id: The model ID that refers to a specific AOAI grader wrapper class.
|
|
291
|
+
:type grader_id: str
|
|
292
|
+
:param init_params: The initialization parameters to be used for the AOAI grader wrapper class.
|
|
293
|
+
Requires that it contain a model_config and grader_config as top-level keys.
|
|
294
|
+
:type init_params: Dict[str, Any]
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
model_config = init_params.get("model_config", None)
|
|
298
|
+
if model_config is None:
|
|
299
|
+
raise EvaluationException(
|
|
300
|
+
message="Grader converter needs a valid 'model_config' key in init_params.",
|
|
301
|
+
blame=ErrorBlame.USER_ERROR,
|
|
302
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
303
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
grader_class = _get_grader_class(grader_id)
|
|
307
|
+
return grader_class(**init_params)
|
|
308
|
+
|
|
309
|
+
def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
310
|
+
"""
|
|
311
|
+
Given a model ID, return the class of the corresponding grader wrapper.
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
from azure.ai.evaluation import (
|
|
315
|
+
AzureOpenAIGrader,
|
|
316
|
+
AzureOpenAILabelGrader,
|
|
317
|
+
AzureOpenAIStringCheckGrader,
|
|
318
|
+
AzureOpenAITextSimilarityGrader,
|
|
319
|
+
)
|
|
320
|
+
id_map = {
|
|
321
|
+
AzureOpenAIGrader.id: AzureOpenAIGrader,
|
|
322
|
+
AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
|
|
323
|
+
AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
|
|
324
|
+
AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
for key in id_map.keys():
|
|
328
|
+
if model_id == key:
|
|
329
|
+
return id_map[key]
|
|
330
|
+
raise EvaluationException(
|
|
331
|
+
message=f"Model ID {model_id} not recognized as an AOAI grader ID",
|
|
332
|
+
blame=ErrorBlame.USER_ERROR,
|
|
333
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
334
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _get_graders_and_column_mappings(
|
|
339
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
340
|
+
column_mappings: Optional[Dict[str, Dict[str, str]]],
|
|
341
|
+
) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
|
|
342
|
+
"""
|
|
343
|
+
Given a dictionary of column mappings and a dictionary of AOAI graders,
|
|
344
|
+
Split them into sub-lists and sub-dictionaries that each correspond to a single evaluation run
|
|
345
|
+
that must be performed to evaluate the entire dataset.
|
|
346
|
+
|
|
347
|
+
Currently this function is fairly naive; it always splits the data if there are multiple
|
|
348
|
+
graders present and any of them have a unique column mapping.
|
|
349
|
+
|
|
350
|
+
This odd separate of data is necessary because our system allows for different evaluators
|
|
351
|
+
to have different dataset columns mapped to the same input name for each evaluator, while
|
|
352
|
+
the OAI API can't. So, if if there's a possibility that such a conflict might arise,
|
|
353
|
+
we need to split the incoming data up.
|
|
354
|
+
|
|
355
|
+
Currently splits each grader into its own eval group/run to ensure they each use
|
|
356
|
+
their own credentials later on. Planned fast follow is to group things by
|
|
357
|
+
matching credentials later.
|
|
358
|
+
|
|
359
|
+
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
|
|
360
|
+
:type graders: Dict[str, AoaiGrader]
|
|
361
|
+
:param column_mappings: The column mappings to use for the evaluation.
|
|
362
|
+
:type column_mappings: Optional[Dict[str, Dict[str, str]]]
|
|
363
|
+
:return: A list of tuples, each containing dictionary of AOAI graders,
|
|
364
|
+
and the column mapping they should use.
|
|
365
|
+
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
default_mapping = column_mappings.get("default", None)
|
|
369
|
+
return [({name : grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
|
|
370
|
+
|
|
371
|
+
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
372
|
+
"""Produce a data source config that maps all columns from the supplied data source into
|
|
373
|
+
the OAI API. The mapping is naive unless a column mapping is provided, in which case
|
|
374
|
+
the column mapping's values overrule the relevant naive mappings
|
|
375
|
+
|
|
376
|
+
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
377
|
+
helper function.
|
|
378
|
+
:type input_data_df: pd.DataFrame
|
|
379
|
+
:param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
|
|
380
|
+
:type column_mapping: Optional[Dict[str, str]]
|
|
381
|
+
:return: A dictionary that can act as data source config for OAI evaluation group creation.
|
|
382
|
+
:rtype: Dict[str, Any]
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
data_source_config = {
|
|
386
|
+
"type": "custom",
|
|
387
|
+
"item_schema": {
|
|
388
|
+
"type": "object",
|
|
389
|
+
"properties": {},
|
|
390
|
+
"required": [],
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
properties = data_source_config["item_schema"]["properties"]
|
|
394
|
+
required = data_source_config["item_schema"]["required"]
|
|
395
|
+
for key in column_mapping.keys():
|
|
396
|
+
properties[key] = {
|
|
397
|
+
"type": "string",
|
|
398
|
+
}
|
|
399
|
+
required.append(key)
|
|
400
|
+
return data_source_config
|
|
401
|
+
|
|
402
|
+
def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
|
|
403
|
+
"""Produce a data source config that naively maps all columns from the supplied data source into
|
|
404
|
+
the OAI API.
|
|
405
|
+
|
|
406
|
+
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
407
|
+
helper function.
|
|
408
|
+
:type input_data_df: pd.DataFrame
|
|
409
|
+
:return: A dictionary that can act as data source config for OAI evaluation group creation.
|
|
410
|
+
:rtype: Dict[str, Any]
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
properties = {}
|
|
414
|
+
required = []
|
|
415
|
+
|
|
416
|
+
for column in input_data_df.columns:
|
|
417
|
+
properties[column] = {
|
|
418
|
+
"type": "string",
|
|
419
|
+
}
|
|
420
|
+
required.append(column)
|
|
421
|
+
data_source_config = {
|
|
422
|
+
"type": "custom",
|
|
423
|
+
"item_schema": {
|
|
424
|
+
"type": "object",
|
|
425
|
+
"properties": properties,
|
|
426
|
+
"required": required,
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
return data_source_config
|
|
430
|
+
|
|
431
|
+
def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
432
|
+
"""
|
|
433
|
+
Given a dataframe of data to be evaluated, and an optional column mapping,
|
|
434
|
+
produce a dictionary can be used as the data source input for an OAI evaluation run.
|
|
435
|
+
|
|
436
|
+
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
437
|
+
helper function.
|
|
438
|
+
:type input_data_df: pd.DataFrame
|
|
439
|
+
:param column_mapping: The column mapping to use for the evaluation. If None, a naive 1:1 mapping is used.
|
|
440
|
+
:type column_mapping: Optional[Dict[str, str]]
|
|
441
|
+
:return: A dictionary that can be used as the data source input for an OAI evaluation run.
|
|
442
|
+
:rtype: Dict[str, Any]
|
|
443
|
+
"""
|
|
444
|
+
content = []
|
|
445
|
+
column_to_source_map = {}
|
|
446
|
+
# Convert from column mapping's format to figure out actual column names in
|
|
447
|
+
# input dataframe, and map those to the appropriate OAI input names.
|
|
448
|
+
for name, formatted_entry in column_mapping.items():
|
|
449
|
+
# From "${" from start and "}" from end before splitting.
|
|
450
|
+
entry_pieces = formatted_entry[2:-1].split(".")
|
|
451
|
+
if len(entry_pieces) == 2 and entry_pieces[0] == "data":
|
|
452
|
+
column_to_source_map[name] = entry_pieces[1]
|
|
453
|
+
elif len(entry_pieces) == 3 and entry_pieces[0] == "run" and entry_pieces[1] == "outputs":
|
|
454
|
+
column_to_source_map[name] = f"__outputs.{entry_pieces[2]}"
|
|
455
|
+
|
|
456
|
+
# Using the above mapping, transform the input dataframe into a content
|
|
457
|
+
# dictionary that'll work in an OAI data source.
|
|
458
|
+
for row in input_data_df.iterrows():
|
|
459
|
+
row_dict = {}
|
|
460
|
+
for oai_key,dataframe_key in column_to_source_map.items():
|
|
461
|
+
row_dict[oai_key] = str(row[1][dataframe_key])
|
|
462
|
+
content.append({"item": row_dict})
|
|
463
|
+
|
|
464
|
+
return {
|
|
465
|
+
"type": "jsonl",
|
|
466
|
+
"source": {
|
|
467
|
+
"type": "file_content",
|
|
468
|
+
"content": content,
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
def _begin_eval_run(
|
|
473
|
+
client: Union[OpenAI, AzureOpenAI],
|
|
474
|
+
eval_group_id: str,
|
|
475
|
+
run_name: str,
|
|
476
|
+
input_data_df: pd.DataFrame,
|
|
477
|
+
column_mapping: Dict[str, str]
|
|
478
|
+
) -> str:
|
|
479
|
+
"""
|
|
480
|
+
Given an eval group id and a dataset file path, use the AOAI API to
|
|
481
|
+
start an evaluation run with the given name and description.
|
|
482
|
+
Returns a poller that can be used to monitor the run.
|
|
483
|
+
|
|
484
|
+
:param client: The AOAI client to use for the evaluation.
|
|
485
|
+
:type client: Union[OpenAI, AzureOpenAI]
|
|
486
|
+
:param eval_group_id: The ID of the evaluation group to use for the evaluation run.
|
|
487
|
+
:type eval_group_id: str
|
|
488
|
+
:param run_name: The name of the evaluation run.
|
|
489
|
+
:type run_name: str
|
|
490
|
+
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
491
|
+
helper function.
|
|
492
|
+
:type input_data_df: pd.DataFrame
|
|
493
|
+
:return: The ID of the evaluation run.
|
|
494
|
+
:rtype: str
|
|
495
|
+
"""
|
|
496
|
+
|
|
497
|
+
data_source = _get_data_source(input_data_df, column_mapping)
|
|
498
|
+
eval_run = client.evals.runs.create(
|
|
499
|
+
eval_id=eval_group_id,
|
|
500
|
+
data_source=data_source,
|
|
501
|
+
name=run_name,
|
|
502
|
+
metadata={"sample_generation": "off","file_format": "jsonl", "is_foundry_eval": "true"}
|
|
503
|
+
# TODO decide if we want to add our own timeout value?
|
|
504
|
+
)
|
|
505
|
+
return eval_run.id
|
|
506
|
+
|
|
507
|
+
# Post built TODO: replace with _red_team.py's retry logic?
|
|
508
|
+
def _wait_for_run_conclusion(
|
|
509
|
+
client: Union[OpenAI, AzureOpenAI],
|
|
510
|
+
eval_group_id: str,
|
|
511
|
+
eval_run_id: str,
|
|
512
|
+
max_wait_seconds = 21600
|
|
513
|
+
) -> Any:
|
|
514
|
+
"""
|
|
515
|
+
Perform exponential backoff polling to get the results of an AOAI evaluation run.
|
|
516
|
+
Raises an EvaluationException if max attempts are reached without receiving a concluding status.
|
|
517
|
+
|
|
518
|
+
:param client: The AOAI client to use for the evaluation.
|
|
519
|
+
:type client: Union[OpenAI, AzureOpenAI]
|
|
520
|
+
:param eval_group_id: The ID of the evaluation group that contains the evaluation run of interest.
|
|
521
|
+
:type eval_group_id: str
|
|
522
|
+
:param eval_run_id: The evaluation run ID to get the results of.
|
|
523
|
+
:type eval_run_id: str
|
|
524
|
+
:param max_wait_seconds: The maximum amount of time to wait for the evaluation run to complete.
|
|
525
|
+
:type max_wait_seconds: int
|
|
526
|
+
:return: The results of the evaluation run.
|
|
527
|
+
:rtype: Any
|
|
528
|
+
"""
|
|
529
|
+
|
|
530
|
+
LOGGER.info(f"AOAI: Getting OAI eval run results from group/run {eval_group_id}/{eval_run_id}...")
|
|
531
|
+
total_wait = 0
|
|
532
|
+
iters = 0
|
|
533
|
+
# start with ~51 minutes of exponential backoff
|
|
534
|
+
# max wait time = 2^10 * 3 = 3072 seconds ~= 51 minutes
|
|
535
|
+
wait_interval = 3 # Seconds.
|
|
536
|
+
while(True):
|
|
537
|
+
wait_interval *= 1.5
|
|
538
|
+
total_wait += wait_interval
|
|
539
|
+
# Reduce last wait interval if total wait time exceeds max wait time
|
|
540
|
+
if total_wait > max_wait_seconds:
|
|
541
|
+
wait_interval -= total_wait - max_wait_seconds
|
|
542
|
+
sleep(wait_interval)
|
|
543
|
+
response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
|
|
544
|
+
if response.status not in ["queued", "in_progress"]:
|
|
545
|
+
return response
|
|
546
|
+
if total_wait > max_wait_seconds:
|
|
547
|
+
raise EvaluationException(
|
|
548
|
+
message=f"Timed out waiting for AOAI evaluation to complete after {iters}"
|
|
549
|
+
+ f" rounds of polling. Final status was {response.status}",
|
|
550
|
+
blame=ErrorBlame.USER_ERROR,
|
|
551
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
552
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
553
|
+
)
|