azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +13 -2
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +86 -50
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +124 -3
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +60 -54
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +24 -15
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +21 -21
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1286 -739
- azure/ai/evaluation/red_team/_red_team_result.py +43 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +32 -32
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +2 -12
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +26 -15
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +10 -8
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +9 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +15 -1
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -131
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@ from time import sleep
|
|
|
11
11
|
|
|
12
12
|
from ._batch_run import CodeClient, ProxyClient
|
|
13
13
|
|
|
14
|
-
#import aoai_mapping
|
|
14
|
+
# import aoai_mapping
|
|
15
15
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
16
|
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
17
17
|
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
@@ -30,17 +30,18 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
|
|
|
30
30
|
eval_run_id: str
|
|
31
31
|
grader_name_map: Dict[str, str]
|
|
32
32
|
|
|
33
|
+
|
|
33
34
|
def _split_evaluators_and_grader_configs(
|
|
34
|
-
|
|
35
|
-
|
|
35
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
36
|
+
) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
|
|
36
37
|
"""
|
|
37
38
|
Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
|
|
38
39
|
dictionaries that each contain one subset, the first containing the evaluators and the second containing
|
|
39
40
|
the AOAI graders. AOAI graders are defined as anything that is an instance of the AoaiGrader class,
|
|
40
|
-
including child class instances.
|
|
41
|
+
including child class instances.
|
|
41
42
|
|
|
42
43
|
:param evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
43
|
-
and value as the evaluator function or AOAI grader.
|
|
44
|
+
and value as the evaluator function or AOAI grader.
|
|
44
45
|
:type evaluators: Dict[str, Union[Callable, ]]
|
|
45
46
|
:return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
|
|
46
47
|
:rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
|
|
@@ -54,13 +55,14 @@ def _split_evaluators_and_grader_configs(
|
|
|
54
55
|
true_evaluators[key] = value
|
|
55
56
|
return true_evaluators, aoai_graders
|
|
56
57
|
|
|
58
|
+
|
|
57
59
|
@experimental
|
|
58
60
|
def _begin_aoai_evaluation(
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
62
|
+
column_mappings: Optional[Dict[str, Dict[str, str]]],
|
|
63
|
+
data: pd.DataFrame,
|
|
64
|
+
run_name: str,
|
|
65
|
+
) -> List[OAIEvalRunCreationInfo]:
|
|
64
66
|
"""
|
|
65
67
|
Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
|
|
66
68
|
AOAI evaluation runs must be queried for completion, so this returns the IDs needed to poll for the
|
|
@@ -84,26 +86,20 @@ def _begin_aoai_evaluation(
|
|
|
84
86
|
:rtype: List[OAIEvalRunCreationInfo]
|
|
85
87
|
"""
|
|
86
88
|
|
|
87
|
-
|
|
88
89
|
LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
|
|
89
90
|
all_eval_run_info: List[OAIEvalRunCreationInfo] = []
|
|
90
91
|
|
|
91
92
|
for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
|
|
92
|
-
all_eval_run_info.append(
|
|
93
|
-
selected_graders,
|
|
94
|
-
|
|
95
|
-
selected_column_mapping,
|
|
96
|
-
run_name
|
|
97
|
-
))
|
|
93
|
+
all_eval_run_info.append(
|
|
94
|
+
_begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
|
|
95
|
+
)
|
|
98
96
|
|
|
99
97
|
return all_eval_run_info
|
|
100
98
|
|
|
99
|
+
|
|
101
100
|
def _begin_single_aoai_evaluation(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
column_mapping: Dict[str, str],
|
|
105
|
-
run_name: str
|
|
106
|
-
) -> OAIEvalRunCreationInfo:
|
|
101
|
+
graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
|
|
102
|
+
) -> OAIEvalRunCreationInfo:
|
|
107
103
|
"""
|
|
108
104
|
Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
|
|
109
105
|
AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
|
|
@@ -121,7 +117,7 @@ def _begin_single_aoai_evaluation(
|
|
|
121
117
|
"""
|
|
122
118
|
|
|
123
119
|
# Format data for eval group creation
|
|
124
|
-
grader_name_list
|
|
120
|
+
grader_name_list = []
|
|
125
121
|
grader_list = []
|
|
126
122
|
# It's expected that all graders supplied for a single eval run use the same credentials
|
|
127
123
|
# so grab a client from the first grader.
|
|
@@ -135,19 +131,17 @@ def _begin_single_aoai_evaluation(
|
|
|
135
131
|
# Create eval group
|
|
136
132
|
# import pdb; pdb.set_trace()
|
|
137
133
|
eval_group_info = client.evals.create(
|
|
138
|
-
data_source_config=data_source_config,
|
|
139
|
-
testing_criteria=grader_list,
|
|
140
|
-
metadata={"is_foundry_eval": "true"}
|
|
134
|
+
data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
|
|
141
135
|
)
|
|
142
|
-
|
|
136
|
+
|
|
143
137
|
LOGGER.info(f"AOAI: Eval group created with id {eval_group_info.id}. Creating eval run next...")
|
|
144
138
|
# Use eval group info to map grader IDs back to user-assigned names.
|
|
145
139
|
grader_name_map = {}
|
|
146
140
|
num_criteria = len(eval_group_info.testing_criteria)
|
|
147
141
|
if num_criteria != len(grader_name_list):
|
|
148
142
|
raise EvaluationException(
|
|
149
|
-
message=f"Number of testing criteria ({num_criteria})"
|
|
150
|
-
|
|
143
|
+
message=f"Number of testing criteria ({num_criteria})"
|
|
144
|
+
+ f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
|
|
151
145
|
blame=ErrorBlame.USER_ERROR,
|
|
152
146
|
category=ErrorCategory.INVALID_VALUE,
|
|
153
147
|
target=ErrorTarget.AOAI_GRADER,
|
|
@@ -155,21 +149,24 @@ def _begin_single_aoai_evaluation(
|
|
|
155
149
|
for name, criteria in zip(grader_name_list, eval_group_info.testing_criteria):
|
|
156
150
|
grader_name_map[criteria.id] = name
|
|
157
151
|
|
|
158
|
-
# Create eval run
|
|
152
|
+
# Create eval run
|
|
159
153
|
eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
|
|
160
|
-
LOGGER.info(
|
|
161
|
-
|
|
154
|
+
LOGGER.info(
|
|
155
|
+
f"AOAI: Eval run created with id {eval_run_id}."
|
|
156
|
+
+ " Results will be retrieved after normal evaluation is complete..."
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return OAIEvalRunCreationInfo(
|
|
160
|
+
client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
|
|
161
|
+
)
|
|
162
162
|
|
|
163
|
-
return OAIEvalRunCreationInfo(client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
|
|
164
163
|
|
|
165
|
-
def _get_evaluation_run_results(
|
|
166
|
-
all_run_info: List[OAIEvalRunCreationInfo]
|
|
167
|
-
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
164
|
+
def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
168
165
|
"""
|
|
169
166
|
Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
|
|
170
167
|
pipeline to consume. This method accepts a list of eval run information, and will combine the
|
|
171
168
|
results into a single dataframe and metrics dictionary.
|
|
172
|
-
|
|
169
|
+
|
|
173
170
|
:param all_run_info: A list of evaluation run information that contains the needed values
|
|
174
171
|
to retrieve the results of the evaluation run.
|
|
175
172
|
:type all_run_info: List[OAIEvalRunCreationInfo]
|
|
@@ -188,13 +185,14 @@ def _get_evaluation_run_results(
|
|
|
188
185
|
|
|
189
186
|
return output_df, run_metrics
|
|
190
187
|
|
|
188
|
+
|
|
191
189
|
def _get_single_run_results(
|
|
192
|
-
|
|
193
|
-
|
|
190
|
+
run_info: OAIEvalRunCreationInfo,
|
|
191
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
194
192
|
"""
|
|
195
193
|
Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
|
|
196
194
|
pipeline to consume.
|
|
197
|
-
|
|
195
|
+
|
|
198
196
|
:param run_info: The evaluation run information that contains the needed values
|
|
199
197
|
to retrieve the results of the evaluation run.
|
|
200
198
|
:type run_info: OAIEvalRunCreationInfo
|
|
@@ -205,28 +203,30 @@ def _get_single_run_results(
|
|
|
205
203
|
"""
|
|
206
204
|
# Wait for evaluation run to complete
|
|
207
205
|
run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
|
|
206
|
+
|
|
208
207
|
if run_results.status != "completed":
|
|
209
208
|
raise EvaluationException(
|
|
210
209
|
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
|
|
211
|
-
|
|
210
|
+
+ f" failed with status {run_results.status}.",
|
|
212
211
|
blame=ErrorBlame.UNKNOWN,
|
|
213
212
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
214
213
|
target=ErrorTarget.AOAI_GRADER,
|
|
215
214
|
)
|
|
216
|
-
|
|
217
|
-
+ " completed successfully. Gathering results...")
|
|
215
|
+
|
|
218
216
|
# Convert run results into a dictionary of metrics
|
|
219
217
|
run_metrics = {}
|
|
220
218
|
if run_results.per_testing_criteria_results is None:
|
|
221
|
-
msg = (
|
|
222
|
-
|
|
223
|
-
|
|
219
|
+
msg = (
|
|
220
|
+
"AOAI evaluation run returned no results, despite 'completed' status. This might"
|
|
221
|
+
+ " occur when invalid or conflicting models are selected in the model and grader configs."
|
|
222
|
+
f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}"
|
|
223
|
+
)
|
|
224
224
|
raise EvaluationException(
|
|
225
225
|
message=msg,
|
|
226
226
|
blame=ErrorBlame.UNKNOWN,
|
|
227
227
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
228
228
|
target=ErrorTarget.AOAI_GRADER,
|
|
229
|
-
)
|
|
229
|
+
)
|
|
230
230
|
for criteria_result in run_results.per_testing_criteria_results:
|
|
231
231
|
grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
|
|
232
232
|
passed = criteria_result.passed
|
|
@@ -235,7 +235,6 @@ def _get_single_run_results(
|
|
|
235
235
|
formatted_column_name = f"{grader_name}.pass_rate"
|
|
236
236
|
run_metrics[formatted_column_name] = ratio
|
|
237
237
|
|
|
238
|
-
|
|
239
238
|
# Get full results and convert them into a dataframe.
|
|
240
239
|
# Notes on raw full data output from OAI eval runs:
|
|
241
240
|
# Each row in the full results list in itself a list.
|
|
@@ -246,36 +245,72 @@ def _get_single_run_results(
|
|
|
246
245
|
# The passed and score values are then added to the results dictionary, prepended with the grader's name
|
|
247
246
|
# as entered by the user in the inputted dictionary.
|
|
248
247
|
# Other values, if they exist, are also added to the results dictionary.
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
248
|
+
|
|
249
|
+
# Collect all results with pagination
|
|
250
|
+
all_results = []
|
|
251
|
+
next_cursor = None
|
|
252
|
+
limit = 100 # Max allowed by API
|
|
253
|
+
|
|
254
|
+
while True:
|
|
255
|
+
# Build kwargs for the API call
|
|
256
|
+
list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
|
|
257
|
+
if next_cursor is not None:
|
|
258
|
+
list_kwargs["after"] = next_cursor
|
|
259
|
+
|
|
260
|
+
raw_list_results = run_info["client"].evals.runs.output_items.list(**list_kwargs)
|
|
261
|
+
|
|
262
|
+
# Add current page results
|
|
263
|
+
all_results.extend(raw_list_results.data)
|
|
264
|
+
|
|
265
|
+
# Check for more pages
|
|
266
|
+
if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
|
|
267
|
+
if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
|
|
268
|
+
# Get the last item's ID for cursor-based pagination
|
|
269
|
+
next_cursor = raw_list_results.data[-1].id
|
|
270
|
+
else:
|
|
271
|
+
break
|
|
272
|
+
else:
|
|
273
|
+
break
|
|
274
|
+
|
|
253
275
|
listed_results = {"index": []}
|
|
254
276
|
# raw data has no order guarantees, we need to sort them by their
|
|
255
277
|
# datasource_item_id
|
|
256
|
-
for row_result in
|
|
278
|
+
for row_result in all_results:
|
|
257
279
|
# Add the datasource_item_id for later sorting
|
|
258
280
|
listed_results["index"].append(row_result.datasource_item_id)
|
|
259
281
|
for single_grader_row_result in row_result.results:
|
|
260
282
|
grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
|
|
261
283
|
for name, value in single_grader_row_result.items():
|
|
262
|
-
if name in ["name"]:
|
|
284
|
+
if name in ["name"]: # Todo decide if we also want to exclude "sample"
|
|
263
285
|
continue
|
|
264
286
|
if name.lower() == "passed":
|
|
265
287
|
# create a `_result` column for each grader
|
|
266
288
|
result_column_name = f"outputs.{grader_name}.{grader_name}_result"
|
|
267
|
-
if len(result_column_name) < 50:
|
|
268
|
-
if
|
|
289
|
+
if len(result_column_name) < 50: # TODO: is this the limit? Should we keep "passed"?
|
|
290
|
+
if result_column_name not in listed_results:
|
|
269
291
|
listed_results[result_column_name] = []
|
|
270
292
|
listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
|
|
271
293
|
|
|
272
294
|
formatted_column_name = f"outputs.{grader_name}.{name}"
|
|
273
|
-
if
|
|
295
|
+
if formatted_column_name not in listed_results:
|
|
274
296
|
listed_results[formatted_column_name] = []
|
|
275
297
|
listed_results[formatted_column_name].append(value)
|
|
298
|
+
|
|
299
|
+
# Ensure all columns have the same length as the index
|
|
300
|
+
num_rows = len(listed_results["index"])
|
|
301
|
+
for col_name in list(listed_results.keys()):
|
|
302
|
+
if col_name != "index":
|
|
303
|
+
col_length = len(listed_results[col_name])
|
|
304
|
+
if col_length < num_rows:
|
|
305
|
+
# Pad with None values
|
|
306
|
+
listed_results[col_name].extend([None] * (num_rows - col_length))
|
|
307
|
+
elif col_length > num_rows:
|
|
308
|
+
# This shouldn't happen, but truncate if it does
|
|
309
|
+
listed_results[col_name] = listed_results[col_name][:num_rows]
|
|
310
|
+
|
|
276
311
|
output_df = pd.DataFrame(listed_results)
|
|
277
312
|
# sort by index
|
|
278
|
-
output_df = output_df.sort_values(
|
|
313
|
+
output_df = output_df.sort_values("index", ascending=[True])
|
|
279
314
|
# remove index column
|
|
280
315
|
output_df.drop(columns=["index"], inplace=True)
|
|
281
316
|
return output_df, run_metrics
|
|
@@ -303,9 +338,10 @@ def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str,
|
|
|
303
338
|
target=ErrorTarget.AOAI_GRADER,
|
|
304
339
|
)
|
|
305
340
|
|
|
306
|
-
grader_class =
|
|
341
|
+
grader_class = _get_grader_class(grader_id)
|
|
307
342
|
return grader_class(**init_params)
|
|
308
343
|
|
|
344
|
+
|
|
309
345
|
def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
310
346
|
"""
|
|
311
347
|
Given a model ID, return the class of the corresponding grader wrapper.
|
|
@@ -316,12 +352,15 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
|
316
352
|
AzureOpenAILabelGrader,
|
|
317
353
|
AzureOpenAIStringCheckGrader,
|
|
318
354
|
AzureOpenAITextSimilarityGrader,
|
|
355
|
+
AzureOpenAIScoreModelGrader,
|
|
319
356
|
)
|
|
357
|
+
|
|
320
358
|
id_map = {
|
|
321
359
|
AzureOpenAIGrader.id: AzureOpenAIGrader,
|
|
322
360
|
AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
|
|
323
361
|
AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
|
|
324
362
|
AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
|
|
363
|
+
AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
|
|
325
364
|
}
|
|
326
365
|
|
|
327
366
|
for key in id_map.keys():
|
|
@@ -336,9 +375,9 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
|
336
375
|
|
|
337
376
|
|
|
338
377
|
def _get_graders_and_column_mappings(
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
378
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
379
|
+
column_mappings: Optional[Dict[str, Dict[str, str]]],
|
|
380
|
+
) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
|
|
342
381
|
"""
|
|
343
382
|
Given a dictionary of column mappings and a dictionary of AOAI graders,
|
|
344
383
|
Split them into sub-lists and sub-dictionaries that each correspond to a single evaluation run
|
|
@@ -366,20 +405,21 @@ def _get_graders_and_column_mappings(
|
|
|
366
405
|
"""
|
|
367
406
|
|
|
368
407
|
default_mapping = column_mappings.get("default", None)
|
|
369
|
-
return [({name
|
|
408
|
+
return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
|
|
409
|
+
|
|
370
410
|
|
|
371
411
|
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
372
412
|
"""Produce a data source config that maps all columns from the supplied data source into
|
|
373
413
|
the OAI API. The mapping is naive unless a column mapping is provided, in which case
|
|
374
414
|
the column mapping's values overrule the relevant naive mappings
|
|
375
|
-
|
|
415
|
+
|
|
376
416
|
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
377
417
|
helper function.
|
|
378
418
|
:type input_data_df: pd.DataFrame
|
|
379
419
|
:param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
|
|
380
420
|
:type column_mapping: Optional[Dict[str, str]]
|
|
381
421
|
:return: A dictionary that can act as data source config for OAI evaluation group creation.
|
|
382
|
-
:rtype: Dict[str, Any]
|
|
422
|
+
:rtype: Dict[str, Any]
|
|
383
423
|
"""
|
|
384
424
|
|
|
385
425
|
data_source_config = {
|
|
@@ -388,7 +428,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
|
|
|
388
428
|
"type": "object",
|
|
389
429
|
"properties": {},
|
|
390
430
|
"required": [],
|
|
391
|
-
}
|
|
431
|
+
},
|
|
392
432
|
}
|
|
393
433
|
properties = data_source_config["item_schema"]["properties"]
|
|
394
434
|
required = data_source_config["item_schema"]["required"]
|
|
@@ -399,10 +439,11 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
|
|
|
399
439
|
required.append(key)
|
|
400
440
|
return data_source_config
|
|
401
441
|
|
|
442
|
+
|
|
402
443
|
def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
|
|
403
444
|
"""Produce a data source config that naively maps all columns from the supplied data source into
|
|
404
445
|
the OAI API.
|
|
405
|
-
|
|
446
|
+
|
|
406
447
|
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
407
448
|
helper function.
|
|
408
449
|
:type input_data_df: pd.DataFrame
|
|
@@ -424,10 +465,11 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
|
|
|
424
465
|
"type": "object",
|
|
425
466
|
"properties": properties,
|
|
426
467
|
"required": required,
|
|
427
|
-
}
|
|
468
|
+
},
|
|
428
469
|
}
|
|
429
470
|
return data_source_config
|
|
430
471
|
|
|
472
|
+
|
|
431
473
|
def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
432
474
|
"""
|
|
433
475
|
Given a dataframe of data to be evaluated, and an optional column mapping,
|
|
@@ -457,7 +499,7 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
457
499
|
# dictionary that'll work in an OAI data source.
|
|
458
500
|
for row in input_data_df.iterrows():
|
|
459
501
|
row_dict = {}
|
|
460
|
-
for oai_key,dataframe_key in column_to_source_map.items():
|
|
502
|
+
for oai_key, dataframe_key in column_to_source_map.items():
|
|
461
503
|
row_dict[oai_key] = str(row[1][dataframe_key])
|
|
462
504
|
content.append({"item": row_dict})
|
|
463
505
|
|
|
@@ -466,20 +508,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
466
508
|
"source": {
|
|
467
509
|
"type": "file_content",
|
|
468
510
|
"content": content,
|
|
469
|
-
}
|
|
511
|
+
},
|
|
470
512
|
}
|
|
471
513
|
|
|
514
|
+
|
|
472
515
|
def _begin_eval_run(
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
516
|
+
client: Union[OpenAI, AzureOpenAI],
|
|
517
|
+
eval_group_id: str,
|
|
518
|
+
run_name: str,
|
|
519
|
+
input_data_df: pd.DataFrame,
|
|
520
|
+
column_mapping: Dict[str, str],
|
|
521
|
+
) -> str:
|
|
479
522
|
"""
|
|
480
|
-
Given an eval group id and a dataset file path, use the AOAI API to
|
|
523
|
+
Given an eval group id and a dataset file path, use the AOAI API to
|
|
481
524
|
start an evaluation run with the given name and description.
|
|
482
|
-
Returns a poller that can be used to monitor the run.
|
|
525
|
+
Returns a poller that can be used to monitor the run.
|
|
483
526
|
|
|
484
527
|
:param client: The AOAI client to use for the evaluation.
|
|
485
528
|
:type client: Union[OpenAI, AzureOpenAI]
|
|
@@ -499,18 +542,16 @@ def _begin_eval_run(
|
|
|
499
542
|
eval_id=eval_group_id,
|
|
500
543
|
data_source=data_source,
|
|
501
544
|
name=run_name,
|
|
502
|
-
metadata={"sample_generation": "off","file_format": "jsonl", "is_foundry_eval": "true"}
|
|
545
|
+
metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
|
|
503
546
|
# TODO decide if we want to add our own timeout value?
|
|
504
547
|
)
|
|
505
548
|
return eval_run.id
|
|
506
549
|
|
|
550
|
+
|
|
507
551
|
# Post built TODO: replace with _red_team.py's retry logic?
|
|
508
552
|
def _wait_for_run_conclusion(
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
eval_run_id: str,
|
|
512
|
-
max_wait_seconds = 21600
|
|
513
|
-
) -> Any:
|
|
553
|
+
client: Union[OpenAI, AzureOpenAI], eval_group_id: str, eval_run_id: str, max_wait_seconds=21600
|
|
554
|
+
) -> Any:
|
|
514
555
|
"""
|
|
515
556
|
Perform exponential backoff polling to get the results of an AOAI evaluation run.
|
|
516
557
|
Raises an EvaluationException if max attempts are reached without receiving a concluding status.
|
|
@@ -532,8 +573,8 @@ def _wait_for_run_conclusion(
|
|
|
532
573
|
iters = 0
|
|
533
574
|
# start with ~51 minutes of exponential backoff
|
|
534
575
|
# max wait time = 2^10 * 3 = 3072 seconds ~= 51 minutes
|
|
535
|
-
wait_interval = 3
|
|
536
|
-
while
|
|
576
|
+
wait_interval = 3 # Seconds.
|
|
577
|
+
while True:
|
|
537
578
|
wait_interval *= 1.5
|
|
538
579
|
total_wait += wait_interval
|
|
539
580
|
# Reduce last wait interval if total wait time exceeds max wait time
|
|
@@ -541,13 +582,13 @@ def _wait_for_run_conclusion(
|
|
|
541
582
|
wait_interval -= total_wait - max_wait_seconds
|
|
542
583
|
sleep(wait_interval)
|
|
543
584
|
response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
|
|
544
|
-
if response.status not in
|
|
585
|
+
if response.status not in ["queued", "in_progress"]:
|
|
545
586
|
return response
|
|
546
587
|
if total_wait > max_wait_seconds:
|
|
547
588
|
raise EvaluationException(
|
|
548
589
|
message=f"Timed out waiting for AOAI evaluation to complete after {iters}"
|
|
549
|
-
|
|
590
|
+
+ f" rounds of polling. Final status was {response.status}",
|
|
550
591
|
blame=ErrorBlame.USER_ERROR,
|
|
551
592
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
552
593
|
target=ErrorTarget.AOAI_GRADER,
|
|
553
|
-
)
|
|
594
|
+
)
|
|
@@ -17,7 +17,6 @@ from typing_extensions import ParamSpec
|
|
|
17
17
|
|
|
18
18
|
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
19
19
|
|
|
20
|
-
from ..._user_agent import USER_AGENT
|
|
21
20
|
from .._utils import _trace_destination_from_project_scope
|
|
22
21
|
|
|
23
22
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -13,6 +13,9 @@ import base64
|
|
|
13
13
|
import math
|
|
14
14
|
|
|
15
15
|
import pandas as pd
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
|
|
18
|
+
from azure.core.pipeline.policies import UserAgentPolicy
|
|
16
19
|
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
17
20
|
|
|
18
21
|
from azure.ai.evaluation._constants import (
|
|
@@ -24,6 +27,7 @@ from azure.ai.evaluation._constants import (
|
|
|
24
27
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
28
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
26
29
|
from azure.ai.evaluation._version import VERSION
|
|
30
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
27
31
|
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
28
32
|
|
|
29
33
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -127,6 +131,7 @@ def process_message_content(content, images_folder_path):
|
|
|
127
131
|
f.write(image_data_binary)
|
|
128
132
|
return None
|
|
129
133
|
|
|
134
|
+
|
|
130
135
|
def _log_metrics_and_instance_results_onedp(
|
|
131
136
|
metrics: Dict[str, Any],
|
|
132
137
|
instance_results: pd.DataFrame,
|
|
@@ -146,7 +151,8 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
146
151
|
)
|
|
147
152
|
client = EvaluationServiceOneDPClient(
|
|
148
153
|
endpoint=project_url,
|
|
149
|
-
credential=credentials
|
|
154
|
+
credential=credentials,
|
|
155
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
150
156
|
)
|
|
151
157
|
|
|
152
158
|
# Massaging before artifacts are put on disk
|
|
@@ -175,13 +181,11 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
175
181
|
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
176
182
|
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
177
183
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
178
|
-
}
|
|
184
|
+
}
|
|
179
185
|
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
180
186
|
|
|
181
187
|
create_evaluation_result_response = client.create_evaluation_result(
|
|
182
|
-
name=uuid.uuid4(),
|
|
183
|
-
path=tmpdir,
|
|
184
|
-
metrics=metrics
|
|
188
|
+
name=uuid.uuid4(), path=tmpdir, metrics=metrics
|
|
185
189
|
)
|
|
186
190
|
|
|
187
191
|
upload_run_response = client.start_evaluation_run(
|
|
@@ -196,14 +200,15 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
196
200
|
display_name=evaluation_name,
|
|
197
201
|
status="Completed",
|
|
198
202
|
outputs={
|
|
199
|
-
|
|
203
|
+
"evaluationResultId": create_evaluation_result_response.id,
|
|
200
204
|
},
|
|
201
205
|
properties=properties,
|
|
202
|
-
)
|
|
206
|
+
),
|
|
203
207
|
)
|
|
204
208
|
|
|
205
209
|
return update_run_response.properties.get("AiStudioEvaluationUri")
|
|
206
210
|
|
|
211
|
+
|
|
207
212
|
def _log_metrics_and_instance_results(
|
|
208
213
|
metrics: Dict[str, Any],
|
|
209
214
|
instance_results: pd.DataFrame,
|
|
@@ -266,11 +271,11 @@ def _log_metrics_and_instance_results(
|
|
|
266
271
|
# We are doing that only for the pure evaluation runs.
|
|
267
272
|
if run is None:
|
|
268
273
|
properties = {
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
275
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
276
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
277
|
+
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
278
|
+
}
|
|
274
279
|
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
275
280
|
ev_run.write_properties_to_run_history(properties=properties)
|
|
276
281
|
else:
|
|
@@ -321,7 +326,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
321
326
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
322
327
|
json.dump(data_dict, f, ensure_ascii=False)
|
|
323
328
|
|
|
324
|
-
|
|
329
|
+
# Use tqdm.write to print message without interfering with any current progress bar
|
|
330
|
+
tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
325
331
|
|
|
326
332
|
|
|
327
333
|
def _apply_column_mapping(
|
|
@@ -407,9 +413,11 @@ def set_event_loop_policy() -> None:
|
|
|
407
413
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
408
414
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
409
415
|
|
|
416
|
+
|
|
410
417
|
# textwrap.wrap tries to do fancy nonsense that we don't want
|
|
411
418
|
def _wrap(s, w):
|
|
412
|
-
return [s[i:i + w] for i in range(0, len(s), w)]
|
|
419
|
+
return [s[i : i + w] for i in range(0, len(s), w)]
|
|
420
|
+
|
|
413
421
|
|
|
414
422
|
def _convert_name_map_into_property_entries(
|
|
415
423
|
name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
|
|
@@ -433,7 +441,7 @@ def _convert_name_map_into_property_entries(
|
|
|
433
441
|
num_segments = math.ceil(len(name_map_string) / segment_length)
|
|
434
442
|
# Property map is somehow still too long to encode within the space
|
|
435
443
|
# we allow, so give up, but make sure the service knows we gave up
|
|
436
|
-
if
|
|
444
|
+
if num_segments > max_segments:
|
|
437
445
|
return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
|
|
438
446
|
|
|
439
447
|
result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
|
|
@@ -443,6 +451,7 @@ def _convert_name_map_into_property_entries(
|
|
|
443
451
|
result[segment_key] = segments_list[i]
|
|
444
452
|
return result
|
|
445
453
|
|
|
454
|
+
|
|
446
455
|
class JSONLDataFileLoader:
|
|
447
456
|
def __init__(self, filename: Union[os.PathLike, str]):
|
|
448
457
|
self.filename = filename
|
|
@@ -34,15 +34,15 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
34
34
|
:language: python
|
|
35
35
|
:dedent: 8
|
|
36
36
|
:caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
.. admonition:: Example using Azure AI Project URL:
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
41
41
|
:start-after: [START bleu_score_evaluator]
|
|
42
42
|
:end-before: [END bleu_score_evaluator]
|
|
43
43
|
:language: python
|
|
44
44
|
:dedent: 8
|
|
45
|
-
:caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
|
|
45
|
+
:caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
|
|
46
46
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
47
47
|
|
|
48
48
|
.. admonition:: Example with Threshold:
|