azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +51 -6
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +88 -52
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +188 -10
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +25 -17
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@ from time import sleep
|
|
|
11
11
|
|
|
12
12
|
from ._batch_run import CodeClient, ProxyClient
|
|
13
13
|
|
|
14
|
-
#import aoai_mapping
|
|
14
|
+
# import aoai_mapping
|
|
15
15
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
16
|
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
17
17
|
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
@@ -30,17 +30,18 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
|
|
|
30
30
|
eval_run_id: str
|
|
31
31
|
grader_name_map: Dict[str, str]
|
|
32
32
|
|
|
33
|
+
|
|
33
34
|
def _split_evaluators_and_grader_configs(
|
|
34
|
-
|
|
35
|
-
|
|
35
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
36
|
+
) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
|
|
36
37
|
"""
|
|
37
38
|
Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
|
|
38
39
|
dictionaries that each contain one subset, the first containing the evaluators and the second containing
|
|
39
40
|
the AOAI graders. AOAI graders are defined as anything that is an instance of the AoaiGrader class,
|
|
40
|
-
including child class instances.
|
|
41
|
+
including child class instances.
|
|
41
42
|
|
|
42
43
|
:param evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
43
|
-
and value as the evaluator function or AOAI grader.
|
|
44
|
+
and value as the evaluator function or AOAI grader.
|
|
44
45
|
:type evaluators: Dict[str, Union[Callable, ]]
|
|
45
46
|
:return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
|
|
46
47
|
:rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
|
|
@@ -54,13 +55,14 @@ def _split_evaluators_and_grader_configs(
|
|
|
54
55
|
true_evaluators[key] = value
|
|
55
56
|
return true_evaluators, aoai_graders
|
|
56
57
|
|
|
58
|
+
|
|
57
59
|
@experimental
|
|
58
60
|
def _begin_aoai_evaluation(
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
62
|
+
column_mappings: Optional[Dict[str, Dict[str, str]]],
|
|
63
|
+
data: pd.DataFrame,
|
|
64
|
+
run_name: str,
|
|
65
|
+
) -> List[OAIEvalRunCreationInfo]:
|
|
64
66
|
"""
|
|
65
67
|
Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
|
|
66
68
|
AOAI evaluation runs must be queried for completion, so this returns the IDs needed to poll for the
|
|
@@ -84,26 +86,20 @@ def _begin_aoai_evaluation(
|
|
|
84
86
|
:rtype: List[OAIEvalRunCreationInfo]
|
|
85
87
|
"""
|
|
86
88
|
|
|
87
|
-
|
|
88
89
|
LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
|
|
89
90
|
all_eval_run_info: List[OAIEvalRunCreationInfo] = []
|
|
90
91
|
|
|
91
92
|
for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
|
|
92
|
-
all_eval_run_info.append(
|
|
93
|
-
selected_graders,
|
|
94
|
-
|
|
95
|
-
selected_column_mapping,
|
|
96
|
-
run_name
|
|
97
|
-
))
|
|
93
|
+
all_eval_run_info.append(
|
|
94
|
+
_begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
|
|
95
|
+
)
|
|
98
96
|
|
|
99
97
|
return all_eval_run_info
|
|
100
98
|
|
|
99
|
+
|
|
101
100
|
def _begin_single_aoai_evaluation(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
column_mapping: Dict[str, str],
|
|
105
|
-
run_name: str
|
|
106
|
-
) -> OAIEvalRunCreationInfo:
|
|
101
|
+
graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
|
|
102
|
+
) -> OAIEvalRunCreationInfo:
|
|
107
103
|
"""
|
|
108
104
|
Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
|
|
109
105
|
AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
|
|
@@ -121,7 +117,7 @@ def _begin_single_aoai_evaluation(
|
|
|
121
117
|
"""
|
|
122
118
|
|
|
123
119
|
# Format data for eval group creation
|
|
124
|
-
grader_name_list
|
|
120
|
+
grader_name_list = []
|
|
125
121
|
grader_list = []
|
|
126
122
|
# It's expected that all graders supplied for a single eval run use the same credentials
|
|
127
123
|
# so grab a client from the first grader.
|
|
@@ -135,19 +131,17 @@ def _begin_single_aoai_evaluation(
|
|
|
135
131
|
# Create eval group
|
|
136
132
|
# import pdb; pdb.set_trace()
|
|
137
133
|
eval_group_info = client.evals.create(
|
|
138
|
-
data_source_config=data_source_config,
|
|
139
|
-
testing_criteria=grader_list,
|
|
140
|
-
metadata={"is_foundry_eval": "true"}
|
|
134
|
+
data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
|
|
141
135
|
)
|
|
142
|
-
|
|
136
|
+
|
|
143
137
|
LOGGER.info(f"AOAI: Eval group created with id {eval_group_info.id}. Creating eval run next...")
|
|
144
138
|
# Use eval group info to map grader IDs back to user-assigned names.
|
|
145
139
|
grader_name_map = {}
|
|
146
140
|
num_criteria = len(eval_group_info.testing_criteria)
|
|
147
141
|
if num_criteria != len(grader_name_list):
|
|
148
142
|
raise EvaluationException(
|
|
149
|
-
message=f"Number of testing criteria ({num_criteria})"
|
|
150
|
-
|
|
143
|
+
message=f"Number of testing criteria ({num_criteria})"
|
|
144
|
+
+ f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
|
|
151
145
|
blame=ErrorBlame.USER_ERROR,
|
|
152
146
|
category=ErrorCategory.INVALID_VALUE,
|
|
153
147
|
target=ErrorTarget.AOAI_GRADER,
|
|
@@ -155,21 +149,24 @@ def _begin_single_aoai_evaluation(
|
|
|
155
149
|
for name, criteria in zip(grader_name_list, eval_group_info.testing_criteria):
|
|
156
150
|
grader_name_map[criteria.id] = name
|
|
157
151
|
|
|
158
|
-
# Create eval run
|
|
152
|
+
# Create eval run
|
|
159
153
|
eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
|
|
160
|
-
LOGGER.info(
|
|
161
|
-
|
|
154
|
+
LOGGER.info(
|
|
155
|
+
f"AOAI: Eval run created with id {eval_run_id}."
|
|
156
|
+
+ " Results will be retrieved after normal evaluation is complete..."
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return OAIEvalRunCreationInfo(
|
|
160
|
+
client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
|
|
161
|
+
)
|
|
162
162
|
|
|
163
|
-
return OAIEvalRunCreationInfo(client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
|
|
164
163
|
|
|
165
|
-
def _get_evaluation_run_results(
|
|
166
|
-
all_run_info: List[OAIEvalRunCreationInfo]
|
|
167
|
-
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
164
|
+
def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
168
165
|
"""
|
|
169
166
|
Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
|
|
170
167
|
pipeline to consume. This method accepts a list of eval run information, and will combine the
|
|
171
168
|
results into a single dataframe and metrics dictionary.
|
|
172
|
-
|
|
169
|
+
|
|
173
170
|
:param all_run_info: A list of evaluation run information that contains the needed values
|
|
174
171
|
to retrieve the results of the evaluation run.
|
|
175
172
|
:type all_run_info: List[OAIEvalRunCreationInfo]
|
|
@@ -188,13 +185,14 @@ def _get_evaluation_run_results(
|
|
|
188
185
|
|
|
189
186
|
return output_df, run_metrics
|
|
190
187
|
|
|
188
|
+
|
|
191
189
|
def _get_single_run_results(
|
|
192
|
-
|
|
193
|
-
|
|
190
|
+
run_info: OAIEvalRunCreationInfo,
|
|
191
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
194
192
|
"""
|
|
195
193
|
Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
|
|
196
194
|
pipeline to consume.
|
|
197
|
-
|
|
195
|
+
|
|
198
196
|
:param run_info: The evaluation run information that contains the needed values
|
|
199
197
|
to retrieve the results of the evaluation run.
|
|
200
198
|
:type run_info: OAIEvalRunCreationInfo
|
|
@@ -205,28 +203,30 @@ def _get_single_run_results(
|
|
|
205
203
|
"""
|
|
206
204
|
# Wait for evaluation run to complete
|
|
207
205
|
run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
|
|
206
|
+
|
|
208
207
|
if run_results.status != "completed":
|
|
209
208
|
raise EvaluationException(
|
|
210
209
|
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
|
|
211
|
-
|
|
210
|
+
+ f" failed with status {run_results.status}.",
|
|
212
211
|
blame=ErrorBlame.UNKNOWN,
|
|
213
212
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
214
213
|
target=ErrorTarget.AOAI_GRADER,
|
|
215
214
|
)
|
|
216
|
-
|
|
217
|
-
+ " completed successfully. Gathering results...")
|
|
215
|
+
|
|
218
216
|
# Convert run results into a dictionary of metrics
|
|
219
217
|
run_metrics = {}
|
|
220
218
|
if run_results.per_testing_criteria_results is None:
|
|
221
|
-
msg = (
|
|
222
|
-
|
|
223
|
-
|
|
219
|
+
msg = (
|
|
220
|
+
"AOAI evaluation run returned no results, despite 'completed' status. This might"
|
|
221
|
+
+ " occur when invalid or conflicting models are selected in the model and grader configs."
|
|
222
|
+
f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}"
|
|
223
|
+
)
|
|
224
224
|
raise EvaluationException(
|
|
225
225
|
message=msg,
|
|
226
226
|
blame=ErrorBlame.UNKNOWN,
|
|
227
227
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
228
228
|
target=ErrorTarget.AOAI_GRADER,
|
|
229
|
-
)
|
|
229
|
+
)
|
|
230
230
|
for criteria_result in run_results.per_testing_criteria_results:
|
|
231
231
|
grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
|
|
232
232
|
passed = criteria_result.passed
|
|
@@ -235,7 +235,6 @@ def _get_single_run_results(
|
|
|
235
235
|
formatted_column_name = f"{grader_name}.pass_rate"
|
|
236
236
|
run_metrics[formatted_column_name] = ratio
|
|
237
237
|
|
|
238
|
-
|
|
239
238
|
# Get full results and convert them into a dataframe.
|
|
240
239
|
# Notes on raw full data output from OAI eval runs:
|
|
241
240
|
# Each row in the full results list in itself a list.
|
|
@@ -246,36 +245,72 @@ def _get_single_run_results(
|
|
|
246
245
|
# The passed and score values are then added to the results dictionary, prepended with the grader's name
|
|
247
246
|
# as entered by the user in the inputted dictionary.
|
|
248
247
|
# Other values, if they exist, are also added to the results dictionary.
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
248
|
+
|
|
249
|
+
# Collect all results with pagination
|
|
250
|
+
all_results = []
|
|
251
|
+
next_cursor = None
|
|
252
|
+
limit = 100 # Max allowed by API
|
|
253
|
+
|
|
254
|
+
while True:
|
|
255
|
+
# Build kwargs for the API call
|
|
256
|
+
list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
|
|
257
|
+
if next_cursor is not None:
|
|
258
|
+
list_kwargs["after"] = next_cursor
|
|
259
|
+
|
|
260
|
+
raw_list_results = run_info["client"].evals.runs.output_items.list(**list_kwargs)
|
|
261
|
+
|
|
262
|
+
# Add current page results
|
|
263
|
+
all_results.extend(raw_list_results.data)
|
|
264
|
+
|
|
265
|
+
# Check for more pages
|
|
266
|
+
if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
|
|
267
|
+
if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
|
|
268
|
+
# Get the last item's ID for cursor-based pagination
|
|
269
|
+
next_cursor = raw_list_results.data[-1].id
|
|
270
|
+
else:
|
|
271
|
+
break
|
|
272
|
+
else:
|
|
273
|
+
break
|
|
274
|
+
|
|
253
275
|
listed_results = {"index": []}
|
|
254
276
|
# raw data has no order guarantees, we need to sort them by their
|
|
255
277
|
# datasource_item_id
|
|
256
|
-
for row_result in
|
|
278
|
+
for row_result in all_results:
|
|
257
279
|
# Add the datasource_item_id for later sorting
|
|
258
280
|
listed_results["index"].append(row_result.datasource_item_id)
|
|
259
281
|
for single_grader_row_result in row_result.results:
|
|
260
282
|
grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
|
|
261
283
|
for name, value in single_grader_row_result.items():
|
|
262
|
-
if name in ["name"]:
|
|
284
|
+
if name in ["name"]: # Todo decide if we also want to exclude "sample"
|
|
263
285
|
continue
|
|
264
286
|
if name.lower() == "passed":
|
|
265
287
|
# create a `_result` column for each grader
|
|
266
288
|
result_column_name = f"outputs.{grader_name}.{grader_name}_result"
|
|
267
|
-
if len(result_column_name) < 50:
|
|
268
|
-
if
|
|
289
|
+
if len(result_column_name) < 50: # TODO: is this the limit? Should we keep "passed"?
|
|
290
|
+
if result_column_name not in listed_results:
|
|
269
291
|
listed_results[result_column_name] = []
|
|
270
292
|
listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
|
|
271
293
|
|
|
272
294
|
formatted_column_name = f"outputs.{grader_name}.{name}"
|
|
273
|
-
if
|
|
295
|
+
if formatted_column_name not in listed_results:
|
|
274
296
|
listed_results[formatted_column_name] = []
|
|
275
297
|
listed_results[formatted_column_name].append(value)
|
|
298
|
+
|
|
299
|
+
# Ensure all columns have the same length as the index
|
|
300
|
+
num_rows = len(listed_results["index"])
|
|
301
|
+
for col_name in list(listed_results.keys()):
|
|
302
|
+
if col_name != "index":
|
|
303
|
+
col_length = len(listed_results[col_name])
|
|
304
|
+
if col_length < num_rows:
|
|
305
|
+
# Pad with None values
|
|
306
|
+
listed_results[col_name].extend([None] * (num_rows - col_length))
|
|
307
|
+
elif col_length > num_rows:
|
|
308
|
+
# This shouldn't happen, but truncate if it does
|
|
309
|
+
listed_results[col_name] = listed_results[col_name][:num_rows]
|
|
310
|
+
|
|
276
311
|
output_df = pd.DataFrame(listed_results)
|
|
277
312
|
# sort by index
|
|
278
|
-
output_df = output_df.sort_values(
|
|
313
|
+
output_df = output_df.sort_values("index", ascending=[True])
|
|
279
314
|
# remove index column
|
|
280
315
|
output_df.drop(columns=["index"], inplace=True)
|
|
281
316
|
return output_df, run_metrics
|
|
@@ -303,9 +338,10 @@ def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str,
|
|
|
303
338
|
target=ErrorTarget.AOAI_GRADER,
|
|
304
339
|
)
|
|
305
340
|
|
|
306
|
-
grader_class =
|
|
341
|
+
grader_class = _get_grader_class(grader_id)
|
|
307
342
|
return grader_class(**init_params)
|
|
308
343
|
|
|
344
|
+
|
|
309
345
|
def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
310
346
|
"""
|
|
311
347
|
Given a model ID, return the class of the corresponding grader wrapper.
|
|
@@ -316,12 +352,17 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
|
316
352
|
AzureOpenAILabelGrader,
|
|
317
353
|
AzureOpenAIStringCheckGrader,
|
|
318
354
|
AzureOpenAITextSimilarityGrader,
|
|
355
|
+
AzureOpenAIScoreModelGrader,
|
|
356
|
+
AzureOpenAIPythonGrader,
|
|
319
357
|
)
|
|
358
|
+
|
|
320
359
|
id_map = {
|
|
321
360
|
AzureOpenAIGrader.id: AzureOpenAIGrader,
|
|
322
361
|
AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
|
|
323
362
|
AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
|
|
324
363
|
AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
|
|
364
|
+
AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
|
|
365
|
+
AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
|
|
325
366
|
}
|
|
326
367
|
|
|
327
368
|
for key in id_map.keys():
|
|
@@ -336,9 +377,9 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
|
|
|
336
377
|
|
|
337
378
|
|
|
338
379
|
def _get_graders_and_column_mappings(
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
380
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
381
|
+
column_mappings: Optional[Dict[str, Dict[str, str]]],
|
|
382
|
+
) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
|
|
342
383
|
"""
|
|
343
384
|
Given a dictionary of column mappings and a dictionary of AOAI graders,
|
|
344
385
|
Split them into sub-lists and sub-dictionaries that each correspond to a single evaluation run
|
|
@@ -366,20 +407,21 @@ def _get_graders_and_column_mappings(
|
|
|
366
407
|
"""
|
|
367
408
|
|
|
368
409
|
default_mapping = column_mappings.get("default", None)
|
|
369
|
-
return [({name
|
|
410
|
+
return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
|
|
411
|
+
|
|
370
412
|
|
|
371
413
|
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
372
414
|
"""Produce a data source config that maps all columns from the supplied data source into
|
|
373
415
|
the OAI API. The mapping is naive unless a column mapping is provided, in which case
|
|
374
416
|
the column mapping's values overrule the relevant naive mappings
|
|
375
|
-
|
|
417
|
+
|
|
376
418
|
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
377
419
|
helper function.
|
|
378
420
|
:type input_data_df: pd.DataFrame
|
|
379
421
|
:param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
|
|
380
422
|
:type column_mapping: Optional[Dict[str, str]]
|
|
381
423
|
:return: A dictionary that can act as data source config for OAI evaluation group creation.
|
|
382
|
-
:rtype: Dict[str, Any]
|
|
424
|
+
:rtype: Dict[str, Any]
|
|
383
425
|
"""
|
|
384
426
|
|
|
385
427
|
data_source_config = {
|
|
@@ -388,7 +430,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
|
|
|
388
430
|
"type": "object",
|
|
389
431
|
"properties": {},
|
|
390
432
|
"required": [],
|
|
391
|
-
}
|
|
433
|
+
},
|
|
392
434
|
}
|
|
393
435
|
properties = data_source_config["item_schema"]["properties"]
|
|
394
436
|
required = data_source_config["item_schema"]["required"]
|
|
@@ -399,10 +441,11 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
|
|
|
399
441
|
required.append(key)
|
|
400
442
|
return data_source_config
|
|
401
443
|
|
|
444
|
+
|
|
402
445
|
def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
|
|
403
446
|
"""Produce a data source config that naively maps all columns from the supplied data source into
|
|
404
447
|
the OAI API.
|
|
405
|
-
|
|
448
|
+
|
|
406
449
|
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
407
450
|
helper function.
|
|
408
451
|
:type input_data_df: pd.DataFrame
|
|
@@ -424,10 +467,11 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
|
|
|
424
467
|
"type": "object",
|
|
425
468
|
"properties": properties,
|
|
426
469
|
"required": required,
|
|
427
|
-
}
|
|
470
|
+
},
|
|
428
471
|
}
|
|
429
472
|
return data_source_config
|
|
430
473
|
|
|
474
|
+
|
|
431
475
|
def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
432
476
|
"""
|
|
433
477
|
Given a dataframe of data to be evaluated, and an optional column mapping,
|
|
@@ -457,7 +501,7 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
457
501
|
# dictionary that'll work in an OAI data source.
|
|
458
502
|
for row in input_data_df.iterrows():
|
|
459
503
|
row_dict = {}
|
|
460
|
-
for oai_key,dataframe_key in column_to_source_map.items():
|
|
504
|
+
for oai_key, dataframe_key in column_to_source_map.items():
|
|
461
505
|
row_dict[oai_key] = str(row[1][dataframe_key])
|
|
462
506
|
content.append({"item": row_dict})
|
|
463
507
|
|
|
@@ -466,20 +510,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
466
510
|
"source": {
|
|
467
511
|
"type": "file_content",
|
|
468
512
|
"content": content,
|
|
469
|
-
}
|
|
513
|
+
},
|
|
470
514
|
}
|
|
471
515
|
|
|
516
|
+
|
|
472
517
|
def _begin_eval_run(
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
518
|
+
client: Union[OpenAI, AzureOpenAI],
|
|
519
|
+
eval_group_id: str,
|
|
520
|
+
run_name: str,
|
|
521
|
+
input_data_df: pd.DataFrame,
|
|
522
|
+
column_mapping: Dict[str, str],
|
|
523
|
+
) -> str:
|
|
479
524
|
"""
|
|
480
|
-
Given an eval group id and a dataset file path, use the AOAI API to
|
|
525
|
+
Given an eval group id and a dataset file path, use the AOAI API to
|
|
481
526
|
start an evaluation run with the given name and description.
|
|
482
|
-
Returns a poller that can be used to monitor the run.
|
|
527
|
+
Returns a poller that can be used to monitor the run.
|
|
483
528
|
|
|
484
529
|
:param client: The AOAI client to use for the evaluation.
|
|
485
530
|
:type client: Union[OpenAI, AzureOpenAI]
|
|
@@ -499,18 +544,16 @@ def _begin_eval_run(
|
|
|
499
544
|
eval_id=eval_group_id,
|
|
500
545
|
data_source=data_source,
|
|
501
546
|
name=run_name,
|
|
502
|
-
metadata={"sample_generation": "off","file_format": "jsonl", "is_foundry_eval": "true"}
|
|
547
|
+
metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
|
|
503
548
|
# TODO decide if we want to add our own timeout value?
|
|
504
549
|
)
|
|
505
550
|
return eval_run.id
|
|
506
551
|
|
|
552
|
+
|
|
507
553
|
# Post built TODO: replace with _red_team.py's retry logic?
|
|
508
554
|
def _wait_for_run_conclusion(
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
eval_run_id: str,
|
|
512
|
-
max_wait_seconds = 21600
|
|
513
|
-
) -> Any:
|
|
555
|
+
client: Union[OpenAI, AzureOpenAI], eval_group_id: str, eval_run_id: str, max_wait_seconds=21600
|
|
556
|
+
) -> Any:
|
|
514
557
|
"""
|
|
515
558
|
Perform exponential backoff polling to get the results of an AOAI evaluation run.
|
|
516
559
|
Raises an EvaluationException if max attempts are reached without receiving a concluding status.
|
|
@@ -532,8 +575,8 @@ def _wait_for_run_conclusion(
|
|
|
532
575
|
iters = 0
|
|
533
576
|
# start with ~51 minutes of exponential backoff
|
|
534
577
|
# max wait time = 2^10 * 3 = 3072 seconds ~= 51 minutes
|
|
535
|
-
wait_interval = 3
|
|
536
|
-
while
|
|
578
|
+
wait_interval = 3 # Seconds.
|
|
579
|
+
while True:
|
|
537
580
|
wait_interval *= 1.5
|
|
538
581
|
total_wait += wait_interval
|
|
539
582
|
# Reduce last wait interval if total wait time exceeds max wait time
|
|
@@ -541,13 +584,13 @@ def _wait_for_run_conclusion(
|
|
|
541
584
|
wait_interval -= total_wait - max_wait_seconds
|
|
542
585
|
sleep(wait_interval)
|
|
543
586
|
response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
|
|
544
|
-
if response.status not in
|
|
587
|
+
if response.status not in ["queued", "in_progress"]:
|
|
545
588
|
return response
|
|
546
589
|
if total_wait > max_wait_seconds:
|
|
547
590
|
raise EvaluationException(
|
|
548
591
|
message=f"Timed out waiting for AOAI evaluation to complete after {iters}"
|
|
549
|
-
|
|
592
|
+
+ f" rounds of polling. Final status was {response.status}",
|
|
550
593
|
blame=ErrorBlame.USER_ERROR,
|
|
551
594
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
552
595
|
target=ErrorTarget.AOAI_GRADER,
|
|
553
|
-
)
|
|
596
|
+
)
|
|
@@ -17,7 +17,6 @@ from typing_extensions import ParamSpec
|
|
|
17
17
|
|
|
18
18
|
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
19
19
|
|
|
20
|
-
from ..._user_agent import USER_AGENT
|
|
21
20
|
from .._utils import _trace_destination_from_project_scope
|
|
22
21
|
|
|
23
22
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -13,6 +13,9 @@ import base64
|
|
|
13
13
|
import math
|
|
14
14
|
|
|
15
15
|
import pandas as pd
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
|
|
18
|
+
from azure.core.pipeline.policies import UserAgentPolicy
|
|
16
19
|
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
17
20
|
|
|
18
21
|
from azure.ai.evaluation._constants import (
|
|
@@ -24,6 +27,7 @@ from azure.ai.evaluation._constants import (
|
|
|
24
27
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
28
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
26
29
|
from azure.ai.evaluation._version import VERSION
|
|
30
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
27
31
|
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
28
32
|
|
|
29
33
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -127,6 +131,7 @@ def process_message_content(content, images_folder_path):
|
|
|
127
131
|
f.write(image_data_binary)
|
|
128
132
|
return None
|
|
129
133
|
|
|
134
|
+
|
|
130
135
|
def _log_metrics_and_instance_results_onedp(
|
|
131
136
|
metrics: Dict[str, Any],
|
|
132
137
|
instance_results: pd.DataFrame,
|
|
@@ -146,7 +151,8 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
146
151
|
)
|
|
147
152
|
client = EvaluationServiceOneDPClient(
|
|
148
153
|
endpoint=project_url,
|
|
149
|
-
credential=credentials
|
|
154
|
+
credential=credentials,
|
|
155
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
150
156
|
)
|
|
151
157
|
|
|
152
158
|
# Massaging before artifacts are put on disk
|
|
@@ -172,21 +178,19 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
172
178
|
|
|
173
179
|
properties = {
|
|
174
180
|
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
175
|
-
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
176
181
|
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
177
182
|
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
178
|
-
}
|
|
183
|
+
}
|
|
179
184
|
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
180
185
|
|
|
181
186
|
create_evaluation_result_response = client.create_evaluation_result(
|
|
182
|
-
name=uuid.uuid4(),
|
|
183
|
-
path=tmpdir,
|
|
184
|
-
metrics=metrics
|
|
187
|
+
name=uuid.uuid4(), path=tmpdir, metrics=metrics
|
|
185
188
|
)
|
|
186
189
|
|
|
187
190
|
upload_run_response = client.start_evaluation_run(
|
|
188
191
|
evaluation=EvaluationUpload(
|
|
189
192
|
display_name=evaluation_name,
|
|
193
|
+
properties=properties,
|
|
190
194
|
)
|
|
191
195
|
)
|
|
192
196
|
|
|
@@ -196,14 +200,14 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
196
200
|
display_name=evaluation_name,
|
|
197
201
|
status="Completed",
|
|
198
202
|
outputs={
|
|
199
|
-
|
|
203
|
+
"evaluationResultId": create_evaluation_result_response.id,
|
|
200
204
|
},
|
|
201
|
-
|
|
202
|
-
)
|
|
205
|
+
),
|
|
203
206
|
)
|
|
204
207
|
|
|
205
208
|
return update_run_response.properties.get("AiStudioEvaluationUri")
|
|
206
209
|
|
|
210
|
+
|
|
207
211
|
def _log_metrics_and_instance_results(
|
|
208
212
|
metrics: Dict[str, Any],
|
|
209
213
|
instance_results: pd.DataFrame,
|
|
@@ -266,11 +270,11 @@ def _log_metrics_and_instance_results(
|
|
|
266
270
|
# We are doing that only for the pure evaluation runs.
|
|
267
271
|
if run is None:
|
|
268
272
|
properties = {
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
273
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
274
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
275
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
276
|
+
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
277
|
+
}
|
|
274
278
|
properties.update(_convert_name_map_into_property_entries(name_map))
|
|
275
279
|
ev_run.write_properties_to_run_history(properties=properties)
|
|
276
280
|
else:
|
|
@@ -321,7 +325,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
321
325
|
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
322
326
|
json.dump(data_dict, f, ensure_ascii=False)
|
|
323
327
|
|
|
324
|
-
|
|
328
|
+
# Use tqdm.write to print message without interfering with any current progress bar
|
|
329
|
+
tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
325
330
|
|
|
326
331
|
|
|
327
332
|
def _apply_column_mapping(
|
|
@@ -407,9 +412,11 @@ def set_event_loop_policy() -> None:
|
|
|
407
412
|
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
408
413
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
409
414
|
|
|
415
|
+
|
|
410
416
|
# textwrap.wrap tries to do fancy nonsense that we don't want
|
|
411
417
|
def _wrap(s, w):
|
|
412
|
-
return [s[i:i + w] for i in range(0, len(s), w)]
|
|
418
|
+
return [s[i : i + w] for i in range(0, len(s), w)]
|
|
419
|
+
|
|
413
420
|
|
|
414
421
|
def _convert_name_map_into_property_entries(
|
|
415
422
|
name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
|
|
@@ -433,7 +440,7 @@ def _convert_name_map_into_property_entries(
|
|
|
433
440
|
num_segments = math.ceil(len(name_map_string) / segment_length)
|
|
434
441
|
# Property map is somehow still too long to encode within the space
|
|
435
442
|
# we allow, so give up, but make sure the service knows we gave up
|
|
436
|
-
if
|
|
443
|
+
if num_segments > max_segments:
|
|
437
444
|
return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
|
|
438
445
|
|
|
439
446
|
result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
|
|
@@ -443,6 +450,7 @@ def _convert_name_map_into_property_entries(
|
|
|
443
450
|
result[segment_key] = segments_list[i]
|
|
444
451
|
return result
|
|
445
452
|
|
|
453
|
+
|
|
446
454
|
class JSONLDataFileLoader:
|
|
447
455
|
def __init__(self, filename: Union[os.PathLike, str]):
|
|
448
456
|
self.filename = filename
|
|
@@ -34,15 +34,15 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
34
34
|
:language: python
|
|
35
35
|
:dedent: 8
|
|
36
36
|
:caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
.. admonition:: Example using Azure AI Project URL:
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
41
41
|
:start-after: [START bleu_score_evaluator]
|
|
42
42
|
:end-before: [END bleu_score_evaluator]
|
|
43
43
|
:language: python
|
|
44
44
|
:dedent: 8
|
|
45
|
-
:caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
|
|
45
|
+
:caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
|
|
46
46
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
47
47
|
|
|
48
48
|
.. admonition:: Example with Threshold:
|
|
@@ -54,7 +54,7 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
54
54
|
:caption: Initialize with threshold and call an BleuScoreEvaluator.
|
|
55
55
|
"""
|
|
56
56
|
|
|
57
|
-
id = "
|
|
57
|
+
id = "azureai://built-in/evaluators/bleu_score"
|
|
58
58
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
59
59
|
|
|
60
60
|
def __init__(self, *, threshold=0.5):
|