azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. azure/ai/evaluation/__init__.py +13 -2
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
  6. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  8. azure/ai/evaluation/_azure/_envs.py +9 -10
  9. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  10. azure/ai/evaluation/_common/constants.py +11 -2
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  13. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  14. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  15. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  16. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  17. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  18. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  20. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  21. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  22. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  23. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  24. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  25. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  26. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  27. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  28. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  29. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  30. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  31. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  32. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  33. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
  34. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  35. azure/ai/evaluation/_common/rai_service.py +86 -50
  36. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  37. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  38. azure/ai/evaluation/_common/utils.py +124 -3
  39. azure/ai/evaluation/_constants.py +2 -1
  40. azure/ai/evaluation/_converters/__init__.py +1 -1
  41. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  42. azure/ai/evaluation/_converters/_models.py +46 -0
  43. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  44. azure/ai/evaluation/_eval_mapping.py +2 -2
  45. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
  46. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  47. azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
  48. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
  49. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  50. azure/ai/evaluation/_evaluate/_utils.py +24 -15
  51. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
  52. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
  53. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
  54. azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
  55. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  56. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
  57. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
  58. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
  59. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
  60. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
  61. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
  62. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  63. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
  64. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
  65. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
  66. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
  67. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
  68. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
  69. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
  70. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  71. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
  72. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
  73. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
  74. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
  75. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
  76. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
  77. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
  78. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
  79. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
  80. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
  81. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
  82. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
  83. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
  84. azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
  85. azure/ai/evaluation/_exceptions.py +10 -0
  86. azure/ai/evaluation/_http_utils.py +3 -3
  87. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
  88. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  91. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  92. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  93. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  94. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  95. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
  96. azure/ai/evaluation/_user_agent.py +32 -1
  97. azure/ai/evaluation/_version.py +1 -1
  98. azure/ai/evaluation/red_team/__init__.py +3 -1
  99. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  100. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  101. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  102. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  103. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  104. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  105. azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
  106. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  107. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  108. azure/ai/evaluation/red_team/_red_team.py +1622 -765
  109. azure/ai/evaluation/red_team/_red_team_result.py +43 -38
  110. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  111. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
  112. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
  113. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
  114. azure/ai/evaluation/red_team/_utils/constants.py +6 -12
  115. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  116. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  117. azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
  118. azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
  119. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  120. azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
  121. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  122. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  123. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
  124. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
  125. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  126. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
  127. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  128. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  129. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  130. azure/ai/evaluation/simulator/_simulator.py +9 -8
  131. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
  132. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
  133. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  134. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
  135. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
  136. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from time import sleep
11
11
 
12
12
  from ._batch_run import CodeClient, ProxyClient
13
13
 
14
- #import aoai_mapping
14
+ # import aoai_mapping
15
15
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
16
  from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
17
17
  from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
@@ -30,17 +30,18 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
30
30
  eval_run_id: str
31
31
  grader_name_map: Dict[str, str]
32
32
 
33
+
33
34
  def _split_evaluators_and_grader_configs(
34
- evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]]
35
- ) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
35
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
36
+ ) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
36
37
  """
37
38
  Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
38
39
  dictionaries that each contain one subset, the first containing the evaluators and the second containing
39
40
  the AOAI graders. AOAI graders are defined as anything that is an instance of the AoaiGrader class,
40
- including child class instances.
41
+ including child class instances.
41
42
 
42
43
  :param evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
43
- and value as the evaluator function or AOAI grader.
44
+ and value as the evaluator function or AOAI grader.
44
45
  :type evaluators: Dict[str, Union[Callable, ]]
45
46
  :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
46
47
  :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
@@ -54,13 +55,14 @@ def _split_evaluators_and_grader_configs(
54
55
  true_evaluators[key] = value
55
56
  return true_evaluators, aoai_graders
56
57
 
58
+
57
59
  @experimental
58
60
  def _begin_aoai_evaluation(
59
- graders: Dict[str, AzureOpenAIGrader],
60
- column_mappings: Optional[Dict[str, Dict[str, str]]],
61
- data: pd.DataFrame,
62
- run_name: str
63
- ) -> List[OAIEvalRunCreationInfo]:
61
+ graders: Dict[str, AzureOpenAIGrader],
62
+ column_mappings: Optional[Dict[str, Dict[str, str]]],
63
+ data: pd.DataFrame,
64
+ run_name: str,
65
+ ) -> List[OAIEvalRunCreationInfo]:
64
66
  """
65
67
  Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
66
68
  AOAI evaluation runs must be queried for completion, so this returns the IDs needed to poll for the
@@ -84,26 +86,20 @@ def _begin_aoai_evaluation(
84
86
  :rtype: List[OAIEvalRunCreationInfo]
85
87
  """
86
88
 
87
-
88
89
  LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
89
90
  all_eval_run_info: List[OAIEvalRunCreationInfo] = []
90
91
 
91
92
  for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
92
- all_eval_run_info.append(_begin_single_aoai_evaluation(
93
- selected_graders,
94
- data,
95
- selected_column_mapping,
96
- run_name
97
- ))
93
+ all_eval_run_info.append(
94
+ _begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
95
+ )
98
96
 
99
97
  return all_eval_run_info
100
98
 
99
+
101
100
  def _begin_single_aoai_evaluation(
102
- graders: Dict[str, AzureOpenAIGrader],
103
- data: pd.DataFrame,
104
- column_mapping: Dict[str, str],
105
- run_name: str
106
- ) -> OAIEvalRunCreationInfo:
101
+ graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
102
+ ) -> OAIEvalRunCreationInfo:
107
103
  """
108
104
  Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
109
105
  AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
@@ -121,7 +117,7 @@ def _begin_single_aoai_evaluation(
121
117
  """
122
118
 
123
119
  # Format data for eval group creation
124
- grader_name_list = []
120
+ grader_name_list = []
125
121
  grader_list = []
126
122
  # It's expected that all graders supplied for a single eval run use the same credentials
127
123
  # so grab a client from the first grader.
@@ -135,19 +131,17 @@ def _begin_single_aoai_evaluation(
135
131
  # Create eval group
136
132
  # import pdb; pdb.set_trace()
137
133
  eval_group_info = client.evals.create(
138
- data_source_config=data_source_config,
139
- testing_criteria=grader_list,
140
- metadata={"is_foundry_eval": "true"}
134
+ data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
141
135
  )
142
-
136
+
143
137
  LOGGER.info(f"AOAI: Eval group created with id {eval_group_info.id}. Creating eval run next...")
144
138
  # Use eval group info to map grader IDs back to user-assigned names.
145
139
  grader_name_map = {}
146
140
  num_criteria = len(eval_group_info.testing_criteria)
147
141
  if num_criteria != len(grader_name_list):
148
142
  raise EvaluationException(
149
- message=f"Number of testing criteria ({num_criteria})" +
150
- f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
143
+ message=f"Number of testing criteria ({num_criteria})"
144
+ + f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
151
145
  blame=ErrorBlame.USER_ERROR,
152
146
  category=ErrorCategory.INVALID_VALUE,
153
147
  target=ErrorTarget.AOAI_GRADER,
@@ -155,21 +149,24 @@ def _begin_single_aoai_evaluation(
155
149
  for name, criteria in zip(grader_name_list, eval_group_info.testing_criteria):
156
150
  grader_name_map[criteria.id] = name
157
151
 
158
- # Create eval run
152
+ # Create eval run
159
153
  eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
160
- LOGGER.info(f"AOAI: Eval run created with id {eval_run_id}." +
161
- " Results will be retrieved after normal evaluation is complete...")
154
+ LOGGER.info(
155
+ f"AOAI: Eval run created with id {eval_run_id}."
156
+ + " Results will be retrieved after normal evaluation is complete..."
157
+ )
158
+
159
+ return OAIEvalRunCreationInfo(
160
+ client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
161
+ )
162
162
 
163
- return OAIEvalRunCreationInfo(client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
164
163
 
165
- def _get_evaluation_run_results(
166
- all_run_info: List[OAIEvalRunCreationInfo]
167
- ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
164
+ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
168
165
  """
169
166
  Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
170
167
  pipeline to consume. This method accepts a list of eval run information, and will combine the
171
168
  results into a single dataframe and metrics dictionary.
172
-
169
+
173
170
  :param all_run_info: A list of evaluation run information that contains the needed values
174
171
  to retrieve the results of the evaluation run.
175
172
  :type all_run_info: List[OAIEvalRunCreationInfo]
@@ -188,13 +185,14 @@ def _get_evaluation_run_results(
188
185
 
189
186
  return output_df, run_metrics
190
187
 
188
+
191
189
  def _get_single_run_results(
192
- run_info: OAIEvalRunCreationInfo,
193
- ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
190
+ run_info: OAIEvalRunCreationInfo,
191
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
194
192
  """
195
193
  Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
196
194
  pipeline to consume.
197
-
195
+
198
196
  :param run_info: The evaluation run information that contains the needed values
199
197
  to retrieve the results of the evaluation run.
200
198
  :type run_info: OAIEvalRunCreationInfo
@@ -205,28 +203,30 @@ def _get_single_run_results(
205
203
  """
206
204
  # Wait for evaluation run to complete
207
205
  run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
206
+
208
207
  if run_results.status != "completed":
209
208
  raise EvaluationException(
210
209
  message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
211
- + f" failed with status {run_results.status}.",
210
+ + f" failed with status {run_results.status}.",
212
211
  blame=ErrorBlame.UNKNOWN,
213
212
  category=ErrorCategory.FAILED_EXECUTION,
214
213
  target=ErrorTarget.AOAI_GRADER,
215
214
  )
216
- LOGGER.info(f"AOAI: Evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
217
- + " completed successfully. Gathering results...")
215
+
218
216
  # Convert run results into a dictionary of metrics
219
217
  run_metrics = {}
220
218
  if run_results.per_testing_criteria_results is None:
221
- msg = ("AOAI evaluation run returned no results, despite 'completed' status. This might" +
222
- " occur when invalid or conflicting models are selected in the model and grader configs."
223
- f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}")
219
+ msg = (
220
+ "AOAI evaluation run returned no results, despite 'completed' status. This might"
221
+ + " occur when invalid or conflicting models are selected in the model and grader configs."
222
+ f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}"
223
+ )
224
224
  raise EvaluationException(
225
225
  message=msg,
226
226
  blame=ErrorBlame.UNKNOWN,
227
227
  category=ErrorCategory.FAILED_EXECUTION,
228
228
  target=ErrorTarget.AOAI_GRADER,
229
- )
229
+ )
230
230
  for criteria_result in run_results.per_testing_criteria_results:
231
231
  grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
232
232
  passed = criteria_result.passed
@@ -235,7 +235,6 @@ def _get_single_run_results(
235
235
  formatted_column_name = f"{grader_name}.pass_rate"
236
236
  run_metrics[formatted_column_name] = ratio
237
237
 
238
-
239
238
  # Get full results and convert them into a dataframe.
240
239
  # Notes on raw full data output from OAI eval runs:
241
240
  # Each row in the full results list in itself a list.
@@ -246,36 +245,72 @@ def _get_single_run_results(
246
245
  # The passed and score values are then added to the results dictionary, prepended with the grader's name
247
246
  # as entered by the user in the inputted dictionary.
248
247
  # Other values, if they exist, are also added to the results dictionary.
249
- raw_list_results = run_info["client"].evals.runs.output_items.list(
250
- eval_id=run_info["eval_group_id"],
251
- run_id=run_info["eval_run_id"]
252
- )
248
+
249
+ # Collect all results with pagination
250
+ all_results = []
251
+ next_cursor = None
252
+ limit = 100 # Max allowed by API
253
+
254
+ while True:
255
+ # Build kwargs for the API call
256
+ list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
257
+ if next_cursor is not None:
258
+ list_kwargs["after"] = next_cursor
259
+
260
+ raw_list_results = run_info["client"].evals.runs.output_items.list(**list_kwargs)
261
+
262
+ # Add current page results
263
+ all_results.extend(raw_list_results.data)
264
+
265
+ # Check for more pages
266
+ if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
267
+ if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
268
+ # Get the last item's ID for cursor-based pagination
269
+ next_cursor = raw_list_results.data[-1].id
270
+ else:
271
+ break
272
+ else:
273
+ break
274
+
253
275
  listed_results = {"index": []}
254
276
  # raw data has no order guarantees, we need to sort them by their
255
277
  # datasource_item_id
256
- for row_result in raw_list_results.data:
278
+ for row_result in all_results:
257
279
  # Add the datasource_item_id for later sorting
258
280
  listed_results["index"].append(row_result.datasource_item_id)
259
281
  for single_grader_row_result in row_result.results:
260
282
  grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
261
283
  for name, value in single_grader_row_result.items():
262
- if name in ["name"]: # Todo decide if we also want to exclude "sample"
284
+ if name in ["name"]: # Todo decide if we also want to exclude "sample"
263
285
  continue
264
286
  if name.lower() == "passed":
265
287
  # create a `_result` column for each grader
266
288
  result_column_name = f"outputs.{grader_name}.{grader_name}_result"
267
- if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
268
- if (result_column_name not in listed_results):
289
+ if len(result_column_name) < 50: # TODO: is this the limit? Should we keep "passed"?
290
+ if result_column_name not in listed_results:
269
291
  listed_results[result_column_name] = []
270
292
  listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
271
293
 
272
294
  formatted_column_name = f"outputs.{grader_name}.{name}"
273
- if (formatted_column_name not in listed_results):
295
+ if formatted_column_name not in listed_results:
274
296
  listed_results[formatted_column_name] = []
275
297
  listed_results[formatted_column_name].append(value)
298
+
299
+ # Ensure all columns have the same length as the index
300
+ num_rows = len(listed_results["index"])
301
+ for col_name in list(listed_results.keys()):
302
+ if col_name != "index":
303
+ col_length = len(listed_results[col_name])
304
+ if col_length < num_rows:
305
+ # Pad with None values
306
+ listed_results[col_name].extend([None] * (num_rows - col_length))
307
+ elif col_length > num_rows:
308
+ # This shouldn't happen, but truncate if it does
309
+ listed_results[col_name] = listed_results[col_name][:num_rows]
310
+
276
311
  output_df = pd.DataFrame(listed_results)
277
312
  # sort by index
278
- output_df = output_df.sort_values('index', ascending=[True])
313
+ output_df = output_df.sort_values("index", ascending=[True])
279
314
  # remove index column
280
315
  output_df.drop(columns=["index"], inplace=True)
281
316
  return output_df, run_metrics
@@ -303,9 +338,10 @@ def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str,
303
338
  target=ErrorTarget.AOAI_GRADER,
304
339
  )
305
340
 
306
- grader_class = _get_grader_class(grader_id)
341
+ grader_class = _get_grader_class(grader_id)
307
342
  return grader_class(**init_params)
308
343
 
344
+
309
345
  def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
310
346
  """
311
347
  Given a model ID, return the class of the corresponding grader wrapper.
@@ -316,12 +352,15 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
316
352
  AzureOpenAILabelGrader,
317
353
  AzureOpenAIStringCheckGrader,
318
354
  AzureOpenAITextSimilarityGrader,
355
+ AzureOpenAIScoreModelGrader,
319
356
  )
357
+
320
358
  id_map = {
321
359
  AzureOpenAIGrader.id: AzureOpenAIGrader,
322
360
  AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
323
361
  AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
324
362
  AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
363
+ AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
325
364
  }
326
365
 
327
366
  for key in id_map.keys():
@@ -336,9 +375,9 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
336
375
 
337
376
 
338
377
  def _get_graders_and_column_mappings(
339
- graders: Dict[str, AzureOpenAIGrader],
340
- column_mappings: Optional[Dict[str, Dict[str, str]]],
341
- ) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
378
+ graders: Dict[str, AzureOpenAIGrader],
379
+ column_mappings: Optional[Dict[str, Dict[str, str]]],
380
+ ) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
342
381
  """
343
382
  Given a dictionary of column mappings and a dictionary of AOAI graders,
344
383
  Split them into sub-lists and sub-dictionaries that each correspond to a single evaluation run
@@ -366,20 +405,21 @@ def _get_graders_and_column_mappings(
366
405
  """
367
406
 
368
407
  default_mapping = column_mappings.get("default", None)
369
- return [({name : grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
408
+ return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
409
+
370
410
 
371
411
  def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
372
412
  """Produce a data source config that maps all columns from the supplied data source into
373
413
  the OAI API. The mapping is naive unless a column mapping is provided, in which case
374
414
  the column mapping's values overrule the relevant naive mappings
375
-
415
+
376
416
  :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
377
417
  helper function.
378
418
  :type input_data_df: pd.DataFrame
379
419
  :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
380
420
  :type column_mapping: Optional[Dict[str, str]]
381
421
  :return: A dictionary that can act as data source config for OAI evaluation group creation.
382
- :rtype: Dict[str, Any]
422
+ :rtype: Dict[str, Any]
383
423
  """
384
424
 
385
425
  data_source_config = {
@@ -388,7 +428,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
388
428
  "type": "object",
389
429
  "properties": {},
390
430
  "required": [],
391
- }
431
+ },
392
432
  }
393
433
  properties = data_source_config["item_schema"]["properties"]
394
434
  required = data_source_config["item_schema"]["required"]
@@ -399,10 +439,11 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
399
439
  required.append(key)
400
440
  return data_source_config
401
441
 
442
+
402
443
  def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
403
444
  """Produce a data source config that naively maps all columns from the supplied data source into
404
445
  the OAI API.
405
-
446
+
406
447
  :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
407
448
  helper function.
408
449
  :type input_data_df: pd.DataFrame
@@ -424,10 +465,11 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
424
465
  "type": "object",
425
466
  "properties": properties,
426
467
  "required": required,
427
- }
468
+ },
428
469
  }
429
470
  return data_source_config
430
471
 
472
+
431
473
  def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
432
474
  """
433
475
  Given a dataframe of data to be evaluated, and an optional column mapping,
@@ -457,7 +499,7 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
457
499
  # dictionary that'll work in an OAI data source.
458
500
  for row in input_data_df.iterrows():
459
501
  row_dict = {}
460
- for oai_key,dataframe_key in column_to_source_map.items():
502
+ for oai_key, dataframe_key in column_to_source_map.items():
461
503
  row_dict[oai_key] = str(row[1][dataframe_key])
462
504
  content.append({"item": row_dict})
463
505
 
@@ -466,20 +508,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
466
508
  "source": {
467
509
  "type": "file_content",
468
510
  "content": content,
469
- }
511
+ },
470
512
  }
471
513
 
514
+
472
515
  def _begin_eval_run(
473
- client: Union[OpenAI, AzureOpenAI],
474
- eval_group_id: str,
475
- run_name: str,
476
- input_data_df: pd.DataFrame,
477
- column_mapping: Dict[str, str]
478
- ) -> str:
516
+ client: Union[OpenAI, AzureOpenAI],
517
+ eval_group_id: str,
518
+ run_name: str,
519
+ input_data_df: pd.DataFrame,
520
+ column_mapping: Dict[str, str],
521
+ ) -> str:
479
522
  """
480
- Given an eval group id and a dataset file path, use the AOAI API to
523
+ Given an eval group id and a dataset file path, use the AOAI API to
481
524
  start an evaluation run with the given name and description.
482
- Returns a poller that can be used to monitor the run.
525
+ Returns a poller that can be used to monitor the run.
483
526
 
484
527
  :param client: The AOAI client to use for the evaluation.
485
528
  :type client: Union[OpenAI, AzureOpenAI]
@@ -499,18 +542,16 @@ def _begin_eval_run(
499
542
  eval_id=eval_group_id,
500
543
  data_source=data_source,
501
544
  name=run_name,
502
- metadata={"sample_generation": "off","file_format": "jsonl", "is_foundry_eval": "true"}
545
+ metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
503
546
  # TODO decide if we want to add our own timeout value?
504
547
  )
505
548
  return eval_run.id
506
549
 
550
+
507
551
  # Post built TODO: replace with _red_team.py's retry logic?
508
552
  def _wait_for_run_conclusion(
509
- client: Union[OpenAI, AzureOpenAI],
510
- eval_group_id: str,
511
- eval_run_id: str,
512
- max_wait_seconds = 21600
513
- ) -> Any:
553
+ client: Union[OpenAI, AzureOpenAI], eval_group_id: str, eval_run_id: str, max_wait_seconds=21600
554
+ ) -> Any:
514
555
  """
515
556
  Perform exponential backoff polling to get the results of an AOAI evaluation run.
516
557
  Raises an EvaluationException if max attempts are reached without receiving a concluding status.
@@ -532,8 +573,8 @@ def _wait_for_run_conclusion(
532
573
  iters = 0
533
574
  # start with ~51 minutes of exponential backoff
534
575
  # max wait time = 2^10 * 3 = 3072 seconds ~= 51 minutes
535
- wait_interval = 3 # Seconds.
536
- while(True):
576
+ wait_interval = 3 # Seconds.
577
+ while True:
537
578
  wait_interval *= 1.5
538
579
  total_wait += wait_interval
539
580
  # Reduce last wait interval if total wait time exceeds max wait time
@@ -541,13 +582,13 @@ def _wait_for_run_conclusion(
541
582
  wait_interval -= total_wait - max_wait_seconds
542
583
  sleep(wait_interval)
543
584
  response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
544
- if response.status not in ["queued", "in_progress"]:
585
+ if response.status not in ["queued", "in_progress"]:
545
586
  return response
546
587
  if total_wait > max_wait_seconds:
547
588
  raise EvaluationException(
548
589
  message=f"Timed out waiting for AOAI evaluation to complete after {iters}"
549
- + f" rounds of polling. Final status was {response.status}",
590
+ + f" rounds of polling. Final status was {response.status}",
550
591
  blame=ErrorBlame.USER_ERROR,
551
592
  category=ErrorCategory.FAILED_EXECUTION,
552
593
  target=ErrorTarget.AOAI_GRADER,
553
- )
594
+ )
@@ -17,7 +17,6 @@ from typing_extensions import ParamSpec
17
17
 
18
18
  from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
19
19
 
20
- from ..._user_agent import USER_AGENT
21
20
  from .._utils import _trace_destination_from_project_scope
22
21
 
23
22
  LOGGER = logging.getLogger(__name__)
@@ -13,6 +13,9 @@ import base64
13
13
  import math
14
14
 
15
15
  import pandas as pd
16
+ from tqdm import tqdm
17
+
18
+ from azure.core.pipeline.policies import UserAgentPolicy
16
19
  from azure.ai.evaluation._legacy._adapters.entities import Run
17
20
 
18
21
  from azure.ai.evaluation._constants import (
@@ -24,6 +27,7 @@ from azure.ai.evaluation._constants import (
24
27
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
28
  from azure.ai.evaluation._model_configurations import AzureAIProject
26
29
  from azure.ai.evaluation._version import VERSION
30
+ from azure.ai.evaluation._user_agent import UserAgentSingleton
27
31
  from azure.ai.evaluation._azure._clients import LiteMLClient
28
32
 
29
33
  LOGGER = logging.getLogger(__name__)
@@ -127,6 +131,7 @@ def process_message_content(content, images_folder_path):
127
131
  f.write(image_data_binary)
128
132
  return None
129
133
 
134
+
130
135
  def _log_metrics_and_instance_results_onedp(
131
136
  metrics: Dict[str, Any],
132
137
  instance_results: pd.DataFrame,
@@ -146,7 +151,8 @@ def _log_metrics_and_instance_results_onedp(
146
151
  )
147
152
  client = EvaluationServiceOneDPClient(
148
153
  endpoint=project_url,
149
- credential=credentials
154
+ credential=credentials,
155
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
150
156
  )
151
157
 
152
158
  # Massaging before artifacts are put on disk
@@ -175,13 +181,11 @@ def _log_metrics_and_instance_results_onedp(
175
181
  EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
176
182
  EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
177
183
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
178
- }
184
+ }
179
185
  properties.update(_convert_name_map_into_property_entries(name_map))
180
186
 
181
187
  create_evaluation_result_response = client.create_evaluation_result(
182
- name=uuid.uuid4(),
183
- path=tmpdir,
184
- metrics=metrics
188
+ name=uuid.uuid4(), path=tmpdir, metrics=metrics
185
189
  )
186
190
 
187
191
  upload_run_response = client.start_evaluation_run(
@@ -196,14 +200,15 @@ def _log_metrics_and_instance_results_onedp(
196
200
  display_name=evaluation_name,
197
201
  status="Completed",
198
202
  outputs={
199
- 'evaluationResultId': create_evaluation_result_response.id,
203
+ "evaluationResultId": create_evaluation_result_response.id,
200
204
  },
201
205
  properties=properties,
202
- )
206
+ ),
203
207
  )
204
208
 
205
209
  return update_run_response.properties.get("AiStudioEvaluationUri")
206
210
 
211
+
207
212
  def _log_metrics_and_instance_results(
208
213
  metrics: Dict[str, Any],
209
214
  instance_results: pd.DataFrame,
@@ -266,11 +271,11 @@ def _log_metrics_and_instance_results(
266
271
  # We are doing that only for the pure evaluation runs.
267
272
  if run is None:
268
273
  properties = {
269
- EvaluationRunProperties.RUN_TYPE: "eval_run",
270
- EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
271
- EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
272
- "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
273
- }
274
+ EvaluationRunProperties.RUN_TYPE: "eval_run",
275
+ EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
276
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
277
+ "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
278
+ }
274
279
  properties.update(_convert_name_map_into_property_entries(name_map))
275
280
  ev_run.write_properties_to_run_history(properties=properties)
276
281
  else:
@@ -321,7 +326,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
321
326
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
322
327
  json.dump(data_dict, f, ensure_ascii=False)
323
328
 
324
- print(f'Evaluation results saved to "{p.resolve()}".\n')
329
+ # Use tqdm.write to print message without interfering with any current progress bar
330
+ tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
325
331
 
326
332
 
327
333
  def _apply_column_mapping(
@@ -407,9 +413,11 @@ def set_event_loop_policy() -> None:
407
413
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
408
414
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
409
415
 
416
+
410
417
  # textwrap.wrap tries to do fancy nonsense that we don't want
411
418
  def _wrap(s, w):
412
- return [s[i:i + w] for i in range(0, len(s), w)]
419
+ return [s[i : i + w] for i in range(0, len(s), w)]
420
+
413
421
 
414
422
  def _convert_name_map_into_property_entries(
415
423
  name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
@@ -433,7 +441,7 @@ def _convert_name_map_into_property_entries(
433
441
  num_segments = math.ceil(len(name_map_string) / segment_length)
434
442
  # Property map is somehow still too long to encode within the space
435
443
  # we allow, so give up, but make sure the service knows we gave up
436
- if (num_segments > max_segments):
444
+ if num_segments > max_segments:
437
445
  return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
438
446
 
439
447
  result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
@@ -443,6 +451,7 @@ def _convert_name_map_into_property_entries(
443
451
  result[segment_key] = segments_list[i]
444
452
  return result
445
453
 
454
+
446
455
  class JSONLDataFileLoader:
447
456
  def __init__(self, filename: Union[os.PathLike, str]):
448
457
  self.filename = filename
@@ -34,15 +34,15 @@ class BleuScoreEvaluator(EvaluatorBase):
34
34
  :language: python
35
35
  :dedent: 8
36
36
  :caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
37
-
37
+
38
38
  .. admonition:: Example using Azure AI Project URL:
39
-
39
+
40
40
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
41
41
  :start-after: [START bleu_score_evaluator]
42
42
  :end-before: [END bleu_score_evaluator]
43
43
  :language: python
44
44
  :dedent: 8
45
- :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
45
+ :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
46
46
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
47
47
 
48
48
  .. admonition:: Example with Threshold: