azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from time import sleep
11
11
 
12
12
  from ._batch_run import CodeClient, ProxyClient
13
13
 
14
- #import aoai_mapping
14
+ # import aoai_mapping
15
15
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
16
  from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
17
17
  from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
@@ -30,17 +30,18 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
30
30
  eval_run_id: str
31
31
  grader_name_map: Dict[str, str]
32
32
 
33
+
33
34
  def _split_evaluators_and_grader_configs(
34
- evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]]
35
- ) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
35
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
36
+ ) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
36
37
  """
37
38
  Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
38
39
  dictionaries that each contain one subset, the first containing the evaluators and the second containing
39
40
  the AOAI graders. AOAI graders are defined as anything that is an instance of the AoaiGrader class,
40
- including child class instances.
41
+ including child class instances.
41
42
 
42
43
  :param evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
43
- and value as the evaluator function or AOAI grader.
44
+ and value as the evaluator function or AOAI grader.
44
45
  :type evaluators: Dict[str, Union[Callable, ]]
45
46
  :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
46
47
  :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
@@ -54,13 +55,14 @@ def _split_evaluators_and_grader_configs(
54
55
  true_evaluators[key] = value
55
56
  return true_evaluators, aoai_graders
56
57
 
58
+
57
59
  @experimental
58
60
  def _begin_aoai_evaluation(
59
- graders: Dict[str, AzureOpenAIGrader],
60
- column_mappings: Optional[Dict[str, Dict[str, str]]],
61
- data: pd.DataFrame,
62
- run_name: str
63
- ) -> List[OAIEvalRunCreationInfo]:
61
+ graders: Dict[str, AzureOpenAIGrader],
62
+ column_mappings: Optional[Dict[str, Dict[str, str]]],
63
+ data: pd.DataFrame,
64
+ run_name: str,
65
+ ) -> List[OAIEvalRunCreationInfo]:
64
66
  """
65
67
  Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
66
68
  AOAI evaluation runs must be queried for completion, so this returns the IDs needed to poll for the
@@ -84,26 +86,20 @@ def _begin_aoai_evaluation(
84
86
  :rtype: List[OAIEvalRunCreationInfo]
85
87
  """
86
88
 
87
-
88
89
  LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
89
90
  all_eval_run_info: List[OAIEvalRunCreationInfo] = []
90
91
 
91
92
  for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
92
- all_eval_run_info.append(_begin_single_aoai_evaluation(
93
- selected_graders,
94
- data,
95
- selected_column_mapping,
96
- run_name
97
- ))
93
+ all_eval_run_info.append(
94
+ _begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
95
+ )
98
96
 
99
97
  return all_eval_run_info
100
98
 
99
+
101
100
  def _begin_single_aoai_evaluation(
102
- graders: Dict[str, AzureOpenAIGrader],
103
- data: pd.DataFrame,
104
- column_mapping: Dict[str, str],
105
- run_name: str
106
- ) -> OAIEvalRunCreationInfo:
101
+ graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
102
+ ) -> OAIEvalRunCreationInfo:
107
103
  """
108
104
  Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
109
105
  AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
@@ -121,7 +117,7 @@ def _begin_single_aoai_evaluation(
121
117
  """
122
118
 
123
119
  # Format data for eval group creation
124
- grader_name_list = []
120
+ grader_name_list = []
125
121
  grader_list = []
126
122
  # It's expected that all graders supplied for a single eval run use the same credentials
127
123
  # so grab a client from the first grader.
@@ -135,19 +131,17 @@ def _begin_single_aoai_evaluation(
135
131
  # Create eval group
136
132
  # import pdb; pdb.set_trace()
137
133
  eval_group_info = client.evals.create(
138
- data_source_config=data_source_config,
139
- testing_criteria=grader_list,
140
- metadata={"is_foundry_eval": "true"}
134
+ data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
141
135
  )
142
-
136
+
143
137
  LOGGER.info(f"AOAI: Eval group created with id {eval_group_info.id}. Creating eval run next...")
144
138
  # Use eval group info to map grader IDs back to user-assigned names.
145
139
  grader_name_map = {}
146
140
  num_criteria = len(eval_group_info.testing_criteria)
147
141
  if num_criteria != len(grader_name_list):
148
142
  raise EvaluationException(
149
- message=f"Number of testing criteria ({num_criteria})" +
150
- f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
143
+ message=f"Number of testing criteria ({num_criteria})"
144
+ + f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
151
145
  blame=ErrorBlame.USER_ERROR,
152
146
  category=ErrorCategory.INVALID_VALUE,
153
147
  target=ErrorTarget.AOAI_GRADER,
@@ -155,21 +149,24 @@ def _begin_single_aoai_evaluation(
155
149
  for name, criteria in zip(grader_name_list, eval_group_info.testing_criteria):
156
150
  grader_name_map[criteria.id] = name
157
151
 
158
- # Create eval run
152
+ # Create eval run
159
153
  eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
160
- LOGGER.info(f"AOAI: Eval run created with id {eval_run_id}." +
161
- " Results will be retrieved after normal evaluation is complete...")
154
+ LOGGER.info(
155
+ f"AOAI: Eval run created with id {eval_run_id}."
156
+ + " Results will be retrieved after normal evaluation is complete..."
157
+ )
158
+
159
+ return OAIEvalRunCreationInfo(
160
+ client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
161
+ )
162
162
 
163
- return OAIEvalRunCreationInfo(client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
164
163
 
165
- def _get_evaluation_run_results(
166
- all_run_info: List[OAIEvalRunCreationInfo]
167
- ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
164
+ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
168
165
  """
169
166
  Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
170
167
  pipeline to consume. This method accepts a list of eval run information, and will combine the
171
168
  results into a single dataframe and metrics dictionary.
172
-
169
+
173
170
  :param all_run_info: A list of evaluation run information that contains the needed values
174
171
  to retrieve the results of the evaluation run.
175
172
  :type all_run_info: List[OAIEvalRunCreationInfo]
@@ -188,13 +185,14 @@ def _get_evaluation_run_results(
188
185
 
189
186
  return output_df, run_metrics
190
187
 
188
+
191
189
  def _get_single_run_results(
192
- run_info: OAIEvalRunCreationInfo,
193
- ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
190
+ run_info: OAIEvalRunCreationInfo,
191
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
194
192
  """
195
193
  Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
196
194
  pipeline to consume.
197
-
195
+
198
196
  :param run_info: The evaluation run information that contains the needed values
199
197
  to retrieve the results of the evaluation run.
200
198
  :type run_info: OAIEvalRunCreationInfo
@@ -205,28 +203,30 @@ def _get_single_run_results(
205
203
  """
206
204
  # Wait for evaluation run to complete
207
205
  run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
206
+
208
207
  if run_results.status != "completed":
209
208
  raise EvaluationException(
210
209
  message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
211
- + f" failed with status {run_results.status}.",
210
+ + f" failed with status {run_results.status}.",
212
211
  blame=ErrorBlame.UNKNOWN,
213
212
  category=ErrorCategory.FAILED_EXECUTION,
214
213
  target=ErrorTarget.AOAI_GRADER,
215
214
  )
216
- LOGGER.info(f"AOAI: Evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
217
- + " completed successfully. Gathering results...")
215
+
218
216
  # Convert run results into a dictionary of metrics
219
217
  run_metrics = {}
220
218
  if run_results.per_testing_criteria_results is None:
221
- msg = ("AOAI evaluation run returned no results, despite 'completed' status. This might" +
222
- " occur when invalid or conflicting models are selected in the model and grader configs."
223
- f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}")
219
+ msg = (
220
+ "AOAI evaluation run returned no results, despite 'completed' status. This might"
221
+ + " occur when invalid or conflicting models are selected in the model and grader configs."
222
+ f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}"
223
+ )
224
224
  raise EvaluationException(
225
225
  message=msg,
226
226
  blame=ErrorBlame.UNKNOWN,
227
227
  category=ErrorCategory.FAILED_EXECUTION,
228
228
  target=ErrorTarget.AOAI_GRADER,
229
- )
229
+ )
230
230
  for criteria_result in run_results.per_testing_criteria_results:
231
231
  grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
232
232
  passed = criteria_result.passed
@@ -235,7 +235,6 @@ def _get_single_run_results(
235
235
  formatted_column_name = f"{grader_name}.pass_rate"
236
236
  run_metrics[formatted_column_name] = ratio
237
237
 
238
-
239
238
  # Get full results and convert them into a dataframe.
240
239
  # Notes on raw full data output from OAI eval runs:
241
240
  # Each row in the full results list in itself a list.
@@ -246,36 +245,72 @@ def _get_single_run_results(
246
245
  # The passed and score values are then added to the results dictionary, prepended with the grader's name
247
246
  # as entered by the user in the inputted dictionary.
248
247
  # Other values, if they exist, are also added to the results dictionary.
249
- raw_list_results = run_info["client"].evals.runs.output_items.list(
250
- eval_id=run_info["eval_group_id"],
251
- run_id=run_info["eval_run_id"]
252
- )
248
+
249
+ # Collect all results with pagination
250
+ all_results = []
251
+ next_cursor = None
252
+ limit = 100 # Max allowed by API
253
+
254
+ while True:
255
+ # Build kwargs for the API call
256
+ list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
257
+ if next_cursor is not None:
258
+ list_kwargs["after"] = next_cursor
259
+
260
+ raw_list_results = run_info["client"].evals.runs.output_items.list(**list_kwargs)
261
+
262
+ # Add current page results
263
+ all_results.extend(raw_list_results.data)
264
+
265
+ # Check for more pages
266
+ if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
267
+ if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
268
+ # Get the last item's ID for cursor-based pagination
269
+ next_cursor = raw_list_results.data[-1].id
270
+ else:
271
+ break
272
+ else:
273
+ break
274
+
253
275
  listed_results = {"index": []}
254
276
  # raw data has no order guarantees, we need to sort them by their
255
277
  # datasource_item_id
256
- for row_result in raw_list_results.data:
278
+ for row_result in all_results:
257
279
  # Add the datasource_item_id for later sorting
258
280
  listed_results["index"].append(row_result.datasource_item_id)
259
281
  for single_grader_row_result in row_result.results:
260
282
  grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
261
283
  for name, value in single_grader_row_result.items():
262
- if name in ["name"]: # Todo decide if we also want to exclude "sample"
284
+ if name in ["name"]: # Todo decide if we also want to exclude "sample"
263
285
  continue
264
286
  if name.lower() == "passed":
265
287
  # create a `_result` column for each grader
266
288
  result_column_name = f"outputs.{grader_name}.{grader_name}_result"
267
- if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
268
- if (result_column_name not in listed_results):
289
+ if len(result_column_name) < 50: # TODO: is this the limit? Should we keep "passed"?
290
+ if result_column_name not in listed_results:
269
291
  listed_results[result_column_name] = []
270
292
  listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
271
293
 
272
294
  formatted_column_name = f"outputs.{grader_name}.{name}"
273
- if (formatted_column_name not in listed_results):
295
+ if formatted_column_name not in listed_results:
274
296
  listed_results[formatted_column_name] = []
275
297
  listed_results[formatted_column_name].append(value)
298
+
299
+ # Ensure all columns have the same length as the index
300
+ num_rows = len(listed_results["index"])
301
+ for col_name in list(listed_results.keys()):
302
+ if col_name != "index":
303
+ col_length = len(listed_results[col_name])
304
+ if col_length < num_rows:
305
+ # Pad with None values
306
+ listed_results[col_name].extend([None] * (num_rows - col_length))
307
+ elif col_length > num_rows:
308
+ # This shouldn't happen, but truncate if it does
309
+ listed_results[col_name] = listed_results[col_name][:num_rows]
310
+
276
311
  output_df = pd.DataFrame(listed_results)
277
312
  # sort by index
278
- output_df = output_df.sort_values('index', ascending=[True])
313
+ output_df = output_df.sort_values("index", ascending=[True])
279
314
  # remove index column
280
315
  output_df.drop(columns=["index"], inplace=True)
281
316
  return output_df, run_metrics
@@ -303,9 +338,10 @@ def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str,
303
338
  target=ErrorTarget.AOAI_GRADER,
304
339
  )
305
340
 
306
- grader_class = _get_grader_class(grader_id)
341
+ grader_class = _get_grader_class(grader_id)
307
342
  return grader_class(**init_params)
308
343
 
344
+
309
345
  def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
310
346
  """
311
347
  Given a model ID, return the class of the corresponding grader wrapper.
@@ -316,12 +352,17 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
316
352
  AzureOpenAILabelGrader,
317
353
  AzureOpenAIStringCheckGrader,
318
354
  AzureOpenAITextSimilarityGrader,
355
+ AzureOpenAIScoreModelGrader,
356
+ AzureOpenAIPythonGrader,
319
357
  )
358
+
320
359
  id_map = {
321
360
  AzureOpenAIGrader.id: AzureOpenAIGrader,
322
361
  AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
323
362
  AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
324
363
  AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
364
+ AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
365
+ AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
325
366
  }
326
367
 
327
368
  for key in id_map.keys():
@@ -336,9 +377,9 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
336
377
 
337
378
 
338
379
  def _get_graders_and_column_mappings(
339
- graders: Dict[str, AzureOpenAIGrader],
340
- column_mappings: Optional[Dict[str, Dict[str, str]]],
341
- ) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
380
+ graders: Dict[str, AzureOpenAIGrader],
381
+ column_mappings: Optional[Dict[str, Dict[str, str]]],
382
+ ) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
342
383
  """
343
384
  Given a dictionary of column mappings and a dictionary of AOAI graders,
344
385
  Split them into sub-lists and sub-dictionaries that each correspond to a single evaluation run
@@ -366,20 +407,21 @@ def _get_graders_and_column_mappings(
366
407
  """
367
408
 
368
409
  default_mapping = column_mappings.get("default", None)
369
- return [({name : grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
410
+ return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
411
+
370
412
 
371
413
  def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
372
414
  """Produce a data source config that maps all columns from the supplied data source into
373
415
  the OAI API. The mapping is naive unless a column mapping is provided, in which case
374
416
  the column mapping's values overrule the relevant naive mappings
375
-
417
+
376
418
  :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
377
419
  helper function.
378
420
  :type input_data_df: pd.DataFrame
379
421
  :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
380
422
  :type column_mapping: Optional[Dict[str, str]]
381
423
  :return: A dictionary that can act as data source config for OAI evaluation group creation.
382
- :rtype: Dict[str, Any]
424
+ :rtype: Dict[str, Any]
383
425
  """
384
426
 
385
427
  data_source_config = {
@@ -388,7 +430,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
388
430
  "type": "object",
389
431
  "properties": {},
390
432
  "required": [],
391
- }
433
+ },
392
434
  }
393
435
  properties = data_source_config["item_schema"]["properties"]
394
436
  required = data_source_config["item_schema"]["required"]
@@ -399,10 +441,11 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
399
441
  required.append(key)
400
442
  return data_source_config
401
443
 
444
+
402
445
  def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
403
446
  """Produce a data source config that naively maps all columns from the supplied data source into
404
447
  the OAI API.
405
-
448
+
406
449
  :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
407
450
  helper function.
408
451
  :type input_data_df: pd.DataFrame
@@ -424,10 +467,11 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
424
467
  "type": "object",
425
468
  "properties": properties,
426
469
  "required": required,
427
- }
470
+ },
428
471
  }
429
472
  return data_source_config
430
473
 
474
+
431
475
  def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
432
476
  """
433
477
  Given a dataframe of data to be evaluated, and an optional column mapping,
@@ -457,7 +501,7 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
457
501
  # dictionary that'll work in an OAI data source.
458
502
  for row in input_data_df.iterrows():
459
503
  row_dict = {}
460
- for oai_key,dataframe_key in column_to_source_map.items():
504
+ for oai_key, dataframe_key in column_to_source_map.items():
461
505
  row_dict[oai_key] = str(row[1][dataframe_key])
462
506
  content.append({"item": row_dict})
463
507
 
@@ -466,20 +510,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
466
510
  "source": {
467
511
  "type": "file_content",
468
512
  "content": content,
469
- }
513
+ },
470
514
  }
471
515
 
516
+
472
517
  def _begin_eval_run(
473
- client: Union[OpenAI, AzureOpenAI],
474
- eval_group_id: str,
475
- run_name: str,
476
- input_data_df: pd.DataFrame,
477
- column_mapping: Dict[str, str]
478
- ) -> str:
518
+ client: Union[OpenAI, AzureOpenAI],
519
+ eval_group_id: str,
520
+ run_name: str,
521
+ input_data_df: pd.DataFrame,
522
+ column_mapping: Dict[str, str],
523
+ ) -> str:
479
524
  """
480
- Given an eval group id and a dataset file path, use the AOAI API to
525
+ Given an eval group id and a dataset file path, use the AOAI API to
481
526
  start an evaluation run with the given name and description.
482
- Returns a poller that can be used to monitor the run.
527
+ Returns a poller that can be used to monitor the run.
483
528
 
484
529
  :param client: The AOAI client to use for the evaluation.
485
530
  :type client: Union[OpenAI, AzureOpenAI]
@@ -499,18 +544,16 @@ def _begin_eval_run(
499
544
  eval_id=eval_group_id,
500
545
  data_source=data_source,
501
546
  name=run_name,
502
- metadata={"sample_generation": "off","file_format": "jsonl", "is_foundry_eval": "true"}
547
+ metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
503
548
  # TODO decide if we want to add our own timeout value?
504
549
  )
505
550
  return eval_run.id
506
551
 
552
+
507
553
  # Post built TODO: replace with _red_team.py's retry logic?
508
554
  def _wait_for_run_conclusion(
509
- client: Union[OpenAI, AzureOpenAI],
510
- eval_group_id: str,
511
- eval_run_id: str,
512
- max_wait_seconds = 21600
513
- ) -> Any:
555
+ client: Union[OpenAI, AzureOpenAI], eval_group_id: str, eval_run_id: str, max_wait_seconds=21600
556
+ ) -> Any:
514
557
  """
515
558
  Perform exponential backoff polling to get the results of an AOAI evaluation run.
516
559
  Raises an EvaluationException if max attempts are reached without receiving a concluding status.
@@ -532,8 +575,8 @@ def _wait_for_run_conclusion(
532
575
  iters = 0
533
576
  # start with ~51 minutes of exponential backoff
534
577
  # max wait time = 2^10 * 3 = 3072 seconds ~= 51 minutes
535
- wait_interval = 3 # Seconds.
536
- while(True):
578
+ wait_interval = 3 # Seconds.
579
+ while True:
537
580
  wait_interval *= 1.5
538
581
  total_wait += wait_interval
539
582
  # Reduce last wait interval if total wait time exceeds max wait time
@@ -541,13 +584,13 @@ def _wait_for_run_conclusion(
541
584
  wait_interval -= total_wait - max_wait_seconds
542
585
  sleep(wait_interval)
543
586
  response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
544
- if response.status not in ["queued", "in_progress"]:
587
+ if response.status not in ["queued", "in_progress"]:
545
588
  return response
546
589
  if total_wait > max_wait_seconds:
547
590
  raise EvaluationException(
548
591
  message=f"Timed out waiting for AOAI evaluation to complete after {iters}"
549
- + f" rounds of polling. Final status was {response.status}",
592
+ + f" rounds of polling. Final status was {response.status}",
550
593
  blame=ErrorBlame.USER_ERROR,
551
594
  category=ErrorCategory.FAILED_EXECUTION,
552
595
  target=ErrorTarget.AOAI_GRADER,
553
- )
596
+ )
@@ -17,7 +17,6 @@ from typing_extensions import ParamSpec
17
17
 
18
18
  from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
19
19
 
20
- from ..._user_agent import USER_AGENT
21
20
  from .._utils import _trace_destination_from_project_scope
22
21
 
23
22
  LOGGER = logging.getLogger(__name__)
@@ -13,6 +13,9 @@ import base64
13
13
  import math
14
14
 
15
15
  import pandas as pd
16
+ from tqdm import tqdm
17
+
18
+ from azure.core.pipeline.policies import UserAgentPolicy
16
19
  from azure.ai.evaluation._legacy._adapters.entities import Run
17
20
 
18
21
  from azure.ai.evaluation._constants import (
@@ -24,6 +27,7 @@ from azure.ai.evaluation._constants import (
24
27
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
28
  from azure.ai.evaluation._model_configurations import AzureAIProject
26
29
  from azure.ai.evaluation._version import VERSION
30
+ from azure.ai.evaluation._user_agent import UserAgentSingleton
27
31
  from azure.ai.evaluation._azure._clients import LiteMLClient
28
32
 
29
33
  LOGGER = logging.getLogger(__name__)
@@ -127,6 +131,7 @@ def process_message_content(content, images_folder_path):
127
131
  f.write(image_data_binary)
128
132
  return None
129
133
 
134
+
130
135
  def _log_metrics_and_instance_results_onedp(
131
136
  metrics: Dict[str, Any],
132
137
  instance_results: pd.DataFrame,
@@ -146,7 +151,8 @@ def _log_metrics_and_instance_results_onedp(
146
151
  )
147
152
  client = EvaluationServiceOneDPClient(
148
153
  endpoint=project_url,
149
- credential=credentials
154
+ credential=credentials,
155
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
150
156
  )
151
157
 
152
158
  # Massaging before artifacts are put on disk
@@ -172,21 +178,19 @@ def _log_metrics_and_instance_results_onedp(
172
178
 
173
179
  properties = {
174
180
  EvaluationRunProperties.RUN_TYPE: "eval_run",
175
- EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
176
181
  EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
177
182
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
178
- }
183
+ }
179
184
  properties.update(_convert_name_map_into_property_entries(name_map))
180
185
 
181
186
  create_evaluation_result_response = client.create_evaluation_result(
182
- name=uuid.uuid4(),
183
- path=tmpdir,
184
- metrics=metrics
187
+ name=uuid.uuid4(), path=tmpdir, metrics=metrics
185
188
  )
186
189
 
187
190
  upload_run_response = client.start_evaluation_run(
188
191
  evaluation=EvaluationUpload(
189
192
  display_name=evaluation_name,
193
+ properties=properties,
190
194
  )
191
195
  )
192
196
 
@@ -196,14 +200,14 @@ def _log_metrics_and_instance_results_onedp(
196
200
  display_name=evaluation_name,
197
201
  status="Completed",
198
202
  outputs={
199
- 'evaluationResultId': create_evaluation_result_response.id,
203
+ "evaluationResultId": create_evaluation_result_response.id,
200
204
  },
201
- properties=properties,
202
- )
205
+ ),
203
206
  )
204
207
 
205
208
  return update_run_response.properties.get("AiStudioEvaluationUri")
206
209
 
210
+
207
211
  def _log_metrics_and_instance_results(
208
212
  metrics: Dict[str, Any],
209
213
  instance_results: pd.DataFrame,
@@ -266,11 +270,11 @@ def _log_metrics_and_instance_results(
266
270
  # We are doing that only for the pure evaluation runs.
267
271
  if run is None:
268
272
  properties = {
269
- EvaluationRunProperties.RUN_TYPE: "eval_run",
270
- EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
271
- EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
272
- "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
273
- }
273
+ EvaluationRunProperties.RUN_TYPE: "eval_run",
274
+ EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
275
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
276
+ "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
277
+ }
274
278
  properties.update(_convert_name_map_into_property_entries(name_map))
275
279
  ev_run.write_properties_to_run_history(properties=properties)
276
280
  else:
@@ -321,7 +325,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
321
325
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
322
326
  json.dump(data_dict, f, ensure_ascii=False)
323
327
 
324
- print(f'Evaluation results saved to "{p.resolve()}".\n')
328
+ # Use tqdm.write to print message without interfering with any current progress bar
329
+ tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
325
330
 
326
331
 
327
332
  def _apply_column_mapping(
@@ -407,9 +412,11 @@ def set_event_loop_policy() -> None:
407
412
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
408
413
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
409
414
 
415
+
410
416
  # textwrap.wrap tries to do fancy nonsense that we don't want
411
417
  def _wrap(s, w):
412
- return [s[i:i + w] for i in range(0, len(s), w)]
418
+ return [s[i : i + w] for i in range(0, len(s), w)]
419
+
413
420
 
414
421
  def _convert_name_map_into_property_entries(
415
422
  name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
@@ -433,7 +440,7 @@ def _convert_name_map_into_property_entries(
433
440
  num_segments = math.ceil(len(name_map_string) / segment_length)
434
441
  # Property map is somehow still too long to encode within the space
435
442
  # we allow, so give up, but make sure the service knows we gave up
436
- if (num_segments > max_segments):
443
+ if num_segments > max_segments:
437
444
  return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
438
445
 
439
446
  result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
@@ -443,6 +450,7 @@ def _convert_name_map_into_property_entries(
443
450
  result[segment_key] = segments_list[i]
444
451
  return result
445
452
 
453
+
446
454
  class JSONLDataFileLoader:
447
455
  def __init__(self, filename: Union[os.PathLike, str]):
448
456
  self.filename = filename
@@ -34,15 +34,15 @@ class BleuScoreEvaluator(EvaluatorBase):
34
34
  :language: python
35
35
  :dedent: 8
36
36
  :caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
37
-
37
+
38
38
  .. admonition:: Example using Azure AI Project URL:
39
-
39
+
40
40
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
41
41
  :start-after: [START bleu_score_evaluator]
42
42
  :end-before: [END bleu_score_evaluator]
43
43
  :language: python
44
44
  :dedent: 8
45
- :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
45
+ :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
46
46
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
47
47
 
48
48
  .. admonition:: Example with Threshold:
@@ -54,7 +54,7 @@ class BleuScoreEvaluator(EvaluatorBase):
54
54
  :caption: Initialize with threshold and call an BleuScoreEvaluator.
55
55
  """
56
56
 
57
- id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
57
+ id = "azureai://built-in/evaluators/bleu_score"
58
58
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
59
59
 
60
60
  def __init__(self, *, threshold=0.5):