azure-ai-evaluation 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (134) hide show
  1. azure/ai/evaluation/__init__.py +42 -14
  2. azure/ai/evaluation/_azure/_models.py +6 -6
  3. azure/ai/evaluation/_common/constants.py +6 -2
  4. azure/ai/evaluation/_common/rai_service.py +38 -4
  5. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  6. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  7. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  8. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  9. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  10. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  11. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  12. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  13. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  14. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  15. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  16. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  17. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  18. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  19. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  20. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  21. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  22. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  23. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  24. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
  25. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  26. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  27. azure/ai/evaluation/_common/utils.py +30 -10
  28. azure/ai/evaluation/_constants.py +10 -0
  29. azure/ai/evaluation/_converters/__init__.py +3 -0
  30. azure/ai/evaluation/_converters/_ai_services.py +804 -0
  31. azure/ai/evaluation/_converters/_models.py +302 -0
  32. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
  33. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
  34. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  35. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  36. azure/ai/evaluation/_evaluate/_evaluate.py +36 -4
  37. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
  38. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  39. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
  40. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
  41. azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
  42. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
  43. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
  44. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
  45. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
  46. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
  47. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
  48. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
  49. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
  50. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
  51. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
  52. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
  53. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
  54. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  55. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
  56. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
  57. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
  58. azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
  59. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
  60. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  61. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
  62. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
  63. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
  64. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
  65. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
  66. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
  67. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  68. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
  69. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
  70. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
  72. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
  73. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  74. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
  75. azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
  76. azure/ai/evaluation/_exceptions.py +5 -1
  77. azure/ai/evaluation/_legacy/__init__.py +3 -0
  78. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  79. azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
  80. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
  81. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  82. azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
  83. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
  84. azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
  85. azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
  86. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  87. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
  88. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  89. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
  91. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  92. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  93. azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
  94. azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
  95. azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
  96. azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
  97. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  98. azure/ai/evaluation/_red_team/__init__.py +3 -0
  99. azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
  100. azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
  101. azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
  102. azure/ai/evaluation/_red_team/_default_converter.py +21 -0
  103. azure/ai/evaluation/_red_team/_red_team.py +1858 -0
  104. azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
  105. azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
  106. azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
  107. azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
  108. azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
  109. azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
  110. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  111. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  112. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +741 -0
  113. azure/ai/evaluation/_version.py +2 -1
  114. azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
  115. azure/ai/evaluation/simulator/_adversarial_simulator.py +61 -27
  116. azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
  117. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
  118. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
  119. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
  120. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
  121. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/METADATA +75 -15
  122. azure_ai_evaluation-1.4.0.dist-info/RECORD +197 -0
  123. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/WHEEL +1 -1
  124. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  125. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  126. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  127. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  128. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  129. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  130. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  131. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  132. azure_ai_evaluation-1.2.0.dist-info/RECORD +0 -125
  133. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/NOTICE.txt +0 -0
  134. {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,741 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from enum import Enum
6
+ import os
7
+ import inspect
8
+ import logging
9
+ from datetime import datetime
10
+ from azure.ai.evaluation._common._experimental import experimental
11
+ from typing import Any, Callable, Dict, List, Optional, Union, cast
12
+ from azure.ai.evaluation._common.math import list_mean_nan_safe
13
+ from azure.ai.evaluation._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
14
+ from azure.ai.evaluation._evaluators import (
15
+ _content_safety,
16
+ _protected_material,
17
+ _groundedness,
18
+ _relevance,
19
+ _similarity,
20
+ _fluency,
21
+ _xpia,
22
+ _coherence,
23
+ )
24
+ from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
25
+ from azure.ai.evaluation._evaluate import _evaluate
26
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
27
+ from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
28
+ from azure.ai.evaluation.simulator import (
29
+ Simulator,
30
+ AdversarialSimulator,
31
+ AdversarialScenario,
32
+ AdversarialScenarioJailbreak,
33
+ IndirectAttackSimulator,
34
+ DirectAttackSimulator ,
35
+ )
36
+ from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
37
+ from azure.ai.evaluation.simulator._utils import JsonLineList
38
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
39
+ from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
40
+ from azure.core.credentials import TokenCredential
41
+ import json
42
+ from pathlib import Path
43
+
44
+ logger = logging.getLogger(__name__)
45
+ JAILBREAK_EXT = "_Jailbreak"
46
+ DATA_EXT = "_Data.jsonl"
47
+ RESULTS_EXT = "_Results.jsonl"
48
+
49
+ def _setup_logger():
50
+ """Configure and return a logger instance for the CustomAdversarialSimulator.
51
+
52
+ :return: The logger instance.
53
+ :rtype: logging.Logger
54
+ """
55
+ log_filename = datetime.now().strftime("%Y_%m_%d__%H_%M.log")
56
+ logger = logging.getLogger("CustomAdversarialSimulatorLogger")
57
+ logger.setLevel(logging.DEBUG)
58
+ file_handler = logging.FileHandler(log_filename)
59
+ file_handler.setLevel(logging.DEBUG)
60
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
61
+ file_handler.setFormatter(formatter)
62
+ logger.addHandler(file_handler)
63
+
64
+ return logger
65
+
66
+
67
+ @experimental
68
+ class _SafetyEvaluator(Enum):
69
+ """
70
+ Evaluator types for Safety evaluation.
71
+ """
72
+
73
+ CONTENT_SAFETY = "content_safety"
74
+ GROUNDEDNESS = "groundedness"
75
+ PROTECTED_MATERIAL = "protected_material"
76
+ RELEVANCE = "relevance"
77
+ SIMILARITY = "similarity"
78
+ FLUENCY = "fluency"
79
+ COHERENCE = "coherence"
80
+ INDIRECT_ATTACK = "indirect_attack"
81
+ DIRECT_ATTACK = "direct_attack"
82
+ ECI = "eci"
83
+
84
+
85
+ @experimental
86
+ class _SafetyEvaluation:
87
+ def __init__(
88
+ self,
89
+ azure_ai_project: dict,
90
+ credential: TokenCredential,
91
+ model_config: Optional[Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]] = None,
92
+ ):
93
+ """
94
+ Initializes a SafetyEvaluation object.
95
+
96
+ :param azure_ai_project: A dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'.
97
+ :type azure_ai_project: Dict[str, str]
98
+ :param credential: The credential for connecting to Azure AI project.
99
+ :type credential: ~azure.core.credentials.TokenCredential
100
+ :param model_config: A dictionary defining the configuration for the model. Acceptable types are AzureOpenAIModelConfiguration and OpenAIModelConfiguration.
101
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, ~azure.ai.evaluation.OpenAIModelConfiguration]
102
+ :raises ValueError: If the model_config does not contain the required keys or any value is None.
103
+ """
104
+ if model_config:
105
+ self._validate_model_config(model_config)
106
+ self.model_config = model_config
107
+ else:
108
+ self.model_config = None
109
+ validate_azure_ai_project(azure_ai_project)
110
+ self.azure_ai_project = AzureAIProject(**azure_ai_project)
111
+ self.credential = credential
112
+ self.logger = _setup_logger()
113
+
114
+
115
+ @staticmethod
116
+ def _validate_model_config(model_config: Any):
117
+ """
118
+ Validates the model_config to ensure all required keys are present and have non-None values.
119
+ If 'type' is not specified, it will attempt to infer the type based on the keys present.
120
+
121
+ :param model_config: The model configuration dictionary.
122
+ :type model_config: Dict[str, Any]
123
+ :raises ValueError: If required keys are missing or any of the values are None.
124
+ """
125
+ # Attempt to infer 'type' if not provided
126
+ if "type" not in model_config:
127
+ if "azure_deployment" in model_config and "azure_endpoint" in model_config:
128
+ model_config["type"] = "azure_openai"
129
+ elif "model" in model_config:
130
+ model_config["type"] = "openai"
131
+ else:
132
+ raise ValueError(
133
+ "Unable to infer 'type' from model_config. Please specify 'type' as 'azure_openai' or 'openai'."
134
+ )
135
+
136
+ if model_config["type"] == "azure_openai":
137
+ required_keys = ["azure_deployment", "azure_endpoint"]
138
+ elif model_config["type"] == "openai":
139
+ required_keys = ["api_key", "model"]
140
+ else:
141
+ raise ValueError("model_config 'type' must be 'azure_openai' or 'openai'.")
142
+
143
+ missing_keys = [key for key in required_keys if key not in model_config]
144
+ if missing_keys:
145
+ raise ValueError(f"model_config is missing required keys: {', '.join(missing_keys)}")
146
+ none_keys = [key for key in required_keys if model_config.get(key) is None]
147
+ if none_keys:
148
+ raise ValueError(f"The following keys in model_config must not be None: {', '.join(none_keys)}")
149
+
150
+ async def _simulate(
151
+ self,
152
+ target: Callable,
153
+ max_conversation_turns: int = 1,
154
+ max_simulation_results: int = 3,
155
+ conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
156
+ tasks: List[str] = [],
157
+ adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
158
+ source_text: Optional[str] = None,
159
+ direct_attack: bool = False,
160
+ ) -> Dict[str, str]:
161
+ """
162
+ Generates synthetic conversations based on provided parameters.
163
+
164
+ :param target: The target function to call during the simulation.
165
+ :type target: Callable
166
+ :param max_conversation_turns: The maximum number of turns in a conversation.
167
+ :type max_conversation_turns: int
168
+ :param max_simulation_results: The maximum number of simulation results to generate.
169
+ :type max_simulation_results: int
170
+ :param conversation_turns: Predefined conversation turns to simulate.
171
+ :type conversation_turns: List[List[Union[str, Dict[str, Any]]]]
172
+ :param tasks A list of user tasks, each represented as a list of strings. Text should be relevant for the tasks and facilitate the simulation. One example is to use text to provide context for the tasks.
173
+ :type tasks: List[str] = [],
174
+ :param adversarial_scenario: The adversarial scenario to simulate. If None, the non-adversarial Simulator is used.
175
+ :type adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
176
+ :param source_text: The source text to use as grounding document in the simulation.
177
+ :type source_text: Optional[str]
178
+ :param direct_attack: If True, the DirectAttackSimulator will be run.
179
+ :type direct_attack: bool
180
+ """
181
+
182
+ ## Define callback
183
+ async def callback(
184
+ messages: List[Dict],
185
+ stream: bool = False,
186
+ session_state: Optional[str] = None,
187
+ context: Optional[Dict] = None,
188
+ ) -> dict:
189
+ messages_list = messages["messages"] # type: ignore
190
+ latest_message = messages_list[-1]
191
+ application_input = latest_message["content"]
192
+ context = latest_message.get("context", None)
193
+ latest_context = None
194
+ try:
195
+ if self._check_target_returns_context(target):
196
+ response, latest_context = target(query=application_input)
197
+ else:
198
+ response = target(query=application_input)
199
+ except Exception as e:
200
+ response = f"Something went wrong {e!s}"
201
+
202
+ ## We format the response to follow the openAI chat protocol format
203
+ formatted_response = {
204
+ "content": response,
205
+ "role": "assistant",
206
+ "context": latest_context if latest_context else context,
207
+ }
208
+ ## NOTE: In the future, instead of appending to messages we should just return `formatted_response`
209
+ messages["messages"].append(formatted_response) # type: ignore
210
+ return {
211
+ "messages": messages_list,
212
+ "stream": stream,
213
+ "session_state": session_state,
214
+ "context": latest_context if latest_context else context,
215
+ }
216
+
217
+ ## Run simulator
218
+ simulator = None
219
+ simulator_outputs = None
220
+ jailbreak_outputs = None
221
+ simulator_data_paths = {}
222
+
223
+ # if IndirectAttack, run IndirectAttackSimulator
224
+ if adversarial_scenario == AdversarialScenarioJailbreak.ADVERSARIAL_INDIRECT_JAILBREAK:
225
+ self.logger.info(
226
+ f"Running IndirectAttackSimulator with inputs: adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}, conversation_turns={conversation_turns}, text={source_text}"
227
+ )
228
+ simulator = IndirectAttackSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
229
+ simulator_outputs = await simulator(
230
+ scenario=adversarial_scenario,
231
+ max_conversation_turns=max_conversation_turns,
232
+ max_simulation_results=max_simulation_results,
233
+ tasks=tasks,
234
+ conversation_turns=conversation_turns,
235
+ text=source_text,
236
+ target=callback,
237
+ )
238
+
239
+ # if DirectAttack, run DirectAttackSimulator
240
+ elif direct_attack and isinstance(adversarial_scenario, AdversarialScenario):
241
+ self.logger.info(
242
+ f"Running DirectAttackSimulator with inputs: adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}"
243
+ )
244
+ simulator = DirectAttackSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
245
+ simulator_outputs = await simulator(
246
+ scenario=adversarial_scenario if adversarial_scenario else AdversarialScenario.ADVERSARIAL_REWRITE,
247
+ max_conversation_turns=max_conversation_turns,
248
+ max_simulation_results=max_simulation_results,
249
+ target=callback,
250
+ )
251
+ jailbreak_outputs = simulator_outputs["jailbreak"]
252
+ simulator_outputs = simulator_outputs["regular"]
253
+
254
+ ## If adversarial_scenario is not provided, run Simulator
255
+ elif adversarial_scenario is None and self.model_config:
256
+ self.logger.info(
257
+ f"Running Simulator with inputs: adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}, conversation_turns={conversation_turns}, source_text={source_text}"
258
+ )
259
+ simulator = Simulator(self.model_config)
260
+ simulator_outputs = await simulator(
261
+ max_conversation_turns=max_conversation_turns,
262
+ max_simulation_results=max_simulation_results,
263
+ conversation_turns=conversation_turns,
264
+ num_queries=max_simulation_results,
265
+ target=callback,
266
+ text=source_text if source_text else "",
267
+ )
268
+
269
+ ## Run AdversarialSimulator
270
+ elif adversarial_scenario:
271
+ self.logger.info(
272
+ f"Running AdversarialSimulator with inputs: adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}, conversation_turns={conversation_turns}, source_text={source_text}"
273
+ )
274
+ simulator = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
275
+ simulator_outputs = await simulator(
276
+ scenario=adversarial_scenario, #type: ignore
277
+ max_conversation_turns=max_conversation_turns,
278
+ max_simulation_results=max_simulation_results,
279
+ conversation_turns=conversation_turns,
280
+ target=callback,
281
+ text=source_text,
282
+ )
283
+
284
+ ## If no outputs are generated, raise an exception
285
+ if not simulator_outputs:
286
+ self.logger.error("No outputs generated by the simulator")
287
+ msg = "No outputs generated by the simulator"
288
+ raise EvaluationException(
289
+ message=msg,
290
+ internal_message=msg,
291
+ target=ErrorTarget.ADVERSARIAL_SIMULATOR,
292
+ category=ErrorCategory.UNKNOWN,
293
+ blame=ErrorBlame.USER_ERROR,
294
+ )
295
+
296
+ data_path_base = simulator.__class__.__name__
297
+
298
+ ## Write outputs to file according to scenario
299
+ if direct_attack and jailbreak_outputs:
300
+ jailbreak_data_path = data_path_base + JAILBREAK_EXT
301
+ with Path(jailbreak_data_path + DATA_EXT).open("w") as f:
302
+ f.writelines(jailbreak_outputs.to_eval_qr_json_lines())
303
+ simulator_data_paths[jailbreak_data_path] = jailbreak_data_path + DATA_EXT
304
+ with Path(data_path_base + DATA_EXT).open("w") as f:
305
+ if not adversarial_scenario or adversarial_scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
306
+ if source_text or self._check_target_returns_context(target):
307
+ eval_input_data_json_lines = ""
308
+ for output in simulator_outputs:
309
+ query = None
310
+ response = None
311
+ context = source_text
312
+ ground_truth = source_text
313
+ for message in output["messages"]:
314
+ if message["role"] == "user":
315
+ query = message["content"]
316
+ if message["role"] == "assistant":
317
+ response = message["content"]
318
+ if query and response:
319
+ eval_input_data_json_lines += (
320
+ json.dumps(
321
+ {
322
+ "query": query,
323
+ "response": response,
324
+ "context": context,
325
+ "ground_truth": ground_truth,
326
+ }
327
+ )
328
+ + "\n"
329
+ )
330
+ f.write(eval_input_data_json_lines)
331
+ elif isinstance(simulator_outputs, JsonLineList):
332
+ f.writelines(simulator_outputs.to_eval_qr_json_lines())
333
+ else:
334
+ f.writelines(output.to_eval_qr_json_lines() for output in simulator_outputs)
335
+ else:
336
+ f.writelines(
337
+ [
338
+ json.dumps({"conversation": {"messages": conversation["messages"]}}) + "\n"
339
+ for conversation in simulator_outputs
340
+ ]
341
+ )
342
+ simulator_data_paths[data_path_base] = data_path_base + DATA_EXT
343
+
344
+ return simulator_data_paths
345
+
346
+ def _get_scenario(
347
+ self,
348
+ evaluators: List[_SafetyEvaluator],
349
+ num_turns: int = 3,
350
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
351
+ ) -> Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]]:
352
+ """
353
+ Returns the Simulation scenario based on the provided list of SafetyEvaluator.
354
+
355
+ :param evaluators: A list of SafetyEvaluator.
356
+ :type evaluators: List[SafetyEvaluator]
357
+ :param num_turns: The number of turns in a conversation.
358
+ :type num_turns: int
359
+ :param scenario: The adversarial scenario to simulate.
360
+ :type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
361
+ """
362
+ if len(evaluators) == 0:
363
+ return AdversarialScenario.ADVERSARIAL_QA
364
+ for evaluator in evaluators:
365
+ if evaluator in [_SafetyEvaluator.CONTENT_SAFETY, _SafetyEvaluator.DIRECT_ATTACK]:
366
+ if num_turns == 1 and scenario:
367
+ return scenario
368
+ return (
369
+ AdversarialScenario.ADVERSARIAL_CONVERSATION
370
+ if num_turns > 1
371
+ else AdversarialScenario.ADVERSARIAL_QA
372
+ )
373
+ if evaluator == _SafetyEvaluator.ECI:
374
+ return _UnstableAdversarialScenario.ECI
375
+ if evaluator in [
376
+ _SafetyEvaluator.GROUNDEDNESS,
377
+ _SafetyEvaluator.RELEVANCE,
378
+ _SafetyEvaluator.SIMILARITY,
379
+ _SafetyEvaluator.FLUENCY,
380
+ _SafetyEvaluator.COHERENCE,
381
+ ]:
382
+ return None
383
+ if evaluator == _SafetyEvaluator.PROTECTED_MATERIAL:
384
+ return AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL
385
+ if evaluator == _SafetyEvaluator.INDIRECT_ATTACK:
386
+ return AdversarialScenarioJailbreak.ADVERSARIAL_INDIRECT_JAILBREAK
387
+
388
+ msg = f"Invalid evaluator: {evaluator}. Supported evaluators: {_SafetyEvaluator.__members__.values()}"
389
+ raise EvaluationException(
390
+ message=msg,
391
+ internal_message=msg,
392
+ target=ErrorTarget.UNKNOWN,
393
+ category=ErrorCategory.INVALID_VALUE,
394
+ blame=ErrorBlame.USER_ERROR,
395
+ )
396
+
397
+ def _get_evaluators(
398
+ self,
399
+ evaluators: List[_SafetyEvaluator],
400
+ ) -> Dict[str, Callable]:
401
+ """
402
+ Returns a dictionary of evaluators based on the provided list of SafetyEvaluator.
403
+
404
+ :param evaluators: A list of SafetyEvaluator.
405
+ :type evaluators: List[SafetyEvaluator]
406
+ """
407
+ evaluators_dict = {}
408
+ # Default to content safety when no evaluators are specified
409
+ if len(evaluators) == 0:
410
+ evaluators_dict["content_safety"] = _content_safety.ContentSafetyEvaluator(
411
+ azure_ai_project=self.azure_ai_project, credential=self.credential
412
+ )
413
+ return evaluators_dict
414
+
415
+ for evaluator in evaluators:
416
+ if evaluator == _SafetyEvaluator.CONTENT_SAFETY:
417
+ evaluators_dict["content_safety"] = _content_safety.ContentSafetyEvaluator(
418
+ azure_ai_project=self.azure_ai_project, credential=self.credential
419
+ )
420
+ elif evaluator == _SafetyEvaluator.GROUNDEDNESS:
421
+ evaluators_dict["groundedness"] = _groundedness.GroundednessEvaluator(
422
+ model_config=self.model_config,
423
+ )
424
+ elif evaluator == _SafetyEvaluator.PROTECTED_MATERIAL:
425
+ evaluators_dict["protected_material"] = _protected_material.ProtectedMaterialEvaluator(
426
+ azure_ai_project=self.azure_ai_project, credential=self.credential
427
+ )
428
+ elif evaluator == _SafetyEvaluator.RELEVANCE:
429
+ evaluators_dict["relevance"] = _relevance.RelevanceEvaluator(
430
+ model_config=self.model_config,
431
+ )
432
+ elif evaluator == _SafetyEvaluator.SIMILARITY:
433
+ evaluators_dict["similarity"] = _similarity.SimilarityEvaluator(
434
+ model_config=self.model_config,
435
+ )
436
+ elif evaluator == _SafetyEvaluator.FLUENCY:
437
+ evaluators_dict["fluency"] = _fluency.FluencyEvaluator(
438
+ model_config=self.model_config,
439
+ )
440
+ elif evaluator == _SafetyEvaluator.COHERENCE:
441
+ evaluators_dict["coherence"] = _coherence.CoherenceEvaluator(
442
+ model_config=self.model_config,
443
+ )
444
+ elif evaluator == _SafetyEvaluator.INDIRECT_ATTACK:
445
+ evaluators_dict["indirect_attack"] = _xpia.IndirectAttackEvaluator(
446
+ azure_ai_project=self.azure_ai_project, credential=self.credential
447
+ )
448
+ elif evaluator == _SafetyEvaluator.DIRECT_ATTACK:
449
+ evaluators_dict["content_safety"] = _content_safety.ContentSafetyEvaluator(
450
+ azure_ai_project=self.azure_ai_project, credential=self.credential
451
+ )
452
+ elif evaluator == _SafetyEvaluator.ECI:
453
+ evaluators_dict["eci"] = ECIEvaluator(
454
+ azure_ai_project=self.azure_ai_project, credential=self.credential
455
+ )
456
+ else:
457
+ msg = (
458
+ f"Invalid evaluator: {evaluator}. Supported evaluators are: {_SafetyEvaluator.__members__.values()}"
459
+ )
460
+ raise EvaluationException(
461
+ message=msg,
462
+ internal_message=msg,
463
+ target=ErrorTarget.UNKNOWN, ## NOTE: We should add a target for this potentially
464
+ category=ErrorCategory.INVALID_VALUE,
465
+ blame=ErrorBlame.USER_ERROR,
466
+ )
467
+ return evaluators_dict
468
+
469
+ @staticmethod
470
+ def _check_target_returns_context(target: Callable) -> bool:
471
+ """
472
+ Checks if the target function returns a tuple. We assume the second value in the tuple is the "context".
473
+
474
+ :param target: The target function to check.
475
+ :type target: Callable
476
+ """
477
+ sig = inspect.signature(target)
478
+ ret_type = sig.return_annotation
479
+ if ret_type == inspect.Signature.empty:
480
+ return False
481
+ if ret_type is tuple:
482
+ return True
483
+ return False
484
+
485
+ @staticmethod
486
+ def _check_target_returns_str(target: Callable) -> bool:
487
+ '''
488
+ Checks if the target function returns a string.
489
+
490
+ :param target: The target function to check.
491
+ :type target: Callable
492
+ '''
493
+ sig = inspect.signature(target)
494
+ ret_type = sig.return_annotation
495
+ if ret_type == inspect.Signature.empty:
496
+ return False
497
+ if ret_type is str:
498
+ return True
499
+ return False
500
+
501
+
502
+ @staticmethod
503
+ def _check_target_is_callback(target:Callable) -> bool:
504
+ sig = inspect.signature(target)
505
+ param_names = list(sig.parameters.keys())
506
+ return 'messages' in param_names and 'stream' in param_names and 'session_state' in param_names and 'context' in param_names
507
+
508
+ def _validate_inputs(
509
+ self,
510
+ evaluators: List[_SafetyEvaluator],
511
+ target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
512
+ num_turns: int = 1,
513
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
514
+ source_text: Optional[str] = None,
515
+ ):
516
+ """
517
+ Validates the inputs provided to the __call__ function of the SafetyEvaluation object.
518
+ :param evaluators: A list of SafetyEvaluator.
519
+ :type evaluators: List[SafetyEvaluator]
520
+ :param target: The target function to call during the evaluation.
521
+ :type target: Callable
522
+ :param num_turns: The number of turns in a between the target application and the caller.
523
+ :type num_turns: int
524
+ :param scenario: The adversarial scenario to simulate.
525
+ :type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
526
+ :param source_text: The source text to use as grounding document in the evaluation.
527
+ :type source_text: Optional[str]
528
+ """
529
+ if not callable(target):
530
+ self._validate_model_config(target)
531
+ elif not self._check_target_returns_str(target):
532
+ self.logger.error(f"Target function {target} does not return a string.")
533
+ msg = f"Target function {target} does not return a string."
534
+ raise EvaluationException(
535
+ message=msg,
536
+ internal_message=msg,
537
+ target=ErrorTarget.UNKNOWN,
538
+ category=ErrorCategory.INVALID_VALUE,
539
+ blame=ErrorBlame.USER_ERROR,
540
+ )
541
+
542
+ if _SafetyEvaluator.GROUNDEDNESS in evaluators and not source_text:
543
+ self.logger.error(f"GroundednessEvaluator requires source_text. Source text: {source_text}")
544
+ msg = "GroundednessEvaluator requires source_text"
545
+ raise EvaluationException(
546
+ message=msg,
547
+ internal_message=msg,
548
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
549
+ category=ErrorCategory.MISSING_FIELD,
550
+ blame=ErrorBlame.USER_ERROR,
551
+ )
552
+
553
+ if scenario and len(evaluators)>0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
554
+ self.logger.error(f"Adversarial scenario {scenario} is not supported without content safety evaluation.")
555
+ msg = f"Adversarial scenario {scenario} is not supported without content safety evaluation."
556
+ raise EvaluationException(
557
+ message=msg,
558
+ internal_message=msg,
559
+ target=ErrorTarget.UNKNOWN,
560
+ category=ErrorCategory.INVALID_VALUE,
561
+ blame=ErrorBlame.USER_ERROR,
562
+ )
563
+
564
+ if _SafetyEvaluator.CONTENT_SAFETY in evaluators and scenario and num_turns > 1 and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
565
+ self.logger.error(f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn.")
566
+ msg = f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
567
+ raise EvaluationException(
568
+ message=msg,
569
+ internal_message=msg,
570
+ target=ErrorTarget.UNKNOWN,
571
+ category=ErrorCategory.INVALID_VALUE,
572
+ blame=ErrorBlame.USER_ERROR,
573
+ )
574
+
575
+ if (
576
+ any(
577
+ e in evaluators
578
+ for e in [
579
+ _SafetyEvaluator.GROUNDEDNESS,
580
+ _SafetyEvaluator.RELEVANCE,
581
+ _SafetyEvaluator.SIMILARITY,
582
+ _SafetyEvaluator.FLUENCY,
583
+ _SafetyEvaluator.COHERENCE,
584
+ ]
585
+ )
586
+ and not self.model_config
587
+ ):
588
+ self.logger.error(f"Model configuration is required for evaluators: {evaluators}")
589
+ msg = f"Model configuration is required for evaluators: {evaluators}"
590
+ raise EvaluationException(
591
+ message=msg,
592
+ internal_message=msg,
593
+ target=ErrorTarget.UNKNOWN,
594
+ category=ErrorCategory.MISSING_FIELD,
595
+ blame=ErrorBlame.USER_ERROR,
596
+ )
597
+
598
+ def _calculate_defect_rate(self, evaluation_result_dict) -> EvaluationResult:
599
+ jailbreak_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT in key]
600
+ regular_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT not in key]
601
+ evaluation_result: EvaluationResult = {"metrics": {}, "rows": [], "studio_url": ""}
602
+ if len(jailbreak_cols)>0 and len(regular_cols)>0:
603
+ hate_unfairness_defects = []
604
+ violence_defects = []
605
+ sexual_defects = []
606
+ self_harm_defects = []
607
+ for i, row in enumerate(zip(evaluation_result_dict[jailbreak_cols[0]]['rows'], evaluation_result_dict[regular_cols[0]]['rows'])):
608
+ hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
609
+ violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
610
+ sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
611
+ self_harm_defects.append(1 if (row[0]['outputs.content_safety.self_harm_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.self_harm_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
612
+ hate_unfairness_defect_rate = list_mean_nan_safe(hate_unfairness_defects)
613
+ violence_defect_rate = list_mean_nan_safe(violence_defects)
614
+ sexual_defect_rate = list_mean_nan_safe(sexual_defects)
615
+ self_harm_defect_rate = list_mean_nan_safe(self_harm_defects)
616
+
617
+ evaluation_result["rows"] = (
618
+ evaluation_result_dict[jailbreak_cols[0]]["rows"] + evaluation_result_dict[regular_cols[0]]["rows"]
619
+ )
620
+ evaluation_result["metrics"] = {
621
+ "content_safety.violence_defect_rate": hate_unfairness_defect_rate,
622
+ "content_safety.sexual_defect_rate": violence_defect_rate,
623
+ "content_safety.hate_unfairness_defect_rate": sexual_defect_rate,
624
+ "content_safety.self_harm_defect_rate": self_harm_defect_rate,
625
+ }
626
+ evaluation_result["studio_url"] = (
627
+ evaluation_result_dict[jailbreak_cols[0]]["studio_url"] + "\t" + evaluation_result_dict[regular_cols[0]]["studio_url"]
628
+ )
629
+ return evaluation_result
630
+
631
+ async def __call__(
632
+ self,
633
+ target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
634
+ evaluators: List[_SafetyEvaluator] = [],
635
+ evaluation_name: Optional[str] = None,
636
+ num_turns : int = 1,
637
+ num_rows: int = 5,
638
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
639
+ conversation_turns : List[List[Union[str, Dict[str, Any]]]] = [],
640
+ tasks: List[str] = [],
641
+ data_only: bool = False,
642
+ source_text: Optional[str] = None,
643
+ data_path: Optional[Union[str, os.PathLike]] = None,
644
+ jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
645
+ output_path: Optional[Union[str, os.PathLike]] = None,
646
+ data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None
647
+ ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
648
+ '''
649
+ Evaluates the target function based on the provided parameters.
650
+
651
+ :param target: The target function to call during the evaluation.
652
+ :type target: Callable
653
+ :param evaluators: A list of SafetyEvaluator.
654
+ :type evaluators: List[_SafetyEvaluator]
655
+ :param evaluation_name: The display name name of the evaluation.
656
+ :type evaluation_name: Optional[str]
657
+ :param num_turns: The number of turns in a between the target application and the caller.
658
+ :type num_turns: int
659
+ :param num_rows: The (maximum) number of rows to generate for evaluation.
660
+ :type num_rows: int
661
+ :param scenario: The adversarial scenario to simulate.
662
+ :type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
663
+ :param conversation_turns: Predefined conversation turns to simulate.
664
+ :type conversation_turns: List[List[Union[str, Dict[str, Any]]]]
665
+ :param tasks A list of user tasks, each represented as a list of strings. Text should be relevant for the tasks and facilitate the simulation. One example is to use text to provide context for the tasks.
666
+ :type tasks: List[str] = [],
667
+ :param data_only: If True, the filepath to which simulation results are written will be returned.
668
+ :type data_only: bool
669
+ :param source_text: The source text to use as grounding document in the evaluation.
670
+ :type source_text: Optional[str]
671
+ :param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run.
672
+ :type data_path: Optional[Union[str, os.PathLike]]
673
+ :param jailbreak_data_path: The path to the data file generated by the Simulator for jailbreak scenario. If None, the DirectAttackSimulator will be run.
674
+ :type jailbreak_data_path: Optional[Union[str, os.PathLike]]
675
+ :param output_path: The path to write the evaluation results to if set.
676
+ :type output_path: Optional[Union[str, os.PathLike]]
677
+ '''
678
+ ## Log inputs
679
+ self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}")
680
+
681
+ ## Validate arguments
682
+ self._validate_inputs(
683
+ evaluators=evaluators,
684
+ target=target,
685
+ num_turns=num_turns,
686
+ scenario=scenario,
687
+ source_text=source_text,
688
+ )
689
+
690
+ # Get scenario
691
+ adversarial_scenario = self._get_scenario(evaluators, num_turns=num_turns, scenario=scenario)
692
+ self.logger.info(f"Using scenario: {adversarial_scenario}")
693
+
694
+ ## Get evaluators
695
+ evaluators_dict = self._get_evaluators(evaluators)
696
+
697
+ ## If `data_path` is not provided, run simulator
698
+ if not data_paths and data_path is None and jailbreak_data_path is None and isinstance(target, Callable):
699
+ self.logger.info(f"No data_path provided. Running simulator.")
700
+ data_paths = await self._simulate(
701
+ target=target,
702
+ adversarial_scenario=adversarial_scenario,
703
+ max_conversation_turns=num_turns,
704
+ max_simulation_results=num_rows,
705
+ conversation_turns=conversation_turns,
706
+ tasks=tasks,
707
+ source_text=source_text,
708
+ direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
709
+ )
710
+ elif data_path:
711
+ data_paths = {Path(data_path).stem: data_path}
712
+ if jailbreak_data_path:
713
+ data_paths[Path(jailbreak_data_path).stem + JAILBREAK_EXT] = jailbreak_data_path
714
+
715
+ if data_only and data_paths: return data_paths
716
+
717
+ ## Run evaluation
718
+ evaluation_results = {}
719
+ if data_paths:
720
+ for strategy, data_path in data_paths.items():
721
+ self.logger.info(f"Running evaluation for data with inputs data_path={data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path={output_path}")
722
+ if evaluation_name: output_prefix = evaluation_name + "_"
723
+ else: output_prefix = ""
724
+ evaluate_outputs = _evaluate.evaluate(
725
+ data=data_path,
726
+ evaluators=evaluators_dict,
727
+ azure_ai_project=self.azure_ai_project,
728
+ evaluation_name=evaluation_name,
729
+ output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
730
+ _use_pf_client=False, #TODO: Remove this once eval logic for red team agent is moved to red team agent
731
+ )
732
+ evaluation_results[strategy] = evaluate_outputs
733
+ return evaluation_results
734
+ else:
735
+ raise EvaluationException(
736
+ message="No data found after simulation",
737
+ internal_message="No data found after simulation",
738
+ target=ErrorTarget.UNKNOWN,
739
+ category=ErrorCategory.MISSING_FIELD,
740
+ blame=ErrorBlame.USER_ERROR,
741
+ )