azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show
  1. azure/ai/evaluation/__init__.py +46 -12
  2. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  3. azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
  4. azure/ai/evaluation/_common/rai_service.py +3 -3
  5. azure/ai/evaluation/_common/utils.py +74 -17
  6. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
  7. azure/ai/evaluation/_evaluate/_evaluate.py +150 -40
  8. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +2 -0
  9. azure/ai/evaluation/_evaluate/_utils.py +1 -2
  10. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  11. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
  12. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +1 -1
  13. azure/ai/evaluation/_evaluators/_common/_base_eval.py +30 -6
  14. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -8
  15. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  16. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  17. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  18. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  19. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  20. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  21. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  22. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  23. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
  24. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  25. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
  26. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +1 -1
  27. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  28. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  29. azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  30. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +54 -2
  31. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  32. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +1 -1
  33. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +1 -1
  34. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  35. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  36. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
  37. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -10
  38. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  39. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +169 -186
  40. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +101 -23
  41. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
  42. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  43. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  44. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
  45. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  46. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  47. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
  48. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  49. azure/ai/evaluation/_version.py +1 -1
  50. azure/ai/evaluation/red_team/__init__.py +2 -2
  51. azure/ai/evaluation/red_team/_red_team.py +838 -478
  52. azure/ai/evaluation/red_team/_red_team_result.py +6 -0
  53. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +8 -3
  54. azure/ai/evaluation/red_team/_utils/constants.py +0 -2
  55. azure/ai/evaluation/simulator/_adversarial_simulator.py +5 -2
  56. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
  57. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +2 -2
  58. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +20 -2
  59. azure/ai/evaluation/simulator/_simulator.py +12 -0
  60. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +32 -3
  61. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +64 -63
  62. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  63. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  64. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -67,19 +67,22 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
67
67
 
68
68
  """
69
69
 
70
- id = "azureml://registries/azureml/models/Indirect-Attack-Evaluator/versions/3"
70
+ id = "azureai://built-in/evaluators/indirect_attack"
71
71
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
72
+ _OPTIONAL_PARAMS = ["query"]
72
73
 
73
74
  @override
74
75
  def __init__(
75
76
  self,
76
77
  credential,
77
78
  azure_ai_project,
79
+ **kwargs,
78
80
  ):
79
81
  super().__init__(
80
82
  eval_metric=EvaluationMetrics.XPIA,
81
83
  azure_ai_project=azure_ai_project,
82
84
  credential=credential,
85
+ **kwargs,
83
86
  )
84
87
 
85
88
  @overload
@@ -19,7 +19,7 @@ class BatchEngineConfig:
19
19
  batch_timeout_seconds: int = PF_BATCH_TIMEOUT_SEC_DEFAULT
20
20
  """The maximum amount of time to wait for all evaluations in the batch to complete."""
21
21
 
22
- run_timeout_seconds: int = 600
22
+ line_timeout_seconds: int = 600
23
23
  """The maximum amount of time to wait for an evaluation to run against a single entry
24
24
  in the data input to complete."""
25
25
 
@@ -32,13 +32,16 @@ class BatchEngineConfig:
32
32
  default_num_results: int = 100
33
33
  """The default number of results to return if you don't ask for all results."""
34
34
 
35
+ raise_on_error: bool = True
36
+ """Whether to raise an error if an evaluation fails."""
37
+
35
38
  def __post_init__(self):
36
39
  if self.logger is None:
37
40
  raise ValueError("logger cannot be None")
38
41
  if self.batch_timeout_seconds <= 0:
39
42
  raise ValueError("batch_timeout_seconds must be greater than 0")
40
- if self.run_timeout_seconds <= 0:
41
- raise ValueError("run_timeout_seconds must be greater than 0")
43
+ if self.line_timeout_seconds <= 0:
44
+ raise ValueError("line_timeout_seconds must be greater than 0")
42
45
  if self.max_concurrency <= 0:
43
46
  raise ValueError("max_concurrency must be greater than 0")
44
47
  if self.default_num_results <= 0:
@@ -20,15 +20,31 @@ from concurrent.futures import Executor
20
20
  from functools import partial
21
21
  from contextlib import contextmanager
22
22
  from datetime import datetime, timezone
23
- from typing import Any, Callable, Dict, Final, Generator, Mapping, MutableMapping, Optional, Sequence, Set, Tuple, cast
23
+ from typing import (
24
+ Any,
25
+ Callable,
26
+ Dict,
27
+ Final,
28
+ Generator,
29
+ List,
30
+ Mapping,
31
+ MutableMapping,
32
+ Optional,
33
+ Sequence,
34
+ Set,
35
+ Tuple,
36
+ cast,
37
+ Literal,
38
+ )
24
39
  from uuid import uuid4
25
40
 
41
+ from ._config import BatchEngineConfig
26
42
  from ._utils import DEFAULTS_KEY, get_int_env_var, get_value_from_path, is_async_callable
27
43
  from ._status import BatchStatus
28
44
  from ._result import BatchResult, BatchRunDetails, BatchRunError, TokenMetrics
29
45
  from ._run_storage import AbstractRunStorage, NoOpRunStorage
30
- from .._common._logging import log_progress, NodeLogManager
31
- from ..._exceptions import ErrorBlame
46
+ from .._common._logging import log_progress, logger, NodeLogManager
47
+ from ..._exceptions import ErrorBlame, EvaluationException
32
48
  from ._exceptions import (
33
49
  BatchEngineCanceledError,
34
50
  BatchEngineError,
@@ -54,30 +70,25 @@ class BatchEngine:
54
70
  self,
55
71
  func: Callable,
56
72
  *,
73
+ config: BatchEngineConfig,
57
74
  storage: Optional[AbstractRunStorage] = None,
58
- batch_timeout_sec: Optional[int] = None,
59
- line_timeout_sec: Optional[int] = None,
60
- max_worker_count: Optional[int] = None,
61
75
  executor: Optional[Executor] = None,
62
76
  ):
63
77
  """Create a new batch engine instance
64
78
 
65
79
  :param Callable func: The function to run the flow
80
+ :param BatchEngineConfig config: The configuration for the batch engine
66
81
  :param Optional[AbstractRunStorage] storage: The storage to store execution results
67
- :param Optional[int] batch_timeout_sec: The timeout of batch run in seconds
68
- :param Optional[int] line_timeout_sec: The timeout of each line in seconds
69
- :param Optional[int] max_worker_count: The concurrency limit of batch run
70
82
  :param Optional[Executor] executor: The executor to run the flow (if needed)
71
83
  """
72
84
 
73
85
  self._func: Callable = func
86
+ self._config: BatchEngineConfig = config
74
87
  self._storage: AbstractRunStorage = storage or NoOpRunStorage()
75
88
 
76
- # TODO ralphe: Consume these from the batch context/config instead of from
77
- # kwargs or (even worse) environment variables
78
- self._batch_timeout_sec = batch_timeout_sec or get_int_env_var("PF_BATCH_TIMEOUT_SEC")
79
- self._line_timeout_sec = line_timeout_sec or get_int_env_var("PF_LINE_TIMEOUT_SEC", 600)
80
- self._max_worker_count = max_worker_count or get_int_env_var("PF_WORKER_COUNT") or MAX_WORKER_COUNT
89
+ self._batch_timeout_sec = self._config.batch_timeout_seconds
90
+ self._line_timeout_sec = self._config.line_timeout_seconds
91
+ self._max_worker_count = self._config.max_concurrency
81
92
 
82
93
  self._executor: Optional[Executor] = executor
83
94
  self._is_canceled: bool = False
@@ -85,15 +96,13 @@ class BatchEngine:
85
96
  async def run(
86
97
  self,
87
98
  data: Sequence[Mapping[str, Any]],
88
- column_mapping: Mapping[str, str],
99
+ column_mapping: Optional[Mapping[str, str]],
89
100
  *,
90
101
  id: Optional[str] = None,
91
102
  max_lines: Optional[int] = None,
92
103
  ) -> BatchResult:
93
104
  if not data:
94
105
  raise BatchEngineValidationError("Please provide a non-empty data mapping.")
95
- if not column_mapping:
96
- raise BatchEngineValidationError("The column mapping is required.")
97
106
 
98
107
  start_time = datetime.now(timezone.utc)
99
108
 
@@ -105,6 +114,8 @@ class BatchEngine:
105
114
  id = id or str(uuid4())
106
115
  result: BatchResult = await self._exec_in_task(id, batch_inputs, start_time)
107
116
  return result
117
+ except EvaluationException:
118
+ raise
108
119
  except Exception as ex:
109
120
  raise BatchEngineError(
110
121
  "Unexpected error while running the batch run.", blame=ErrorBlame.SYSTEM_ERROR
@@ -114,20 +125,58 @@ class BatchEngine:
114
125
  # TODO ralphe: Make sure this works
115
126
  self._is_canceled = True
116
127
 
117
- @staticmethod
118
128
  def _apply_column_mapping(
129
+ self,
119
130
  data: Sequence[Mapping[str, Any]],
120
- column_mapping: Mapping[str, str],
131
+ column_mapping: Optional[Mapping[str, str]],
121
132
  max_lines: Optional[int],
122
133
  ) -> Sequence[Mapping[str, str]]:
134
+
135
+ resolved_column_mapping: Mapping[str, str] = self._resolve_column_mapping(column_mapping)
136
+ resolved_column_mapping.update(self._generate_defaults_for_column_mapping())
137
+ return self._apply_column_mapping_to_lines(data, resolved_column_mapping, max_lines)
138
+
139
+ def _resolve_column_mapping(
140
+ self,
141
+ column_mapping: Optional[Mapping[str, str]],
142
+ ) -> Mapping[str, str]:
143
+ parameters = inspect.signature(self._func).parameters
144
+ default_column_mapping: Dict[str, str] = {
145
+ name: f"${{data.{name}}}"
146
+ for name, value in parameters.items()
147
+ if name not in ["self", "cls", "args", "kwargs"]
148
+ }
149
+ resolved_mapping: Dict[str, str] = default_column_mapping.copy()
150
+
151
+ for name, value in parameters.items():
152
+ if value and value.default is not inspect.Parameter.empty:
153
+ resolved_mapping.pop(name)
154
+
155
+ resolved_mapping.update(column_mapping or {})
156
+ return resolved_mapping
157
+
158
+ def _generate_defaults_for_column_mapping(self) -> Mapping[Literal["$defaults$"], Any]:
159
+
160
+ return {
161
+ DEFAULTS_KEY: {
162
+ name: value.default
163
+ for name, value in inspect.signature(self._func).parameters.items()
164
+ if value.default is not inspect.Parameter.empty
165
+ }
166
+ }
167
+
168
+ @staticmethod
169
+ def _apply_column_mapping_to_lines(
170
+ data: Sequence[Mapping[str, Any]],
171
+ column_mapping: Mapping[str, str],
172
+ max_lines: Optional[int],
173
+ ) -> Sequence[Mapping[str, Any]]:
123
174
  data = data[:max_lines] if max_lines else data
124
175
 
125
176
  inputs: Sequence[Mapping[str, Any]] = []
126
- line: int = 0
127
177
  defaults = cast(Mapping[str, Any], column_mapping.get(DEFAULTS_KEY, {}))
128
178
 
129
- for input in data:
130
- line += 1
179
+ for line_number, input in enumerate(data, start=1):
131
180
  mapped: Dict[str, Any] = {}
132
181
  missing_inputs: Set[str] = set()
133
182
 
@@ -148,18 +197,18 @@ class BatchEngine:
148
197
  continue
149
198
 
150
199
  dict_path = match.group(1)
151
- found, value = get_value_from_path(dict_path, input)
200
+ found, mapped_value = get_value_from_path(dict_path, input)
152
201
  if not found: # try default value
153
- found, value = get_value_from_path(dict_path, defaults)
202
+ found, mapped_value = get_value_from_path(dict_path, defaults)
154
203
 
155
204
  if found:
156
- mapped[key] = value
205
+ mapped[key] = mapped_value
157
206
  else:
158
207
  missing_inputs.add(dict_path)
159
208
 
160
209
  if missing_inputs:
161
210
  missing = ", ".join(missing_inputs)
162
- raise BatchEngineValidationError(f"Missing inputs for line {line}: '{missing}'")
211
+ raise BatchEngineValidationError(f"Missing inputs for line {line_number}: '{missing}'")
163
212
 
164
213
  inputs.append(mapped)
165
214
 
@@ -212,10 +261,12 @@ class BatchEngine:
212
261
  end_time=None,
213
262
  tokens=TokenMetrics(0, 0, 0),
214
263
  error=BatchRunError("The line run is not completed.", None),
264
+ index=i,
215
265
  )
216
266
  )
217
267
  for i in range(len(batch_inputs))
218
268
  ]
269
+ self.handle_line_failures(result_details)
219
270
 
220
271
  for line_result in result_details:
221
272
  # Indicate the worst status of the batch run. This works because
@@ -229,9 +280,15 @@ class BatchEngine:
229
280
  metrics.total_tokens += line_result.tokens.total_tokens
230
281
 
231
282
  if failed_lines and not error:
232
- error = BatchEngineRunFailedError(
233
- str(floor(failed_lines / len(batch_inputs) * 100)) + f"% of the batch run failed."
283
+ error_message = f"{floor(failed_lines / len(batch_inputs) * 100)}% of the batch run failed."
284
+ first_exception: Optional[Exception] = next(
285
+ (result.error.exception for result in result_details if result.error and result.error.exception),
286
+ None,
234
287
  )
288
+ if first_exception is not None:
289
+ error_message += f" {first_exception}"
290
+
291
+ error = BatchEngineRunFailedError(error_message)
235
292
 
236
293
  return BatchResult(
237
294
  status=status,
@@ -283,6 +340,13 @@ class BatchEngine:
283
340
  # TODO ralphe: set logger to use here
284
341
  )
285
342
 
343
+ def __preprocess_inputs(self, inputs: Mapping[str, Any]) -> Mapping[str, Any]:
344
+
345
+ func_params = inspect.signature(self._func).parameters
346
+
347
+ filtered_params = {key: value for key, value in inputs.items() if key in func_params}
348
+ return filtered_params
349
+
286
350
  async def _exec_line_async(
287
351
  self,
288
352
  run_id: str,
@@ -298,6 +362,7 @@ class BatchEngine:
298
362
  end_time=None,
299
363
  tokens=TokenMetrics(0, 0, 0),
300
364
  error=None,
365
+ index=index,
301
366
  )
302
367
 
303
368
  try:
@@ -313,13 +378,15 @@ class BatchEngine:
313
378
  # For now we will just run the function in the current process, but in the future we may
314
379
  # want to consider running the function in a separate process for isolation reasons.
315
380
  output: Any
381
+
382
+ processed_inputs = self.__preprocess_inputs(inputs)
316
383
  if is_async_callable(self._func):
317
- output = await self._func(**inputs)
384
+ output = await self._func(**processed_inputs)
318
385
  else:
319
386
  # to maximize the parallelism, we run the synchronous function in a separate thread
320
387
  # and await its result
321
388
  output = await asyncio.get_event_loop().run_in_executor(
322
- self._executor, partial(self._func, **inputs)
389
+ self._executor, partial(self._func, **processed_inputs)
323
390
  )
324
391
 
325
392
  # This should in theory never happen but as an extra precaution, let's check if the output
@@ -340,6 +407,24 @@ class BatchEngine:
340
407
 
341
408
  return index, details
342
409
 
410
+ @staticmethod
411
+ def handle_line_failures(run_infos: List[BatchRunDetails], raise_on_line_failure: bool = False):
412
+ """Handle line failures in batch run"""
413
+ failed_run_infos: List[BatchRunDetails] = [r for r in run_infos if r.status == BatchStatus.Failed]
414
+ failed_msg: Optional[str] = None
415
+ if len(failed_run_infos) > 0:
416
+ failed_indexes = ",".join([str(r.index) for r in failed_run_infos])
417
+ first_fail_exception: str = failed_run_infos[0].error.details
418
+ if raise_on_line_failure:
419
+ failed_msg = "Flow run failed due to the error: " + first_fail_exception
420
+ raise Exception(failed_msg)
421
+
422
+ failed_msg = (
423
+ f"{len(failed_run_infos)}/{len(run_infos)} flow run failed, indexes: [{failed_indexes}],"
424
+ f" exception of index {failed_run_infos[0].index}: {first_fail_exception}"
425
+ )
426
+ logger.error(failed_msg)
427
+
343
428
  def _persist_run_info(self, line_results: Sequence[BatchRunDetails]):
344
429
  # TODO ralphe: implement?
345
430
  pass
@@ -55,6 +55,8 @@ class BatchRunDetails:
55
55
  """The token metrics of the line run."""
56
56
  error: Optional[BatchRunError]
57
57
  """The error of the line run. This will only be set if the status is Failed."""
58
+ index: int
59
+ """The line run index."""
58
60
 
59
61
  @property
60
62
  def duration(self) -> timedelta:
@@ -58,7 +58,7 @@ class Run:
58
58
  dynamic_callable: Callable,
59
59
  name_prefix: Optional[str],
60
60
  inputs: Sequence[Mapping[str, Any]],
61
- column_mapping: Mapping[str, str],
61
+ column_mapping: Optional[Mapping[str, str]] = None,
62
62
  created_on: Optional[datetime] = None,
63
63
  run: Optional["Run"] = None,
64
64
  ):
@@ -70,7 +70,7 @@ class Run:
70
70
  self.dynamic_callable = dynamic_callable
71
71
  self.name = self._generate_run_name(name_prefix, self._created_on)
72
72
  self.inputs = inputs
73
- self.column_mapping = column_mapping
73
+ self.column_mapping: Optional[Mapping[str, str]] = column_mapping
74
74
  self.result: Optional[BatchResult] = None
75
75
  self.metrics: Mapping[str, Any] = {}
76
76
  self._run = run
@@ -5,6 +5,7 @@
5
5
  import dataclasses
6
6
  import inspect
7
7
  import sys
8
+ import traceback
8
9
 
9
10
  from concurrent.futures import Executor
10
11
  from datetime import datetime, timezone
@@ -46,11 +47,6 @@ class RunSubmitter:
46
47
  **kwargs,
47
48
  ) -> Run:
48
49
 
49
- # if the column mappings are not provided, generate them based on the arguments to the
50
- # flow function.
51
- if column_mapping is None:
52
- column_mapping = self._generate_column_mapping(dynamic_callable)
53
-
54
50
  # The old code always spun up two threads here using a ThreadPoolExecutor:
55
51
  # 1. One thread essentially did nothing of value (since tracing was disabled, and we
56
52
  # don't care about checking for the latest PromptFlow version number now)
@@ -84,7 +80,7 @@ class RunSubmitter:
84
80
  # unnecessary Flow loading code was removed here. Instead do direct calls to _submit_bulk_run
85
81
  await self._submit_bulk_run(run=run, local_storage=local_storage, **kwargs)
86
82
 
87
- self.stream_run(run=run, storage=local_storage, raise_on_error=True)
83
+ self.stream_run(run=run, storage=local_storage, raise_on_error=self._config.raise_on_error)
88
84
  return run
89
85
 
90
86
  async def _submit_bulk_run(self, run: Run, local_storage: AbstractRunStorage, **kwargs) -> None:
@@ -125,10 +121,8 @@ class RunSubmitter:
125
121
  try:
126
122
  batch_engine = BatchEngine(
127
123
  run.dynamic_callable,
124
+ config=self._config,
128
125
  storage=local_storage,
129
- batch_timeout_sec=self._config.batch_timeout_seconds,
130
- line_timeout_sec=self._config.run_timeout_seconds,
131
- max_worker_count=self._config.max_concurrency,
132
126
  executor=self._executor,
133
127
  )
134
128
 
@@ -160,10 +154,10 @@ class RunSubmitter:
160
154
  # system metrics
161
155
  system_metrics = {}
162
156
  if batch_result:
163
- system_metrics.update(dataclasses.asdict(batch_result.tokens)) # token related
157
+ # system_metrics.update(dataclasses.asdict(batch_result.tokens)) # token related
164
158
  system_metrics.update(
165
159
  {
166
- "duration": batch_result.duration.total_seconds(),
160
+ # "duration": batch_result.duration.total_seconds(),
167
161
  # "__pf__.lines.completed": batch_result.total_lines - batch_result.failed_lines,
168
162
  # "__pf__.lines.failed": batch_result.failed_lines,
169
163
  }
@@ -173,31 +167,16 @@ class RunSubmitter:
173
167
  run.metrics = system_metrics
174
168
  run.result = batch_result
175
169
 
176
- @staticmethod
177
- def _generate_column_mapping(function: Callable) -> Mapping[str, Any]:
178
- args = inspect.signature(function).parameters
179
- default_values: Dict[str, Any] = {}
180
- mapping: Dict[str, Any] = {}
181
- for key, value in args.items():
182
- if key in ["self", "cls"] or value.kind in [value.VAR_POSITIONAL, value.VAR_KEYWORD]:
183
- continue
184
-
185
- mapping[key] = f"${{data.{key}}}"
186
- if value.default != inspect.Parameter.empty:
187
- default_values[key] = value.default
188
-
189
- return {
190
- **mapping,
191
- DEFAULTS_KEY: default_values,
192
- }
193
-
194
170
  @staticmethod
195
171
  def _validate_inputs(run: Run):
196
172
  if not run.inputs and not run.previous_run:
197
173
  raise BatchEngineValidationError("Either data, or a previous run must be specified for the evaluation run.")
198
174
 
199
175
  @staticmethod
200
- def _validate_column_mapping(column_mapping: Mapping[str, str]):
176
+ def _validate_column_mapping(column_mapping: Optional[Mapping[str, str]]):
177
+ if not column_mapping:
178
+ return
179
+
201
180
  if not isinstance(column_mapping, Mapping):
202
181
  raise BatchEngineValidationError(f"Column mapping must be a dict, got {type(column_mapping)}.")
203
182
 
@@ -221,6 +200,7 @@ class RunSubmitter:
221
200
  return
222
201
 
223
202
  file_handler = sys.stdout
203
+ error_message: Optional[str] = None
224
204
  try:
225
205
  printed = 0
226
206
  available_logs = storage.logger.get_logs()
@@ -232,7 +212,24 @@ class RunSubmitter:
232
212
 
233
213
  if run.status == RunStatus.FAILED or run.status == RunStatus.CANCELED:
234
214
  if run.status == RunStatus.FAILED:
235
- error_message = storage.load_exception().get("message", "Run fails with unknown error.")
215
+ # Get the first error message from the results, or use a default one
216
+ if run.result and run.result.error:
217
+ error_message = "".join(
218
+ traceback.format_exception(
219
+ type(run.result.error), run.result.error, run.result.error.__traceback__
220
+ )
221
+ )
222
+ elif run.result and run.result.details:
223
+ err = next((r.error for r in run.result.details if r.error), None)
224
+ if err and err.exception:
225
+ error_message = "".join(
226
+ traceback.format_exception(type(err.exception), err.exception, err.exception.__traceback__)
227
+ )
228
+ elif err and err.details:
229
+ error_message = err.details
230
+
231
+ if not error_message:
232
+ error_message = "Run fails with unknown error."
236
233
  else:
237
234
  error_message = "Run is canceled."
238
235
  if raise_on_error:
@@ -290,6 +290,7 @@ class _SafetyEvaluation:
290
290
  target=callback,
291
291
  text=source_text if source_text else "",
292
292
  concurrent_async_tasks=concurrent_async_tasks,
293
+ randomization_seed=randomization_seed,
293
294
  )
294
295
 
295
296
  ## Run AdversarialSimulator
@@ -902,6 +903,7 @@ class _SafetyEvaluation:
902
903
  evaluation_name=evaluation_name,
903
904
  output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
904
905
  _use_pf_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
906
+ _use_run_submitter_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
905
907
  )
906
908
  evaluation_results[strategy] = evaluate_outputs
907
909
  return evaluation_results
@@ -3,4 +3,4 @@
3
3
  # ---------------------------------------------------------
4
4
  # represents upcoming version
5
5
 
6
- VERSION = "1.9.0"
6
+ VERSION = "1.10.0"
@@ -8,8 +8,8 @@ try:
8
8
  from ._attack_objective_generator import RiskCategory
9
9
  from ._red_team_result import RedTeamResult
10
10
  except ImportError:
11
- print(
12
- "[INFO] Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
11
+ raise ImportError(
12
+ "Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
13
13
  )
14
14
 
15
15