azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +46 -12
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- azure/ai/evaluation/_common/rai_service.py +3 -3
- azure/ai/evaluation/_common/utils.py +74 -17
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- azure/ai/evaluation/_evaluate/_evaluate.py +150 -40
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +2 -0
- azure/ai/evaluation/_evaluate/_utils.py +1 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +30 -6
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -8
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +1 -1
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +54 -2
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +1 -1
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +1 -1
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -10
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +169 -186
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +101 -23
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +2 -2
- azure/ai/evaluation/red_team/_red_team.py +838 -478
- azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +8 -3
- azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure/ai/evaluation/simulator/_adversarial_simulator.py +5 -2
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +2 -2
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +20 -2
- azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +32 -3
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +64 -63
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -67,19 +67,22 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
67
67
|
|
|
68
68
|
"""
|
|
69
69
|
|
|
70
|
-
id = "
|
|
70
|
+
id = "azureai://built-in/evaluators/indirect_attack"
|
|
71
71
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
72
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
72
73
|
|
|
73
74
|
@override
|
|
74
75
|
def __init__(
|
|
75
76
|
self,
|
|
76
77
|
credential,
|
|
77
78
|
azure_ai_project,
|
|
79
|
+
**kwargs,
|
|
78
80
|
):
|
|
79
81
|
super().__init__(
|
|
80
82
|
eval_metric=EvaluationMetrics.XPIA,
|
|
81
83
|
azure_ai_project=azure_ai_project,
|
|
82
84
|
credential=credential,
|
|
85
|
+
**kwargs,
|
|
83
86
|
)
|
|
84
87
|
|
|
85
88
|
@overload
|
|
@@ -19,7 +19,7 @@ class BatchEngineConfig:
|
|
|
19
19
|
batch_timeout_seconds: int = PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
20
20
|
"""The maximum amount of time to wait for all evaluations in the batch to complete."""
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
line_timeout_seconds: int = 600
|
|
23
23
|
"""The maximum amount of time to wait for an evaluation to run against a single entry
|
|
24
24
|
in the data input to complete."""
|
|
25
25
|
|
|
@@ -32,13 +32,16 @@ class BatchEngineConfig:
|
|
|
32
32
|
default_num_results: int = 100
|
|
33
33
|
"""The default number of results to return if you don't ask for all results."""
|
|
34
34
|
|
|
35
|
+
raise_on_error: bool = True
|
|
36
|
+
"""Whether to raise an error if an evaluation fails."""
|
|
37
|
+
|
|
35
38
|
def __post_init__(self):
|
|
36
39
|
if self.logger is None:
|
|
37
40
|
raise ValueError("logger cannot be None")
|
|
38
41
|
if self.batch_timeout_seconds <= 0:
|
|
39
42
|
raise ValueError("batch_timeout_seconds must be greater than 0")
|
|
40
|
-
if self.
|
|
41
|
-
raise ValueError("
|
|
43
|
+
if self.line_timeout_seconds <= 0:
|
|
44
|
+
raise ValueError("line_timeout_seconds must be greater than 0")
|
|
42
45
|
if self.max_concurrency <= 0:
|
|
43
46
|
raise ValueError("max_concurrency must be greater than 0")
|
|
44
47
|
if self.default_num_results <= 0:
|
|
@@ -20,15 +20,31 @@ from concurrent.futures import Executor
|
|
|
20
20
|
from functools import partial
|
|
21
21
|
from contextlib import contextmanager
|
|
22
22
|
from datetime import datetime, timezone
|
|
23
|
-
from typing import
|
|
23
|
+
from typing import (
|
|
24
|
+
Any,
|
|
25
|
+
Callable,
|
|
26
|
+
Dict,
|
|
27
|
+
Final,
|
|
28
|
+
Generator,
|
|
29
|
+
List,
|
|
30
|
+
Mapping,
|
|
31
|
+
MutableMapping,
|
|
32
|
+
Optional,
|
|
33
|
+
Sequence,
|
|
34
|
+
Set,
|
|
35
|
+
Tuple,
|
|
36
|
+
cast,
|
|
37
|
+
Literal,
|
|
38
|
+
)
|
|
24
39
|
from uuid import uuid4
|
|
25
40
|
|
|
41
|
+
from ._config import BatchEngineConfig
|
|
26
42
|
from ._utils import DEFAULTS_KEY, get_int_env_var, get_value_from_path, is_async_callable
|
|
27
43
|
from ._status import BatchStatus
|
|
28
44
|
from ._result import BatchResult, BatchRunDetails, BatchRunError, TokenMetrics
|
|
29
45
|
from ._run_storage import AbstractRunStorage, NoOpRunStorage
|
|
30
|
-
from .._common._logging import log_progress, NodeLogManager
|
|
31
|
-
from ..._exceptions import ErrorBlame
|
|
46
|
+
from .._common._logging import log_progress, logger, NodeLogManager
|
|
47
|
+
from ..._exceptions import ErrorBlame, EvaluationException
|
|
32
48
|
from ._exceptions import (
|
|
33
49
|
BatchEngineCanceledError,
|
|
34
50
|
BatchEngineError,
|
|
@@ -54,30 +70,25 @@ class BatchEngine:
|
|
|
54
70
|
self,
|
|
55
71
|
func: Callable,
|
|
56
72
|
*,
|
|
73
|
+
config: BatchEngineConfig,
|
|
57
74
|
storage: Optional[AbstractRunStorage] = None,
|
|
58
|
-
batch_timeout_sec: Optional[int] = None,
|
|
59
|
-
line_timeout_sec: Optional[int] = None,
|
|
60
|
-
max_worker_count: Optional[int] = None,
|
|
61
75
|
executor: Optional[Executor] = None,
|
|
62
76
|
):
|
|
63
77
|
"""Create a new batch engine instance
|
|
64
78
|
|
|
65
79
|
:param Callable func: The function to run the flow
|
|
80
|
+
:param BatchEngineConfig config: The configuration for the batch engine
|
|
66
81
|
:param Optional[AbstractRunStorage] storage: The storage to store execution results
|
|
67
|
-
:param Optional[int] batch_timeout_sec: The timeout of batch run in seconds
|
|
68
|
-
:param Optional[int] line_timeout_sec: The timeout of each line in seconds
|
|
69
|
-
:param Optional[int] max_worker_count: The concurrency limit of batch run
|
|
70
82
|
:param Optional[Executor] executor: The executor to run the flow (if needed)
|
|
71
83
|
"""
|
|
72
84
|
|
|
73
85
|
self._func: Callable = func
|
|
86
|
+
self._config: BatchEngineConfig = config
|
|
74
87
|
self._storage: AbstractRunStorage = storage or NoOpRunStorage()
|
|
75
88
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
self.
|
|
79
|
-
self._line_timeout_sec = line_timeout_sec or get_int_env_var("PF_LINE_TIMEOUT_SEC", 600)
|
|
80
|
-
self._max_worker_count = max_worker_count or get_int_env_var("PF_WORKER_COUNT") or MAX_WORKER_COUNT
|
|
89
|
+
self._batch_timeout_sec = self._config.batch_timeout_seconds
|
|
90
|
+
self._line_timeout_sec = self._config.line_timeout_seconds
|
|
91
|
+
self._max_worker_count = self._config.max_concurrency
|
|
81
92
|
|
|
82
93
|
self._executor: Optional[Executor] = executor
|
|
83
94
|
self._is_canceled: bool = False
|
|
@@ -85,15 +96,13 @@ class BatchEngine:
|
|
|
85
96
|
async def run(
|
|
86
97
|
self,
|
|
87
98
|
data: Sequence[Mapping[str, Any]],
|
|
88
|
-
column_mapping: Mapping[str, str],
|
|
99
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
89
100
|
*,
|
|
90
101
|
id: Optional[str] = None,
|
|
91
102
|
max_lines: Optional[int] = None,
|
|
92
103
|
) -> BatchResult:
|
|
93
104
|
if not data:
|
|
94
105
|
raise BatchEngineValidationError("Please provide a non-empty data mapping.")
|
|
95
|
-
if not column_mapping:
|
|
96
|
-
raise BatchEngineValidationError("The column mapping is required.")
|
|
97
106
|
|
|
98
107
|
start_time = datetime.now(timezone.utc)
|
|
99
108
|
|
|
@@ -105,6 +114,8 @@ class BatchEngine:
|
|
|
105
114
|
id = id or str(uuid4())
|
|
106
115
|
result: BatchResult = await self._exec_in_task(id, batch_inputs, start_time)
|
|
107
116
|
return result
|
|
117
|
+
except EvaluationException:
|
|
118
|
+
raise
|
|
108
119
|
except Exception as ex:
|
|
109
120
|
raise BatchEngineError(
|
|
110
121
|
"Unexpected error while running the batch run.", blame=ErrorBlame.SYSTEM_ERROR
|
|
@@ -114,20 +125,58 @@ class BatchEngine:
|
|
|
114
125
|
# TODO ralphe: Make sure this works
|
|
115
126
|
self._is_canceled = True
|
|
116
127
|
|
|
117
|
-
@staticmethod
|
|
118
128
|
def _apply_column_mapping(
|
|
129
|
+
self,
|
|
119
130
|
data: Sequence[Mapping[str, Any]],
|
|
120
|
-
column_mapping: Mapping[str, str],
|
|
131
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
121
132
|
max_lines: Optional[int],
|
|
122
133
|
) -> Sequence[Mapping[str, str]]:
|
|
134
|
+
|
|
135
|
+
resolved_column_mapping: Mapping[str, str] = self._resolve_column_mapping(column_mapping)
|
|
136
|
+
resolved_column_mapping.update(self._generate_defaults_for_column_mapping())
|
|
137
|
+
return self._apply_column_mapping_to_lines(data, resolved_column_mapping, max_lines)
|
|
138
|
+
|
|
139
|
+
def _resolve_column_mapping(
|
|
140
|
+
self,
|
|
141
|
+
column_mapping: Optional[Mapping[str, str]],
|
|
142
|
+
) -> Mapping[str, str]:
|
|
143
|
+
parameters = inspect.signature(self._func).parameters
|
|
144
|
+
default_column_mapping: Dict[str, str] = {
|
|
145
|
+
name: f"${{data.{name}}}"
|
|
146
|
+
for name, value in parameters.items()
|
|
147
|
+
if name not in ["self", "cls", "args", "kwargs"]
|
|
148
|
+
}
|
|
149
|
+
resolved_mapping: Dict[str, str] = default_column_mapping.copy()
|
|
150
|
+
|
|
151
|
+
for name, value in parameters.items():
|
|
152
|
+
if value and value.default is not inspect.Parameter.empty:
|
|
153
|
+
resolved_mapping.pop(name)
|
|
154
|
+
|
|
155
|
+
resolved_mapping.update(column_mapping or {})
|
|
156
|
+
return resolved_mapping
|
|
157
|
+
|
|
158
|
+
def _generate_defaults_for_column_mapping(self) -> Mapping[Literal["$defaults$"], Any]:
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
DEFAULTS_KEY: {
|
|
162
|
+
name: value.default
|
|
163
|
+
for name, value in inspect.signature(self._func).parameters.items()
|
|
164
|
+
if value.default is not inspect.Parameter.empty
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def _apply_column_mapping_to_lines(
|
|
170
|
+
data: Sequence[Mapping[str, Any]],
|
|
171
|
+
column_mapping: Mapping[str, str],
|
|
172
|
+
max_lines: Optional[int],
|
|
173
|
+
) -> Sequence[Mapping[str, Any]]:
|
|
123
174
|
data = data[:max_lines] if max_lines else data
|
|
124
175
|
|
|
125
176
|
inputs: Sequence[Mapping[str, Any]] = []
|
|
126
|
-
line: int = 0
|
|
127
177
|
defaults = cast(Mapping[str, Any], column_mapping.get(DEFAULTS_KEY, {}))
|
|
128
178
|
|
|
129
|
-
for input in data:
|
|
130
|
-
line += 1
|
|
179
|
+
for line_number, input in enumerate(data, start=1):
|
|
131
180
|
mapped: Dict[str, Any] = {}
|
|
132
181
|
missing_inputs: Set[str] = set()
|
|
133
182
|
|
|
@@ -148,18 +197,18 @@ class BatchEngine:
|
|
|
148
197
|
continue
|
|
149
198
|
|
|
150
199
|
dict_path = match.group(1)
|
|
151
|
-
found,
|
|
200
|
+
found, mapped_value = get_value_from_path(dict_path, input)
|
|
152
201
|
if not found: # try default value
|
|
153
|
-
found,
|
|
202
|
+
found, mapped_value = get_value_from_path(dict_path, defaults)
|
|
154
203
|
|
|
155
204
|
if found:
|
|
156
|
-
mapped[key] =
|
|
205
|
+
mapped[key] = mapped_value
|
|
157
206
|
else:
|
|
158
207
|
missing_inputs.add(dict_path)
|
|
159
208
|
|
|
160
209
|
if missing_inputs:
|
|
161
210
|
missing = ", ".join(missing_inputs)
|
|
162
|
-
raise BatchEngineValidationError(f"Missing inputs for line {
|
|
211
|
+
raise BatchEngineValidationError(f"Missing inputs for line {line_number}: '{missing}'")
|
|
163
212
|
|
|
164
213
|
inputs.append(mapped)
|
|
165
214
|
|
|
@@ -212,10 +261,12 @@ class BatchEngine:
|
|
|
212
261
|
end_time=None,
|
|
213
262
|
tokens=TokenMetrics(0, 0, 0),
|
|
214
263
|
error=BatchRunError("The line run is not completed.", None),
|
|
264
|
+
index=i,
|
|
215
265
|
)
|
|
216
266
|
)
|
|
217
267
|
for i in range(len(batch_inputs))
|
|
218
268
|
]
|
|
269
|
+
self.handle_line_failures(result_details)
|
|
219
270
|
|
|
220
271
|
for line_result in result_details:
|
|
221
272
|
# Indicate the worst status of the batch run. This works because
|
|
@@ -229,9 +280,15 @@ class BatchEngine:
|
|
|
229
280
|
metrics.total_tokens += line_result.tokens.total_tokens
|
|
230
281
|
|
|
231
282
|
if failed_lines and not error:
|
|
232
|
-
|
|
233
|
-
|
|
283
|
+
error_message = f"{floor(failed_lines / len(batch_inputs) * 100)}% of the batch run failed."
|
|
284
|
+
first_exception: Optional[Exception] = next(
|
|
285
|
+
(result.error.exception for result in result_details if result.error and result.error.exception),
|
|
286
|
+
None,
|
|
234
287
|
)
|
|
288
|
+
if first_exception is not None:
|
|
289
|
+
error_message += f" {first_exception}"
|
|
290
|
+
|
|
291
|
+
error = BatchEngineRunFailedError(error_message)
|
|
235
292
|
|
|
236
293
|
return BatchResult(
|
|
237
294
|
status=status,
|
|
@@ -283,6 +340,13 @@ class BatchEngine:
|
|
|
283
340
|
# TODO ralphe: set logger to use here
|
|
284
341
|
)
|
|
285
342
|
|
|
343
|
+
def __preprocess_inputs(self, inputs: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
344
|
+
|
|
345
|
+
func_params = inspect.signature(self._func).parameters
|
|
346
|
+
|
|
347
|
+
filtered_params = {key: value for key, value in inputs.items() if key in func_params}
|
|
348
|
+
return filtered_params
|
|
349
|
+
|
|
286
350
|
async def _exec_line_async(
|
|
287
351
|
self,
|
|
288
352
|
run_id: str,
|
|
@@ -298,6 +362,7 @@ class BatchEngine:
|
|
|
298
362
|
end_time=None,
|
|
299
363
|
tokens=TokenMetrics(0, 0, 0),
|
|
300
364
|
error=None,
|
|
365
|
+
index=index,
|
|
301
366
|
)
|
|
302
367
|
|
|
303
368
|
try:
|
|
@@ -313,13 +378,15 @@ class BatchEngine:
|
|
|
313
378
|
# For now we will just run the function in the current process, but in the future we may
|
|
314
379
|
# want to consider running the function in a separate process for isolation reasons.
|
|
315
380
|
output: Any
|
|
381
|
+
|
|
382
|
+
processed_inputs = self.__preprocess_inputs(inputs)
|
|
316
383
|
if is_async_callable(self._func):
|
|
317
|
-
output = await self._func(**
|
|
384
|
+
output = await self._func(**processed_inputs)
|
|
318
385
|
else:
|
|
319
386
|
# to maximize the parallelism, we run the synchronous function in a separate thread
|
|
320
387
|
# and await its result
|
|
321
388
|
output = await asyncio.get_event_loop().run_in_executor(
|
|
322
|
-
self._executor, partial(self._func, **
|
|
389
|
+
self._executor, partial(self._func, **processed_inputs)
|
|
323
390
|
)
|
|
324
391
|
|
|
325
392
|
# This should in theory never happen but as an extra precaution, let's check if the output
|
|
@@ -340,6 +407,24 @@ class BatchEngine:
|
|
|
340
407
|
|
|
341
408
|
return index, details
|
|
342
409
|
|
|
410
|
+
@staticmethod
|
|
411
|
+
def handle_line_failures(run_infos: List[BatchRunDetails], raise_on_line_failure: bool = False):
|
|
412
|
+
"""Handle line failures in batch run"""
|
|
413
|
+
failed_run_infos: List[BatchRunDetails] = [r for r in run_infos if r.status == BatchStatus.Failed]
|
|
414
|
+
failed_msg: Optional[str] = None
|
|
415
|
+
if len(failed_run_infos) > 0:
|
|
416
|
+
failed_indexes = ",".join([str(r.index) for r in failed_run_infos])
|
|
417
|
+
first_fail_exception: str = failed_run_infos[0].error.details
|
|
418
|
+
if raise_on_line_failure:
|
|
419
|
+
failed_msg = "Flow run failed due to the error: " + first_fail_exception
|
|
420
|
+
raise Exception(failed_msg)
|
|
421
|
+
|
|
422
|
+
failed_msg = (
|
|
423
|
+
f"{len(failed_run_infos)}/{len(run_infos)} flow run failed, indexes: [{failed_indexes}],"
|
|
424
|
+
f" exception of index {failed_run_infos[0].index}: {first_fail_exception}"
|
|
425
|
+
)
|
|
426
|
+
logger.error(failed_msg)
|
|
427
|
+
|
|
343
428
|
def _persist_run_info(self, line_results: Sequence[BatchRunDetails]):
|
|
344
429
|
# TODO ralphe: implement?
|
|
345
430
|
pass
|
|
@@ -55,6 +55,8 @@ class BatchRunDetails:
|
|
|
55
55
|
"""The token metrics of the line run."""
|
|
56
56
|
error: Optional[BatchRunError]
|
|
57
57
|
"""The error of the line run. This will only be set if the status is Failed."""
|
|
58
|
+
index: int
|
|
59
|
+
"""The line run index."""
|
|
58
60
|
|
|
59
61
|
@property
|
|
60
62
|
def duration(self) -> timedelta:
|
|
@@ -58,7 +58,7 @@ class Run:
|
|
|
58
58
|
dynamic_callable: Callable,
|
|
59
59
|
name_prefix: Optional[str],
|
|
60
60
|
inputs: Sequence[Mapping[str, Any]],
|
|
61
|
-
column_mapping: Mapping[str, str],
|
|
61
|
+
column_mapping: Optional[Mapping[str, str]] = None,
|
|
62
62
|
created_on: Optional[datetime] = None,
|
|
63
63
|
run: Optional["Run"] = None,
|
|
64
64
|
):
|
|
@@ -70,7 +70,7 @@ class Run:
|
|
|
70
70
|
self.dynamic_callable = dynamic_callable
|
|
71
71
|
self.name = self._generate_run_name(name_prefix, self._created_on)
|
|
72
72
|
self.inputs = inputs
|
|
73
|
-
self.column_mapping = column_mapping
|
|
73
|
+
self.column_mapping: Optional[Mapping[str, str]] = column_mapping
|
|
74
74
|
self.result: Optional[BatchResult] = None
|
|
75
75
|
self.metrics: Mapping[str, Any] = {}
|
|
76
76
|
self._run = run
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
import inspect
|
|
7
7
|
import sys
|
|
8
|
+
import traceback
|
|
8
9
|
|
|
9
10
|
from concurrent.futures import Executor
|
|
10
11
|
from datetime import datetime, timezone
|
|
@@ -46,11 +47,6 @@ class RunSubmitter:
|
|
|
46
47
|
**kwargs,
|
|
47
48
|
) -> Run:
|
|
48
49
|
|
|
49
|
-
# if the column mappings are not provided, generate them based on the arguments to the
|
|
50
|
-
# flow function.
|
|
51
|
-
if column_mapping is None:
|
|
52
|
-
column_mapping = self._generate_column_mapping(dynamic_callable)
|
|
53
|
-
|
|
54
50
|
# The old code always spun up two threads here using a ThreadPoolExecutor:
|
|
55
51
|
# 1. One thread essentially did nothing of value (since tracing was disabled, and we
|
|
56
52
|
# don't care about checking for the latest PromptFlow version number now)
|
|
@@ -84,7 +80,7 @@ class RunSubmitter:
|
|
|
84
80
|
# unnecessary Flow loading code was removed here. Instead do direct calls to _submit_bulk_run
|
|
85
81
|
await self._submit_bulk_run(run=run, local_storage=local_storage, **kwargs)
|
|
86
82
|
|
|
87
|
-
self.stream_run(run=run, storage=local_storage, raise_on_error=
|
|
83
|
+
self.stream_run(run=run, storage=local_storage, raise_on_error=self._config.raise_on_error)
|
|
88
84
|
return run
|
|
89
85
|
|
|
90
86
|
async def _submit_bulk_run(self, run: Run, local_storage: AbstractRunStorage, **kwargs) -> None:
|
|
@@ -125,10 +121,8 @@ class RunSubmitter:
|
|
|
125
121
|
try:
|
|
126
122
|
batch_engine = BatchEngine(
|
|
127
123
|
run.dynamic_callable,
|
|
124
|
+
config=self._config,
|
|
128
125
|
storage=local_storage,
|
|
129
|
-
batch_timeout_sec=self._config.batch_timeout_seconds,
|
|
130
|
-
line_timeout_sec=self._config.run_timeout_seconds,
|
|
131
|
-
max_worker_count=self._config.max_concurrency,
|
|
132
126
|
executor=self._executor,
|
|
133
127
|
)
|
|
134
128
|
|
|
@@ -160,10 +154,10 @@ class RunSubmitter:
|
|
|
160
154
|
# system metrics
|
|
161
155
|
system_metrics = {}
|
|
162
156
|
if batch_result:
|
|
163
|
-
system_metrics.update(dataclasses.asdict(batch_result.tokens)) # token related
|
|
157
|
+
# system_metrics.update(dataclasses.asdict(batch_result.tokens)) # token related
|
|
164
158
|
system_metrics.update(
|
|
165
159
|
{
|
|
166
|
-
"duration": batch_result.duration.total_seconds(),
|
|
160
|
+
# "duration": batch_result.duration.total_seconds(),
|
|
167
161
|
# "__pf__.lines.completed": batch_result.total_lines - batch_result.failed_lines,
|
|
168
162
|
# "__pf__.lines.failed": batch_result.failed_lines,
|
|
169
163
|
}
|
|
@@ -173,31 +167,16 @@ class RunSubmitter:
|
|
|
173
167
|
run.metrics = system_metrics
|
|
174
168
|
run.result = batch_result
|
|
175
169
|
|
|
176
|
-
@staticmethod
|
|
177
|
-
def _generate_column_mapping(function: Callable) -> Mapping[str, Any]:
|
|
178
|
-
args = inspect.signature(function).parameters
|
|
179
|
-
default_values: Dict[str, Any] = {}
|
|
180
|
-
mapping: Dict[str, Any] = {}
|
|
181
|
-
for key, value in args.items():
|
|
182
|
-
if key in ["self", "cls"] or value.kind in [value.VAR_POSITIONAL, value.VAR_KEYWORD]:
|
|
183
|
-
continue
|
|
184
|
-
|
|
185
|
-
mapping[key] = f"${{data.{key}}}"
|
|
186
|
-
if value.default != inspect.Parameter.empty:
|
|
187
|
-
default_values[key] = value.default
|
|
188
|
-
|
|
189
|
-
return {
|
|
190
|
-
**mapping,
|
|
191
|
-
DEFAULTS_KEY: default_values,
|
|
192
|
-
}
|
|
193
|
-
|
|
194
170
|
@staticmethod
|
|
195
171
|
def _validate_inputs(run: Run):
|
|
196
172
|
if not run.inputs and not run.previous_run:
|
|
197
173
|
raise BatchEngineValidationError("Either data, or a previous run must be specified for the evaluation run.")
|
|
198
174
|
|
|
199
175
|
@staticmethod
|
|
200
|
-
def _validate_column_mapping(column_mapping: Mapping[str, str]):
|
|
176
|
+
def _validate_column_mapping(column_mapping: Optional[Mapping[str, str]]):
|
|
177
|
+
if not column_mapping:
|
|
178
|
+
return
|
|
179
|
+
|
|
201
180
|
if not isinstance(column_mapping, Mapping):
|
|
202
181
|
raise BatchEngineValidationError(f"Column mapping must be a dict, got {type(column_mapping)}.")
|
|
203
182
|
|
|
@@ -221,6 +200,7 @@ class RunSubmitter:
|
|
|
221
200
|
return
|
|
222
201
|
|
|
223
202
|
file_handler = sys.stdout
|
|
203
|
+
error_message: Optional[str] = None
|
|
224
204
|
try:
|
|
225
205
|
printed = 0
|
|
226
206
|
available_logs = storage.logger.get_logs()
|
|
@@ -232,7 +212,24 @@ class RunSubmitter:
|
|
|
232
212
|
|
|
233
213
|
if run.status == RunStatus.FAILED or run.status == RunStatus.CANCELED:
|
|
234
214
|
if run.status == RunStatus.FAILED:
|
|
235
|
-
|
|
215
|
+
# Get the first error message from the results, or use a default one
|
|
216
|
+
if run.result and run.result.error:
|
|
217
|
+
error_message = "".join(
|
|
218
|
+
traceback.format_exception(
|
|
219
|
+
type(run.result.error), run.result.error, run.result.error.__traceback__
|
|
220
|
+
)
|
|
221
|
+
)
|
|
222
|
+
elif run.result and run.result.details:
|
|
223
|
+
err = next((r.error for r in run.result.details if r.error), None)
|
|
224
|
+
if err and err.exception:
|
|
225
|
+
error_message = "".join(
|
|
226
|
+
traceback.format_exception(type(err.exception), err.exception, err.exception.__traceback__)
|
|
227
|
+
)
|
|
228
|
+
elif err and err.details:
|
|
229
|
+
error_message = err.details
|
|
230
|
+
|
|
231
|
+
if not error_message:
|
|
232
|
+
error_message = "Run fails with unknown error."
|
|
236
233
|
else:
|
|
237
234
|
error_message = "Run is canceled."
|
|
238
235
|
if raise_on_error:
|
|
@@ -290,6 +290,7 @@ class _SafetyEvaluation:
|
|
|
290
290
|
target=callback,
|
|
291
291
|
text=source_text if source_text else "",
|
|
292
292
|
concurrent_async_tasks=concurrent_async_tasks,
|
|
293
|
+
randomization_seed=randomization_seed,
|
|
293
294
|
)
|
|
294
295
|
|
|
295
296
|
## Run AdversarialSimulator
|
|
@@ -902,6 +903,7 @@ class _SafetyEvaluation:
|
|
|
902
903
|
evaluation_name=evaluation_name,
|
|
903
904
|
output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
|
|
904
905
|
_use_pf_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
|
|
906
|
+
_use_run_submitter_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
|
|
905
907
|
)
|
|
906
908
|
evaluation_results[strategy] = evaluate_outputs
|
|
907
909
|
return evaluation_results
|
azure/ai/evaluation/_version.py
CHANGED
|
@@ -8,8 +8,8 @@ try:
|
|
|
8
8
|
from ._attack_objective_generator import RiskCategory
|
|
9
9
|
from ._red_team_result import RedTeamResult
|
|
10
10
|
except ImportError:
|
|
11
|
-
|
|
12
|
-
"
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
|