azure-ai-evaluation 1.11.1__py3-none-any.whl → 1.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
- azure/ai/evaluation/_aoai/label_grader.py +8 -3
- azure/ai/evaluation/_aoai/python_grader.py +8 -3
- azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
- azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
- azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
- azure/ai/evaluation/_eval_mapping.py +2 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +252 -48
- azure/ai/evaluation/_evaluate/_utils.py +7 -3
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
- azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
- azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
- azure/ai/evaluation/red_team/_red_team.py +9 -0
- azure/ai/evaluation/red_team/_red_team_result.py +230 -1
- azure/ai/evaluation/red_team/_result_processor.py +416 -23
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
- {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +13 -3
- {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
- {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,9 @@ import hashlib
|
|
|
11
11
|
import json
|
|
12
12
|
import math
|
|
13
13
|
import os
|
|
14
|
+
import uuid
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
from datetime import datetime
|
|
14
17
|
from typing import Any, Dict, List, Optional, Union, cast
|
|
15
18
|
|
|
16
19
|
import pandas as pd
|
|
@@ -25,7 +28,14 @@ from ._utils.formatting_utils import list_mean_nan_safe, is_none_or_nan, get_att
|
|
|
25
28
|
class ResultProcessor:
|
|
26
29
|
"""Handles processing and formatting of red team evaluation results."""
|
|
27
30
|
|
|
28
|
-
def __init__(
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
logger,
|
|
34
|
+
attack_success_thresholds,
|
|
35
|
+
application_scenario,
|
|
36
|
+
risk_categories,
|
|
37
|
+
ai_studio_url=None,
|
|
38
|
+
):
|
|
29
39
|
"""Initialize the result processor.
|
|
30
40
|
|
|
31
41
|
:param logger: Logger instance for logging
|
|
@@ -53,6 +63,7 @@ class ResultProcessor:
|
|
|
53
63
|
risk_categories = []
|
|
54
64
|
attack_successes = []
|
|
55
65
|
conversations = []
|
|
66
|
+
output_item_lookup = defaultdict(list)
|
|
56
67
|
|
|
57
68
|
self.logger.info(f"Building RedTeamResult from red_team_info with {len(red_team_info)} strategies")
|
|
58
69
|
|
|
@@ -155,18 +166,19 @@ class ResultProcessor:
|
|
|
155
166
|
conv_data = json.loads(line)
|
|
156
167
|
if "conversation" in conv_data and "messages" in conv_data["conversation"]:
|
|
157
168
|
messages = conv_data["conversation"]["messages"]
|
|
169
|
+
conversation_key = hashlib.sha256(
|
|
170
|
+
json.dumps(messages, sort_keys=True).encode("utf-8")
|
|
171
|
+
).hexdigest()
|
|
158
172
|
|
|
159
173
|
# Determine attack success based on evaluation results if available
|
|
160
174
|
attack_success = None
|
|
161
175
|
risk_assessment = {}
|
|
162
176
|
|
|
163
|
-
# If we have evaluation results, try to extract attack success and risk assessment
|
|
164
177
|
eval_row = None
|
|
178
|
+
|
|
179
|
+
# If we have evaluation results, try to extract attack success and risk assessment
|
|
165
180
|
if eval_result:
|
|
166
|
-
|
|
167
|
-
json.dumps(messages, sort_keys=True).encode("utf-8")
|
|
168
|
-
).hexdigest()
|
|
169
|
-
eval_row = eval_row_lookup.get(key)
|
|
181
|
+
eval_row = eval_row_lookup.get(conversation_key)
|
|
170
182
|
if eval_row:
|
|
171
183
|
if f"outputs.{risk_category}.{risk_category}_result" in eval_row:
|
|
172
184
|
attack_success = get_attack_success(
|
|
@@ -245,7 +257,18 @@ class ResultProcessor:
|
|
|
245
257
|
"risk_assessment": (risk_assessment if risk_assessment else None),
|
|
246
258
|
"attack_success_threshold": attack_threshold,
|
|
247
259
|
}
|
|
260
|
+
conversation_index = len(conversations)
|
|
248
261
|
conversations.append(conversation)
|
|
262
|
+
|
|
263
|
+
output_item_lookup[conversation_key].append(
|
|
264
|
+
self._build_output_item(
|
|
265
|
+
conversation=conversation,
|
|
266
|
+
eval_row=eval_row,
|
|
267
|
+
raw_conversation=conv_data,
|
|
268
|
+
conversation_key=conversation_key,
|
|
269
|
+
conversation_index=conversation_index,
|
|
270
|
+
)
|
|
271
|
+
)
|
|
249
272
|
except json.JSONDecodeError as e:
|
|
250
273
|
self.logger.error(f"Error parsing JSON in data file {data_file}: {e}")
|
|
251
274
|
except Exception as e:
|
|
@@ -259,6 +282,22 @@ class ResultProcessor:
|
|
|
259
282
|
conversations.sort(key=lambda x: x["attack_technique"])
|
|
260
283
|
self.logger.info(f"Processed {len(conversations)} conversations from all data files")
|
|
261
284
|
|
|
285
|
+
ordered_output_items: List[Dict[str, Any]] = []
|
|
286
|
+
for conversation in conversations:
|
|
287
|
+
conv_key = hashlib.sha256(
|
|
288
|
+
json.dumps(conversation["conversation"], sort_keys=True).encode("utf-8")
|
|
289
|
+
).hexdigest()
|
|
290
|
+
items_for_key = output_item_lookup.get(conv_key, [])
|
|
291
|
+
if items_for_key:
|
|
292
|
+
ordered_output_items.append(items_for_key.pop(0))
|
|
293
|
+
|
|
294
|
+
# Append any remaining items that were not matched (should be uncommon)
|
|
295
|
+
for remaining_items in output_item_lookup.values():
|
|
296
|
+
if remaining_items:
|
|
297
|
+
ordered_output_items.extend(remaining_items)
|
|
298
|
+
|
|
299
|
+
self.logger.info(f"Processed {len(ordered_output_items)} output items from all data files")
|
|
300
|
+
|
|
262
301
|
# Create a DataFrame for analysis
|
|
263
302
|
results_dict = {
|
|
264
303
|
"converter": converters,
|
|
@@ -293,11 +332,363 @@ class ResultProcessor:
|
|
|
293
332
|
scorecard=cast(RedTeamingScorecard, scorecard),
|
|
294
333
|
parameters=cast(RedTeamingParameters, redteaming_parameters),
|
|
295
334
|
attack_details=conversations,
|
|
335
|
+
output_items=ordered_output_items,
|
|
296
336
|
studio_url=self.ai_studio_url or None,
|
|
297
337
|
)
|
|
298
338
|
|
|
299
339
|
return red_team_result
|
|
300
340
|
|
|
341
|
+
def _build_output_item(
|
|
342
|
+
self,
|
|
343
|
+
conversation: Dict[str, Any],
|
|
344
|
+
eval_row: Optional[Dict[str, Any]],
|
|
345
|
+
raw_conversation: Dict[str, Any],
|
|
346
|
+
conversation_key: str,
|
|
347
|
+
conversation_index: int,
|
|
348
|
+
) -> Dict[str, Any]:
|
|
349
|
+
"""Construct an output item entry for a single conversation."""
|
|
350
|
+
|
|
351
|
+
created_time = self._resolve_created_time(eval_row)
|
|
352
|
+
datasource_item_id = self._resolve_datasource_item_id(eval_row, raw_conversation, conversation_index)
|
|
353
|
+
datasource_item = self._build_datasource_item(eval_row, raw_conversation, datasource_item_id)
|
|
354
|
+
sample_payload = self._build_sample_payload(conversation, raw_conversation)
|
|
355
|
+
results = self._build_output_result(
|
|
356
|
+
conversation,
|
|
357
|
+
eval_row,
|
|
358
|
+
sample_payload=sample_payload,
|
|
359
|
+
)
|
|
360
|
+
output_item_id = self._resolve_output_item_id(
|
|
361
|
+
eval_row, datasource_item_id, conversation_key, conversation_index
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
status = "unknown"
|
|
365
|
+
if results:
|
|
366
|
+
if any(isinstance(result, dict) and result.get("passed") is False for result in results):
|
|
367
|
+
status = "fail"
|
|
368
|
+
elif any(isinstance(result, dict) and result.get("passed") is True for result in results):
|
|
369
|
+
status = "pass"
|
|
370
|
+
|
|
371
|
+
output_item: Dict[str, Any] = {
|
|
372
|
+
"object": "eval.run.output_item",
|
|
373
|
+
"id": output_item_id,
|
|
374
|
+
"created_time": created_time,
|
|
375
|
+
"status": status,
|
|
376
|
+
"results": results,
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
if datasource_item_id is not None:
|
|
380
|
+
output_item["datasource_item_id"] = datasource_item_id
|
|
381
|
+
if datasource_item:
|
|
382
|
+
output_item["datasource_item"] = datasource_item
|
|
383
|
+
|
|
384
|
+
return output_item
|
|
385
|
+
|
|
386
|
+
def _build_sample_payload(
|
|
387
|
+
self,
|
|
388
|
+
conversation: Dict[str, Any],
|
|
389
|
+
raw_conversation: Dict[str, Any],
|
|
390
|
+
) -> Dict[str, Any]:
|
|
391
|
+
"""Create the sample payload for an output item."""
|
|
392
|
+
|
|
393
|
+
conversation_payload = raw_conversation.get("conversation")
|
|
394
|
+
if isinstance(conversation_payload, dict) and "messages" in conversation_payload:
|
|
395
|
+
messages = conversation_payload.get("messages", [])
|
|
396
|
+
else:
|
|
397
|
+
messages = conversation.get("conversation", [])
|
|
398
|
+
|
|
399
|
+
normalized_messages: List[Dict[str, Any]] = []
|
|
400
|
+
for message in messages:
|
|
401
|
+
if not isinstance(message, dict):
|
|
402
|
+
continue
|
|
403
|
+
normalized = self._normalize_sample_message(message)
|
|
404
|
+
if not normalized:
|
|
405
|
+
continue
|
|
406
|
+
normalized_messages.append(normalized)
|
|
407
|
+
|
|
408
|
+
final_assistant_index: Optional[int] = None
|
|
409
|
+
for index in range(len(normalized_messages) - 1, -1, -1):
|
|
410
|
+
if normalized_messages[index].get("role") == "assistant":
|
|
411
|
+
final_assistant_index = index
|
|
412
|
+
break
|
|
413
|
+
|
|
414
|
+
output_messages: List[Dict[str, Any]] = []
|
|
415
|
+
input_messages: List[Dict[str, Any]]
|
|
416
|
+
|
|
417
|
+
if final_assistant_index is not None:
|
|
418
|
+
output_messages = [normalized_messages[final_assistant_index]]
|
|
419
|
+
input_messages = normalized_messages[:final_assistant_index]
|
|
420
|
+
else:
|
|
421
|
+
input_messages = normalized_messages
|
|
422
|
+
|
|
423
|
+
sample_payload: Dict[str, Any] = {
|
|
424
|
+
"object": "eval.run.output_item.sample",
|
|
425
|
+
"input": input_messages,
|
|
426
|
+
"output": output_messages,
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
metadata = {
|
|
430
|
+
key: value
|
|
431
|
+
for key, value in raw_conversation.items()
|
|
432
|
+
if key not in {"conversation"} and not self._is_missing(value)
|
|
433
|
+
}
|
|
434
|
+
if metadata:
|
|
435
|
+
sample_payload["metadata"] = metadata
|
|
436
|
+
|
|
437
|
+
return sample_payload
|
|
438
|
+
|
|
439
|
+
@staticmethod
|
|
440
|
+
def _normalize_sample_message(message: Dict[str, Any]) -> Dict[str, Any]:
|
|
441
|
+
"""Return a shallow copy of a message limited to supported fields."""
|
|
442
|
+
|
|
443
|
+
allowed_keys = {"role", "content", "name", "tool_calls"}
|
|
444
|
+
normalized: Dict[str, Any] = {}
|
|
445
|
+
|
|
446
|
+
for key, value in message.items():
|
|
447
|
+
if key not in allowed_keys or value is None:
|
|
448
|
+
continue
|
|
449
|
+
if key == "tool_calls" and isinstance(value, list):
|
|
450
|
+
normalized["tool_calls"] = [call for call in value if isinstance(call, dict)]
|
|
451
|
+
else:
|
|
452
|
+
normalized[key] = value
|
|
453
|
+
|
|
454
|
+
return normalized
|
|
455
|
+
|
|
456
|
+
def _build_datasource_item(
|
|
457
|
+
self,
|
|
458
|
+
eval_row: Optional[Dict[str, Any]],
|
|
459
|
+
raw_conversation: Dict[str, Any],
|
|
460
|
+
datasource_item_id: Optional[str],
|
|
461
|
+
) -> Optional[Dict[str, Any]]:
|
|
462
|
+
"""Create datasource item metadata if available."""
|
|
463
|
+
return None
|
|
464
|
+
|
|
465
|
+
def _build_output_result(
|
|
466
|
+
self,
|
|
467
|
+
conversation: Dict[str, Any],
|
|
468
|
+
eval_row: Optional[Dict[str, Any]],
|
|
469
|
+
sample_payload: Optional[Dict[str, Any]] = None,
|
|
470
|
+
) -> Dict[str, Any]:
|
|
471
|
+
"""Assemble result entries for each risk category."""
|
|
472
|
+
|
|
473
|
+
results: List[Dict[str, Any]] = []
|
|
474
|
+
risk_assessment = conversation.get("risk_assessment") or {}
|
|
475
|
+
attack_threshold = conversation.get("attack_success_threshold")
|
|
476
|
+
|
|
477
|
+
for risk in self.risk_categories:
|
|
478
|
+
risk_value = risk.value
|
|
479
|
+
passed: Optional[bool] = None
|
|
480
|
+
score: Optional[Union[int, float]] = None
|
|
481
|
+
threshold: Optional[Union[int, float]] = None
|
|
482
|
+
reason: Optional[str] = None
|
|
483
|
+
severity_label: Optional[str] = None
|
|
484
|
+
|
|
485
|
+
if isinstance(eval_row, dict):
|
|
486
|
+
result_value = eval_row.get(f"outputs.{risk_value}.{risk_value}_result")
|
|
487
|
+
if result_value is not None:
|
|
488
|
+
passed = str(result_value).lower() == "pass"
|
|
489
|
+
score = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_score"))
|
|
490
|
+
threshold = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_threshold"))
|
|
491
|
+
reason = eval_row.get(f"outputs.{risk_value}.{risk_value}_reason")
|
|
492
|
+
|
|
493
|
+
assessment = risk_assessment.get(risk_value) if risk_assessment else None
|
|
494
|
+
if isinstance(assessment, dict):
|
|
495
|
+
severity_label = assessment.get("severity_label")
|
|
496
|
+
if not reason:
|
|
497
|
+
reason = assessment.get("reason")
|
|
498
|
+
|
|
499
|
+
properties: Dict[str, Any] = {}
|
|
500
|
+
attack_technique = conversation.get("attack_technique")
|
|
501
|
+
attack_complexity = conversation.get("attack_complexity")
|
|
502
|
+
attack_success = conversation.get("attack_success")
|
|
503
|
+
|
|
504
|
+
if attack_technique is not None:
|
|
505
|
+
properties["attack_technique"] = attack_technique
|
|
506
|
+
if attack_complexity is not None:
|
|
507
|
+
properties["attack_complexity"] = attack_complexity
|
|
508
|
+
if attack_success is not None:
|
|
509
|
+
properties["attack_success"] = attack_success
|
|
510
|
+
|
|
511
|
+
if (
|
|
512
|
+
passed is None
|
|
513
|
+
and score is None
|
|
514
|
+
and threshold is None
|
|
515
|
+
and not reason
|
|
516
|
+
and risk_value != conversation.get("risk_category")
|
|
517
|
+
):
|
|
518
|
+
continue
|
|
519
|
+
|
|
520
|
+
if threshold is None and attack_threshold is not None and risk_value == conversation.get("risk_category"):
|
|
521
|
+
threshold = attack_threshold
|
|
522
|
+
|
|
523
|
+
result_entry: Dict[str, Any] = {
|
|
524
|
+
"object": "eval.run.output_item.result",
|
|
525
|
+
"type": "azure_ai_evaluator" if isinstance(eval_row, dict) else "azure_ai_red_team",
|
|
526
|
+
"name": risk_value,
|
|
527
|
+
"metric": risk_value,
|
|
528
|
+
"passed": passed,
|
|
529
|
+
"score": score,
|
|
530
|
+
"threshold": threshold,
|
|
531
|
+
"reason": reason,
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
if properties:
|
|
535
|
+
result_entry["properties"] = properties
|
|
536
|
+
|
|
537
|
+
if sample_payload:
|
|
538
|
+
result_entry["sample"] = sample_payload
|
|
539
|
+
|
|
540
|
+
results.append(result_entry)
|
|
541
|
+
|
|
542
|
+
if not results:
|
|
543
|
+
risk_value = conversation.get("risk_category")
|
|
544
|
+
|
|
545
|
+
properties: Dict[str, Any] = {}
|
|
546
|
+
attack_technique = conversation.get("attack_technique")
|
|
547
|
+
attack_complexity = conversation.get("attack_complexity")
|
|
548
|
+
attack_success = conversation.get("attack_success")
|
|
549
|
+
|
|
550
|
+
if attack_technique is not None:
|
|
551
|
+
properties["attack_technique"] = attack_technique
|
|
552
|
+
if attack_complexity is not None:
|
|
553
|
+
properties["attack_complexity"] = attack_complexity
|
|
554
|
+
if attack_success is not None:
|
|
555
|
+
properties["attack_success"] = attack_success
|
|
556
|
+
|
|
557
|
+
assessment = risk_assessment.get(risk_value) if risk_assessment else None
|
|
558
|
+
fallback_reason: Optional[str] = None
|
|
559
|
+
|
|
560
|
+
if isinstance(assessment, dict):
|
|
561
|
+
fallback_reason = assessment.get("reason")
|
|
562
|
+
|
|
563
|
+
fallback_result: Dict[str, Any] = {
|
|
564
|
+
"object": "eval.run.output_item.result",
|
|
565
|
+
"type": "azure_ai_red_team",
|
|
566
|
+
"name": risk_value,
|
|
567
|
+
"metric": risk_value,
|
|
568
|
+
"passed": None,
|
|
569
|
+
"score": None,
|
|
570
|
+
"threshold": attack_threshold,
|
|
571
|
+
"reason": fallback_reason,
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
if properties:
|
|
575
|
+
fallback_result["properties"] = properties
|
|
576
|
+
|
|
577
|
+
if sample_payload:
|
|
578
|
+
fallback_result["sample"] = sample_payload
|
|
579
|
+
|
|
580
|
+
results.append(fallback_result)
|
|
581
|
+
|
|
582
|
+
return results
|
|
583
|
+
|
|
584
|
+
def _extract_input_data(
|
|
585
|
+
self,
|
|
586
|
+
eval_row: Optional[Dict[str, Any]],
|
|
587
|
+
raw_conversation: Dict[str, Any],
|
|
588
|
+
) -> Dict[str, Any]:
|
|
589
|
+
"""Extract input data from evaluation rows or conversation payload."""
|
|
590
|
+
|
|
591
|
+
input_data: Dict[str, Any] = {}
|
|
592
|
+
|
|
593
|
+
if isinstance(eval_row, dict):
|
|
594
|
+
for key, value in eval_row.items():
|
|
595
|
+
if key.startswith("inputs."):
|
|
596
|
+
path = key.split(".")[1:]
|
|
597
|
+
self._assign_nested_value(input_data, path, value)
|
|
598
|
+
|
|
599
|
+
if not input_data:
|
|
600
|
+
for key, value in raw_conversation.items():
|
|
601
|
+
if key == "conversation" or value is None:
|
|
602
|
+
continue
|
|
603
|
+
input_data[key] = value
|
|
604
|
+
|
|
605
|
+
return input_data
|
|
606
|
+
|
|
607
|
+
@staticmethod
|
|
608
|
+
def _assign_nested_value(container: Dict[str, Any], path: List[str], value: Any) -> None:
|
|
609
|
+
current = container
|
|
610
|
+
for part in path[:-1]:
|
|
611
|
+
current = current.setdefault(part, {})
|
|
612
|
+
current[path[-1]] = value
|
|
613
|
+
|
|
614
|
+
def _resolve_output_item_id(
|
|
615
|
+
self,
|
|
616
|
+
eval_row: Optional[Dict[str, Any]],
|
|
617
|
+
datasource_item_id: Optional[str],
|
|
618
|
+
conversation_key: str,
|
|
619
|
+
conversation_index: int,
|
|
620
|
+
) -> str:
|
|
621
|
+
if isinstance(eval_row, dict):
|
|
622
|
+
for candidate_key in ["id", "output_item_id", "datasource_item_id"]:
|
|
623
|
+
candidate_value = eval_row.get(candidate_key)
|
|
624
|
+
if candidate_value:
|
|
625
|
+
return str(candidate_value)
|
|
626
|
+
|
|
627
|
+
if datasource_item_id:
|
|
628
|
+
return datasource_item_id
|
|
629
|
+
|
|
630
|
+
return str(uuid.uuid4())
|
|
631
|
+
|
|
632
|
+
def _resolve_datasource_item_id(
|
|
633
|
+
self,
|
|
634
|
+
eval_row: Optional[Dict[str, Any]],
|
|
635
|
+
raw_conversation: Dict[str, Any],
|
|
636
|
+
conversation_index: int,
|
|
637
|
+
) -> Optional[str]:
|
|
638
|
+
return None
|
|
639
|
+
|
|
640
|
+
def _resolve_created_time(self, eval_row: Optional[Dict[str, Any]]) -> int:
|
|
641
|
+
if isinstance(eval_row, dict):
|
|
642
|
+
for key in ["created_time", "created_at", "timestamp"]:
|
|
643
|
+
value = eval_row.get(key)
|
|
644
|
+
if value is None:
|
|
645
|
+
continue
|
|
646
|
+
if isinstance(value, (int, float)):
|
|
647
|
+
return int(value)
|
|
648
|
+
if isinstance(value, str):
|
|
649
|
+
try:
|
|
650
|
+
return int(datetime.fromisoformat(value).timestamp())
|
|
651
|
+
except ValueError:
|
|
652
|
+
continue
|
|
653
|
+
|
|
654
|
+
return int(datetime.utcnow().timestamp())
|
|
655
|
+
|
|
656
|
+
def _normalize_numeric(self, value: Any) -> Optional[Union[int, float]]:
|
|
657
|
+
if value is None:
|
|
658
|
+
return None
|
|
659
|
+
|
|
660
|
+
if isinstance(value, (int, float)):
|
|
661
|
+
if isinstance(value, float) and math.isnan(value):
|
|
662
|
+
return None
|
|
663
|
+
return value
|
|
664
|
+
|
|
665
|
+
try:
|
|
666
|
+
if pd.isna(value):
|
|
667
|
+
return None
|
|
668
|
+
except Exception:
|
|
669
|
+
pass
|
|
670
|
+
|
|
671
|
+
if isinstance(value, str):
|
|
672
|
+
stripped = value.strip()
|
|
673
|
+
if not stripped:
|
|
674
|
+
return None
|
|
675
|
+
try:
|
|
676
|
+
if "." in stripped:
|
|
677
|
+
return float(stripped)
|
|
678
|
+
return int(stripped)
|
|
679
|
+
except ValueError:
|
|
680
|
+
return None
|
|
681
|
+
|
|
682
|
+
return None
|
|
683
|
+
|
|
684
|
+
def _is_missing(self, value: Any) -> bool:
|
|
685
|
+
if value is None:
|
|
686
|
+
return True
|
|
687
|
+
try:
|
|
688
|
+
return pd.isna(value)
|
|
689
|
+
except Exception:
|
|
690
|
+
return False
|
|
691
|
+
|
|
301
692
|
def _create_default_scorecard(self, conversations: List, complexity_levels: List, converters: List) -> tuple:
|
|
302
693
|
"""Create a default scorecard when no evaluation results are available."""
|
|
303
694
|
scorecard = {
|
|
@@ -305,14 +696,14 @@ class ResultProcessor:
|
|
|
305
696
|
{
|
|
306
697
|
"overall_asr": 0.0,
|
|
307
698
|
"overall_total": len(conversations),
|
|
308
|
-
"
|
|
699
|
+
"overall_successful_attacks": 0,
|
|
309
700
|
}
|
|
310
701
|
],
|
|
311
702
|
"attack_technique_summary": [
|
|
312
703
|
{
|
|
313
704
|
"overall_asr": 0.0,
|
|
314
705
|
"overall_total": len(conversations),
|
|
315
|
-
"
|
|
706
|
+
"overall_successful_attacks": 0,
|
|
316
707
|
}
|
|
317
708
|
],
|
|
318
709
|
"joint_risk_attack_summary": [],
|
|
@@ -320,13 +711,14 @@ class ResultProcessor:
|
|
|
320
711
|
}
|
|
321
712
|
|
|
322
713
|
# Create basic parameters
|
|
714
|
+
attack_objective_generated_from: Dict[str, Any] = {
|
|
715
|
+
"application_scenario": self.application_scenario,
|
|
716
|
+
"risk_categories": [risk.value for risk in self.risk_categories],
|
|
717
|
+
"policy_document": "",
|
|
718
|
+
}
|
|
719
|
+
|
|
323
720
|
redteaming_parameters = {
|
|
324
|
-
"attack_objective_generated_from":
|
|
325
|
-
"application_scenario": self.application_scenario,
|
|
326
|
-
"risk_categories": [risk.value for risk in self.risk_categories],
|
|
327
|
-
"custom_attack_seed_prompts": "",
|
|
328
|
-
"policy_document": "",
|
|
329
|
-
},
|
|
721
|
+
"attack_objective_generated_from": attack_objective_generated_from,
|
|
330
722
|
"attack_complexity": (list(set(complexity_levels)) if complexity_levels else ["baseline", "easy"]),
|
|
331
723
|
"techniques_used": {},
|
|
332
724
|
"attack_success_thresholds": self._format_thresholds_for_output(),
|
|
@@ -375,7 +767,7 @@ class ResultProcessor:
|
|
|
375
767
|
{
|
|
376
768
|
"overall_asr": overall_asr,
|
|
377
769
|
"overall_total": overall_total,
|
|
378
|
-
"
|
|
770
|
+
"overall_successful_attacks": int(overall_successful_attacks),
|
|
379
771
|
}
|
|
380
772
|
)
|
|
381
773
|
|
|
@@ -445,7 +837,7 @@ class ResultProcessor:
|
|
|
445
837
|
{
|
|
446
838
|
f"{complexity}_asr": asr,
|
|
447
839
|
f"{complexity}_total": len(complexity_df),
|
|
448
|
-
f"{complexity}
|
|
840
|
+
f"{complexity}_successful_attacks": (
|
|
449
841
|
sum([s for s in complexity_df["attack_success"].tolist() if not is_none_or_nan(s)])
|
|
450
842
|
if "attack_success" in complexity_df.columns
|
|
451
843
|
else 0
|
|
@@ -458,7 +850,7 @@ class ResultProcessor:
|
|
|
458
850
|
{
|
|
459
851
|
"overall_asr": overall_asr,
|
|
460
852
|
"overall_total": overall_total,
|
|
461
|
-
"
|
|
853
|
+
"overall_successful_attacks": int(overall_successful_attacks),
|
|
462
854
|
}
|
|
463
855
|
)
|
|
464
856
|
|
|
@@ -478,13 +870,14 @@ class ResultProcessor:
|
|
|
478
870
|
# Create redteaming parameters
|
|
479
871
|
unique_complexities = sorted([c for c in results_df["complexity_level"].unique() if c != "baseline"])
|
|
480
872
|
|
|
873
|
+
attack_objective_generated_from = {
|
|
874
|
+
"application_scenario": self.application_scenario,
|
|
875
|
+
"risk_categories": [risk.value for risk in self.risk_categories],
|
|
876
|
+
"policy_document": "",
|
|
877
|
+
}
|
|
878
|
+
|
|
481
879
|
redteaming_parameters = {
|
|
482
|
-
"attack_objective_generated_from":
|
|
483
|
-
"application_scenario": self.application_scenario,
|
|
484
|
-
"risk_categories": [risk.value for risk in self.risk_categories],
|
|
485
|
-
"custom_attack_seed_prompts": "",
|
|
486
|
-
"policy_document": "",
|
|
487
|
-
},
|
|
880
|
+
"attack_objective_generated_from": attack_objective_generated_from,
|
|
488
881
|
"attack_complexity": [c.capitalize() for c in unique_complexities],
|
|
489
882
|
"techniques_used": {},
|
|
490
883
|
"attack_success_thresholds": self._format_thresholds_for_output(),
|
|
@@ -112,7 +112,7 @@ def format_scorecard(redteam_result: RedTeamResult) -> str:
|
|
|
112
112
|
overall_asr = risk_summary.get("overall_asr", 0)
|
|
113
113
|
|
|
114
114
|
output = [f"Overall ASR: {overall_asr}%"]
|
|
115
|
-
overall_successes = risk_summary.get("
|
|
115
|
+
overall_successes = risk_summary.get("overall_successful_attacks", 0)
|
|
116
116
|
overall_total = risk_summary.get("overall_total", 0)
|
|
117
117
|
output.append(f"Attack Success: {overall_successes}/{overall_total} attacks were successful")
|
|
118
118
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.12.0
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -413,12 +413,22 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
413
413
|
|
|
414
414
|
# Release History
|
|
415
415
|
|
|
416
|
-
|
|
416
|
+
|
|
417
|
+
## 1.12.0 (2025-10-02)
|
|
418
|
+
|
|
419
|
+
### Features Added
|
|
420
|
+
- AOAI Graders now accept a "credential" parameter that can be used for authentication with an AzureOpenAIModelConfiguration
|
|
421
|
+
- Added `is_reasoning_model` parameter support to `CoherenceEvaluator`, `FluencyEvaluator`, `SimilarityEvaluator`, `GroundednessEvaluator`, `RetrievalEvaluator`, and `RelevanceEvaluator` to enable reasoning model configuration for o1/o3 models.
|
|
422
|
+
|
|
423
|
+
### Bugs Fixed
|
|
424
|
+
- Support for multi-level nesting in OpenAI grader (experimental)
|
|
425
|
+
|
|
426
|
+
## 1.11.1 (2025-09-19)
|
|
417
427
|
|
|
418
428
|
### Bugs Fixed
|
|
419
429
|
- Pinning duckdb version to 1.3.2 for redteam extra to fix error `TypeError: unhashable type: '_duckdb.typing.DuckDBPyType'`
|
|
420
430
|
|
|
421
|
-
## 1.11.0 (2025-09-
|
|
431
|
+
## 1.11.0 (2025-09-03)
|
|
422
432
|
|
|
423
433
|
### Features Added
|
|
424
434
|
- Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
|