azure-ai-evaluation 1.11.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (35) hide show
  1. azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
  2. azure/ai/evaluation/_aoai/label_grader.py +8 -3
  3. azure/ai/evaluation/_aoai/python_grader.py +8 -3
  4. azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
  5. azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
  7. azure/ai/evaluation/_eval_mapping.py +2 -0
  8. azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
  9. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +252 -48
  10. azure/ai/evaluation/_evaluate/_utils.py +7 -3
  11. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  12. azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
  13. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  14. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
  15. azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
  16. azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
  17. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
  18. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  19. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  20. azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
  21. azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
  22. azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
  23. azure/ai/evaluation/_exceptions.py +1 -0
  24. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
  25. azure/ai/evaluation/_version.py +1 -1
  26. azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
  27. azure/ai/evaluation/red_team/_red_team.py +9 -0
  28. azure/ai/evaluation/red_team/_red_team_result.py +230 -1
  29. azure/ai/evaluation/red_team/_result_processor.py +416 -23
  30. azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
  31. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +13 -3
  32. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
  33. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
  34. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
  35. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,9 @@ import hashlib
11
11
  import json
12
12
  import math
13
13
  import os
14
+ import uuid
15
+ from collections import defaultdict
16
+ from datetime import datetime
14
17
  from typing import Any, Dict, List, Optional, Union, cast
15
18
 
16
19
  import pandas as pd
@@ -25,7 +28,14 @@ from ._utils.formatting_utils import list_mean_nan_safe, is_none_or_nan, get_att
25
28
  class ResultProcessor:
26
29
  """Handles processing and formatting of red team evaluation results."""
27
30
 
28
- def __init__(self, logger, attack_success_thresholds, application_scenario, risk_categories, ai_studio_url=None):
31
+ def __init__(
32
+ self,
33
+ logger,
34
+ attack_success_thresholds,
35
+ application_scenario,
36
+ risk_categories,
37
+ ai_studio_url=None,
38
+ ):
29
39
  """Initialize the result processor.
30
40
 
31
41
  :param logger: Logger instance for logging
@@ -53,6 +63,7 @@ class ResultProcessor:
53
63
  risk_categories = []
54
64
  attack_successes = []
55
65
  conversations = []
66
+ output_item_lookup = defaultdict(list)
56
67
 
57
68
  self.logger.info(f"Building RedTeamResult from red_team_info with {len(red_team_info)} strategies")
58
69
 
@@ -155,18 +166,19 @@ class ResultProcessor:
155
166
  conv_data = json.loads(line)
156
167
  if "conversation" in conv_data and "messages" in conv_data["conversation"]:
157
168
  messages = conv_data["conversation"]["messages"]
169
+ conversation_key = hashlib.sha256(
170
+ json.dumps(messages, sort_keys=True).encode("utf-8")
171
+ ).hexdigest()
158
172
 
159
173
  # Determine attack success based on evaluation results if available
160
174
  attack_success = None
161
175
  risk_assessment = {}
162
176
 
163
- # If we have evaluation results, try to extract attack success and risk assessment
164
177
  eval_row = None
178
+
179
+ # If we have evaluation results, try to extract attack success and risk assessment
165
180
  if eval_result:
166
- key = hashlib.sha256(
167
- json.dumps(messages, sort_keys=True).encode("utf-8")
168
- ).hexdigest()
169
- eval_row = eval_row_lookup.get(key)
181
+ eval_row = eval_row_lookup.get(conversation_key)
170
182
  if eval_row:
171
183
  if f"outputs.{risk_category}.{risk_category}_result" in eval_row:
172
184
  attack_success = get_attack_success(
@@ -245,7 +257,18 @@ class ResultProcessor:
245
257
  "risk_assessment": (risk_assessment if risk_assessment else None),
246
258
  "attack_success_threshold": attack_threshold,
247
259
  }
260
+ conversation_index = len(conversations)
248
261
  conversations.append(conversation)
262
+
263
+ output_item_lookup[conversation_key].append(
264
+ self._build_output_item(
265
+ conversation=conversation,
266
+ eval_row=eval_row,
267
+ raw_conversation=conv_data,
268
+ conversation_key=conversation_key,
269
+ conversation_index=conversation_index,
270
+ )
271
+ )
249
272
  except json.JSONDecodeError as e:
250
273
  self.logger.error(f"Error parsing JSON in data file {data_file}: {e}")
251
274
  except Exception as e:
@@ -259,6 +282,22 @@ class ResultProcessor:
259
282
  conversations.sort(key=lambda x: x["attack_technique"])
260
283
  self.logger.info(f"Processed {len(conversations)} conversations from all data files")
261
284
 
285
+ ordered_output_items: List[Dict[str, Any]] = []
286
+ for conversation in conversations:
287
+ conv_key = hashlib.sha256(
288
+ json.dumps(conversation["conversation"], sort_keys=True).encode("utf-8")
289
+ ).hexdigest()
290
+ items_for_key = output_item_lookup.get(conv_key, [])
291
+ if items_for_key:
292
+ ordered_output_items.append(items_for_key.pop(0))
293
+
294
+ # Append any remaining items that were not matched (should be uncommon)
295
+ for remaining_items in output_item_lookup.values():
296
+ if remaining_items:
297
+ ordered_output_items.extend(remaining_items)
298
+
299
+ self.logger.info(f"Processed {len(ordered_output_items)} output items from all data files")
300
+
262
301
  # Create a DataFrame for analysis
263
302
  results_dict = {
264
303
  "converter": converters,
@@ -293,11 +332,363 @@ class ResultProcessor:
293
332
  scorecard=cast(RedTeamingScorecard, scorecard),
294
333
  parameters=cast(RedTeamingParameters, redteaming_parameters),
295
334
  attack_details=conversations,
335
+ output_items=ordered_output_items,
296
336
  studio_url=self.ai_studio_url or None,
297
337
  )
298
338
 
299
339
  return red_team_result
300
340
 
341
+ def _build_output_item(
342
+ self,
343
+ conversation: Dict[str, Any],
344
+ eval_row: Optional[Dict[str, Any]],
345
+ raw_conversation: Dict[str, Any],
346
+ conversation_key: str,
347
+ conversation_index: int,
348
+ ) -> Dict[str, Any]:
349
+ """Construct an output item entry for a single conversation."""
350
+
351
+ created_time = self._resolve_created_time(eval_row)
352
+ datasource_item_id = self._resolve_datasource_item_id(eval_row, raw_conversation, conversation_index)
353
+ datasource_item = self._build_datasource_item(eval_row, raw_conversation, datasource_item_id)
354
+ sample_payload = self._build_sample_payload(conversation, raw_conversation)
355
+ results = self._build_output_result(
356
+ conversation,
357
+ eval_row,
358
+ sample_payload=sample_payload,
359
+ )
360
+ output_item_id = self._resolve_output_item_id(
361
+ eval_row, datasource_item_id, conversation_key, conversation_index
362
+ )
363
+
364
+ status = "unknown"
365
+ if results:
366
+ if any(isinstance(result, dict) and result.get("passed") is False for result in results):
367
+ status = "fail"
368
+ elif any(isinstance(result, dict) and result.get("passed") is True for result in results):
369
+ status = "pass"
370
+
371
+ output_item: Dict[str, Any] = {
372
+ "object": "eval.run.output_item",
373
+ "id": output_item_id,
374
+ "created_time": created_time,
375
+ "status": status,
376
+ "results": results,
377
+ }
378
+
379
+ if datasource_item_id is not None:
380
+ output_item["datasource_item_id"] = datasource_item_id
381
+ if datasource_item:
382
+ output_item["datasource_item"] = datasource_item
383
+
384
+ return output_item
385
+
386
+ def _build_sample_payload(
387
+ self,
388
+ conversation: Dict[str, Any],
389
+ raw_conversation: Dict[str, Any],
390
+ ) -> Dict[str, Any]:
391
+ """Create the sample payload for an output item."""
392
+
393
+ conversation_payload = raw_conversation.get("conversation")
394
+ if isinstance(conversation_payload, dict) and "messages" in conversation_payload:
395
+ messages = conversation_payload.get("messages", [])
396
+ else:
397
+ messages = conversation.get("conversation", [])
398
+
399
+ normalized_messages: List[Dict[str, Any]] = []
400
+ for message in messages:
401
+ if not isinstance(message, dict):
402
+ continue
403
+ normalized = self._normalize_sample_message(message)
404
+ if not normalized:
405
+ continue
406
+ normalized_messages.append(normalized)
407
+
408
+ final_assistant_index: Optional[int] = None
409
+ for index in range(len(normalized_messages) - 1, -1, -1):
410
+ if normalized_messages[index].get("role") == "assistant":
411
+ final_assistant_index = index
412
+ break
413
+
414
+ output_messages: List[Dict[str, Any]] = []
415
+ input_messages: List[Dict[str, Any]]
416
+
417
+ if final_assistant_index is not None:
418
+ output_messages = [normalized_messages[final_assistant_index]]
419
+ input_messages = normalized_messages[:final_assistant_index]
420
+ else:
421
+ input_messages = normalized_messages
422
+
423
+ sample_payload: Dict[str, Any] = {
424
+ "object": "eval.run.output_item.sample",
425
+ "input": input_messages,
426
+ "output": output_messages,
427
+ }
428
+
429
+ metadata = {
430
+ key: value
431
+ for key, value in raw_conversation.items()
432
+ if key not in {"conversation"} and not self._is_missing(value)
433
+ }
434
+ if metadata:
435
+ sample_payload["metadata"] = metadata
436
+
437
+ return sample_payload
438
+
439
+ @staticmethod
440
+ def _normalize_sample_message(message: Dict[str, Any]) -> Dict[str, Any]:
441
+ """Return a shallow copy of a message limited to supported fields."""
442
+
443
+ allowed_keys = {"role", "content", "name", "tool_calls"}
444
+ normalized: Dict[str, Any] = {}
445
+
446
+ for key, value in message.items():
447
+ if key not in allowed_keys or value is None:
448
+ continue
449
+ if key == "tool_calls" and isinstance(value, list):
450
+ normalized["tool_calls"] = [call for call in value if isinstance(call, dict)]
451
+ else:
452
+ normalized[key] = value
453
+
454
+ return normalized
455
+
456
+ def _build_datasource_item(
457
+ self,
458
+ eval_row: Optional[Dict[str, Any]],
459
+ raw_conversation: Dict[str, Any],
460
+ datasource_item_id: Optional[str],
461
+ ) -> Optional[Dict[str, Any]]:
462
+ """Create datasource item metadata if available."""
463
+ return None
464
+
465
+ def _build_output_result(
466
+ self,
467
+ conversation: Dict[str, Any],
468
+ eval_row: Optional[Dict[str, Any]],
469
+ sample_payload: Optional[Dict[str, Any]] = None,
470
+ ) -> Dict[str, Any]:
471
+ """Assemble result entries for each risk category."""
472
+
473
+ results: List[Dict[str, Any]] = []
474
+ risk_assessment = conversation.get("risk_assessment") or {}
475
+ attack_threshold = conversation.get("attack_success_threshold")
476
+
477
+ for risk in self.risk_categories:
478
+ risk_value = risk.value
479
+ passed: Optional[bool] = None
480
+ score: Optional[Union[int, float]] = None
481
+ threshold: Optional[Union[int, float]] = None
482
+ reason: Optional[str] = None
483
+ severity_label: Optional[str] = None
484
+
485
+ if isinstance(eval_row, dict):
486
+ result_value = eval_row.get(f"outputs.{risk_value}.{risk_value}_result")
487
+ if result_value is not None:
488
+ passed = str(result_value).lower() == "pass"
489
+ score = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_score"))
490
+ threshold = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_threshold"))
491
+ reason = eval_row.get(f"outputs.{risk_value}.{risk_value}_reason")
492
+
493
+ assessment = risk_assessment.get(risk_value) if risk_assessment else None
494
+ if isinstance(assessment, dict):
495
+ severity_label = assessment.get("severity_label")
496
+ if not reason:
497
+ reason = assessment.get("reason")
498
+
499
+ properties: Dict[str, Any] = {}
500
+ attack_technique = conversation.get("attack_technique")
501
+ attack_complexity = conversation.get("attack_complexity")
502
+ attack_success = conversation.get("attack_success")
503
+
504
+ if attack_technique is not None:
505
+ properties["attack_technique"] = attack_technique
506
+ if attack_complexity is not None:
507
+ properties["attack_complexity"] = attack_complexity
508
+ if attack_success is not None:
509
+ properties["attack_success"] = attack_success
510
+
511
+ if (
512
+ passed is None
513
+ and score is None
514
+ and threshold is None
515
+ and not reason
516
+ and risk_value != conversation.get("risk_category")
517
+ ):
518
+ continue
519
+
520
+ if threshold is None and attack_threshold is not None and risk_value == conversation.get("risk_category"):
521
+ threshold = attack_threshold
522
+
523
+ result_entry: Dict[str, Any] = {
524
+ "object": "eval.run.output_item.result",
525
+ "type": "azure_ai_evaluator" if isinstance(eval_row, dict) else "azure_ai_red_team",
526
+ "name": risk_value,
527
+ "metric": risk_value,
528
+ "passed": passed,
529
+ "score": score,
530
+ "threshold": threshold,
531
+ "reason": reason,
532
+ }
533
+
534
+ if properties:
535
+ result_entry["properties"] = properties
536
+
537
+ if sample_payload:
538
+ result_entry["sample"] = sample_payload
539
+
540
+ results.append(result_entry)
541
+
542
+ if not results:
543
+ risk_value = conversation.get("risk_category")
544
+
545
+ properties: Dict[str, Any] = {}
546
+ attack_technique = conversation.get("attack_technique")
547
+ attack_complexity = conversation.get("attack_complexity")
548
+ attack_success = conversation.get("attack_success")
549
+
550
+ if attack_technique is not None:
551
+ properties["attack_technique"] = attack_technique
552
+ if attack_complexity is not None:
553
+ properties["attack_complexity"] = attack_complexity
554
+ if attack_success is not None:
555
+ properties["attack_success"] = attack_success
556
+
557
+ assessment = risk_assessment.get(risk_value) if risk_assessment else None
558
+ fallback_reason: Optional[str] = None
559
+
560
+ if isinstance(assessment, dict):
561
+ fallback_reason = assessment.get("reason")
562
+
563
+ fallback_result: Dict[str, Any] = {
564
+ "object": "eval.run.output_item.result",
565
+ "type": "azure_ai_red_team",
566
+ "name": risk_value,
567
+ "metric": risk_value,
568
+ "passed": None,
569
+ "score": None,
570
+ "threshold": attack_threshold,
571
+ "reason": fallback_reason,
572
+ }
573
+
574
+ if properties:
575
+ fallback_result["properties"] = properties
576
+
577
+ if sample_payload:
578
+ fallback_result["sample"] = sample_payload
579
+
580
+ results.append(fallback_result)
581
+
582
+ return results
583
+
584
+ def _extract_input_data(
585
+ self,
586
+ eval_row: Optional[Dict[str, Any]],
587
+ raw_conversation: Dict[str, Any],
588
+ ) -> Dict[str, Any]:
589
+ """Extract input data from evaluation rows or conversation payload."""
590
+
591
+ input_data: Dict[str, Any] = {}
592
+
593
+ if isinstance(eval_row, dict):
594
+ for key, value in eval_row.items():
595
+ if key.startswith("inputs."):
596
+ path = key.split(".")[1:]
597
+ self._assign_nested_value(input_data, path, value)
598
+
599
+ if not input_data:
600
+ for key, value in raw_conversation.items():
601
+ if key == "conversation" or value is None:
602
+ continue
603
+ input_data[key] = value
604
+
605
+ return input_data
606
+
607
+ @staticmethod
608
+ def _assign_nested_value(container: Dict[str, Any], path: List[str], value: Any) -> None:
609
+ current = container
610
+ for part in path[:-1]:
611
+ current = current.setdefault(part, {})
612
+ current[path[-1]] = value
613
+
614
+ def _resolve_output_item_id(
615
+ self,
616
+ eval_row: Optional[Dict[str, Any]],
617
+ datasource_item_id: Optional[str],
618
+ conversation_key: str,
619
+ conversation_index: int,
620
+ ) -> str:
621
+ if isinstance(eval_row, dict):
622
+ for candidate_key in ["id", "output_item_id", "datasource_item_id"]:
623
+ candidate_value = eval_row.get(candidate_key)
624
+ if candidate_value:
625
+ return str(candidate_value)
626
+
627
+ if datasource_item_id:
628
+ return datasource_item_id
629
+
630
+ return str(uuid.uuid4())
631
+
632
+ def _resolve_datasource_item_id(
633
+ self,
634
+ eval_row: Optional[Dict[str, Any]],
635
+ raw_conversation: Dict[str, Any],
636
+ conversation_index: int,
637
+ ) -> Optional[str]:
638
+ return None
639
+
640
+ def _resolve_created_time(self, eval_row: Optional[Dict[str, Any]]) -> int:
641
+ if isinstance(eval_row, dict):
642
+ for key in ["created_time", "created_at", "timestamp"]:
643
+ value = eval_row.get(key)
644
+ if value is None:
645
+ continue
646
+ if isinstance(value, (int, float)):
647
+ return int(value)
648
+ if isinstance(value, str):
649
+ try:
650
+ return int(datetime.fromisoformat(value).timestamp())
651
+ except ValueError:
652
+ continue
653
+
654
+ return int(datetime.utcnow().timestamp())
655
+
656
+ def _normalize_numeric(self, value: Any) -> Optional[Union[int, float]]:
657
+ if value is None:
658
+ return None
659
+
660
+ if isinstance(value, (int, float)):
661
+ if isinstance(value, float) and math.isnan(value):
662
+ return None
663
+ return value
664
+
665
+ try:
666
+ if pd.isna(value):
667
+ return None
668
+ except Exception:
669
+ pass
670
+
671
+ if isinstance(value, str):
672
+ stripped = value.strip()
673
+ if not stripped:
674
+ return None
675
+ try:
676
+ if "." in stripped:
677
+ return float(stripped)
678
+ return int(stripped)
679
+ except ValueError:
680
+ return None
681
+
682
+ return None
683
+
684
+ def _is_missing(self, value: Any) -> bool:
685
+ if value is None:
686
+ return True
687
+ try:
688
+ return pd.isna(value)
689
+ except Exception:
690
+ return False
691
+
301
692
  def _create_default_scorecard(self, conversations: List, complexity_levels: List, converters: List) -> tuple:
302
693
  """Create a default scorecard when no evaluation results are available."""
303
694
  scorecard = {
@@ -305,14 +696,14 @@ class ResultProcessor:
305
696
  {
306
697
  "overall_asr": 0.0,
307
698
  "overall_total": len(conversations),
308
- "overall_attack_successes": 0,
699
+ "overall_successful_attacks": 0,
309
700
  }
310
701
  ],
311
702
  "attack_technique_summary": [
312
703
  {
313
704
  "overall_asr": 0.0,
314
705
  "overall_total": len(conversations),
315
- "overall_attack_successes": 0,
706
+ "overall_successful_attacks": 0,
316
707
  }
317
708
  ],
318
709
  "joint_risk_attack_summary": [],
@@ -320,13 +711,14 @@ class ResultProcessor:
320
711
  }
321
712
 
322
713
  # Create basic parameters
714
+ attack_objective_generated_from: Dict[str, Any] = {
715
+ "application_scenario": self.application_scenario,
716
+ "risk_categories": [risk.value for risk in self.risk_categories],
717
+ "policy_document": "",
718
+ }
719
+
323
720
  redteaming_parameters = {
324
- "attack_objective_generated_from": {
325
- "application_scenario": self.application_scenario,
326
- "risk_categories": [risk.value for risk in self.risk_categories],
327
- "custom_attack_seed_prompts": "",
328
- "policy_document": "",
329
- },
721
+ "attack_objective_generated_from": attack_objective_generated_from,
330
722
  "attack_complexity": (list(set(complexity_levels)) if complexity_levels else ["baseline", "easy"]),
331
723
  "techniques_used": {},
332
724
  "attack_success_thresholds": self._format_thresholds_for_output(),
@@ -375,7 +767,7 @@ class ResultProcessor:
375
767
  {
376
768
  "overall_asr": overall_asr,
377
769
  "overall_total": overall_total,
378
- "overall_attack_successes": int(overall_successful_attacks),
770
+ "overall_successful_attacks": int(overall_successful_attacks),
379
771
  }
380
772
  )
381
773
 
@@ -445,7 +837,7 @@ class ResultProcessor:
445
837
  {
446
838
  f"{complexity}_asr": asr,
447
839
  f"{complexity}_total": len(complexity_df),
448
- f"{complexity}_attack_successes": (
840
+ f"{complexity}_successful_attacks": (
449
841
  sum([s for s in complexity_df["attack_success"].tolist() if not is_none_or_nan(s)])
450
842
  if "attack_success" in complexity_df.columns
451
843
  else 0
@@ -458,7 +850,7 @@ class ResultProcessor:
458
850
  {
459
851
  "overall_asr": overall_asr,
460
852
  "overall_total": overall_total,
461
- "overall_attack_successes": int(overall_successful_attacks),
853
+ "overall_successful_attacks": int(overall_successful_attacks),
462
854
  }
463
855
  )
464
856
 
@@ -478,13 +870,14 @@ class ResultProcessor:
478
870
  # Create redteaming parameters
479
871
  unique_complexities = sorted([c for c in results_df["complexity_level"].unique() if c != "baseline"])
480
872
 
873
+ attack_objective_generated_from = {
874
+ "application_scenario": self.application_scenario,
875
+ "risk_categories": [risk.value for risk in self.risk_categories],
876
+ "policy_document": "",
877
+ }
878
+
481
879
  redteaming_parameters = {
482
- "attack_objective_generated_from": {
483
- "application_scenario": self.application_scenario,
484
- "risk_categories": [risk.value for risk in self.risk_categories],
485
- "custom_attack_seed_prompts": "",
486
- "policy_document": "",
487
- },
880
+ "attack_objective_generated_from": attack_objective_generated_from,
488
881
  "attack_complexity": [c.capitalize() for c in unique_complexities],
489
882
  "techniques_used": {},
490
883
  "attack_success_thresholds": self._format_thresholds_for_output(),
@@ -112,7 +112,7 @@ def format_scorecard(redteam_result: RedTeamResult) -> str:
112
112
  overall_asr = risk_summary.get("overall_asr", 0)
113
113
 
114
114
  output = [f"Overall ASR: {overall_asr}%"]
115
- overall_successes = risk_summary.get("overall_attack_successes", 0)
115
+ overall_successes = risk_summary.get("overall_successful_attacks", 0)
116
116
  overall_total = risk_summary.get("overall_total", 0)
117
117
  output.append(f"Attack Success: {overall_successes}/{overall_total} attacks were successful")
118
118
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: azure-ai-evaluation
3
- Version: 1.11.1
3
+ Version: 1.12.0
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -413,12 +413,22 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
413
413
 
414
414
  # Release History
415
415
 
416
- ## 1.11.1 (2025-09-17)
416
+
417
+ ## 1.12.0 (2025-10-02)
418
+
419
+ ### Features Added
420
+ - AOAI Graders now accept a "credential" parameter that can be used for authentication with an AzureOpenAIModelConfiguration
421
+ - Added `is_reasoning_model` parameter support to `CoherenceEvaluator`, `FluencyEvaluator`, `SimilarityEvaluator`, `GroundednessEvaluator`, `RetrievalEvaluator`, and `RelevanceEvaluator` to enable reasoning model configuration for o1/o3 models.
422
+
423
+ ### Bugs Fixed
424
+ - Support for multi-level nesting in OpenAI grader (experimental)
425
+
426
+ ## 1.11.1 (2025-09-19)
417
427
 
418
428
  ### Bugs Fixed
419
429
  - Pinning duckdb version to 1.3.2 for redteam extra to fix error `TypeError: unhashable type: '_duckdb.typing.DuckDBPyType'`
420
430
 
421
- ## 1.11.0 (2025-09-02)
431
+ ## 1.11.0 (2025-09-03)
422
432
 
423
433
  ### Features Added
424
434
  - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.