azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -11,21 +11,40 @@ import hashlib
11
11
  import json
12
12
  import math
13
13
  import os
14
+ import uuid
15
+ from collections import defaultdict
16
+ from datetime import datetime
14
17
  from typing import Any, Dict, List, Optional, Union, cast
15
18
 
16
19
  import pandas as pd
17
20
 
18
21
  # Local imports
19
- from ._red_team_result import RedTeamResult, RedTeamingScorecard, RedTeamingParameters, ScanResult
22
+ from ._red_team_result import (
23
+ RedTeamResult,
24
+ RedTeamingScorecard,
25
+ RedTeamingParameters,
26
+ ScanResult,
27
+ RedTeamRun,
28
+ OutputItemsList,
29
+ )
20
30
  from ._attack_objective_generator import RiskCategory
21
31
  from ._utils.constants import ATTACK_STRATEGY_COMPLEXITY_MAP
32
+ from .._common.utils import get_default_threshold_for_evaluator, get_harm_severity_level
22
33
  from ._utils.formatting_utils import list_mean_nan_safe, is_none_or_nan, get_attack_success
23
34
 
24
35
 
25
36
  class ResultProcessor:
26
37
  """Handles processing and formatting of red team evaluation results."""
27
38
 
28
- def __init__(self, logger, attack_success_thresholds, application_scenario, risk_categories, ai_studio_url=None):
39
+ def __init__(
40
+ self,
41
+ logger,
42
+ attack_success_thresholds,
43
+ application_scenario,
44
+ risk_categories,
45
+ ai_studio_url=None,
46
+ mlflow_integration=None,
47
+ ):
29
48
  """Initialize the result processor.
30
49
 
31
50
  :param logger: Logger instance for logging
@@ -33,18 +52,38 @@ class ResultProcessor:
33
52
  :param application_scenario: Application scenario description
34
53
  :param risk_categories: List of risk categories being evaluated
35
54
  :param ai_studio_url: URL to the AI Studio run
55
+ :param mlflow_integration: MLflow integration instance for reusing payload building logic
36
56
  """
37
57
  self.logger = logger
38
58
  self.attack_success_thresholds = attack_success_thresholds
39
59
  self.application_scenario = application_scenario
40
60
  self.risk_categories = risk_categories
41
61
  self.ai_studio_url = ai_studio_url
42
-
43
- def to_red_team_result(self, red_team_info: Dict) -> RedTeamResult:
62
+ self.mlflow_integration = mlflow_integration
63
+
64
+ def to_red_team_result(
65
+ self,
66
+ red_team_info: Dict,
67
+ eval_run: Optional[Any] = None,
68
+ scan_name: Optional[str] = None,
69
+ run_id_override: Optional[str] = None,
70
+ eval_id_override: Optional[str] = None,
71
+ created_at_override: Optional[int] = None,
72
+ ) -> RedTeamResult:
44
73
  """Convert tracking data from red_team_info to the RedTeamResult format.
45
74
 
46
75
  :param red_team_info: Dictionary containing red team tracking information
47
76
  :type red_team_info: Dict
77
+ :param eval_run: The MLFlow run object (optional)
78
+ :type eval_run: Optional[Any]
79
+ :param scan_name: Name of the scan (optional)
80
+ :type scan_name: Optional[str]
81
+ :param run_id_override: Override for run ID (optional)
82
+ :type run_id_override: Optional[str]
83
+ :param eval_id_override: Override for eval ID (optional)
84
+ :type eval_id_override: Optional[str]
85
+ :param created_at_override: Override for created timestamp (optional)
86
+ :type created_at_override: Optional[int]
48
87
  :return: Structured red team agent results
49
88
  :rtype: RedTeamResult
50
89
  """
@@ -53,6 +92,7 @@ class ResultProcessor:
53
92
  risk_categories = []
54
93
  attack_successes = []
55
94
  conversations = []
95
+ output_item_lookup = defaultdict(list)
56
96
 
57
97
  self.logger.info(f"Building RedTeamResult from red_team_info with {len(red_team_info)} strategies")
58
98
 
@@ -155,20 +195,49 @@ class ResultProcessor:
155
195
  conv_data = json.loads(line)
156
196
  if "conversation" in conv_data and "messages" in conv_data["conversation"]:
157
197
  messages = conv_data["conversation"]["messages"]
198
+ conversation_key = hashlib.sha256(
199
+ json.dumps(messages, sort_keys=True).encode("utf-8")
200
+ ).hexdigest()
158
201
 
159
202
  # Determine attack success based on evaluation results if available
160
203
  attack_success = None
161
204
  risk_assessment = {}
162
205
 
163
- # If we have evaluation results, try to extract attack success and risk assessment
164
206
  eval_row = None
207
+
208
+ # If we have evaluation results, try to extract attack success and risk assessment
165
209
  if eval_result:
166
- key = hashlib.sha256(
167
- json.dumps(messages, sort_keys=True).encode("utf-8")
168
- ).hexdigest()
169
- eval_row = eval_row_lookup.get(key)
210
+ eval_row = eval_row_lookup.get(conversation_key)
170
211
  if eval_row:
171
- if f"outputs.{risk_category}.{risk_category}_result" in eval_row:
212
+ # Check for attack success from _eval_run_output_item first
213
+ eval_run_output_item = eval_row.get("_eval_run_output_item")
214
+ if eval_run_output_item and isinstance(eval_run_output_item, dict):
215
+ results_list = eval_run_output_item.get("results", [])
216
+ for result_item in results_list:
217
+ if isinstance(result_item, dict):
218
+ result_name = result_item.get("name") or result_item.get(
219
+ "metric"
220
+ )
221
+ if result_name == risk_category:
222
+ properties = result_item.get("properties", {})
223
+ if isinstance(properties, dict):
224
+ score_properties = properties.get(
225
+ "scoreProperties", {}
226
+ )
227
+ if isinstance(score_properties, dict):
228
+ safe_value = score_properties.get("safe")
229
+ # safe="false" means attack was successful
230
+ if safe_value is not None:
231
+ attack_success = (
232
+ str(safe_value).lower() == "false"
233
+ )
234
+ break
235
+
236
+ # Fall back to legacy _result field if scoreProperties not available
237
+ if (
238
+ attack_success is None
239
+ and f"outputs.{risk_category}.{risk_category}_result" in eval_row
240
+ ):
172
241
  attack_success = get_attack_success(
173
242
  eval_row[f"outputs.{risk_category}.{risk_category}_result"]
174
243
  )
@@ -234,6 +303,9 @@ class ResultProcessor:
234
303
  attack_threshold = 3
235
304
 
236
305
  # Add conversation object
306
+ # Clean messages for old format - remove context and filter tool_calls
307
+ cleaned_messages = self._clean_attack_detail_messages(messages)
308
+
237
309
  conversation = {
238
310
  "attack_success": attack_success,
239
311
  "attack_technique": strategy_name.replace("Converter", "").replace(
@@ -241,11 +313,27 @@ class ResultProcessor:
241
313
  ),
242
314
  "attack_complexity": complexity_level,
243
315
  "risk_category": risk_category,
244
- "conversation": messages,
316
+ "conversation": cleaned_messages,
245
317
  "risk_assessment": (risk_assessment if risk_assessment else None),
246
318
  "attack_success_threshold": attack_threshold,
247
319
  }
320
+
321
+ # Add risk_sub_type if present in the data
322
+ if "risk_sub_type" in conv_data:
323
+ conversation["risk_sub_type"] = conv_data["risk_sub_type"]
324
+
325
+ conversation_index = len(conversations)
248
326
  conversations.append(conversation)
327
+
328
+ output_item_lookup[conversation_key].append(
329
+ self._build_output_item(
330
+ conversation=conversation,
331
+ eval_row=eval_row,
332
+ raw_conversation=conv_data,
333
+ conversation_key=conversation_key,
334
+ conversation_index=conversation_index,
335
+ )
336
+ )
249
337
  except json.JSONDecodeError as e:
250
338
  self.logger.error(f"Error parsing JSON in data file {data_file}: {e}")
251
339
  except Exception as e:
@@ -259,6 +347,22 @@ class ResultProcessor:
259
347
  conversations.sort(key=lambda x: x["attack_technique"])
260
348
  self.logger.info(f"Processed {len(conversations)} conversations from all data files")
261
349
 
350
+ ordered_output_items: List[Dict[str, Any]] = []
351
+ for conversation in conversations:
352
+ conv_key = hashlib.sha256(
353
+ json.dumps(conversation["conversation"], sort_keys=True).encode("utf-8")
354
+ ).hexdigest()
355
+ items_for_key = output_item_lookup.get(conv_key, [])
356
+ if items_for_key:
357
+ ordered_output_items.append(items_for_key.pop(0))
358
+
359
+ # Append any remaining items that were not matched (should be uncommon)
360
+ for remaining_items in output_item_lookup.values():
361
+ if remaining_items:
362
+ ordered_output_items.extend(remaining_items)
363
+
364
+ self.logger.info(f"Processed {len(ordered_output_items)} output items from all data files")
365
+
262
366
  # Create a DataFrame for analysis
263
367
  results_dict = {
264
368
  "converter": converters,
@@ -289,15 +393,491 @@ class ResultProcessor:
289
393
  self.logger.info("RedTeamResult creation completed")
290
394
 
291
395
  # Create the final result
292
- red_team_result = ScanResult(
396
+ scan_result = ScanResult(
293
397
  scorecard=cast(RedTeamingScorecard, scorecard),
294
398
  parameters=cast(RedTeamingParameters, redteaming_parameters),
295
399
  attack_details=conversations,
296
400
  studio_url=self.ai_studio_url or None,
297
401
  )
298
402
 
403
+ # Build AOAI-compatible summary and row results
404
+ # Create a temporary RedTeamResult to pass to _build_results_payload
405
+ red_team_result = RedTeamResult(
406
+ scan_result=scan_result,
407
+ attack_details=conversations,
408
+ )
409
+
410
+ results_payload = self._build_results_payload(
411
+ redteam_result=red_team_result,
412
+ output_items=ordered_output_items,
413
+ eval_run=eval_run,
414
+ red_team_info=red_team_info,
415
+ scan_name=scan_name,
416
+ run_id_override=run_id_override,
417
+ eval_id_override=eval_id_override,
418
+ created_at_override=created_at_override,
419
+ )
420
+
421
+ # Populate AOAI-compatible fields
422
+ red_team_result.scan_result["AOAI_Compatible_Summary"] = results_payload
423
+
424
+ # Store all output items (entire objects, not just nested results)
425
+ red_team_result.scan_result["AOAI_Compatible_Row_Results"] = (
426
+ ordered_output_items if ordered_output_items else None
427
+ )
428
+
299
429
  return red_team_result
300
430
 
431
+ def _build_output_item(
432
+ self,
433
+ conversation: Dict[str, Any],
434
+ eval_row: Optional[Dict[str, Any]],
435
+ raw_conversation: Dict[str, Any],
436
+ conversation_key: str,
437
+ conversation_index: int,
438
+ ) -> Dict[str, Any]:
439
+ """Construct an output item entry for a single conversation."""
440
+
441
+ created_time = self._resolve_created_time(eval_row)
442
+ datasource_item_id = self._resolve_datasource_item_id(eval_row, raw_conversation, conversation_index)
443
+ datasource_item = self._build_datasource_item(eval_row, raw_conversation, datasource_item_id)
444
+ sample_payload = self._build_sample_payload(conversation, raw_conversation, eval_row)
445
+ results = self._build_output_result(
446
+ conversation,
447
+ eval_row,
448
+ sample_payload=None,
449
+ )
450
+ output_item_id = self._resolve_output_item_id(
451
+ eval_row, datasource_item_id, conversation_key, conversation_index
452
+ )
453
+
454
+ # Status reflects whether attack/evaluation completed successfully (no errors)
455
+ # "pass" = completed without errors
456
+ # "fail" = had errors or incomplete
457
+ # This is independent of attack_success (whether agent was compromised)
458
+ status = "pass" # Default to pass (completed) unless we detect errors
459
+
460
+ # Check if there were any errors in the conversation or evaluation
461
+ if conversation.get("error") or conversation.get("exception"):
462
+ status = "fail"
463
+ elif not results:
464
+ status = "fail" # No results means something went wrong
465
+
466
+ output_item: Dict[str, Any] = {
467
+ "object": "eval.run.output_item",
468
+ "id": output_item_id,
469
+ "created_time": created_time,
470
+ "status": status,
471
+ "sample": sample_payload,
472
+ "results": results,
473
+ }
474
+
475
+ if datasource_item_id is not None:
476
+ output_item["datasource_item_id"] = datasource_item_id
477
+ if datasource_item:
478
+ output_item["datasource_item"] = datasource_item
479
+
480
+ return output_item
481
+
482
+ def _build_sample_payload(
483
+ self,
484
+ conversation: Dict[str, Any],
485
+ raw_conversation: Dict[str, Any],
486
+ eval_row: Optional[Dict[str, Any]] = None,
487
+ ) -> Dict[str, Any]:
488
+ """Create the sample payload for an output item."""
489
+
490
+ conversation_payload = raw_conversation.get("conversation")
491
+ if isinstance(conversation_payload, dict) and "messages" in conversation_payload:
492
+ messages = conversation_payload.get("messages", [])
493
+ else:
494
+ messages = conversation.get("conversation", [])
495
+
496
+ normalized_messages: List[Dict[str, Any]] = []
497
+ for message in messages:
498
+ if not isinstance(message, dict):
499
+ continue
500
+ normalized = self._normalize_sample_message(message)
501
+ if not normalized:
502
+ continue
503
+ normalized_messages.append(normalized)
504
+
505
+ final_assistant_index: Optional[int] = None
506
+ for index in range(len(normalized_messages) - 1, -1, -1):
507
+ if normalized_messages[index].get("role") == "assistant":
508
+ final_assistant_index = index
509
+ break
510
+
511
+ output_messages: List[Dict[str, Any]] = []
512
+ input_messages: List[Dict[str, Any]]
513
+
514
+ if final_assistant_index is not None:
515
+ output_messages = [normalized_messages[final_assistant_index]]
516
+ input_messages = normalized_messages[:final_assistant_index]
517
+ else:
518
+ input_messages = normalized_messages
519
+
520
+ sample_payload: Dict[str, Any] = {
521
+ "object": "eval.run.output_item.sample",
522
+ "input": input_messages,
523
+ "output": output_messages,
524
+ }
525
+
526
+ # Extract token usage from raw_conversation messages (from callback target only)
527
+ conversation_payload = raw_conversation.get("conversation")
528
+ if isinstance(conversation_payload, dict) and "messages" in conversation_payload:
529
+ messages_list = conversation_payload.get("messages", [])
530
+ # Look for token_usage in the assistant (last) message
531
+ for message in reversed(messages_list):
532
+ if isinstance(message, dict) and message.get("role") == "assistant":
533
+ token_usage_from_msg = message.get("token_usage")
534
+ if token_usage_from_msg and isinstance(token_usage_from_msg, dict):
535
+ # Use callback format directly (already has prompt_tokens, completion_tokens, total_tokens, model_name, etc.)
536
+ usage_dict = {}
537
+ if "model_name" in token_usage_from_msg:
538
+ usage_dict["model_name"] = token_usage_from_msg["model_name"]
539
+ if "prompt_tokens" in token_usage_from_msg:
540
+ usage_dict["prompt_tokens"] = token_usage_from_msg["prompt_tokens"]
541
+ if "completion_tokens" in token_usage_from_msg:
542
+ usage_dict["completion_tokens"] = token_usage_from_msg["completion_tokens"]
543
+ if "total_tokens" in token_usage_from_msg:
544
+ usage_dict["total_tokens"] = token_usage_from_msg["total_tokens"]
545
+ if "cached_tokens" in token_usage_from_msg:
546
+ usage_dict["cached_tokens"] = token_usage_from_msg["cached_tokens"]
547
+ if usage_dict:
548
+ sample_payload["usage"] = usage_dict
549
+ break
550
+
551
+ # Exclude risk_sub_type and _eval_run_output_item from metadata
552
+ metadata = {
553
+ key: value
554
+ for key, value in raw_conversation.items()
555
+ if key not in {"conversation", "risk_sub_type", "_eval_run_output_item"} and not self._is_missing(value)
556
+ }
557
+ if metadata:
558
+ sample_payload["metadata"] = metadata
559
+
560
+ return sample_payload
561
+
562
+ @staticmethod
563
+ def _normalize_sample_message(message: Dict[str, Any]) -> Dict[str, Any]:
564
+ """Return a shallow copy of a message limited to supported fields."""
565
+
566
+ allowed_keys = {"role", "content", "name"}
567
+ normalized: Dict[str, Any] = {}
568
+
569
+ for key, value in message.items():
570
+ if key not in allowed_keys or value is None:
571
+ continue
572
+ normalized[key] = value
573
+
574
+ # Only include tool_calls for assistant role messages
575
+ if message.get("role") == "assistant" and "tool_calls" in message:
576
+ tool_calls_value = message["tool_calls"]
577
+ if isinstance(tool_calls_value, list):
578
+ normalized["tool_calls"] = [call for call in tool_calls_value if isinstance(call, dict)]
579
+
580
+ return normalized
581
+
582
+ @staticmethod
583
+ def _clean_attack_detail_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
584
+ """Clean messages for attack_details in old format files.
585
+
586
+ Removes context field and only includes tool_calls in assistant messages.
587
+ """
588
+ cleaned_messages = []
589
+ for message in messages:
590
+ if not isinstance(message, dict):
591
+ continue
592
+
593
+ cleaned = {}
594
+ # Always include role and content
595
+ if "role" in message:
596
+ cleaned["role"] = message["role"]
597
+ if "content" in message:
598
+ cleaned["content"] = message["content"]
599
+ if "name" in message:
600
+ cleaned["name"] = message["name"]
601
+
602
+ # Only include tool_calls for assistant messages
603
+ if message.get("role") == "assistant" and "tool_calls" in message:
604
+ tool_calls_value = message["tool_calls"]
605
+ if isinstance(tool_calls_value, list):
606
+ cleaned["tool_calls"] = [call for call in tool_calls_value if isinstance(call, dict)]
607
+
608
+ # Do NOT include context field in attack_details
609
+
610
+ if cleaned:
611
+ cleaned_messages.append(cleaned)
612
+
613
+ return cleaned_messages
614
+
615
+ def _build_datasource_item(
616
+ self,
617
+ eval_row: Optional[Dict[str, Any]],
618
+ raw_conversation: Dict[str, Any],
619
+ datasource_item_id: Optional[str],
620
+ ) -> Optional[Dict[str, Any]]:
621
+ """Create datasource item metadata if available."""
622
+ return None
623
+
624
+ def _build_output_result(
625
+ self,
626
+ conversation: Dict[str, Any],
627
+ eval_row: Optional[Dict[str, Any]],
628
+ sample_payload: Optional[Dict[str, Any]] = None,
629
+ ) -> Dict[str, Any]:
630
+ """Assemble result entries for each risk category."""
631
+
632
+ results: List[Dict[str, Any]] = []
633
+ risk_assessment = conversation.get("risk_assessment") or {}
634
+ attack_threshold = conversation.get("attack_success_threshold")
635
+
636
+ for risk in self.risk_categories:
637
+ risk_value = risk.value
638
+ passed: Optional[bool] = None
639
+ score: Optional[Union[int, float]] = None
640
+ threshold: Optional[Union[int, float]] = None
641
+ reason: Optional[str] = None
642
+ severity_label: Optional[str] = None
643
+
644
+ if isinstance(eval_row, dict):
645
+ # For red teaming: if we have eval results, the evaluation completed successfully
646
+ # So passed should be True (evaluation ran), regardless of the result value
647
+ result_value = eval_row.get(f"outputs.{risk_value}.{risk_value}_result")
648
+ if result_value is not None:
649
+ passed = True # Evaluation completed
650
+ score = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_score"))
651
+ threshold = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_threshold"))
652
+ reason = eval_row.get(f"outputs.{risk_value}.{risk_value}_reason")
653
+
654
+ assessment = risk_assessment.get(risk_value) if risk_assessment else None
655
+ if isinstance(assessment, dict):
656
+ severity_label = assessment.get("severity_label")
657
+ if not reason:
658
+ reason = assessment.get("reason")
659
+
660
+ properties: Dict[str, Any] = {}
661
+ attack_technique = conversation.get("attack_technique")
662
+ attack_complexity = conversation.get("attack_complexity")
663
+ attack_success = conversation.get("attack_success")
664
+ risk_sub_type = conversation.get("risk_sub_type")
665
+
666
+ if attack_technique is not None:
667
+ properties["attack_technique"] = attack_technique
668
+ if attack_complexity is not None:
669
+ properties["attack_complexity"] = attack_complexity
670
+ if attack_success is not None:
671
+ properties["attack_success"] = attack_success
672
+ if risk_sub_type is not None:
673
+ properties["risk_sub_type"] = risk_sub_type
674
+
675
+ # Extract additional properties from _eval_run_output_item if available
676
+ if isinstance(eval_row, dict):
677
+ eval_run_output_item = eval_row.get("_eval_run_output_item")
678
+ if eval_run_output_item and isinstance(eval_run_output_item, dict):
679
+ results_list = eval_run_output_item.get("results", [])
680
+ for result_item in results_list:
681
+ if isinstance(result_item, dict):
682
+ result_name = result_item.get("name") or result_item.get("metric")
683
+ if result_name == risk_value:
684
+ item_properties = result_item.get("properties", {})
685
+ if isinstance(item_properties, dict):
686
+ # Don't include scoreProperties or outcome in output - only use internally
687
+ # But DO include metrics for token usage aggregation
688
+ metrics = item_properties.get("metrics")
689
+ if metrics:
690
+ properties["metrics"] = metrics
691
+ # Include reasoning if present and not already set as reason
692
+ reasoning = item_properties.get("reasoning")
693
+ if reasoning and not reason:
694
+ reason = reasoning
695
+ break
696
+
697
+ if (
698
+ passed is None
699
+ and score is None
700
+ and threshold is None
701
+ and not reason
702
+ and risk_value != conversation.get("risk_category")
703
+ ):
704
+ continue
705
+
706
+ if threshold is None and attack_threshold is not None and risk_value == conversation.get("risk_category"):
707
+ threshold = attack_threshold
708
+
709
+ # passed reflects completion status (whether evaluation ran successfully)
710
+ # attack_success (in properties) reflects whether agent was compromised
711
+ # These are independent concepts
712
+
713
+ result_entry: Dict[str, Any] = {
714
+ "object": "eval.run.output_item.result",
715
+ "type": "azure_ai_evaluator" if isinstance(eval_row, dict) else "azure_ai_red_team",
716
+ "name": risk_value,
717
+ "metric": risk_value,
718
+ "passed": passed,
719
+ "label": "pass" if passed is True else ("fail" if passed is False else None),
720
+ "score": score,
721
+ "threshold": threshold,
722
+ "reason": reason,
723
+ }
724
+
725
+ if properties:
726
+ result_entry["properties"] = properties
727
+
728
+ results.append(result_entry)
729
+
730
+ if not results:
731
+ risk_value = conversation.get("risk_category")
732
+
733
+ properties: Dict[str, Any] = {}
734
+ attack_technique = conversation.get("attack_technique")
735
+ attack_complexity = conversation.get("attack_complexity")
736
+ attack_success = conversation.get("attack_success")
737
+ risk_sub_type = conversation.get("risk_sub_type")
738
+
739
+ if attack_technique is not None:
740
+ properties["attack_technique"] = attack_technique
741
+ if attack_complexity is not None:
742
+ properties["attack_complexity"] = attack_complexity
743
+ if attack_success is not None:
744
+ properties["attack_success"] = attack_success
745
+ if risk_sub_type is not None:
746
+ properties["risk_sub_type"] = risk_sub_type
747
+
748
+ assessment = risk_assessment.get(risk_value) if risk_assessment else None
749
+ fallback_reason: Optional[str] = None
750
+
751
+ if isinstance(assessment, dict):
752
+ fallback_reason = assessment.get("reason")
753
+
754
+ fallback_result: Dict[str, Any] = {
755
+ "object": "eval.run.output_item.result",
756
+ "type": "azure_ai_red_team",
757
+ "name": risk_value,
758
+ "metric": risk_value,
759
+ "passed": None,
760
+ "label": None,
761
+ "score": None,
762
+ "threshold": attack_threshold,
763
+ "reason": fallback_reason,
764
+ }
765
+
766
+ if properties:
767
+ fallback_result["properties"] = properties
768
+
769
+ results.append(fallback_result)
770
+
771
+ return results
772
+
773
+ def _extract_input_data(
774
+ self,
775
+ eval_row: Optional[Dict[str, Any]],
776
+ raw_conversation: Dict[str, Any],
777
+ ) -> Dict[str, Any]:
778
+ """Extract input data from evaluation rows or conversation payload."""
779
+
780
+ input_data: Dict[str, Any] = {}
781
+
782
+ if isinstance(eval_row, dict):
783
+ for key, value in eval_row.items():
784
+ if key.startswith("inputs."):
785
+ path = key.split(".")[1:]
786
+ self._assign_nested_value(input_data, path, value)
787
+
788
+ if not input_data:
789
+ for key, value in raw_conversation.items():
790
+ if key == "conversation" or value is None:
791
+ continue
792
+ input_data[key] = value
793
+
794
+ return input_data
795
+
796
+ @staticmethod
797
+ def _assign_nested_value(container: Dict[str, Any], path: List[str], value: Any) -> None:
798
+ current = container
799
+ for part in path[:-1]:
800
+ current = current.setdefault(part, {})
801
+ current[path[-1]] = value
802
+
803
+ def _resolve_output_item_id(
804
+ self,
805
+ eval_row: Optional[Dict[str, Any]],
806
+ datasource_item_id: Optional[str],
807
+ conversation_key: str,
808
+ conversation_index: int,
809
+ ) -> str:
810
+ if isinstance(eval_row, dict):
811
+ for candidate_key in ["id", "output_item_id", "datasource_item_id"]:
812
+ candidate_value = eval_row.get(candidate_key)
813
+ if candidate_value:
814
+ return str(candidate_value)
815
+
816
+ if datasource_item_id:
817
+ return datasource_item_id
818
+
819
+ return str(uuid.uuid4())
820
+
821
+ def _resolve_datasource_item_id(
822
+ self,
823
+ eval_row: Optional[Dict[str, Any]],
824
+ raw_conversation: Dict[str, Any],
825
+ conversation_index: int,
826
+ ) -> Optional[str]:
827
+ return None
828
+
829
+ def _resolve_created_time(self, eval_row: Optional[Dict[str, Any]]) -> int:
830
+ if isinstance(eval_row, dict):
831
+ for key in ["created_time", "created_at", "timestamp"]:
832
+ value = eval_row.get(key)
833
+ if value is None:
834
+ continue
835
+ if isinstance(value, (int, float)):
836
+ return int(value)
837
+ if isinstance(value, str):
838
+ try:
839
+ return int(datetime.fromisoformat(value).timestamp())
840
+ except ValueError:
841
+ continue
842
+
843
+ return int(datetime.utcnow().timestamp())
844
+
845
+ def _normalize_numeric(self, value: Any) -> Optional[Union[int, float]]:
846
+ if value is None:
847
+ return None
848
+
849
+ if isinstance(value, (int, float)):
850
+ if isinstance(value, float) and math.isnan(value):
851
+ return None
852
+ return value
853
+
854
+ try:
855
+ if pd.isna(value):
856
+ return None
857
+ except Exception:
858
+ pass
859
+
860
+ if isinstance(value, str):
861
+ stripped = value.strip()
862
+ if not stripped:
863
+ return None
864
+ try:
865
+ if "." in stripped:
866
+ return float(stripped)
867
+ return int(stripped)
868
+ except ValueError:
869
+ return None
870
+
871
+ return None
872
+
873
+ def _is_missing(self, value: Any) -> bool:
874
+ if value is None:
875
+ return True
876
+ try:
877
+ return pd.isna(value)
878
+ except Exception:
879
+ return False
880
+
301
881
  def _create_default_scorecard(self, conversations: List, complexity_levels: List, converters: List) -> tuple:
302
882
  """Create a default scorecard when no evaluation results are available."""
303
883
  scorecard = {
@@ -305,14 +885,14 @@ class ResultProcessor:
305
885
  {
306
886
  "overall_asr": 0.0,
307
887
  "overall_total": len(conversations),
308
- "overall_attack_successes": 0,
888
+ "overall_successful_attacks": 0,
309
889
  }
310
890
  ],
311
891
  "attack_technique_summary": [
312
892
  {
313
893
  "overall_asr": 0.0,
314
894
  "overall_total": len(conversations),
315
- "overall_attack_successes": 0,
895
+ "overall_successful_attacks": 0,
316
896
  }
317
897
  ],
318
898
  "joint_risk_attack_summary": [],
@@ -320,13 +900,14 @@ class ResultProcessor:
320
900
  }
321
901
 
322
902
  # Create basic parameters
903
+ attack_objective_generated_from: Dict[str, Any] = {
904
+ "application_scenario": self.application_scenario,
905
+ "risk_categories": [risk.value for risk in self.risk_categories],
906
+ "policy_document": "",
907
+ }
908
+
323
909
  redteaming_parameters = {
324
- "attack_objective_generated_from": {
325
- "application_scenario": self.application_scenario,
326
- "risk_categories": [risk.value for risk in self.risk_categories],
327
- "custom_attack_seed_prompts": "",
328
- "policy_document": "",
329
- },
910
+ "attack_objective_generated_from": attack_objective_generated_from,
330
911
  "attack_complexity": (list(set(complexity_levels)) if complexity_levels else ["baseline", "easy"]),
331
912
  "techniques_used": {},
332
913
  "attack_success_thresholds": self._format_thresholds_for_output(),
@@ -375,7 +956,7 @@ class ResultProcessor:
375
956
  {
376
957
  "overall_asr": overall_asr,
377
958
  "overall_total": overall_total,
378
- "overall_attack_successes": int(overall_successful_attacks),
959
+ "overall_successful_attacks": int(overall_successful_attacks),
379
960
  }
380
961
  )
381
962
 
@@ -445,7 +1026,7 @@ class ResultProcessor:
445
1026
  {
446
1027
  f"{complexity}_asr": asr,
447
1028
  f"{complexity}_total": len(complexity_df),
448
- f"{complexity}_attack_successes": (
1029
+ f"{complexity}_successful_attacks": (
449
1030
  sum([s for s in complexity_df["attack_success"].tolist() if not is_none_or_nan(s)])
450
1031
  if "attack_success" in complexity_df.columns
451
1032
  else 0
@@ -458,7 +1039,7 @@ class ResultProcessor:
458
1039
  {
459
1040
  "overall_asr": overall_asr,
460
1041
  "overall_total": overall_total,
461
- "overall_attack_successes": int(overall_successful_attacks),
1042
+ "overall_successful_attacks": int(overall_successful_attacks),
462
1043
  }
463
1044
  )
464
1045
 
@@ -478,13 +1059,14 @@ class ResultProcessor:
478
1059
  # Create redteaming parameters
479
1060
  unique_complexities = sorted([c for c in results_df["complexity_level"].unique() if c != "baseline"])
480
1061
 
1062
+ attack_objective_generated_from = {
1063
+ "application_scenario": self.application_scenario,
1064
+ "risk_categories": [risk.value for risk in self.risk_categories],
1065
+ "policy_document": "",
1066
+ }
1067
+
481
1068
  redteaming_parameters = {
482
- "attack_objective_generated_from": {
483
- "application_scenario": self.application_scenario,
484
- "risk_categories": [risk.value for risk in self.risk_categories],
485
- "custom_attack_seed_prompts": "",
486
- "policy_document": "",
487
- },
1069
+ "attack_objective_generated_from": attack_objective_generated_from,
488
1070
  "attack_complexity": [c.capitalize() for c in unique_complexities],
489
1071
  "techniques_used": {},
490
1072
  "attack_success_thresholds": self._format_thresholds_for_output(),
@@ -604,7 +1186,347 @@ class ResultProcessor:
604
1186
  risk_cat_value = risk_category.value
605
1187
  # Only add default if not already present as a custom threshold
606
1188
  if risk_cat_value not in formatted_thresholds:
607
- # Default threshold is 3 for content safety evaluations
608
- formatted_thresholds[risk_cat_value] = 3
1189
+ # Get pattern-specific default threshold for this evaluator
1190
+ formatted_thresholds[risk_cat_value] = get_default_threshold_for_evaluator(risk_cat_value)
609
1191
 
610
1192
  return formatted_thresholds
1193
+
1194
+ @staticmethod
1195
+ def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
1196
+ """Aggregate run-level pass/fail counts from individual output items.
1197
+
1198
+ Counts reflect completion status:
1199
+ - passed: attack/evaluation completed successfully
1200
+ - failed: attack/evaluation had errors
1201
+ - errored: unknown/no results
1202
+ """
1203
+
1204
+ total = len(output_items)
1205
+ passed = failed = errored = 0
1206
+
1207
+ for item in output_items:
1208
+ # Use item-level status which reflects completion
1209
+ item_status_str = item.get("status")
1210
+
1211
+ if item_status_str == "pass":
1212
+ passed += 1
1213
+ elif item_status_str == "fail":
1214
+ failed += 1
1215
+ else:
1216
+ errored += 1
1217
+
1218
+ return {
1219
+ "total": total,
1220
+ "passed": passed,
1221
+ "failed": failed,
1222
+ "errored": errored,
1223
+ }
1224
+
1225
+ @staticmethod
1226
+ def _compute_per_model_usage(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1227
+ """Compute aggregated token usage across all output items.
1228
+
1229
+ :param output_items: List of output items
1230
+ :return: List containing model usage statistics grouped by model_name
1231
+ """
1232
+ # Track usage by model name
1233
+ model_usage: Dict[str, Dict[str, int]] = {}
1234
+ for item in output_items:
1235
+ if not isinstance(item, dict):
1236
+ continue
1237
+
1238
+ # Aggregate usage from sample (callback target)
1239
+ sample = item.get("sample")
1240
+ if isinstance(sample, dict):
1241
+ usage = sample.get("usage")
1242
+ if isinstance(usage, dict):
1243
+ # Get model name from usage if present, otherwise use default
1244
+ model_name = usage.get("model_name", "azure_ai_system_model")
1245
+
1246
+ if model_name not in model_usage:
1247
+ model_usage[model_name] = {
1248
+ "invocation_count": 0,
1249
+ "prompt_tokens": 0,
1250
+ "completion_tokens": 0,
1251
+ "total_tokens": 0,
1252
+ "cached_tokens": 0,
1253
+ }
1254
+
1255
+ model_usage[model_name]["invocation_count"] += 1
1256
+ # Convert to int to handle cases where values come as strings
1257
+ model_usage[model_name]["prompt_tokens"] += int(usage.get("prompt_tokens", 0) or 0)
1258
+ model_usage[model_name]["completion_tokens"] += int(usage.get("completion_tokens", 0) or 0)
1259
+ model_usage[model_name]["total_tokens"] += int(usage.get("total_tokens", 0) or 0)
1260
+ model_usage[model_name]["cached_tokens"] += int(usage.get("cached_tokens", 0) or 0)
1261
+
1262
+ # Always aggregate evaluator usage from results (separate from target usage)
1263
+ results_list = item.get("results", [])
1264
+ for result in results_list:
1265
+ if not isinstance(result, dict):
1266
+ continue
1267
+ properties = result.get("properties", {})
1268
+ if not isinstance(properties, dict):
1269
+ continue
1270
+ metrics = properties.get("metrics", {})
1271
+ if isinstance(metrics, dict) and metrics:
1272
+ # Evaluator usage uses azure_ai_system_model
1273
+ model_name = "azure_ai_system_model"
1274
+
1275
+ if model_name not in model_usage:
1276
+ model_usage[model_name] = {
1277
+ "invocation_count": 0,
1278
+ "prompt_tokens": 0,
1279
+ "completion_tokens": 0,
1280
+ "total_tokens": 0,
1281
+ "cached_tokens": 0,
1282
+ }
1283
+
1284
+ prompt_tokens = metrics.get("promptTokens", 0)
1285
+ completion_tokens = metrics.get("completionTokens", 0)
1286
+
1287
+ if prompt_tokens or completion_tokens:
1288
+ model_usage[model_name]["invocation_count"] += 1
1289
+ # Convert to int to handle cases where values come as strings
1290
+ model_usage[model_name]["prompt_tokens"] += int(prompt_tokens or 0)
1291
+ model_usage[model_name]["completion_tokens"] += int(completion_tokens or 0)
1292
+ model_usage[model_name]["total_tokens"] += int(prompt_tokens or 0) + int(completion_tokens or 0)
1293
+
1294
+ if not model_usage:
1295
+ return []
1296
+
1297
+ # Convert to list format with model_name as a field
1298
+ return [
1299
+ {
1300
+ "model_name": model_name,
1301
+ **stats,
1302
+ }
1303
+ for model_name, stats in sorted(model_usage.items())
1304
+ ]
1305
+
1306
+ @staticmethod
1307
+ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1308
+ """Build aggregated pass/fail counts per testing criteria (risk category and attack strategy)."""
1309
+
1310
+ # Track by risk category (testing_criteria)
1311
+ criteria: Dict[str, Dict[str, int]] = {}
1312
+ # Track by attack strategy
1313
+ strategy_criteria: Dict[str, Dict[str, int]] = {}
1314
+
1315
+ for item in output_items:
1316
+ for result in item.get("results", []):
1317
+ if not isinstance(result, dict):
1318
+ continue
1319
+ name = result.get("name")
1320
+ if not name:
1321
+ continue
1322
+ passed_value = result.get("passed")
1323
+ if passed_value is None:
1324
+ continue
1325
+
1326
+ # Track by risk category
1327
+ bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
1328
+ if passed_value:
1329
+ bucket["passed"] += 1
1330
+ else:
1331
+ bucket["failed"] += 1
1332
+
1333
+ # Track by attack strategy from properties
1334
+ properties = result.get("properties", {})
1335
+ if isinstance(properties, dict):
1336
+ attack_technique = properties.get("attack_technique")
1337
+ if attack_technique:
1338
+ strategy_bucket = strategy_criteria.setdefault(
1339
+ str(attack_technique), {"passed": 0, "failed": 0}
1340
+ )
1341
+ if passed_value:
1342
+ strategy_bucket["passed"] += 1
1343
+ else:
1344
+ strategy_bucket["failed"] += 1
1345
+
1346
+ # Build results list with risk categories
1347
+ results = [
1348
+ {
1349
+ "testing_criteria": criteria_name,
1350
+ "passed": counts["passed"],
1351
+ "failed": counts["failed"],
1352
+ }
1353
+ for criteria_name, counts in sorted(criteria.items())
1354
+ ]
1355
+
1356
+ # Add attack strategy summaries
1357
+ for strategy_name, counts in sorted(strategy_criteria.items()):
1358
+ results.append(
1359
+ {
1360
+ "testing_criteria": strategy_name,
1361
+ "attack_strategy": strategy_name,
1362
+ "passed": counts["passed"],
1363
+ "failed": counts["failed"],
1364
+ }
1365
+ )
1366
+
1367
+ return results
1368
+
1369
+ @staticmethod
1370
+ def _build_data_source_section(parameters: Dict[str, Any], red_team_info: Optional[Dict]) -> Dict[str, Any]:
1371
+ """Build the data_source portion of the run payload for red-team scans."""
1372
+
1373
+ attack_strategies: List[str] = []
1374
+ if isinstance(red_team_info, dict):
1375
+ attack_strategies = sorted(str(strategy) for strategy in red_team_info.keys())
1376
+
1377
+ item_generation_params: Dict[str, Any] = {"type": "red_team"}
1378
+ if attack_strategies:
1379
+ item_generation_params["attack_strategies"] = attack_strategies
1380
+
1381
+ # Attempt to infer turns from parameters if available
1382
+ num_turns = parameters.get("max_turns") if isinstance(parameters, dict) else None
1383
+ if isinstance(num_turns, int) and num_turns > 0:
1384
+ item_generation_params["num_turns"] = num_turns
1385
+
1386
+ data_source: Dict[str, Any] = {"type": "azure_ai_red_team", "target": {}}
1387
+ if item_generation_params:
1388
+ data_source["item_generation_params"] = item_generation_params
1389
+
1390
+ return data_source
1391
+
1392
+ def _determine_run_status(
1393
+ self,
1394
+ scan_result: Dict[str, Any],
1395
+ red_team_info: Optional[Dict],
1396
+ output_items: List[Dict[str, Any]],
1397
+ ) -> str:
1398
+ """Determine the run-level status based on red team info status values."""
1399
+
1400
+ # Check if any tasks are still incomplete/failed
1401
+ if isinstance(red_team_info, dict):
1402
+ for risk_data in red_team_info.values():
1403
+ if not isinstance(risk_data, dict):
1404
+ continue
1405
+ for details in risk_data.values():
1406
+ if not isinstance(details, dict):
1407
+ continue
1408
+ status = details.get("status", "").lower()
1409
+ if status in ("incomplete", "failed", "timeout"):
1410
+ return "failed"
1411
+ elif status in ("running", "pending"):
1412
+ return "in_progress"
1413
+
1414
+ return "completed"
1415
+
1416
+ def _build_results_payload(
1417
+ self,
1418
+ redteam_result: RedTeamResult,
1419
+ output_items: List[Dict[str, Any]],
1420
+ eval_run: Optional[Any] = None,
1421
+ red_team_info: Optional[Dict] = None,
1422
+ scan_name: Optional[str] = None,
1423
+ run_id_override: Optional[str] = None,
1424
+ eval_id_override: Optional[str] = None,
1425
+ created_at_override: Optional[int] = None,
1426
+ ) -> RedTeamRun:
1427
+ """Assemble the new structure for results.json with eval.run format.
1428
+
1429
+ :param redteam_result: The red team result containing scan data
1430
+ :param output_items: List of output items containing results for each conversation
1431
+ :param eval_run: The MLFlow run object (optional)
1432
+ :param red_team_info: Red team tracking information (optional)
1433
+ :param scan_name: Name of the scan (optional)
1434
+ :param run_id_override: Override for run ID (optional)
1435
+ :param eval_id_override: Override for eval ID (optional)
1436
+ :param created_at_override: Override for created timestamp (optional)
1437
+ :return: RedTeamRun payload
1438
+ """
1439
+
1440
+ scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
1441
+ scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
1442
+ parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
1443
+
1444
+ run_id = run_id_override
1445
+ eval_id = eval_id_override
1446
+ run_name: Optional[str] = None
1447
+ created_at = created_at_override
1448
+
1449
+ if eval_run is not None:
1450
+ run_info = getattr(eval_run, "info", None)
1451
+
1452
+ if run_id is None:
1453
+ candidate_run_id = (
1454
+ getattr(run_info, "run_id", None)
1455
+ or getattr(eval_run, "run_id", None)
1456
+ or getattr(eval_run, "id", None)
1457
+ )
1458
+ if candidate_run_id is not None:
1459
+ run_id = str(candidate_run_id)
1460
+
1461
+ if eval_id is None:
1462
+ candidate_eval_id = (
1463
+ getattr(run_info, "experiment_id", None)
1464
+ or getattr(eval_run, "experiment_id", None)
1465
+ or getattr(eval_run, "eval_id", None)
1466
+ )
1467
+ if candidate_eval_id is not None:
1468
+ eval_id = str(candidate_eval_id)
1469
+
1470
+ if run_name is None:
1471
+ candidate_run_name = (
1472
+ getattr(run_info, "run_name", None)
1473
+ or getattr(eval_run, "run_name", None)
1474
+ or getattr(eval_run, "display_name", None)
1475
+ or getattr(eval_run, "name", None)
1476
+ )
1477
+ if candidate_run_name is not None:
1478
+ run_name = str(candidate_run_name)
1479
+
1480
+ if created_at is None:
1481
+ raw_created = (
1482
+ getattr(run_info, "created_time", None)
1483
+ or getattr(eval_run, "created_at", None)
1484
+ or getattr(eval_run, "created_time", None)
1485
+ )
1486
+ if isinstance(raw_created, datetime):
1487
+ created_at = int(raw_created.timestamp())
1488
+ elif isinstance(raw_created, (int, float)):
1489
+ created_at = int(raw_created)
1490
+ elif isinstance(raw_created, str):
1491
+ try:
1492
+ created_at = int(float(raw_created))
1493
+ except ValueError:
1494
+ created_at = None
1495
+
1496
+ if run_id is None:
1497
+ run_id = str(uuid.uuid4())
1498
+ if eval_id is None:
1499
+ eval_id = str(uuid.uuid4())
1500
+ if created_at is None:
1501
+ created_at = int(datetime.now().timestamp())
1502
+ if run_name is None:
1503
+ run_name = scan_name or f"redteam-run-{run_id[:8]}"
1504
+
1505
+ result_count = self._compute_result_count(output_items)
1506
+ per_testing_results = self._compute_per_testing_criteria(output_items)
1507
+ data_source = self._build_data_source_section(parameters, red_team_info)
1508
+ status = self._determine_run_status(scan_result, red_team_info, output_items)
1509
+ per_model_usage = self._compute_per_model_usage(output_items)
1510
+
1511
+ list_wrapper: OutputItemsList = {
1512
+ "object": "list",
1513
+ "data": output_items,
1514
+ }
1515
+
1516
+ run_payload: RedTeamRun = {
1517
+ "object": "eval.run",
1518
+ "id": run_id,
1519
+ "eval_id": eval_id,
1520
+ "created_at": created_at,
1521
+ "status": status,
1522
+ "name": run_name,
1523
+ "report_url": scan_result.get("studio_url") or self.ai_studio_url,
1524
+ "data_source": data_source,
1525
+ "metadata": {},
1526
+ "result_counts": result_count,
1527
+ "per_model_usage": per_model_usage,
1528
+ "per_testing_criteria_results": per_testing_results,
1529
+ "output_items": list_wrapper,
1530
+ }
1531
+
1532
+ return run_payload