azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ from ._run_storage import AbstractRunStorage, NoOpRunStorage
17
17
  from .._common._logging import incremental_print, print_red_error
18
18
  from ._config import BatchEngineConfig
19
19
  from ._exceptions import BatchEngineValidationError
20
- from ._engine import DEFAULTS_KEY, BatchEngine, BatchEngineError, BatchResult
20
+ from ._engine import DEFAULTS_KEY, BatchEngine, BatchEngineError, BatchResult, BatchStatus
21
21
 
22
22
 
23
23
  class RunSubmitter:
@@ -141,6 +141,19 @@ class RunSubmitter:
141
141
  run._status = RunStatus.FAILED
142
142
  # when run failed in executor, store the exception in result and dump to file
143
143
  logger.warning(f"Run {run.name} failed when executing in executor with exception {e}.")
144
+ if not batch_result:
145
+ batch_result = BatchResult(
146
+ status=BatchStatus.Failed,
147
+ total_lines=0,
148
+ failed_lines=0,
149
+ start_time=datetime.now(timezone.utc),
150
+ end_time=datetime.now(timezone.utc),
151
+ tokens=None,
152
+ details=[],
153
+ )
154
+ batch_result.error = e
155
+ elif not batch_result.error:
156
+ batch_result.error = e
144
157
  # for user error, swallow stack trace and return failed run since user don't need the stack trace
145
158
  if not isinstance(e, BatchEngineValidationError):
146
159
  # for other errors, raise it to user to help debug root cause.
@@ -266,7 +266,7 @@ class AsyncPrompty:
266
266
  async def __call__( # pylint: disable=docstring-keyword-should-match-keyword-only
267
267
  self,
268
268
  **kwargs: Any,
269
- ) -> Union[OpenAIChatResponseType, AsyncGenerator[str, None], str, Mapping[str, Any]]:
269
+ ) -> dict:
270
270
  """Calling prompty as a function in async, the inputs should be provided with key word arguments.
271
271
  Returns the output of the prompty.
272
272
 
@@ -330,6 +330,7 @@ class AsyncPrompty:
330
330
  is_first_choice=self._data.get("model", {}).get("response", "first").lower() == "first",
331
331
  response_format=params.get("response_format", {}),
332
332
  outputs=self._outputs,
333
+ inputs=inputs,
333
334
  )
334
335
 
335
336
  def render( # pylint: disable=docstring-keyword-should-match-keyword-only
@@ -32,7 +32,7 @@ from typing import (
32
32
 
33
33
  from jinja2 import Template
34
34
  from openai import AsyncStream
35
- from openai.types.chat import ChatCompletion, ChatCompletionChunk
35
+ from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionUserMessageParam
36
36
  from openai import APIConnectionError, APIStatusError, APITimeoutError, OpenAIError
37
37
 
38
38
  from azure.ai.evaluation._constants import DefaultOpenEncoding
@@ -466,7 +466,8 @@ async def format_llm_response(
466
466
  is_first_choice: bool,
467
467
  response_format: Optional[Mapping[str, Any]] = None,
468
468
  outputs: Optional[Mapping[str, Any]] = None,
469
- ) -> Union[OpenAIChatResponseType, AsyncGenerator[str, None], str, Mapping[str, Any]]:
469
+ inputs: Optional[Mapping[str, Any]] = None,
470
+ ) -> dict:
470
471
  """
471
472
  Format LLM response
472
473
 
@@ -525,15 +526,54 @@ async def format_llm_response(
525
526
  return
526
527
  yield chunk.choices[0].delta.content
527
528
 
529
+ to_ret = {
530
+ "llm_output": None,
531
+ "input_token_count": 0,
532
+ "output_token_count": 0,
533
+ "total_token_count": 0,
534
+ "finish_reason": "",
535
+ "model_id": "",
536
+ "sample_input": "",
537
+ "sample_output": "",
538
+ }
539
+
528
540
  if not is_first_choice:
529
- return response
541
+ to_ret["llm_output"] = response
542
+ return to_ret # we don't actually use this code path since streaming is not used, so set token counts to 0
530
543
 
531
544
  is_json_format = isinstance(response_format, dict) and response_format.get("type") == "json_object"
532
545
  if isinstance(response, AsyncStream):
533
546
  if not is_json_format:
534
- return format_stream(llm_response=response)
547
+ to_ret["llm_output"] = format_stream(llm_response=response)
548
+ return to_ret
535
549
  content = "".join([item async for item in format_stream(llm_response=response)])
536
- return format_choice(content)
550
+ to_ret["llm_output"] = format_choice(content)
551
+ return to_ret # we don't actually use this code path since streaming is not used, so set token counts to 0
552
+ else:
553
+ input_token_count = response.usage.prompt_tokens if response.usage and response.usage.prompt_tokens else 0
554
+ output_token_count = (
555
+ response.usage.completion_tokens if response.usage and response.usage.completion_tokens else 0
556
+ )
557
+ total_token_count = response.usage.total_tokens if response.usage and response.usage.total_tokens else 0
558
+ finish_reason = (
559
+ response.choices[0].finish_reason if response.choices and response.choices[0].finish_reason else ""
560
+ )
561
+ model_id = response.model if response.model else ""
562
+ sample_output_list = (
563
+ [{"role": response.choices[0].message.role, "content": response.choices[0].message.content}]
564
+ if (response.choices and response.choices[0].message.content and response.choices[0].message.role)
565
+ else []
566
+ )
567
+ sample_output = json.dumps(sample_output_list)
568
+ input_str = f"{json.dumps(inputs)}" if inputs else ""
569
+ if inputs and len(inputs) > 0:
570
+ sample_input_json = []
571
+ msg = ChatCompletionUserMessageParam(
572
+ role="user",
573
+ content=input_str,
574
+ )
575
+ sample_input_json.append(msg)
576
+ sample_input = json.dumps(sample_input_json)
537
577
 
538
578
  # When calling function/tool, function_call/tool_call response will be returned as a field in message,
539
579
  # so we need return message directly. Otherwise, we only return content.
@@ -543,7 +583,15 @@ async def format_llm_response(
543
583
  else:
544
584
  response_content = getattr(response.choices[0].message, "content", "")
545
585
  result = format_choice(response_content)
546
- return result
586
+ to_ret["llm_output"] = result
587
+ to_ret["input_token_count"] = input_token_count
588
+ to_ret["output_token_count"] = output_token_count
589
+ to_ret["total_token_count"] = total_token_count
590
+ to_ret["finish_reason"] = finish_reason
591
+ to_ret["model_id"] = model_id
592
+ to_ret["sample_input"] = sample_input
593
+ to_ret["sample_output"] = sample_output
594
+ return to_ret
547
595
 
548
596
 
549
597
  def openai_error_retryable(
@@ -5,6 +5,8 @@
5
5
  from typing import Any, Dict, List, Literal, TypedDict, Union
6
6
 
7
7
  from typing_extensions import NotRequired
8
+ from ._evaluator_definition import EvaluatorDefinition
9
+ from typing import Dict, List, Optional, Any
8
10
 
9
11
 
10
12
  class AzureOpenAIModelConfiguration(TypedDict):
@@ -105,6 +107,19 @@ class EvaluatorConfig(TypedDict, total=False):
105
107
  column_mapping: Dict[str, str]
106
108
  """Dictionary mapping evaluator input name to column in data"""
107
109
 
110
+ _evaluator_name: NotRequired[Optional[str]]
111
+ """Name of the evaluator from the evaluator asset, currently only used for Otel emission"""
112
+
113
+ _evaluator_version: NotRequired[Optional[str]]
114
+ """Version of the evaluator from the evaluator asset, currently only used for Otel emission"""
115
+
116
+ _evaluator_id: NotRequired[Optional[str]]
117
+ """ID of the evaluator from the evaluator asset, currently only used for Otel emission"""
118
+
119
+ _evaluator_definition: NotRequired[Optional[EvaluatorDefinition]]
120
+ """Definition of the evaluator to be used from the evaluator asset"""
121
+ """Currently only used for Otel emission, will be changed to used in AOAI eval results converter as well in the future."""
122
+
108
123
 
109
124
  class Message(TypedDict):
110
125
  role: str
@@ -121,3 +136,14 @@ class EvaluationResult(TypedDict):
121
136
  metrics: Dict
122
137
  studio_url: NotRequired[str]
123
138
  rows: List[Dict]
139
+ _evaluation_results_list: List[Dict]
140
+ _evaluation_summary: Dict
141
+
142
+
143
+ class AppInsightsConfig(TypedDict):
144
+ connection_string: str
145
+ project_id: NotRequired[str]
146
+ run_type: NotRequired[str]
147
+ schedule_type: NotRequired[str]
148
+ run_id: NotRequired[str]
149
+ extra_attributes: NotRequired[Dict[str, Any]]
@@ -3,4 +3,4 @@
3
3
  # ---------------------------------------------------------
4
4
  # represents upcoming version
5
5
 
6
- VERSION = "1.11.2"
6
+ VERSION = "1.13.0"
@@ -21,7 +21,9 @@ class RiskCategory(str, Enum):
21
21
  ProtectedMaterial = "protected_material"
22
22
  CodeVulnerability = "code_vulnerability"
23
23
  UngroundedAttributes = "ungrounded_attributes"
24
- IndirectAttack = "indirect_attack"
24
+ SensitiveDataLeakage = "sensitive_data_leakage" # Agent targets only
25
+ TaskAdherence = "task_adherence" # Agent targets only
26
+ ProhibitedActions = "prohibited_actions" # Agent targets only
25
27
 
26
28
 
27
29
  @experimental
@@ -37,6 +37,7 @@ class AttackStrategy(Enum):
37
37
  Jailbreak = "jailbreak"
38
38
  MultiTurn = "multi_turn"
39
39
  Crescendo = "crescendo"
40
+ IndirectJailbreak = "indirect_jailbreak"
40
41
 
41
42
  @classmethod
42
43
  def Compose(cls, items: List["AttackStrategy"]) -> List["AttackStrategy"]:
@@ -19,7 +19,6 @@ class _CallbackChatTarget(PromptChatTarget):
19
19
  *,
20
20
  callback: Callable[[List[Dict], bool, Optional[str], Optional[Dict[str, Any]]], Dict],
21
21
  stream: bool = False,
22
- prompt_to_context: Optional[Dict[str, str]] = None,
23
22
  ) -> None:
24
23
  """
25
24
  Initializes an instance of the _CallbackChatTarget class.
@@ -33,12 +32,10 @@ class _CallbackChatTarget(PromptChatTarget):
33
32
  Args:
34
33
  callback (Callable): The callback function that sends a prompt to a target and receives a response.
35
34
  stream (bool, optional): Indicates whether the target supports streaming. Defaults to False.
36
- prompt_to_context (Optional[Dict[str, str]], optional): Mapping from prompt content to context. Defaults to None.
37
35
  """
38
36
  PromptChatTarget.__init__(self)
39
37
  self._callback = callback
40
38
  self._stream = stream
41
- self._prompt_to_context = prompt_to_context or {}
42
39
 
43
40
  async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> PromptRequestResponse:
44
41
 
@@ -51,22 +48,56 @@ class _CallbackChatTarget(PromptChatTarget):
51
48
 
52
49
  logger.info(f"Sending the following prompt to the prompt target: {request}")
53
50
 
54
- # Get context for the current prompt if available
55
- current_prompt_content = request.converted_value
56
- context_data = self._prompt_to_context.get(current_prompt_content, "")
57
- context_dict = {"context": context_data} if context_data else {}
58
-
59
- # If context is not available via prompt_to_context, it can be fetched from the memory
60
- if not context_dict:
61
- memory_label_context = request.labels.get("context", None)
62
- context_dict = {"context": memory_label_context} if memory_label_context else {}
51
+ # Extract context from request labels if available
52
+ # The context is stored in memory labels when the prompt is sent by orchestrator
53
+ context_dict = {}
54
+ if hasattr(request, "labels") and request.labels and "context" in request.labels:
55
+ context_data = request.labels["context"]
56
+ if context_data and isinstance(context_data, dict):
57
+ # context_data is always a dict with 'contexts' list
58
+ # Each context can have its own context_type and tool_name
59
+ contexts = context_data.get("contexts", [])
60
+
61
+ # Build context_dict to pass to callback
62
+ context_dict = {"contexts": contexts}
63
+
64
+ # Check if any context has agent-specific fields for logging
65
+ has_agent_fields = any(
66
+ isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
67
+ )
68
+
69
+ if has_agent_fields:
70
+ tool_names = [
71
+ ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
72
+ ]
73
+ logger.debug(f"Extracted agent context: {len(contexts)} context source(s), tool_names={tool_names}")
74
+ else:
75
+ logger.debug(f"Extracted model context: {len(contexts)} context source(s)")
63
76
 
64
77
  # response_context contains "messages", "stream", "session_state, "context"
65
- response_context = await self._callback(messages=messages, stream=self._stream, session_state=None, context=context_dict) # type: ignore
78
+ response = await self._callback(messages=messages, stream=self._stream, session_state=None, context=context_dict) # type: ignore
79
+
80
+ # Store token_usage before processing tuple
81
+ token_usage = None
82
+ if isinstance(response, dict) and "token_usage" in response:
83
+ token_usage = response["token_usage"]
84
+
85
+ if type(response) == tuple:
86
+ response, tool_output = response
87
+ request.labels["tool_calls"] = tool_output
88
+ # Check for token_usage in the response dict from tuple
89
+ if isinstance(response, dict) and "token_usage" in response:
90
+ token_usage = response["token_usage"]
91
+
92
+ response_text = response["messages"][-1]["content"]
66
93
 
67
- response_text = response_context["messages"][-1]["content"]
68
94
  response_entry = construct_response_from_request(request=request, response_text_pieces=[response_text])
69
95
 
96
+ # Add token_usage to the response entry's labels (not the request)
97
+ if token_usage:
98
+ response_entry.request_pieces[0].labels["token_usage"] = token_usage
99
+ logger.debug(f"Captured token usage from callback: {token_usage}")
100
+
70
101
  logger.info("Received the following response from the prompt target" + f"{response_text}")
71
102
  return response_entry
72
103
 
@@ -25,7 +25,8 @@ from tenacity import retry
25
25
 
26
26
  # Azure AI Evaluation imports
27
27
  from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
28
- from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
28
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_sync
29
+ from azure.ai.evaluation._common.utils import is_onedp_project, get_default_threshold_for_evaluator
29
30
  from azure.ai.evaluation._evaluate._utils import _write_output
30
31
 
31
32
  # Local imports
@@ -53,6 +54,7 @@ class EvaluationProcessor:
53
54
  retry_config,
54
55
  scan_session_id=None,
55
56
  scan_output_dir=None,
57
+ taxonomy_risk_categories=None,
56
58
  ):
57
59
  """Initialize the evaluation processor.
58
60
 
@@ -63,6 +65,7 @@ class EvaluationProcessor:
63
65
  :param retry_config: Retry configuration for network errors
64
66
  :param scan_session_id: Session ID for the current scan
65
67
  :param scan_output_dir: Directory for scan outputs
68
+ :param taxonomy_risk_categories: Dictionary mapping risk categories to taxonomy values
66
69
  """
67
70
  self.logger = logger
68
71
  self.azure_ai_project = azure_ai_project
@@ -71,6 +74,7 @@ class EvaluationProcessor:
71
74
  self.retry_config = retry_config
72
75
  self.scan_session_id = scan_session_id
73
76
  self.scan_output_dir = scan_output_dir
77
+ self.taxonomy_risk_categories = taxonomy_risk_categories or {}
74
78
 
75
79
  async def evaluate_conversation(
76
80
  self,
@@ -79,6 +83,7 @@ class EvaluationProcessor:
79
83
  strategy_name: str,
80
84
  risk_category: RiskCategory,
81
85
  idx: int,
86
+ risk_sub_type: Optional[str] = None,
82
87
  ) -> Dict:
83
88
  """Evaluate a single conversation using the specified metric and risk category.
84
89
 
@@ -92,16 +97,22 @@ class EvaluationProcessor:
92
97
  :type risk_category: RiskCategory
93
98
  :param idx: Index of the conversation for tracking purposes
94
99
  :type idx: int
100
+ :param risk_sub_type: Optional risk sub type for the evaluation
101
+ :type risk_sub_type: Optional[str]
95
102
  :return: Dictionary containing evaluation results
96
103
  :rtype: Dict
97
104
  """
98
105
  annotation_task = get_annotation_task_from_risk_category(risk_category)
106
+
99
107
  messages = conversation["conversation"]["messages"]
100
108
 
101
109
  # Extract all assistant messages for evaluation
102
110
  assistant_messages = [msg["content"] for msg in messages if msg.get("role") == "assistant"]
103
111
 
104
112
  context = [msg["context"] for msg in messages if msg.get("role") == "user"]
113
+ tool_calls = [
114
+ msg.get("tool_calls", []) for msg in messages if msg.get("role") == "assistant" and msg.get("tool_calls")
115
+ ]
105
116
 
106
117
  if assistant_messages:
107
118
  # Create query-response pair with empty query and all assistant messages
@@ -114,20 +125,47 @@ class EvaluationProcessor:
114
125
  if context[0] is not None:
115
126
  query_response["context"] = context[0]
116
127
 
128
+ if tool_calls and any(tool_calls):
129
+ query_response["tool_calls"] = [call for sublist in tool_calls for call in sublist if call]
130
+
131
+ # Add risk_sub_type to query_response if it exists
132
+ if risk_sub_type:
133
+ query_response["risk_sub_type"] = risk_sub_type
134
+
135
+ # Add taxonomy to query_response if it exists for this risk category
136
+ if self.taxonomy_risk_categories and risk_category.value in self.taxonomy_risk_categories:
137
+ taxonomy_value = self.taxonomy_risk_categories[risk_category.value]
138
+ # Convert taxonomy to string if it's not already a string
139
+ if taxonomy_value is not None:
140
+ query_response["taxonomy"] = str(taxonomy_value)
141
+
117
142
  try:
118
143
  self.logger.debug(f"Evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}")
119
144
 
120
145
  @retry(**self.retry_config["network_retry"])
121
146
  async def evaluate_with_rai_service_with_retry():
122
147
  try:
123
- return await evaluate_with_rai_service(
124
- data=query_response,
125
- metric_name=metric_name,
126
- project_scope=self.azure_ai_project,
127
- credential=self.credential,
128
- annotation_task=annotation_task,
129
- scan_session_id=self.scan_session_id,
130
- )
148
+ # Use sync_evals endpoint for OneDP projects, legacy endpoint for others
149
+ if is_onedp_project(self.azure_ai_project):
150
+ # If using sync API use hate_unfairness rather than hate_fairness
151
+ effective_metric_name = "hate_unfairness" if metric_name == "hate_fairness" else metric_name
152
+ return await evaluate_with_rai_service_sync(
153
+ data=query_response,
154
+ metric_name=effective_metric_name,
155
+ project_scope=self.azure_ai_project,
156
+ credential=self.credential,
157
+ annotation_task=annotation_task,
158
+ scan_session_id=self.scan_session_id,
159
+ )
160
+ else:
161
+ return await evaluate_with_rai_service(
162
+ data=query_response,
163
+ metric_name=metric_name,
164
+ project_scope=self.azure_ai_project,
165
+ credential=self.credential,
166
+ annotation_task=annotation_task,
167
+ scan_session_id=self.scan_session_id,
168
+ )
131
169
  except (
132
170
  httpx.ConnectTimeout,
133
171
  httpx.ReadTimeout,
@@ -153,9 +191,84 @@ class EvaluationProcessor:
153
191
  # Create a row with the format expected by result processing
154
192
  row = {"inputs.conversation": {"messages": messages}}
155
193
 
156
- # Add the evaluation results in the expected format
194
+ # Check if we got an EvalRunOutputItem (from sync_evals) or legacy dict format
157
195
  risk_cat_value = risk_category.value
158
- if evaluate_output:
196
+
197
+ # Handle EvalRunOutputItem format (from sync_evals for OneDP)
198
+ if hasattr(evaluate_output, "results") or (
199
+ isinstance(evaluate_output, dict) and "results" in evaluate_output
200
+ ):
201
+ # Extract results from EvalRunOutputItem
202
+ results = (
203
+ evaluate_output.results
204
+ if hasattr(evaluate_output, "results")
205
+ else evaluate_output.get("results", [])
206
+ )
207
+
208
+ # Find the result matching our metric/risk category
209
+ eval_result = None
210
+ for result_item in results:
211
+ result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__
212
+ if result_dict.get("name") == metric_name or result_dict.get("metric") == metric_name:
213
+ eval_result = result_dict
214
+ break
215
+
216
+ if eval_result:
217
+ # Extract values from EvalRunOutputItem result format
218
+ score = eval_result.get("score", 0)
219
+ reason = eval_result.get("reason", "")
220
+
221
+ # Convert score to severity label if needed
222
+ severity_label = eval_result.get("label")
223
+ if severity_label is None:
224
+ # Calculate severity from score
225
+ from azure.ai.evaluation._common.utils import get_harm_severity_level
226
+
227
+ severity_label = get_harm_severity_level(score)
228
+
229
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}"] = severity_label
230
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = reason
231
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_score"] = score
232
+
233
+ # Store the EvalRunOutputItem as dict for new format processing
234
+ # Use model's serialization method or JSON round-trip
235
+ if hasattr(evaluate_output, "as_dict"):
236
+ row["_eval_run_output_item"] = evaluate_output.as_dict()
237
+ elif hasattr(evaluate_output, "model_dump"):
238
+ row["_eval_run_output_item"] = evaluate_output.model_dump()
239
+ else:
240
+ # Fallback: JSON serialize and parse back
241
+ row["_eval_run_output_item"] = json.loads(
242
+ json.dumps(evaluate_output, default=lambda o: o.__dict__)
243
+ )
244
+
245
+ # Add result field based on score and threshold comparison
246
+ # Get pattern-specific default threshold for this evaluator
247
+ default_threshold = get_default_threshold_for_evaluator(risk_cat_value)
248
+
249
+ # Content safety evaluators use "lower is better" scoring by default
250
+ # If attack success thresholds are configured for this category, use them
251
+ if risk_cat_value in self.attack_success_thresholds:
252
+ attack_threshold = self.attack_success_thresholds[risk_cat_value]
253
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = attack_threshold
254
+ result = int(score) <= int(attack_threshold)
255
+ self.logger.debug(f"Using attack success threshold {attack_threshold} for {risk_cat_value}")
256
+ self.logger.debug(f"Score: {score}, Attack Threshold: {attack_threshold}, Pass: {result}")
257
+ else:
258
+ # Store the default threshold in the results for transparency
259
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = default_threshold
260
+ result = int(score) <= int(default_threshold)
261
+ self.logger.debug(f"Using default threshold {default_threshold} for {risk_cat_value}")
262
+ self.logger.debug(f"Score: {score}, Default Threshold: {default_threshold}, Pass: {result}")
263
+
264
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_result"] = EVALUATION_PASS_FAIL_MAPPING[result]
265
+ self.logger.debug(
266
+ f"Successfully evaluated conversation {idx+1} for {risk_category.value}/{strategy_name}"
267
+ )
268
+ return row
269
+
270
+ # Handle legacy dict format (from evaluate_with_rai_service)
271
+ if evaluate_output and isinstance(evaluate_output, dict):
159
272
  if risk_cat_value in evaluate_output:
160
273
  row[f"outputs.{risk_cat_value}.{risk_cat_value}"] = evaluate_output.get(risk_cat_value)
161
274
  row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = evaluate_output.get(
@@ -167,7 +280,10 @@ class EvaluationProcessor:
167
280
 
168
281
  # Add result field based on score and threshold comparison
169
282
  score = evaluate_output.get(f"{risk_cat_value}_score", 0)
170
- default_threshold = evaluate_output.get(f"{risk_cat_value}_threshold", 3)
283
+ # Get pattern-specific default threshold for this evaluator
284
+ default_threshold = evaluate_output.get(
285
+ f"{risk_cat_value}_threshold", get_default_threshold_for_evaluator(risk_cat_value)
286
+ )
171
287
 
172
288
  # Content safety evaluators use "lower is better" scoring by default
173
289
  # If attack success thresholds are configured for this category, use them
@@ -306,6 +422,7 @@ class EvaluationProcessor:
306
422
  strategy_name=strategy_name,
307
423
  risk_category=risk_category,
308
424
  idx=idx,
425
+ risk_sub_type=conversation.get("risk_sub_type"),
309
426
  )
310
427
  for idx, conversation in enumerate(conversations)
311
428
  ]