azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ from ._run_storage import AbstractRunStorage, NoOpRunStorage
|
|
|
17
17
|
from .._common._logging import incremental_print, print_red_error
|
|
18
18
|
from ._config import BatchEngineConfig
|
|
19
19
|
from ._exceptions import BatchEngineValidationError
|
|
20
|
-
from ._engine import DEFAULTS_KEY, BatchEngine, BatchEngineError, BatchResult
|
|
20
|
+
from ._engine import DEFAULTS_KEY, BatchEngine, BatchEngineError, BatchResult, BatchStatus
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class RunSubmitter:
|
|
@@ -141,6 +141,19 @@ class RunSubmitter:
|
|
|
141
141
|
run._status = RunStatus.FAILED
|
|
142
142
|
# when run failed in executor, store the exception in result and dump to file
|
|
143
143
|
logger.warning(f"Run {run.name} failed when executing in executor with exception {e}.")
|
|
144
|
+
if not batch_result:
|
|
145
|
+
batch_result = BatchResult(
|
|
146
|
+
status=BatchStatus.Failed,
|
|
147
|
+
total_lines=0,
|
|
148
|
+
failed_lines=0,
|
|
149
|
+
start_time=datetime.now(timezone.utc),
|
|
150
|
+
end_time=datetime.now(timezone.utc),
|
|
151
|
+
tokens=None,
|
|
152
|
+
details=[],
|
|
153
|
+
)
|
|
154
|
+
batch_result.error = e
|
|
155
|
+
elif not batch_result.error:
|
|
156
|
+
batch_result.error = e
|
|
144
157
|
# for user error, swallow stack trace and return failed run since user don't need the stack trace
|
|
145
158
|
if not isinstance(e, BatchEngineValidationError):
|
|
146
159
|
# for other errors, raise it to user to help debug root cause.
|
|
@@ -266,7 +266,7 @@ class AsyncPrompty:
|
|
|
266
266
|
async def __call__( # pylint: disable=docstring-keyword-should-match-keyword-only
|
|
267
267
|
self,
|
|
268
268
|
**kwargs: Any,
|
|
269
|
-
) ->
|
|
269
|
+
) -> dict:
|
|
270
270
|
"""Calling prompty as a function in async, the inputs should be provided with key word arguments.
|
|
271
271
|
Returns the output of the prompty.
|
|
272
272
|
|
|
@@ -330,6 +330,7 @@ class AsyncPrompty:
|
|
|
330
330
|
is_first_choice=self._data.get("model", {}).get("response", "first").lower() == "first",
|
|
331
331
|
response_format=params.get("response_format", {}),
|
|
332
332
|
outputs=self._outputs,
|
|
333
|
+
inputs=inputs,
|
|
333
334
|
)
|
|
334
335
|
|
|
335
336
|
def render( # pylint: disable=docstring-keyword-should-match-keyword-only
|
|
@@ -32,7 +32,7 @@ from typing import (
|
|
|
32
32
|
|
|
33
33
|
from jinja2 import Template
|
|
34
34
|
from openai import AsyncStream
|
|
35
|
-
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
35
|
+
from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionUserMessageParam
|
|
36
36
|
from openai import APIConnectionError, APIStatusError, APITimeoutError, OpenAIError
|
|
37
37
|
|
|
38
38
|
from azure.ai.evaluation._constants import DefaultOpenEncoding
|
|
@@ -466,7 +466,8 @@ async def format_llm_response(
|
|
|
466
466
|
is_first_choice: bool,
|
|
467
467
|
response_format: Optional[Mapping[str, Any]] = None,
|
|
468
468
|
outputs: Optional[Mapping[str, Any]] = None,
|
|
469
|
-
|
|
469
|
+
inputs: Optional[Mapping[str, Any]] = None,
|
|
470
|
+
) -> dict:
|
|
470
471
|
"""
|
|
471
472
|
Format LLM response
|
|
472
473
|
|
|
@@ -525,15 +526,54 @@ async def format_llm_response(
|
|
|
525
526
|
return
|
|
526
527
|
yield chunk.choices[0].delta.content
|
|
527
528
|
|
|
529
|
+
to_ret = {
|
|
530
|
+
"llm_output": None,
|
|
531
|
+
"input_token_count": 0,
|
|
532
|
+
"output_token_count": 0,
|
|
533
|
+
"total_token_count": 0,
|
|
534
|
+
"finish_reason": "",
|
|
535
|
+
"model_id": "",
|
|
536
|
+
"sample_input": "",
|
|
537
|
+
"sample_output": "",
|
|
538
|
+
}
|
|
539
|
+
|
|
528
540
|
if not is_first_choice:
|
|
529
|
-
|
|
541
|
+
to_ret["llm_output"] = response
|
|
542
|
+
return to_ret # we don't actually use this code path since streaming is not used, so set token counts to 0
|
|
530
543
|
|
|
531
544
|
is_json_format = isinstance(response_format, dict) and response_format.get("type") == "json_object"
|
|
532
545
|
if isinstance(response, AsyncStream):
|
|
533
546
|
if not is_json_format:
|
|
534
|
-
|
|
547
|
+
to_ret["llm_output"] = format_stream(llm_response=response)
|
|
548
|
+
return to_ret
|
|
535
549
|
content = "".join([item async for item in format_stream(llm_response=response)])
|
|
536
|
-
|
|
550
|
+
to_ret["llm_output"] = format_choice(content)
|
|
551
|
+
return to_ret # we don't actually use this code path since streaming is not used, so set token counts to 0
|
|
552
|
+
else:
|
|
553
|
+
input_token_count = response.usage.prompt_tokens if response.usage and response.usage.prompt_tokens else 0
|
|
554
|
+
output_token_count = (
|
|
555
|
+
response.usage.completion_tokens if response.usage and response.usage.completion_tokens else 0
|
|
556
|
+
)
|
|
557
|
+
total_token_count = response.usage.total_tokens if response.usage and response.usage.total_tokens else 0
|
|
558
|
+
finish_reason = (
|
|
559
|
+
response.choices[0].finish_reason if response.choices and response.choices[0].finish_reason else ""
|
|
560
|
+
)
|
|
561
|
+
model_id = response.model if response.model else ""
|
|
562
|
+
sample_output_list = (
|
|
563
|
+
[{"role": response.choices[0].message.role, "content": response.choices[0].message.content}]
|
|
564
|
+
if (response.choices and response.choices[0].message.content and response.choices[0].message.role)
|
|
565
|
+
else []
|
|
566
|
+
)
|
|
567
|
+
sample_output = json.dumps(sample_output_list)
|
|
568
|
+
input_str = f"{json.dumps(inputs)}" if inputs else ""
|
|
569
|
+
if inputs and len(inputs) > 0:
|
|
570
|
+
sample_input_json = []
|
|
571
|
+
msg = ChatCompletionUserMessageParam(
|
|
572
|
+
role="user",
|
|
573
|
+
content=input_str,
|
|
574
|
+
)
|
|
575
|
+
sample_input_json.append(msg)
|
|
576
|
+
sample_input = json.dumps(sample_input_json)
|
|
537
577
|
|
|
538
578
|
# When calling function/tool, function_call/tool_call response will be returned as a field in message,
|
|
539
579
|
# so we need return message directly. Otherwise, we only return content.
|
|
@@ -543,7 +583,15 @@ async def format_llm_response(
|
|
|
543
583
|
else:
|
|
544
584
|
response_content = getattr(response.choices[0].message, "content", "")
|
|
545
585
|
result = format_choice(response_content)
|
|
546
|
-
|
|
586
|
+
to_ret["llm_output"] = result
|
|
587
|
+
to_ret["input_token_count"] = input_token_count
|
|
588
|
+
to_ret["output_token_count"] = output_token_count
|
|
589
|
+
to_ret["total_token_count"] = total_token_count
|
|
590
|
+
to_ret["finish_reason"] = finish_reason
|
|
591
|
+
to_ret["model_id"] = model_id
|
|
592
|
+
to_ret["sample_input"] = sample_input
|
|
593
|
+
to_ret["sample_output"] = sample_output
|
|
594
|
+
return to_ret
|
|
547
595
|
|
|
548
596
|
|
|
549
597
|
def openai_error_retryable(
|
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
from typing import Any, Dict, List, Literal, TypedDict, Union
|
|
6
6
|
|
|
7
7
|
from typing_extensions import NotRequired
|
|
8
|
+
from ._evaluator_definition import EvaluatorDefinition
|
|
9
|
+
from typing import Dict, List, Optional, Any
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class AzureOpenAIModelConfiguration(TypedDict):
|
|
@@ -105,6 +107,19 @@ class EvaluatorConfig(TypedDict, total=False):
|
|
|
105
107
|
column_mapping: Dict[str, str]
|
|
106
108
|
"""Dictionary mapping evaluator input name to column in data"""
|
|
107
109
|
|
|
110
|
+
_evaluator_name: NotRequired[Optional[str]]
|
|
111
|
+
"""Name of the evaluator from the evaluator asset, currently only used for Otel emission"""
|
|
112
|
+
|
|
113
|
+
_evaluator_version: NotRequired[Optional[str]]
|
|
114
|
+
"""Version of the evaluator from the evaluator asset, currently only used for Otel emission"""
|
|
115
|
+
|
|
116
|
+
_evaluator_id: NotRequired[Optional[str]]
|
|
117
|
+
"""ID of the evaluator from the evaluator asset, currently only used for Otel emission"""
|
|
118
|
+
|
|
119
|
+
_evaluator_definition: NotRequired[Optional[EvaluatorDefinition]]
|
|
120
|
+
"""Definition of the evaluator to be used from the evaluator asset"""
|
|
121
|
+
"""Currently only used for Otel emission, will be changed to used in AOAI eval results converter as well in the future."""
|
|
122
|
+
|
|
108
123
|
|
|
109
124
|
class Message(TypedDict):
|
|
110
125
|
role: str
|
|
@@ -121,3 +136,14 @@ class EvaluationResult(TypedDict):
|
|
|
121
136
|
metrics: Dict
|
|
122
137
|
studio_url: NotRequired[str]
|
|
123
138
|
rows: List[Dict]
|
|
139
|
+
_evaluation_results_list: List[Dict]
|
|
140
|
+
_evaluation_summary: Dict
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class AppInsightsConfig(TypedDict):
|
|
144
|
+
connection_string: str
|
|
145
|
+
project_id: NotRequired[str]
|
|
146
|
+
run_type: NotRequired[str]
|
|
147
|
+
schedule_type: NotRequired[str]
|
|
148
|
+
run_id: NotRequired[str]
|
|
149
|
+
extra_attributes: NotRequired[Dict[str, Any]]
|
azure/ai/evaluation/_version.py
CHANGED
|
@@ -21,7 +21,9 @@ class RiskCategory(str, Enum):
|
|
|
21
21
|
ProtectedMaterial = "protected_material"
|
|
22
22
|
CodeVulnerability = "code_vulnerability"
|
|
23
23
|
UngroundedAttributes = "ungrounded_attributes"
|
|
24
|
-
|
|
24
|
+
SensitiveDataLeakage = "sensitive_data_leakage" # Agent targets only
|
|
25
|
+
TaskAdherence = "task_adherence" # Agent targets only
|
|
26
|
+
ProhibitedActions = "prohibited_actions" # Agent targets only
|
|
25
27
|
|
|
26
28
|
|
|
27
29
|
@experimental
|
|
@@ -19,7 +19,6 @@ class _CallbackChatTarget(PromptChatTarget):
|
|
|
19
19
|
*,
|
|
20
20
|
callback: Callable[[List[Dict], bool, Optional[str], Optional[Dict[str, Any]]], Dict],
|
|
21
21
|
stream: bool = False,
|
|
22
|
-
prompt_to_context: Optional[Dict[str, str]] = None,
|
|
23
22
|
) -> None:
|
|
24
23
|
"""
|
|
25
24
|
Initializes an instance of the _CallbackChatTarget class.
|
|
@@ -33,12 +32,10 @@ class _CallbackChatTarget(PromptChatTarget):
|
|
|
33
32
|
Args:
|
|
34
33
|
callback (Callable): The callback function that sends a prompt to a target and receives a response.
|
|
35
34
|
stream (bool, optional): Indicates whether the target supports streaming. Defaults to False.
|
|
36
|
-
prompt_to_context (Optional[Dict[str, str]], optional): Mapping from prompt content to context. Defaults to None.
|
|
37
35
|
"""
|
|
38
36
|
PromptChatTarget.__init__(self)
|
|
39
37
|
self._callback = callback
|
|
40
38
|
self._stream = stream
|
|
41
|
-
self._prompt_to_context = prompt_to_context or {}
|
|
42
39
|
|
|
43
40
|
async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> PromptRequestResponse:
|
|
44
41
|
|
|
@@ -51,22 +48,56 @@ class _CallbackChatTarget(PromptChatTarget):
|
|
|
51
48
|
|
|
52
49
|
logger.info(f"Sending the following prompt to the prompt target: {request}")
|
|
53
50
|
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
51
|
+
# Extract context from request labels if available
|
|
52
|
+
# The context is stored in memory labels when the prompt is sent by orchestrator
|
|
53
|
+
context_dict = {}
|
|
54
|
+
if hasattr(request, "labels") and request.labels and "context" in request.labels:
|
|
55
|
+
context_data = request.labels["context"]
|
|
56
|
+
if context_data and isinstance(context_data, dict):
|
|
57
|
+
# context_data is always a dict with 'contexts' list
|
|
58
|
+
# Each context can have its own context_type and tool_name
|
|
59
|
+
contexts = context_data.get("contexts", [])
|
|
60
|
+
|
|
61
|
+
# Build context_dict to pass to callback
|
|
62
|
+
context_dict = {"contexts": contexts}
|
|
63
|
+
|
|
64
|
+
# Check if any context has agent-specific fields for logging
|
|
65
|
+
has_agent_fields = any(
|
|
66
|
+
isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if has_agent_fields:
|
|
70
|
+
tool_names = [
|
|
71
|
+
ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
|
|
72
|
+
]
|
|
73
|
+
logger.debug(f"Extracted agent context: {len(contexts)} context source(s), tool_names={tool_names}")
|
|
74
|
+
else:
|
|
75
|
+
logger.debug(f"Extracted model context: {len(contexts)} context source(s)")
|
|
63
76
|
|
|
64
77
|
# response_context contains "messages", "stream", "session_state, "context"
|
|
65
|
-
|
|
78
|
+
response = await self._callback(messages=messages, stream=self._stream, session_state=None, context=context_dict) # type: ignore
|
|
79
|
+
|
|
80
|
+
# Store token_usage before processing tuple
|
|
81
|
+
token_usage = None
|
|
82
|
+
if isinstance(response, dict) and "token_usage" in response:
|
|
83
|
+
token_usage = response["token_usage"]
|
|
84
|
+
|
|
85
|
+
if type(response) == tuple:
|
|
86
|
+
response, tool_output = response
|
|
87
|
+
request.labels["tool_calls"] = tool_output
|
|
88
|
+
# Check for token_usage in the response dict from tuple
|
|
89
|
+
if isinstance(response, dict) and "token_usage" in response:
|
|
90
|
+
token_usage = response["token_usage"]
|
|
91
|
+
|
|
92
|
+
response_text = response["messages"][-1]["content"]
|
|
66
93
|
|
|
67
|
-
response_text = response_context["messages"][-1]["content"]
|
|
68
94
|
response_entry = construct_response_from_request(request=request, response_text_pieces=[response_text])
|
|
69
95
|
|
|
96
|
+
# Add token_usage to the response entry's labels (not the request)
|
|
97
|
+
if token_usage:
|
|
98
|
+
response_entry.request_pieces[0].labels["token_usage"] = token_usage
|
|
99
|
+
logger.debug(f"Captured token usage from callback: {token_usage}")
|
|
100
|
+
|
|
70
101
|
logger.info("Received the following response from the prompt target" + f"{response_text}")
|
|
71
102
|
return response_entry
|
|
72
103
|
|
|
@@ -25,7 +25,8 @@ from tenacity import retry
|
|
|
25
25
|
|
|
26
26
|
# Azure AI Evaluation imports
|
|
27
27
|
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
28
|
-
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
28
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_sync
|
|
29
|
+
from azure.ai.evaluation._common.utils import is_onedp_project, get_default_threshold_for_evaluator
|
|
29
30
|
from azure.ai.evaluation._evaluate._utils import _write_output
|
|
30
31
|
|
|
31
32
|
# Local imports
|
|
@@ -53,6 +54,7 @@ class EvaluationProcessor:
|
|
|
53
54
|
retry_config,
|
|
54
55
|
scan_session_id=None,
|
|
55
56
|
scan_output_dir=None,
|
|
57
|
+
taxonomy_risk_categories=None,
|
|
56
58
|
):
|
|
57
59
|
"""Initialize the evaluation processor.
|
|
58
60
|
|
|
@@ -63,6 +65,7 @@ class EvaluationProcessor:
|
|
|
63
65
|
:param retry_config: Retry configuration for network errors
|
|
64
66
|
:param scan_session_id: Session ID for the current scan
|
|
65
67
|
:param scan_output_dir: Directory for scan outputs
|
|
68
|
+
:param taxonomy_risk_categories: Dictionary mapping risk categories to taxonomy values
|
|
66
69
|
"""
|
|
67
70
|
self.logger = logger
|
|
68
71
|
self.azure_ai_project = azure_ai_project
|
|
@@ -71,6 +74,7 @@ class EvaluationProcessor:
|
|
|
71
74
|
self.retry_config = retry_config
|
|
72
75
|
self.scan_session_id = scan_session_id
|
|
73
76
|
self.scan_output_dir = scan_output_dir
|
|
77
|
+
self.taxonomy_risk_categories = taxonomy_risk_categories or {}
|
|
74
78
|
|
|
75
79
|
async def evaluate_conversation(
|
|
76
80
|
self,
|
|
@@ -79,6 +83,7 @@ class EvaluationProcessor:
|
|
|
79
83
|
strategy_name: str,
|
|
80
84
|
risk_category: RiskCategory,
|
|
81
85
|
idx: int,
|
|
86
|
+
risk_sub_type: Optional[str] = None,
|
|
82
87
|
) -> Dict:
|
|
83
88
|
"""Evaluate a single conversation using the specified metric and risk category.
|
|
84
89
|
|
|
@@ -92,16 +97,22 @@ class EvaluationProcessor:
|
|
|
92
97
|
:type risk_category: RiskCategory
|
|
93
98
|
:param idx: Index of the conversation for tracking purposes
|
|
94
99
|
:type idx: int
|
|
100
|
+
:param risk_sub_type: Optional risk sub type for the evaluation
|
|
101
|
+
:type risk_sub_type: Optional[str]
|
|
95
102
|
:return: Dictionary containing evaluation results
|
|
96
103
|
:rtype: Dict
|
|
97
104
|
"""
|
|
98
105
|
annotation_task = get_annotation_task_from_risk_category(risk_category)
|
|
106
|
+
|
|
99
107
|
messages = conversation["conversation"]["messages"]
|
|
100
108
|
|
|
101
109
|
# Extract all assistant messages for evaluation
|
|
102
110
|
assistant_messages = [msg["content"] for msg in messages if msg.get("role") == "assistant"]
|
|
103
111
|
|
|
104
112
|
context = [msg["context"] for msg in messages if msg.get("role") == "user"]
|
|
113
|
+
tool_calls = [
|
|
114
|
+
msg.get("tool_calls", []) for msg in messages if msg.get("role") == "assistant" and msg.get("tool_calls")
|
|
115
|
+
]
|
|
105
116
|
|
|
106
117
|
if assistant_messages:
|
|
107
118
|
# Create query-response pair with empty query and all assistant messages
|
|
@@ -114,20 +125,47 @@ class EvaluationProcessor:
|
|
|
114
125
|
if context[0] is not None:
|
|
115
126
|
query_response["context"] = context[0]
|
|
116
127
|
|
|
128
|
+
if tool_calls and any(tool_calls):
|
|
129
|
+
query_response["tool_calls"] = [call for sublist in tool_calls for call in sublist if call]
|
|
130
|
+
|
|
131
|
+
# Add risk_sub_type to query_response if it exists
|
|
132
|
+
if risk_sub_type:
|
|
133
|
+
query_response["risk_sub_type"] = risk_sub_type
|
|
134
|
+
|
|
135
|
+
# Add taxonomy to query_response if it exists for this risk category
|
|
136
|
+
if self.taxonomy_risk_categories and risk_category.value in self.taxonomy_risk_categories:
|
|
137
|
+
taxonomy_value = self.taxonomy_risk_categories[risk_category.value]
|
|
138
|
+
# Convert taxonomy to string if it's not already a string
|
|
139
|
+
if taxonomy_value is not None:
|
|
140
|
+
query_response["taxonomy"] = str(taxonomy_value)
|
|
141
|
+
|
|
117
142
|
try:
|
|
118
143
|
self.logger.debug(f"Evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}")
|
|
119
144
|
|
|
120
145
|
@retry(**self.retry_config["network_retry"])
|
|
121
146
|
async def evaluate_with_rai_service_with_retry():
|
|
122
147
|
try:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
148
|
+
# Use sync_evals endpoint for OneDP projects, legacy endpoint for others
|
|
149
|
+
if is_onedp_project(self.azure_ai_project):
|
|
150
|
+
# If using sync API use hate_unfairness rather than hate_fairness
|
|
151
|
+
effective_metric_name = "hate_unfairness" if metric_name == "hate_fairness" else metric_name
|
|
152
|
+
return await evaluate_with_rai_service_sync(
|
|
153
|
+
data=query_response,
|
|
154
|
+
metric_name=effective_metric_name,
|
|
155
|
+
project_scope=self.azure_ai_project,
|
|
156
|
+
credential=self.credential,
|
|
157
|
+
annotation_task=annotation_task,
|
|
158
|
+
scan_session_id=self.scan_session_id,
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
return await evaluate_with_rai_service(
|
|
162
|
+
data=query_response,
|
|
163
|
+
metric_name=metric_name,
|
|
164
|
+
project_scope=self.azure_ai_project,
|
|
165
|
+
credential=self.credential,
|
|
166
|
+
annotation_task=annotation_task,
|
|
167
|
+
scan_session_id=self.scan_session_id,
|
|
168
|
+
)
|
|
131
169
|
except (
|
|
132
170
|
httpx.ConnectTimeout,
|
|
133
171
|
httpx.ReadTimeout,
|
|
@@ -153,9 +191,84 @@ class EvaluationProcessor:
|
|
|
153
191
|
# Create a row with the format expected by result processing
|
|
154
192
|
row = {"inputs.conversation": {"messages": messages}}
|
|
155
193
|
|
|
156
|
-
#
|
|
194
|
+
# Check if we got an EvalRunOutputItem (from sync_evals) or legacy dict format
|
|
157
195
|
risk_cat_value = risk_category.value
|
|
158
|
-
|
|
196
|
+
|
|
197
|
+
# Handle EvalRunOutputItem format (from sync_evals for OneDP)
|
|
198
|
+
if hasattr(evaluate_output, "results") or (
|
|
199
|
+
isinstance(evaluate_output, dict) and "results" in evaluate_output
|
|
200
|
+
):
|
|
201
|
+
# Extract results from EvalRunOutputItem
|
|
202
|
+
results = (
|
|
203
|
+
evaluate_output.results
|
|
204
|
+
if hasattr(evaluate_output, "results")
|
|
205
|
+
else evaluate_output.get("results", [])
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Find the result matching our metric/risk category
|
|
209
|
+
eval_result = None
|
|
210
|
+
for result_item in results:
|
|
211
|
+
result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__
|
|
212
|
+
if result_dict.get("name") == metric_name or result_dict.get("metric") == metric_name:
|
|
213
|
+
eval_result = result_dict
|
|
214
|
+
break
|
|
215
|
+
|
|
216
|
+
if eval_result:
|
|
217
|
+
# Extract values from EvalRunOutputItem result format
|
|
218
|
+
score = eval_result.get("score", 0)
|
|
219
|
+
reason = eval_result.get("reason", "")
|
|
220
|
+
|
|
221
|
+
# Convert score to severity label if needed
|
|
222
|
+
severity_label = eval_result.get("label")
|
|
223
|
+
if severity_label is None:
|
|
224
|
+
# Calculate severity from score
|
|
225
|
+
from azure.ai.evaluation._common.utils import get_harm_severity_level
|
|
226
|
+
|
|
227
|
+
severity_label = get_harm_severity_level(score)
|
|
228
|
+
|
|
229
|
+
row[f"outputs.{risk_cat_value}.{risk_cat_value}"] = severity_label
|
|
230
|
+
row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = reason
|
|
231
|
+
row[f"outputs.{risk_cat_value}.{risk_cat_value}_score"] = score
|
|
232
|
+
|
|
233
|
+
# Store the EvalRunOutputItem as dict for new format processing
|
|
234
|
+
# Use model's serialization method or JSON round-trip
|
|
235
|
+
if hasattr(evaluate_output, "as_dict"):
|
|
236
|
+
row["_eval_run_output_item"] = evaluate_output.as_dict()
|
|
237
|
+
elif hasattr(evaluate_output, "model_dump"):
|
|
238
|
+
row["_eval_run_output_item"] = evaluate_output.model_dump()
|
|
239
|
+
else:
|
|
240
|
+
# Fallback: JSON serialize and parse back
|
|
241
|
+
row["_eval_run_output_item"] = json.loads(
|
|
242
|
+
json.dumps(evaluate_output, default=lambda o: o.__dict__)
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Add result field based on score and threshold comparison
|
|
246
|
+
# Get pattern-specific default threshold for this evaluator
|
|
247
|
+
default_threshold = get_default_threshold_for_evaluator(risk_cat_value)
|
|
248
|
+
|
|
249
|
+
# Content safety evaluators use "lower is better" scoring by default
|
|
250
|
+
# If attack success thresholds are configured for this category, use them
|
|
251
|
+
if risk_cat_value in self.attack_success_thresholds:
|
|
252
|
+
attack_threshold = self.attack_success_thresholds[risk_cat_value]
|
|
253
|
+
row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = attack_threshold
|
|
254
|
+
result = int(score) <= int(attack_threshold)
|
|
255
|
+
self.logger.debug(f"Using attack success threshold {attack_threshold} for {risk_cat_value}")
|
|
256
|
+
self.logger.debug(f"Score: {score}, Attack Threshold: {attack_threshold}, Pass: {result}")
|
|
257
|
+
else:
|
|
258
|
+
# Store the default threshold in the results for transparency
|
|
259
|
+
row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = default_threshold
|
|
260
|
+
result = int(score) <= int(default_threshold)
|
|
261
|
+
self.logger.debug(f"Using default threshold {default_threshold} for {risk_cat_value}")
|
|
262
|
+
self.logger.debug(f"Score: {score}, Default Threshold: {default_threshold}, Pass: {result}")
|
|
263
|
+
|
|
264
|
+
row[f"outputs.{risk_cat_value}.{risk_cat_value}_result"] = EVALUATION_PASS_FAIL_MAPPING[result]
|
|
265
|
+
self.logger.debug(
|
|
266
|
+
f"Successfully evaluated conversation {idx+1} for {risk_category.value}/{strategy_name}"
|
|
267
|
+
)
|
|
268
|
+
return row
|
|
269
|
+
|
|
270
|
+
# Handle legacy dict format (from evaluate_with_rai_service)
|
|
271
|
+
if evaluate_output and isinstance(evaluate_output, dict):
|
|
159
272
|
if risk_cat_value in evaluate_output:
|
|
160
273
|
row[f"outputs.{risk_cat_value}.{risk_cat_value}"] = evaluate_output.get(risk_cat_value)
|
|
161
274
|
row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = evaluate_output.get(
|
|
@@ -167,7 +280,10 @@ class EvaluationProcessor:
|
|
|
167
280
|
|
|
168
281
|
# Add result field based on score and threshold comparison
|
|
169
282
|
score = evaluate_output.get(f"{risk_cat_value}_score", 0)
|
|
170
|
-
|
|
283
|
+
# Get pattern-specific default threshold for this evaluator
|
|
284
|
+
default_threshold = evaluate_output.get(
|
|
285
|
+
f"{risk_cat_value}_threshold", get_default_threshold_for_evaluator(risk_cat_value)
|
|
286
|
+
)
|
|
171
287
|
|
|
172
288
|
# Content safety evaluators use "lower is better" scoring by default
|
|
173
289
|
# If attack success thresholds are configured for this category, use them
|
|
@@ -306,6 +422,7 @@ class EvaluationProcessor:
|
|
|
306
422
|
strategy_name=strategy_name,
|
|
307
423
|
risk_category=risk_category,
|
|
308
424
|
idx=idx,
|
|
425
|
+
risk_sub_type=conversation.get("risk_sub_type"),
|
|
309
426
|
)
|
|
310
427
|
for idx, conversation in enumerate(conversations)
|
|
311
428
|
]
|