PyPI - azure-ai-evaluation - Versions diffs - 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.11.2py3-none-any.whl → 1.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py CHANGED Viewed

@@ -17,7 +17,7 @@ from ._run_storage import AbstractRunStorage, NoOpRunStorage
 from .._common._logging import incremental_print, print_red_error
 from ._config import BatchEngineConfig
 from ._exceptions import BatchEngineValidationError
-from ._engine import DEFAULTS_KEY, BatchEngine, BatchEngineError, BatchResult
+from ._engine import DEFAULTS_KEY, BatchEngine, BatchEngineError, BatchResult, BatchStatus
 class RunSubmitter:
@@ -141,6 +141,19 @@ class RunSubmitter:
             run._status = RunStatus.FAILED
             # when run failed in executor, store the exception in result and dump to file
             logger.warning(f"Run {run.name} failed when executing in executor with exception {e}.")
+            if not batch_result:
+                batch_result = BatchResult(
+                    status=BatchStatus.Failed,
+                    total_lines=0,
+                    failed_lines=0,
+                    start_time=datetime.now(timezone.utc),
+                    end_time=datetime.now(timezone.utc),
+                    tokens=None,
+                    details=[],
+                )
+                batch_result.error = e
+            elif not batch_result.error:
+                batch_result.error = e
             # for user error, swallow stack trace and return failed run since user don't need the stack trace
             if not isinstance(e, BatchEngineValidationError):
                 # for other errors, raise it to user to help debug root cause.

azure/ai/evaluation/_legacy/prompty/_prompty.py CHANGED Viewed

@@ -266,7 +266,7 @@ class AsyncPrompty:
     async def __call__(  # pylint: disable=docstring-keyword-should-match-keyword-only
         self,
         **kwargs: Any,
-    ) -> Union[OpenAIChatResponseType, AsyncGenerator[str, None], str, Mapping[str, Any]]:
+    ) -> dict:
         """Calling prompty as a function in async, the inputs should be provided with key word arguments.
         Returns the output of the prompty.
@@ -330,6 +330,7 @@ class AsyncPrompty:
             is_first_choice=self._data.get("model", {}).get("response", "first").lower() == "first",
             response_format=params.get("response_format", {}),
             outputs=self._outputs,
+            inputs=inputs,
         )
     def render(  # pylint: disable=docstring-keyword-should-match-keyword-only

azure/ai/evaluation/_legacy/prompty/_utils.py CHANGED Viewed

@@ -32,7 +32,7 @@ from typing import (
 from jinja2 import Template
 from openai import AsyncStream
-from openai.types.chat import ChatCompletion, ChatCompletionChunk
+from openai.types.chat import ChatCompletion, ChatCompletionChunk, ChatCompletionUserMessageParam
 from openai import APIConnectionError, APIStatusError, APITimeoutError, OpenAIError
 from azure.ai.evaluation._constants import DefaultOpenEncoding
@@ -466,7 +466,8 @@ async def format_llm_response(
     is_first_choice: bool,
     response_format: Optional[Mapping[str, Any]] = None,
     outputs: Optional[Mapping[str, Any]] = None,
-) -> Union[OpenAIChatResponseType, AsyncGenerator[str, None], str, Mapping[str, Any]]:
+    inputs: Optional[Mapping[str, Any]] = None,
+) -> dict:
     """
     Format LLM response
@@ -525,15 +526,54 @@ async def format_llm_response(
                     return
                 yield chunk.choices[0].delta.content
+    to_ret = {
+        "llm_output": None,
+        "input_token_count": 0,
+        "output_token_count": 0,
+        "total_token_count": 0,
+        "finish_reason": "",
+        "model_id": "",
+        "sample_input": "",
+        "sample_output": "",
+    }
     if not is_first_choice:
-        return response
+        to_ret["llm_output"] = response
+        return to_ret  # we don't actually use this code path since streaming is not used, so set token counts to 0
     is_json_format = isinstance(response_format, dict) and response_format.get("type") == "json_object"
     if isinstance(response, AsyncStream):
         if not is_json_format:
-            return format_stream(llm_response=response)
+            to_ret["llm_output"] = format_stream(llm_response=response)
+            return to_ret
         content = "".join([item async for item in format_stream(llm_response=response)])
-        return format_choice(content)
+        to_ret["llm_output"] = format_choice(content)
+        return to_ret  # we don't actually use this code path since streaming is not used, so set token counts to 0
+    else:
+        input_token_count = response.usage.prompt_tokens if response.usage and response.usage.prompt_tokens else 0
+        output_token_count = (
+            response.usage.completion_tokens if response.usage and response.usage.completion_tokens else 0
+        )
+        total_token_count = response.usage.total_tokens if response.usage and response.usage.total_tokens else 0
+        finish_reason = (
+            response.choices[0].finish_reason if response.choices and response.choices[0].finish_reason else ""
+        )
+        model_id = response.model if response.model else ""
+        sample_output_list = (
+            [{"role": response.choices[0].message.role, "content": response.choices[0].message.content}]
+            if (response.choices and response.choices[0].message.content and response.choices[0].message.role)
+            else []
+        )
+        sample_output = json.dumps(sample_output_list)
+        input_str = f"{json.dumps(inputs)}" if inputs else ""
+        if inputs and len(inputs) > 0:
+            sample_input_json = []
+            msg = ChatCompletionUserMessageParam(
+                role="user",
+                content=input_str,
+            )
+            sample_input_json.append(msg)
+            sample_input = json.dumps(sample_input_json)
     # When calling function/tool, function_call/tool_call response will be returned as a field in message,
     # so we need return message directly. Otherwise, we only return content.
@@ -543,7 +583,15 @@ async def format_llm_response(
     else:
         response_content = getattr(response.choices[0].message, "content", "")
     result = format_choice(response_content)
-    return result
+    to_ret["llm_output"] = result
+    to_ret["input_token_count"] = input_token_count
+    to_ret["output_token_count"] = output_token_count
+    to_ret["total_token_count"] = total_token_count
+    to_ret["finish_reason"] = finish_reason
+    to_ret["model_id"] = model_id
+    to_ret["sample_input"] = sample_input
+    to_ret["sample_output"] = sample_output
+    return to_ret
 def openai_error_retryable(

azure/ai/evaluation/_model_configurations.py CHANGED Viewed

@@ -5,6 +5,8 @@
 from typing import Any, Dict, List, Literal, TypedDict, Union
 from typing_extensions import NotRequired
+from ._evaluator_definition import EvaluatorDefinition
+from typing import Dict, List, Optional, Any
 class AzureOpenAIModelConfiguration(TypedDict):
@@ -105,6 +107,19 @@ class EvaluatorConfig(TypedDict, total=False):
     column_mapping: Dict[str, str]
     """Dictionary mapping evaluator input name to column in data"""
+    _evaluator_name: NotRequired[Optional[str]]
+    """Name of the evaluator from the evaluator asset, currently only used for Otel emission"""
+    _evaluator_version: NotRequired[Optional[str]]
+    """Version of the evaluator from the evaluator asset, currently only used for Otel emission"""
+    _evaluator_id: NotRequired[Optional[str]]
+    """ID of the evaluator from the evaluator asset, currently only used for Otel emission"""
+    _evaluator_definition: NotRequired[Optional[EvaluatorDefinition]]
+    """Definition of the evaluator to be used from the evaluator asset"""
+    """Currently only used for Otel emission, will be changed to used in AOAI eval results converter as well in the future."""
 class Message(TypedDict):
     role: str
@@ -121,3 +136,14 @@ class EvaluationResult(TypedDict):
     metrics: Dict
     studio_url: NotRequired[str]
     rows: List[Dict]
+    _evaluation_results_list: List[Dict]
+    _evaluation_summary: Dict
+class AppInsightsConfig(TypedDict):
+    connection_string: str
+    project_id: NotRequired[str]
+    run_type: NotRequired[str]
+    schedule_type: NotRequired[str]
+    run_id: NotRequired[str]
+    extra_attributes: NotRequired[Dict[str, Any]]

azure/ai/evaluation/_version.py CHANGED Viewed

@@ -3,4 +3,4 @@
 # ---------------------------------------------------------
 # represents upcoming version
-VERSION = "1.11.2"
+VERSION = "1.13.0"

azure/ai/evaluation/red_team/_attack_objective_generator.py CHANGED Viewed

@@ -21,7 +21,9 @@ class RiskCategory(str, Enum):
     ProtectedMaterial = "protected_material"
     CodeVulnerability = "code_vulnerability"
     UngroundedAttributes = "ungrounded_attributes"
-    IndirectAttack = "indirect_attack"
+    SensitiveDataLeakage = "sensitive_data_leakage"  # Agent targets only
+    TaskAdherence = "task_adherence"  # Agent targets only
+    ProhibitedActions = "prohibited_actions"  # Agent targets only
 @experimental

azure/ai/evaluation/red_team/_attack_strategy.py CHANGED Viewed

@@ -37,6 +37,7 @@ class AttackStrategy(Enum):
     Jailbreak = "jailbreak"
     MultiTurn = "multi_turn"
     Crescendo = "crescendo"
+    IndirectJailbreak = "indirect_jailbreak"
     @classmethod
     def Compose(cls, items: List["AttackStrategy"]) -> List["AttackStrategy"]:

azure/ai/evaluation/red_team/_callback_chat_target.py CHANGED Viewed

@@ -19,7 +19,6 @@ class _CallbackChatTarget(PromptChatTarget):
         *,
         callback: Callable[[List[Dict], bool, Optional[str], Optional[Dict[str, Any]]], Dict],
         stream: bool = False,
-        prompt_to_context: Optional[Dict[str, str]] = None,
     ) -> None:
         """
         Initializes an instance of the _CallbackChatTarget class.
@@ -33,12 +32,10 @@ class _CallbackChatTarget(PromptChatTarget):
         Args:
             callback (Callable): The callback function that sends a prompt to a target and receives a response.
             stream (bool, optional): Indicates whether the target supports streaming. Defaults to False.
-            prompt_to_context (Optional[Dict[str, str]], optional): Mapping from prompt content to context. Defaults to None.
         """
         PromptChatTarget.__init__(self)
         self._callback = callback
         self._stream = stream
-        self._prompt_to_context = prompt_to_context or {}
     async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> PromptRequestResponse:
@@ -51,22 +48,56 @@ class _CallbackChatTarget(PromptChatTarget):
         logger.info(f"Sending the following prompt to the prompt target: {request}")
-        # Get context for the current prompt if available
-        current_prompt_content = request.converted_value
-        context_data = self._prompt_to_context.get(current_prompt_content, "")
-        context_dict = {"context": context_data} if context_data else {}
-        # If context is not available via prompt_to_context, it can be fetched from the memory
-        if not context_dict:
-            memory_label_context = request.labels.get("context", None)
-            context_dict = {"context": memory_label_context} if memory_label_context else {}
+        # Extract context from request labels if available
+        # The context is stored in memory labels when the prompt is sent by orchestrator
+        context_dict = {}
+        if hasattr(request, "labels") and request.labels and "context" in request.labels:
+            context_data = request.labels["context"]
+            if context_data and isinstance(context_data, dict):
+                # context_data is always a dict with 'contexts' list
+                # Each context can have its own context_type and tool_name
+                contexts = context_data.get("contexts", [])
+                # Build context_dict to pass to callback
+                context_dict = {"contexts": contexts}
+                # Check if any context has agent-specific fields for logging
+                has_agent_fields = any(
+                    isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
+                )
+                if has_agent_fields:
+                    tool_names = [
+                        ctx.get("tool_name") for ctx in contexts if isinstance(ctx, dict) and "tool_name" in ctx
+                    ]
+                    logger.debug(f"Extracted agent context: {len(contexts)} context source(s), tool_names={tool_names}")
+                else:
+                    logger.debug(f"Extracted model context: {len(contexts)} context source(s)")
         # response_context contains "messages", "stream", "session_state, "context"
-        response_context = await self._callback(messages=messages, stream=self._stream, session_state=None, context=context_dict)  # type: ignore
+        response = await self._callback(messages=messages, stream=self._stream, session_state=None, context=context_dict)  # type: ignore
+        # Store token_usage before processing tuple
+        token_usage = None
+        if isinstance(response, dict) and "token_usage" in response:
+            token_usage = response["token_usage"]
+        if type(response) == tuple:
+            response, tool_output = response
+            request.labels["tool_calls"] = tool_output
+            # Check for token_usage in the response dict from tuple
+            if isinstance(response, dict) and "token_usage" in response:
+                token_usage = response["token_usage"]
+        response_text = response["messages"][-1]["content"]
-        response_text = response_context["messages"][-1]["content"]
         response_entry = construct_response_from_request(request=request, response_text_pieces=[response_text])
+        # Add token_usage to the response entry's labels (not the request)
+        if token_usage:
+            response_entry.request_pieces[0].labels["token_usage"] = token_usage
+            logger.debug(f"Captured token usage from callback: {token_usage}")
         logger.info("Received the following response from the prompt target" + f"{response_text}")
         return response_entry

azure/ai/evaluation/red_team/_evaluation_processor.py CHANGED Viewed

@@ -25,7 +25,8 @@ from tenacity import retry
 # Azure AI Evaluation imports
 from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
+from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_sync
+from azure.ai.evaluation._common.utils import is_onedp_project, get_default_threshold_for_evaluator
 from azure.ai.evaluation._evaluate._utils import _write_output
 # Local imports
@@ -53,6 +54,7 @@ class EvaluationProcessor:
         retry_config,
         scan_session_id=None,
         scan_output_dir=None,
+        taxonomy_risk_categories=None,
     ):
         """Initialize the evaluation processor.
@@ -63,6 +65,7 @@ class EvaluationProcessor:
         :param retry_config: Retry configuration for network errors
         :param scan_session_id: Session ID for the current scan
         :param scan_output_dir: Directory for scan outputs
+        :param taxonomy_risk_categories: Dictionary mapping risk categories to taxonomy values
         """
         self.logger = logger
         self.azure_ai_project = azure_ai_project
@@ -71,6 +74,7 @@ class EvaluationProcessor:
         self.retry_config = retry_config
         self.scan_session_id = scan_session_id
         self.scan_output_dir = scan_output_dir
+        self.taxonomy_risk_categories = taxonomy_risk_categories or {}
     async def evaluate_conversation(
         self,
@@ -79,6 +83,7 @@ class EvaluationProcessor:
         strategy_name: str,
         risk_category: RiskCategory,
         idx: int,
+        risk_sub_type: Optional[str] = None,
     ) -> Dict:
         """Evaluate a single conversation using the specified metric and risk category.
@@ -92,16 +97,22 @@ class EvaluationProcessor:
         :type risk_category: RiskCategory
         :param idx: Index of the conversation for tracking purposes
         :type idx: int
+        :param risk_sub_type: Optional risk sub type for the evaluation
+        :type risk_sub_type: Optional[str]
         :return: Dictionary containing evaluation results
         :rtype: Dict
         """
         annotation_task = get_annotation_task_from_risk_category(risk_category)
         messages = conversation["conversation"]["messages"]
         # Extract all assistant messages for evaluation
         assistant_messages = [msg["content"] for msg in messages if msg.get("role") == "assistant"]
         context = [msg["context"] for msg in messages if msg.get("role") == "user"]
+        tool_calls = [
+            msg.get("tool_calls", []) for msg in messages if msg.get("role") == "assistant" and msg.get("tool_calls")
+        ]
         if assistant_messages:
             # Create query-response pair with empty query and all assistant messages
@@ -114,20 +125,47 @@ class EvaluationProcessor:
             if context[0] is not None:
                 query_response["context"] = context[0]
+            if tool_calls and any(tool_calls):
+                query_response["tool_calls"] = [call for sublist in tool_calls for call in sublist if call]
+            # Add risk_sub_type to query_response if it exists
+            if risk_sub_type:
+                query_response["risk_sub_type"] = risk_sub_type
+            # Add taxonomy to query_response if it exists for this risk category
+            if self.taxonomy_risk_categories and risk_category.value in self.taxonomy_risk_categories:
+                taxonomy_value = self.taxonomy_risk_categories[risk_category.value]
+                # Convert taxonomy to string if it's not already a string
+                if taxonomy_value is not None:
+                    query_response["taxonomy"] = str(taxonomy_value)
             try:
                 self.logger.debug(f"Evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}")
                 @retry(**self.retry_config["network_retry"])
                 async def evaluate_with_rai_service_with_retry():
                     try:
-                        return await evaluate_with_rai_service(
-                            data=query_response,
-                            metric_name=metric_name,
-                            project_scope=self.azure_ai_project,
-                            credential=self.credential,
-                            annotation_task=annotation_task,
-                            scan_session_id=self.scan_session_id,
-                        )
+                        # Use sync_evals endpoint for OneDP projects, legacy endpoint for others
+                        if is_onedp_project(self.azure_ai_project):
+                            # If using sync API use hate_unfairness rather than hate_fairness
+                            effective_metric_name = "hate_unfairness" if metric_name == "hate_fairness" else metric_name
+                            return await evaluate_with_rai_service_sync(
+                                data=query_response,
+                                metric_name=effective_metric_name,
+                                project_scope=self.azure_ai_project,
+                                credential=self.credential,
+                                annotation_task=annotation_task,
+                                scan_session_id=self.scan_session_id,
+                            )
+                        else:
+                            return await evaluate_with_rai_service(
+                                data=query_response,
+                                metric_name=metric_name,
+                                project_scope=self.azure_ai_project,
+                                credential=self.credential,
+                                annotation_task=annotation_task,
+                                scan_session_id=self.scan_session_id,
+                            )
                     except (
                         httpx.ConnectTimeout,
                         httpx.ReadTimeout,
@@ -153,9 +191,84 @@ class EvaluationProcessor:
                 # Create a row with the format expected by result processing
                 row = {"inputs.conversation": {"messages": messages}}
-                # Add the evaluation results in the expected format
+                # Check if we got an EvalRunOutputItem (from sync_evals) or legacy dict format
                 risk_cat_value = risk_category.value
-                if evaluate_output:
+                # Handle EvalRunOutputItem format (from sync_evals for OneDP)
+                if hasattr(evaluate_output, "results") or (
+                    isinstance(evaluate_output, dict) and "results" in evaluate_output
+                ):
+                    # Extract results from EvalRunOutputItem
+                    results = (
+                        evaluate_output.results
+                        if hasattr(evaluate_output, "results")
+                        else evaluate_output.get("results", [])
+                    )
+                    # Find the result matching our metric/risk category
+                    eval_result = None
+                    for result_item in results:
+                        result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__
+                        if result_dict.get("name") == metric_name or result_dict.get("metric") == metric_name:
+                            eval_result = result_dict
+                            break
+                    if eval_result:
+                        # Extract values from EvalRunOutputItem result format
+                        score = eval_result.get("score", 0)
+                        reason = eval_result.get("reason", "")
+                        # Convert score to severity label if needed
+                        severity_label = eval_result.get("label")
+                        if severity_label is None:
+                            # Calculate severity from score
+                            from azure.ai.evaluation._common.utils import get_harm_severity_level
+                            severity_label = get_harm_severity_level(score)
+                        row[f"outputs.{risk_cat_value}.{risk_cat_value}"] = severity_label
+                        row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = reason
+                        row[f"outputs.{risk_cat_value}.{risk_cat_value}_score"] = score
+                        # Store the EvalRunOutputItem as dict for new format processing
+                        # Use model's serialization method or JSON round-trip
+                        if hasattr(evaluate_output, "as_dict"):
+                            row["_eval_run_output_item"] = evaluate_output.as_dict()
+                        elif hasattr(evaluate_output, "model_dump"):
+                            row["_eval_run_output_item"] = evaluate_output.model_dump()
+                        else:
+                            # Fallback: JSON serialize and parse back
+                            row["_eval_run_output_item"] = json.loads(
+                                json.dumps(evaluate_output, default=lambda o: o.__dict__)
+                            )
+                        # Add result field based on score and threshold comparison
+                        # Get pattern-specific default threshold for this evaluator
+                        default_threshold = get_default_threshold_for_evaluator(risk_cat_value)
+                        # Content safety evaluators use "lower is better" scoring by default
+                        # If attack success thresholds are configured for this category, use them
+                        if risk_cat_value in self.attack_success_thresholds:
+                            attack_threshold = self.attack_success_thresholds[risk_cat_value]
+                            row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = attack_threshold
+                            result = int(score) <= int(attack_threshold)
+                            self.logger.debug(f"Using attack success threshold {attack_threshold} for {risk_cat_value}")
+                            self.logger.debug(f"Score: {score}, Attack Threshold: {attack_threshold}, Pass: {result}")
+                        else:
+                            # Store the default threshold in the results for transparency
+                            row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = default_threshold
+                            result = int(score) <= int(default_threshold)
+                            self.logger.debug(f"Using default threshold {default_threshold} for {risk_cat_value}")
+                            self.logger.debug(f"Score: {score}, Default Threshold: {default_threshold}, Pass: {result}")
+                        row[f"outputs.{risk_cat_value}.{risk_cat_value}_result"] = EVALUATION_PASS_FAIL_MAPPING[result]
+                        self.logger.debug(
+                            f"Successfully evaluated conversation {idx+1} for {risk_category.value}/{strategy_name}"
+                        )
+                        return row
+                # Handle legacy dict format (from evaluate_with_rai_service)
+                if evaluate_output and isinstance(evaluate_output, dict):
                     if risk_cat_value in evaluate_output:
                         row[f"outputs.{risk_cat_value}.{risk_cat_value}"] = evaluate_output.get(risk_cat_value)
                         row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = evaluate_output.get(
@@ -167,7 +280,10 @@ class EvaluationProcessor:
                         # Add result field based on score and threshold comparison
                         score = evaluate_output.get(f"{risk_cat_value}_score", 0)
-                        default_threshold = evaluate_output.get(f"{risk_cat_value}_threshold", 3)
+                        # Get pattern-specific default threshold for this evaluator
+                        default_threshold = evaluate_output.get(
+                            f"{risk_cat_value}_threshold", get_default_threshold_for_evaluator(risk_cat_value)
+                        )
                         # Content safety evaluators use "lower is better" scoring by default
                         # If attack success thresholds are configured for this category, use them
@@ -306,6 +422,7 @@ class EvaluationProcessor:
                     strategy_name=strategy_name,
                     risk_category=risk_category,
                     idx=idx,
+                    risk_sub_type=conversation.get("risk_sub_type"),
                 )
                 for idx, conversation in enumerate(conversations)
             ]

azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

azure-ai-evaluation 1.11.2py3-none-any.whl → 1.13.0py3-none-any.whl