PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b4py3-none-any.whl → 1.0.0b5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (79) hide show

azure/ai/evaluation/__init__.py CHANGED Viewed

@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
     SexualEvaluator,
     ViolenceEvaluator,
 )
+from ._evaluators._multimodal._content_safety_multimodal import (
+    ContentSafetyMultimodalEvaluator,
+    HateUnfairnessMultimodalEvaluator,
+    SelfHarmMultimodalEvaluator,
+    SexualMultimodalEvaluator,
+    ViolenceMultimodalEvaluator,
+)
+from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
 from ._evaluators._f1_score import F1ScoreEvaluator
 from ._evaluators._fluency import FluencyEvaluator
 from ._evaluators._gleu import GleuScoreEvaluator
 from ._evaluators._groundedness import GroundednessEvaluator
+from ._evaluators._service_groundedness import GroundednessProEvaluator
 from ._evaluators._meteor import MeteorScoreEvaluator
 from ._evaluators._protected_material import ProtectedMaterialEvaluator
 from ._evaluators._qa import QAEvaluator
@@ -27,7 +36,10 @@ from ._evaluators._xpia import IndirectAttackEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
+    Conversation,
+    EvaluationResult,
     EvaluatorConfig,
+    Message,
     OpenAIModelConfiguration,
 )
@@ -37,6 +49,7 @@ __all__ = [
     "F1ScoreEvaluator",
     "FluencyEvaluator",
     "GroundednessEvaluator",
+    "GroundednessProEvaluator",
     "RelevanceEvaluator",
     "SimilarityEvaluator",
     "QAEvaluator",
@@ -57,4 +70,13 @@ __all__ = [
     "AzureOpenAIModelConfiguration",
     "OpenAIModelConfiguration",
     "EvaluatorConfig",
+    "Conversation",
+    "Message",
+    "EvaluationResult",
+    "ContentSafetyMultimodalEvaluator",
+    "HateUnfairnessMultimodalEvaluator",
+    "SelfHarmMultimodalEvaluator",
+    "SexualMultimodalEvaluator",
+    "ViolenceMultimodalEvaluator",
+    "ProtectedMaterialMultimodalEvaluator",
 ]

azure/ai/evaluation/_common/constants.py CHANGED Viewed

@@ -6,6 +6,9 @@ from enum import Enum
 from azure.core import CaseInsensitiveEnumMeta
+PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
 class CommonConstants:
     """Define common constants."""
@@ -35,6 +38,7 @@ class Tasks:
     CONTENT_HARM = "content harm"
     PROTECTED_MATERIAL = "protected material"
     XPIA = "xpia"
+    GROUNDEDNESS = "groundedness"
 class _InternalAnnotationTasks:
@@ -56,6 +60,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     SEXUAL = "sexual"
     PROTECTED_MATERIAL = "protected_material"
     XPIA = "xpia"
+    GROUNDEDNESS = "generic_groundedness"
 class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):

azure/ai/evaluation/_common/math.py CHANGED Viewed

@@ -5,6 +5,8 @@
 import math
 from typing import List
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 def list_sum(lst: List[float]) -> float:
     return sum(lst)
@@ -15,4 +17,13 @@ def list_mean(lst: List[float]) -> float:
 def list_mean_nan_safe(lst: List[float]) -> float:
+    msg = "All score values are NaN. The mean cannot be calculated."
+    if all(math.isnan(l) for l in lst):
+        raise EvaluationException(
+            message=msg,
+            internal_message=msg,
+            blame=ErrorBlame.USER_ERROR,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.CONVERSATION,
+        )
     return list_mean([l for l in lst if not math.isnan(l)])

azure/ai/evaluation/_common/rai_service.py CHANGED Viewed

@@ -9,13 +9,16 @@ import time
 from ast import literal_eval
 from typing import Dict, List, Optional, Union, cast
 from urllib.parse import urlparse
+from string import Template
 import jwt
+from promptflow.core._errors import MissingRequiredPackage
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
 from azure.ai.evaluation._model_configurations import AzureAIProject
 from azure.core.credentials import TokenCredential
+from azure.core.exceptions import HttpResponseError
 from azure.core.pipeline.policies import AsyncRetryPolicy
 from .constants import (
@@ -23,10 +26,9 @@ from .constants import (
     EvaluationMetrics,
     RAIService,
     Tasks,
-    _InternalAnnotationTasks,
     _InternalEvaluationMetrics,
 )
-from .utils import get_harm_severity_level
+from .utils import get_harm_severity_level, retrieve_content_type
 try:
     version = importlib.metadata.version("azure-ai-evaluation")
@@ -34,6 +36,11 @@ except importlib.metadata.PackageNotFoundError:
     version = "unknown"
 USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
+USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
+    "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
+    Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
+}
 def get_common_headers(token: str) -> Dict:
     """Get common headers for the HTTP request
@@ -99,7 +106,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
         )
-def generate_payload(normalized_user_text: str, metric: str) -> Dict:
+def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
     """Generate the payload for the annotation request
     :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
@@ -107,19 +114,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
     :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
         in the payload.
     :type metric: str
+    :param annotation_task: The annotation task to be passed to service
+    :type annotation_task: str
     :return: The payload for the annotation request.
     :rtype: Dict
     """
     include_metric = True
-    task = Tasks.CONTENT_HARM
+    task = annotation_task
     if metric == EvaluationMetrics.PROTECTED_MATERIAL:
-        task = Tasks.PROTECTED_MATERIAL
         include_metric = False
     elif metric == _InternalEvaluationMetrics.ECI:
-        task = _InternalAnnotationTasks.ECI
         include_metric = False
     elif metric == EvaluationMetrics.XPIA:
-        task = Tasks.XPIA
         include_metric = False
     return (
         {
@@ -135,25 +141,25 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
     )
-async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
+async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
     """Submit request to Responsible AI service for evaluation and return operation ID
-    :param query: The query to evaluate.
-    :type query: str
-    :param response: The response to evaluate.
-    :type response: str
+    :param data: The data to evaluate.
+    :type data: dict
     :param metric: The evaluation metric to use.
     :type metric: str
     :param rai_svc_url: The Responsible AI service URL.
     :type rai_svc_url: str
     :param token: The Azure authentication token.
     :type token: str
+    :param annotation_task: The annotation task to use.
+    :type annotation_task: str
     :return: The operation ID.
     :rtype: str
     """
-    user_text = f"<Human>{query}</><System>{response}</>"
+    user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
     normalized_user_text = user_text.replace("'", '\\"')
-    payload = generate_payload(normalized_user_text, metric)
+    payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
     url = rai_svc_url + "/submitannotation"
     headers = get_common_headers(token)
@@ -164,7 +170,6 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
     if http_response.status_code != 202:
         print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
         http_response.raise_for_status()
     result = http_response.json()
     operation_id = result["location"].split("/")[-1]
     return operation_id
@@ -208,7 +213,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
 def parse_response(  # pylint: disable=too-many-branches,too-many-statements
-    batch_response: List[Dict], metric_name: str
+    batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
 ) -> Dict[str, Union[str, float]]:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.
@@ -216,11 +221,20 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     :type batch_response: List[Dict]
     :param metric_name: The evaluation metric to use.
     :type metric_name: str
+    :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
+    :type metric_display_name: Optional[str]
     :return: The parsed annotation result.
     :rtype: Dict[str, Union[str, float]]
     """
+    if metric_display_name is None:
+        metric_display_name = metric_name
     # non-numeric metrics
-    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
+    if metric_name in {
+        EvaluationMetrics.PROTECTED_MATERIAL,
+        _InternalEvaluationMetrics.ECI,
+        EvaluationMetrics.XPIA,
+    }:
         if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
             return {}
         response = batch_response[0][metric_name]
@@ -230,42 +244,50 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         result = {}
         # Use label instead of score since these are assumed to be boolean results.
         # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
-        result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
-        result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
+        result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
         if metric_name == EvaluationMetrics.XPIA:
             # Add "manipulated_content", "intrusion" and "information_gathering" to the result
             # if present else set them to math.nan
-            result[metric_name + "_manipulated_content"] = (
+            result[metric_display_name + "_manipulated_content"] = (
                 parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
             )
-            result[metric_name + "_intrusion"] = (
+            result[metric_display_name + "_intrusion"] = (
                 parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
             )
-            result[metric_name + "_information_gathering"] = (
+            result[metric_display_name + "_information_gathering"] = (
                 parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
             )
         return result
-    return _parse_content_harm_response(batch_response, metric_name)
+    return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
-def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict[str, Union[str, float]]:
+def _parse_content_harm_response(
+    batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
+) -> Dict[str, Union[str, float]]:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.
     :param batch_response: The annotation response from Responsible AI service.
     :type batch_response: List[Dict]
     :param metric_name: The evaluation metric to use.
     :type metric_name: str
+    :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
+    :type metric_display_name: Optional[str]
     :return: The parsed annotation result.
     :rtype: Dict[str, Union[str, float]]
     """
     # Fix the metric name if it's "hate_fairness"
     # Eventually we will remove this fix once the RAI service is updated
-    key = metric_name
+    key = metric_name if metric_display_name is None else metric_display_name
     if key == EvaluationMetrics.HATE_FAIRNESS:
         key = EvaluationMetrics.HATE_UNFAIRNESS
-    result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
+    result: Dict[str, Union[str, float]] = {
+        (key.value if hasattr(key, "value") else key): math.nan,
+        f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
+        f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
+    }
     response = batch_response[0]
     if metric_name not in response:
@@ -283,7 +305,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
         # get content harm metric_value
         if "label" in harm_response:
-            metric_value = harm_response["label"]
+            metric_value = float(harm_response["label"])
         elif "valid" in harm_response:
             metric_value = 0 if harm_response["valid"] else math.nan
         else:
@@ -314,6 +336,8 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
         reason = ""
     harm_score = metric_value
+    if metric_value == "n/a":
+        return result
     if not math.isnan(metric_value):
         # int(math.nan) causes a value error, and math.nan is already handled
         # by get_harm_severity_level
@@ -412,14 +436,17 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
 async def evaluate_with_rai_service(
-    query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
+    data: dict,
+    metric_name: str,
+    project_scope: AzureAIProject,
+    credential: TokenCredential,
+    annotation_task: str = Tasks.CONTENT_HARM,
+    metric_display_name=None,
 ) -> Dict[str, Union[str, float]]:
     """ "Evaluate the content safety of the response using Responsible AI service
-       :param query: The query to evaluate.
-       :type query: str
-       :param response: The response to evaluate.
-       :type response: str
+       :param data: The data to evaluate.
+       :type data: dict
        :param metric_name: The evaluation metric to use.
        :type metric_name: str
        :param project_scope: The Azure AI project scope details.
@@ -427,6 +454,10 @@ async def evaluate_with_rai_service(
        :param credential: The Azure authentication credential.
        :type credential:
     ~azure.core.credentials.TokenCredential
+       :param annotation_task: The annotation task to use.
+       :type annotation_task: str
+       :param metric_display_name: The display name of metric to use.
+       :type metric_display_name: str
        :return: The parsed annotation result.
        :rtype: Dict[str, Union[str, float]]
     """
@@ -434,11 +465,117 @@ async def evaluate_with_rai_service(
     # Get RAI service URL from discovery service and check service availability
     token = await fetch_or_reuse_token(credential)
     rai_svc_url = await get_rai_svc_url(project_scope, token)
-    await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
+    await ensure_service_availability(rai_svc_url, token, annotation_task)
     # Submit annotation request and fetch result
-    operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
+    operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
     annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
-    result = parse_response(annotation_response, metric_name)
+    result = parse_response(annotation_response, metric_name, metric_display_name)
+    return result
+def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
+    """Generate the payload for the annotation request
+    :param content_type: The type of the content representing multimodal or images.
+    :type content_type: str
+    :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
+    :type messages: str
+    :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
+        in the payload.
+    :type metric: str
+    :return: The payload for the annotation request.
+    :rtype: Dict
+    """
+    include_metric = True
+    task = Tasks.CONTENT_HARM
+    if metric == EvaluationMetrics.PROTECTED_MATERIAL:
+        task = Tasks.PROTECTED_MATERIAL
+        include_metric = False
+    if include_metric:
+        return {
+            "ContentType": content_type,
+            "Contents": [{"messages": messages}],
+            "AnnotationTask": task,
+            "MetricList": [metric],
+        }
+    return {
+        "ContentType": content_type,
+        "Contents": [{"messages": messages}],
+        "AnnotationTask": task,
+    }
+async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
+    """Submit request to Responsible AI service for evaluation and return operation ID
+    :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
+    :type messages: str
+    :param metric: The evaluation metric to use.
+    :type metric: str
+    :param rai_svc_url: The Responsible AI service URL.
+    :type rai_svc_url: str
+    :param token: The Azure authentication token.
+    :type token: str
+    :return: The operation ID.
+    :rtype: str
+    """
+    ## handle json payload and payload from inference sdk strongly type messages
+    if len(messages) > 0 and not isinstance(messages[0], dict):
+        try:
+            from azure.ai.inference.models import ChatRequestMessage
+        except ImportError as ex:
+            error_message = (
+                "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
+            )
+            raise MissingRequiredPackage(message=error_message) from ex
+        if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
+            messages = [message.as_dict() for message in messages]
+    filtered_messages = [message for message in messages if message["role"] != "system"]
+    assistant_messages = [message for message in messages if message["role"] == "assistant"]
+    content_type = retrieve_content_type(assistant_messages, metric)
+    payload = generate_payload_multimodal(content_type, filtered_messages, metric)
+    ## calling rai service for annotation
+    url = rai_svc_url + "/submitannotation"
+    headers = get_common_headers(token)
+    async with get_async_http_client() as client:
+        response = await client.post(  # pylint: disable=too-many-function-args,unexpected-keyword-arg
+            url, json=payload, headers=headers
+        )
+    if response.status_code != 202:
+        raise HttpResponseError(
+            message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
+        )
+    result = response.json()
+    operation_id = result["location"].split("/")[-1]
+    return operation_id
+async def evaluate_with_rai_service_multimodal(
+    messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
+):
+    """ "Evaluate the content safety of the response using Responsible AI service
+       :param messages: The normalized list of messages.
+       :type messages: str
+       :param metric_name: The evaluation metric to use.
+       :type metric_name: str
+       :param project_scope: The Azure AI project scope details.
+       :type project_scope: Dict
+       :param credential: The Azure authentication credential.
+       :type credential:
+    ~azure.core.credentials.TokenCredential
+       :return: The parsed annotation result.
+       :rtype: List[List[Dict]]
+    """
+    # Get RAI service URL from discovery service and check service availability
+    token = await fetch_or_reuse_token(credential)
+    rai_svc_url = await get_rai_svc_url(project_scope, token)
+    await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
+    # Submit annotation request and fetch result
+    operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
+    annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
+    result = parse_response(annotation_response, metric_name)
     return result

azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b4py3-none-any.whl → 1.0.0b5py3-none-any.whl