PyPI - azure-ai-evaluation - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (33) hide show

azure/ai/evaluation/_evaluate/_eval_run.py CHANGED Viewed

@@ -22,29 +22,12 @@ from azure.ai.evaluation._version import VERSION
 from azure.core.pipeline.policies import RetryPolicy
 from azure.core.rest import HttpResponse
 from azure.core.exceptions import HttpResponseError
+from azure.storage.blob import BlobServiceClient
+from azure.ai.evaluation._azure._clients import LiteMLClient
 LOGGER = logging.getLogger(__name__)
-# Handle optional import. The azure libraries are only present if
-# promptflow-azure is installed.
-try:
-    from azure.ai.ml import MLClient
-    from azure.ai.ml.entities._credentials import AccountKeyConfiguration  # pylint: disable=ungrouped-imports
-    from azure.ai.ml.entities._datastore.datastore import Datastore
-    from azure.storage.blob import BlobServiceClient
-except (ModuleNotFoundError, ImportError):
-    raise EvaluationException(  # pylint: disable=raise-missing-from
-        message=(
-            "The required packages for remote tracking are missing.\n"
-            'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
-        ),
-        target=ErrorTarget.EVALUATE,
-        category=ErrorCategory.MISSING_PACKAGE,
-        blame=ErrorBlame.USER_ERROR,
-    )
 @dataclasses.dataclass
 class RunInfo:
     """
@@ -93,15 +76,16 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
     :type group_name: str
     :param workspace_name: The name of workspace/project used to track run.
     :type workspace_name: str
-    :param ml_client: The ml client used for authentication into Azure.
-    :type ml_client: azure.ai.ml.MLClient
+    :param management_client: The trace destination string to parse the AI ML workspace blob store from.
+    :type management_client:
+        ~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
     :param promptflow_run: The promptflow run used by the
+    :type promptflow_run: Optional[promptflow._sdk.entities.Run]
     """
     _MAX_RETRIES = 5
     _BACKOFF_FACTOR = 2
     _TIMEOUT = 5
-    _SCOPE = "https://management.azure.com/.default"
     EVALUATION_ARTIFACT = "instance_results.jsonl"
@@ -112,14 +96,14 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         subscription_id: str,
         group_name: str,
         workspace_name: str,
-        ml_client: "MLClient",
+        management_client: LiteMLClient,
         promptflow_run: Optional[Run] = None,
     ) -> None:
         self._tracking_uri: str = tracking_uri
         self._subscription_id: str = subscription_id
         self._resource_group_name: str = group_name
         self._workspace_name: str = workspace_name
-        self._ml_client: Any = ml_client
+        self._management_client: LiteMLClient = management_client
         self._is_promptflow_run: bool = promptflow_run is not None
         self._run_name = run_name
         self._promptflow_run = promptflow_run
@@ -184,7 +168,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
             if self._promptflow_run is not None:
                 self._info = RunInfo(
                     self._promptflow_run.name,
-                    self._promptflow_run._experiment_name,  # pylint: disable=protected-access
+                    self._promptflow_run._experiment_name or "",  # pylint: disable=protected-access
                     self._promptflow_run.name,
                 )
             else:
@@ -310,12 +294,8 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         """
         return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
-    def _get_token(self):
-        # We have to use lazy import because promptflow.azure
-        # is an optional dependency.
-        from promptflow.azure._utils._token_cache import ArmTokenCache  # pylint: disable=import-error,no-name-in-module
-        return ArmTokenCache().get_token(self._ml_client._credential)  # pylint: disable=protected-access
+    def _get_token(self) -> str:
+        return self._management_client.get_token()
     def request_with_retry(
         self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
@@ -441,9 +421,10 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
                 local_paths.append(local_file_path)
         # We will write the artifacts to the workspaceblobstore
-        datastore = self._ml_client.datastores.get_default(include_secrets=True)
+        datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
         account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
-        svc_client = BlobServiceClient(account_url=account_url, credential=self._get_datastore_credential(datastore))
+        svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
         try:
             for local, remote in zip(local_paths, remote_paths["paths"]):
                 blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
@@ -515,16 +496,6 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         except Exception as ex:  # pylint: disable=broad-exception-caught
             LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
-    def _get_datastore_credential(self, datastore: "Datastore"):
-        # Reference the logic in azure.ai.ml._artifact._artifact_utilities
-        # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
-        credential = datastore.credentials
-        if isinstance(credential, AccountKeyConfiguration):
-            return credential.account_key
-        if hasattr(credential, "sas_token"):
-            return credential.sas_token
-        return self._ml_client.datastores._credential  # pylint: disable=protected-access
     def log_metric(self, key: str, value: float) -> None:
         """
         Log the metric to azure similar to how it is done by mlflow.

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -10,7 +10,6 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, T
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
-from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
 from promptflow.client import PFClient
 from promptflow.entities import Run
@@ -21,7 +20,6 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
     EvaluationMetrics,
-    EvaluationRunProperties,
     Prefixes,
     _InternalEvaluationMetrics,
 )
@@ -468,33 +466,14 @@ def _apply_target_to_data(
     :rtype: Tuple[pandas.DataFrame, List[str]]
     """
     _run_name = kwargs.get("_run_name")
-    upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
-    try:
-        with TargetRunContext(upload_target_snaphot):
-            run: Run = pf_client.run(
-                flow=target,
-                display_name=evaluation_name,
-                data=data,
-                properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
-                stream=True,
-                name=_run_name,
-            )
-    except (UserAuthenticationError, UploadInternalError) as ex:
-        if "Failed to upload run" in ex.message:
-            msg = (
-                "Failed to upload the target run to the cloud. "
-                "This may be caused by insufficient permission to access storage or other errors."
-            )
-            raise EvaluationException(
-                message=msg,
-                target=ErrorTarget.EVALUATE,
-                category=ErrorCategory.FAILED_REMOTE_TRACKING,
-                blame=ErrorBlame.USER_ERROR,
-                tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
-            ) from ex
-        raise ex
+    with TargetRunContext():
+        run: Run = pf_client.run(
+            flow=target,
+            display_name=evaluation_name,
+            data=data,
+            stream=True,
+            name=_run_name,
+        )
     target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
     # Remove input and output prefix
@@ -834,11 +813,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     studio_url = None
     if trace_destination:
         studio_url = _log_metrics_and_instance_results(
-            metrics,
-            result_df,
-            trace_destination,
-            target_run,
-            evaluation_name,
+            metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
         )
     result_df_dict = result_df.to_dict("records")

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -7,12 +7,11 @@ import os
 import re
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
+from typing import Any, Dict, NamedTuple, Optional, Union, cast
 import uuid
 import base64
 import pandas as pd
-from promptflow.client import PFClient
 from promptflow.entities import Run
 from azure.ai.evaluation._constants import (
@@ -23,6 +22,8 @@ from azure.ai.evaluation._constants import (
 )
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._model_configurations import AzureAIProject
+from azure.ai.evaluation._version import VERSION
+from azure.ai.evaluation._azure._clients import LiteMLClient
 LOGGER = logging.getLogger(__name__)
@@ -45,6 +46,8 @@ def is_none(value) -> bool:
 def extract_workspace_triad_from_trace_provider(  # pylint: disable=name-too-long
     trace_provider: str,
 ) -> AzureMLWorkspace:
+    from promptflow._cli._utils import get_workspace_triad_from_local
     match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
     if not match or len(match.groups()) != 5:
         raise EvaluationException(
@@ -58,10 +61,20 @@ def extract_workspace_triad_from_trace_provider(  # pylint: disable=name-too-lon
             category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.UNKNOWN,
         )
     subscription_id = match.group(1)
     resource_group_name = match.group(3)
     workspace_name = match.group(5)
-    return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
+    # In theory this if statement should never evaluate to True, but we'll keep it here just in case
+    # for backwards compatibility with what the original code that depended on promptflow-azure did
+    if not (subscription_id and resource_group_name and workspace_name):
+        local = get_workspace_triad_from_local()
+        subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
+        resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
+        workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
+    return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
 def load_jsonl(path):
@@ -69,19 +82,6 @@ def load_jsonl(path):
         return [json.loads(line) for line in f.readlines()]
-def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
-    from promptflow.azure._cli._utils import _get_azure_pf_client
-    ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
-    azure_pf_client = _get_azure_pf_client(
-        subscription_id=ws_triad.subscription_id,
-        resource_group=ws_triad.resource_group_name,
-        workspace_name=ws_triad.workspace_name,
-    )
-    return azure_pf_client, ws_triad
 def _store_multimodal_content(messages, tmpdir: str):
     # verify if images folder exists
     images_folder_path = os.path.join(tmpdir, "images")
@@ -91,23 +91,40 @@ def _store_multimodal_content(messages, tmpdir: str):
     for message in messages:
         if isinstance(message.get("content", []), list):
             for content in message.get("content", []):
-                if content.get("type") == "image_url":
-                    image_url = content.get("image_url")
-                    if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
-                        # Extract the base64 string
-                        base64image = image_url["url"].replace("data:image/jpg;base64,", "")
+                process_message_content(content, images_folder_path)
-                        # Generate a unique filename
-                        image_file_name = f"{str(uuid.uuid4())}.jpg"
-                        image_url["url"] = f"images/{image_file_name}"  # Replace the base64 URL with the file path
+def process_message_content(content, images_folder_path):
+    if content.get("type", "") == "image_url":
+        image_url = content.get("image_url")
-                        # Decode the base64 string to binary image data
-                        image_data_binary = base64.b64decode(base64image)
+        if not image_url or "url" not in image_url:
+            return None
-                        # Write the binary image data to the file
-                        image_file_path = os.path.join(images_folder_path, image_file_name)
-                        with open(image_file_path, "wb") as f:
-                            f.write(image_data_binary)
+        url = image_url["url"]
+        if not url.startswith("data:image/"):
+            return None
+        match = re.search("data:image/([^;]+);", url)
+        if not match:
+            return None
+        ext = match.group(1)
+        # Extract the base64 string
+        base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
+        # Generate a unique filename
+        image_file_name = f"{str(uuid.uuid4())}.{ext}"
+        image_url["url"] = f"images/{image_file_name}"  # Replace the base64 URL with the file path
+        # Decode the base64 string to binary image data
+        image_data_binary = base64.b64decode(base64image)
+        # Write the binary image data to the file
+        image_file_path = os.path.join(images_folder_path, image_file_name)
+        with open(image_file_path, "wb") as f:
+            f.write(image_data_binary)
+    return None
 def _log_metrics_and_instance_results(
@@ -116,6 +133,7 @@ def _log_metrics_and_instance_results(
     trace_destination: Optional[str],
     run: Run,
     evaluation_name: Optional[str],
+    **kwargs,
 ) -> Optional[str]:
     from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -123,19 +141,26 @@ def _log_metrics_and_instance_results(
         LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
         return None
-    azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
-    tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
+    ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
+    management_client = LiteMLClient(
+        subscription_id=ws_triad.subscription_id,
+        resource_group=ws_triad.resource_group_name,
+        logger=LOGGER,
+        credential=kwargs.get("credential"),
+        # let the client automatically determine the credentials to use
+    )
+    tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
     # Adding line_number as index column this is needed by UI to form link to individual instance run
     instance_results["line_number"] = instance_results.index.values
     with EvalRun(
         run_name=run.name if run is not None else evaluation_name,
-        tracking_uri=tracking_uri,
+        tracking_uri=cast(str, tracking_uri),
         subscription_id=ws_triad.subscription_id,
         group_name=ws_triad.resource_group_name,
         workspace_name=ws_triad.workspace_name,
-        ml_client=azure_pf_client.ml_client,
+        management_client=management_client,
         promptflow_run=run,
     ) as ev_run:
         artifact_name = EvalRun.EVALUATION_ARTIFACT
@@ -166,9 +191,16 @@ def _log_metrics_and_instance_results(
                     properties={
                         EvaluationRunProperties.RUN_TYPE: "eval_run",
                         EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
+                        EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
                         "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
                     }
                 )
+            else:
+                ev_run.write_properties_to_run_history(
+                    properties={
+                        EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
+                    }
+                )
         for metric_name, metric_value in metrics.items():
             ev_run.log_metric(metric_name, metric_value)

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing_extensions import ParamSpec, TypeAlias, get_overloads
 from azure.ai.evaluation._common.math import list_mean
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._common.utils import remove_optional_singletons
+from azure.ai.evaluation._model_configurations import Conversation
 P = ParamSpec("P")
 T = TypeVar("T")
@@ -202,6 +203,59 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         return converter
+    def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
+        """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
+        This uses the inputs derived from the _derive_singleton_inputs function to determine which
+        aspects of a conversation ought to be extracted.
+        :return: The function that will be used to convert conversations to evaluable inputs.
+        :rtype: Callable
+        """
+        def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
+            messages = cast(List[Dict[str, Any]], conversation["messages"])
+            # Extract user messages, assistant messages from conversation
+            user_messages: List[Dict[str, Any]] = []
+            assistant_messages: List[Dict[str, Any]] = []
+            system_messages: List[Dict[str, Any]] = []
+            # Convert conversation slice into queries and responses.
+            # Assume that 'user' role is asking queries and 'assistant' role is responding.
+            if self._eval_last_turn and len(messages) > 1:
+                messages = messages[-2:]
+            for each_turn in messages:
+                role = each_turn["role"]
+                if role == "user":
+                    user_messages.append(each_turn)
+                elif role == "assistant":
+                    assistant_messages.append(each_turn)
+                elif role == "system":
+                    system_messages.append(each_turn)
+            # validation
+            if len(user_messages) != len(assistant_messages):
+                raise EvaluationException(
+                    message="Mismatched number of user and assistant messages.",
+                    internal_message=("Mismatched number of user and assistant messages."),
+                )
+            if len(assistant_messages) > 1:
+                raise EvaluationException(
+                    message="Conversation can have only one assistant message.",
+                    internal_message=("Conversation can have only one assistant message."),
+                )
+            eval_conv_inputs = []
+            for user_msg, assist_msg in zip(user_messages, assistant_messages):
+                conv_messages = []
+                if len(system_messages) == 1:
+                    conv_messages.append(system_messages[0])
+                conv_messages.append(user_msg)
+                conv_messages.append(assist_msg)
+                eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
+            return eval_conv_inputs
+        return multi_modal_converter
     def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
         """Convert an arbitrary input into a list of inputs for evaluators.
         It is assumed that evaluators generally make use of their inputs in one of two ways.
@@ -210,7 +264,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         values.
         The self._singleton_inputs list assigned during initialization is used to find and extract
-        singleton keywords, and self._allow_converssation_input is used to determine if a conversation
+        singleton keywords, and self._allow_conversation_input is used to determine if a conversation
         is a valid input.
         If both conversations and singletons are allowed, the function will raise an exception if both
@@ -241,6 +295,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             )
         # Handle Conversation
         if conversation is not None:
+            if self._is_multi_modal_conversation(conversation):
+                return self._derive_multi_modal_conversation_converter()(conversation)
             return self._derive_conversation_converter()(conversation)
         # Handle Singletons
         required_singletons = remove_optional_singletons(self, singletons)
@@ -255,6 +311,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             target=ErrorTarget.CONVERSATION,
         )
+    def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
+        if "messages" not in conversation:
+            return False
+        messages = conversation["messages"]
+        if not isinstance(messages, list):
+            return False
+        for message in messages:
+            if "content" in message:
+                content = message.get("content", "")
+                if isinstance(content, list):
+                    if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
+                        return True
+        return False
     def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
         """Aggregate the evaluation results of each conversation turn into a single result.

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -10,6 +10,7 @@ from promptflow.core import AsyncPrompty
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
 from . import EvaluatorBase
@@ -47,10 +48,12 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         self._prompty_file = prompty_file
         super().__init__(eval_last_turn=eval_last_turn)
+        subclass_name = self.__class__.__name__
+        user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
         prompty_model_config = construct_prompty_model_config(
             validate_model_config(model_config),
             self._DEFAULT_OPEN_API_VERSION,
-            USER_AGENT,
+            user_agent,
         )
         self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
@@ -69,6 +72,14 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         :return: The evaluation result.
         :rtype: Dict
         """
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message="Only text conversation inputs are supported.",
+                internal_message="Only text conversation inputs are supported.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.CONVERSATION,
+            )
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         score = math.nan

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -11,9 +11,10 @@ from azure.ai.evaluation._common.constants import (
     Tasks,
     _InternalAnnotationTasks,
 )
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
+from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._common.utils import validate_conversation
 from azure.core.credentials import TokenCredential
 from . import EvaluatorBase
@@ -81,6 +82,36 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         :return: The evaluation result.
         :rtype: Dict
         """
+        if "query" in eval_input and "response" in eval_input:
+            return await self._evaluate_query_response(eval_input)
+        conversation = eval_input.get("conversation", None)
+        return await self._evaluate_conversation(conversation)
+    async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
+        """
+        Evaluates content according to this evaluator's metric.
+        :keyword conversation: The conversation contains list of messages to be evaluated.
+            Each message should have "role" and "content" keys.
+        :param conversation: The conversation to evaluate.
+        :type conversation: ~azure.ai.evaluation.Conversation
+        :return: The evaluation score computation based on the Content Safety metric (self.metric).
+        :rtype: Dict[str, Union[float, str]]
+        """
+        # validate inputs
+        validate_conversation(conversation)
+        messages = conversation["messages"]
+        # Run score computation based on supplied metric.
+        result = await evaluate_with_rai_service_multimodal(
+            messages=messages,
+            metric_name=self._eval_metric,
+            project_scope=self._azure_ai_project,
+            credential=self._credential,
+        )
+        return result
+    async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
         query = eval_input.get("query", None)
         response = eval_input.get("response", None)
         if query is None or response is None:
@@ -111,6 +142,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
             project_scope=self._azure_ai_project,
             credential=self._credential,
             annotation_task=self._get_task(),
+            evaluator_name=self.__class__.__name__,
         )
     def _get_task(self):

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -20,7 +20,7 @@ from ._violence import ViolenceEvaluator
 @experimental
 class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     """
-    Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
+    Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
@@ -47,7 +47,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     # TODO address 3579092 to re-enabled parallel evals.
     def __init__(self, credential, azure_ai_project, **kwargs):
         super().__init__()
-        self._parallel = kwargs.pop("_parallel", False)
+        self._parallel = kwargs.pop("_parallel", True)
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
             ViolenceEvaluator(credential, azure_ai_project),
             SexualEvaluator(credential, azure_ai_project),
@@ -130,7 +130,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
             with ThreadPoolExecutor() as executor:
                 # pylint: disable=no-value-for-parameter
                 futures = {
-                    executor.submit(query=query, response=response, conversation=conversation): evaluator
+                    executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
                     for evaluator in self._evaluators
                 }

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -80,7 +80,7 @@ class _AsyncSimilarityEvaluator:
 class SimilarityEvaluator:
     """
-    Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
+    Evaluates similarity score for a given query, response, and ground truth.
     The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
     AI model's generated prediction. This calculation involves creating sentence-level embeddings for both

azure/ai/evaluation/_http_utils.py CHANGED Viewed

@@ -448,19 +448,21 @@ class AsyncHttpPipeline(AsyncPipeline):
         return cast(Self, await super().__aenter__())
-def get_http_client() -> HttpPipeline:
+def get_http_client(**kwargs: Any) -> HttpPipeline:
     """Get an HttpPipeline configured with common policies.
     :returns: An HttpPipeline with a set of applied policies:
     :rtype: HttpPipeline
     """
-    return HttpPipeline(user_agent_policy=UserAgentPolicy(base_user_agent=USER_AGENT))
+    kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
+    return HttpPipeline(**kwargs)
-def get_async_http_client() -> AsyncHttpPipeline:
+def get_async_http_client(**kwargs: Any) -> AsyncHttpPipeline:
     """Get an AsyncHttpPipeline configured with common policies.
     :returns: An AsyncHttpPipeline with a set of applied policies:
     :rtype: AsyncHttpPipeline
     """
-    return AsyncHttpPipeline(user_agent_policy=UserAgentPolicy(base_user_agent=USER_AGENT))
+    kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
+    return AsyncHttpPipeline(**kwargs)

azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py CHANGED Viewed

@@ -32,10 +32,6 @@ ROUGE-1.5.5.pl -m -e data -n 2 -a settings.xml
 In these examples settings.xml lists input files and formats.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
 import collections
 import re

azure/ai/evaluation/_vendor/rouge_score/scoring.py CHANGED Viewed

@@ -21,10 +21,6 @@ Aggregation functions use bootstrap resampling to compute confidence intervals
 as per the original ROUGE perl implementation.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
 import abc
 import collections
 from typing import Dict

azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl