PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b2__py3-none-any.whl → 1.0.0b3__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.0.0b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (43) hide show

azure/ai/evaluation/__init__.py CHANGED Viewed

@@ -4,10 +4,8 @@
 from ._evaluate._evaluate import evaluate
 from ._evaluators._bleu import BleuScoreEvaluator
-from ._evaluators._chat import ChatEvaluator
 from ._evaluators._coherence import CoherenceEvaluator
 from ._evaluators._content_safety import (
-    ContentSafetyChatEvaluator,
     ContentSafetyEvaluator,
     HateUnfairnessEvaluator,
     SelfHarmEvaluator,
@@ -22,10 +20,16 @@ from ._evaluators._meteor import MeteorScoreEvaluator
 from ._evaluators._protected_material import ProtectedMaterialEvaluator
 from ._evaluators._qa import QAEvaluator
 from ._evaluators._relevance import RelevanceEvaluator
+from ._evaluators._retrieval import RetrievalEvaluator
 from ._evaluators._rouge import RougeScoreEvaluator, RougeType
 from ._evaluators._similarity import SimilarityEvaluator
 from ._evaluators._xpia import IndirectAttackEvaluator
-from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from ._model_configurations import (
+    AzureAIProject,
+    AzureOpenAIModelConfiguration,
+    OpenAIModelConfiguration,
+    EvaluatorConfig,
+)
 __all__ = [
     "evaluate",
@@ -36,21 +40,21 @@ __all__ = [
     "RelevanceEvaluator",
     "SimilarityEvaluator",
     "QAEvaluator",
-    "ChatEvaluator",
     "ViolenceEvaluator",
     "SexualEvaluator",
     "SelfHarmEvaluator",
     "HateUnfairnessEvaluator",
     "ContentSafetyEvaluator",
-    "ContentSafetyChatEvaluator",
     "IndirectAttackEvaluator",
     "BleuScoreEvaluator",
     "GleuScoreEvaluator",
     "MeteorScoreEvaluator",
+    "RetrievalEvaluator",
     "RougeScoreEvaluator",
     "RougeType",
     "ProtectedMaterialEvaluator",
     "AzureAIProject",
     "AzureOpenAIModelConfiguration",
     "OpenAIModelConfiguration",
+    "EvaluatorConfig",
 ]

azure/ai/evaluation/_common/utils.py CHANGED Viewed

@@ -3,12 +3,13 @@
 # ---------------------------------------------------------
 import threading
-from typing import List, Optional, Union
+from typing import List, Union
 import nltk
 import numpy as np
 from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
 from . import constants
@@ -70,18 +71,32 @@ def nltk_tokenize(text: str) -> List[str]:
     return list(tokens)
-def ensure_api_version_in_aoai_model_config(
+def parse_model_config_type(
     model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
-    default_api_version: str,
 ) -> None:
     if "azure_endpoint" in model_config or "azure_deployment" in model_config:
-        model_config["api_version"] = model_config.get("api_version", default_api_version)
+        model_config["type"] = AZURE_OPENAI_TYPE
+    else:
+        model_config["type"] = OPENAI_TYPE
-def ensure_user_agent_in_aoai_model_config(
+def construct_prompty_model_config(
     model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
-    prompty_model_config: dict,
-    user_agent: Optional[str] = None,
-) -> None:
-    if user_agent and ("azure_endpoint" in model_config or "azure_deployment" in model_config):
+    default_api_version: str,
+    user_agent: str,
+) -> dict:
+    parse_model_config_type(model_config)
+    if model_config["type"] == AZURE_OPENAI_TYPE:
+        model_config["api_version"] = model_config.get("api_version", default_api_version)
+    prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
+    # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
+    # https://github.com/encode/httpx/discussions/2959
+    prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
+    if model_config["type"] == AZURE_OPENAI_TYPE and user_agent:
         prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
+    return prompty_model_config

azure/ai/evaluation/_constants.py CHANGED Viewed

@@ -57,3 +57,7 @@ PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
 OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
 OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
+AZURE_OPENAI_TYPE = "azure_openai"
+OPENAI_TYPE = "openai"

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -19,7 +19,7 @@ from .._constants import (
     Prefixes,
     _InternalEvaluationMetrics,
 )
-from .._model_configurations import AzureAIProject
+from .._model_configurations import AzureAIProject, EvaluatorConfig
 from .._user_agent import USER_AGENT
 from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
 from ._utils import (
@@ -158,6 +158,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
     ]
     missing_inputs = [col for col in required_inputs if col not in df_data.columns]
+    if missing_inputs and "conversation" in required_inputs:
+        non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
+        if len(missing_inputs) == len(non_conversation_inputs) and [
+            input in non_conversation_inputs for input in missing_inputs
+        ]:
+            missing_inputs = []
     if missing_inputs:
         if not is_target_fn:
             msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
@@ -273,7 +279,7 @@ def _validate_columns(
     df: pd.DataFrame,
     evaluators: Dict[str, Any],
     target: Optional[Callable],
-    evaluator_config: Dict[str, Dict[str, str]],
+    column_mapping: Dict[str, Dict[str, str]],
 ) -> None:
     """
     Check that all columns needed by evaluator or target function are present.
@@ -284,8 +290,8 @@ def _validate_columns(
     :type evaluators: Dict[str, Any]
     :param target: The callable to be applied to data set.
     :type target: Optional[Callable]
-    :param evaluator_config: The configuration for evaluators.
-    :type evaluator_config: Dict[str, Dict[str, str]]
+    :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
+    :type column_mapping: Dict[str, Dict[str, str]]
     :raises EvaluationException: If column starts from "__outputs." while target is defined.
     """
     if target:
@@ -306,7 +312,7 @@ def _validate_columns(
     else:
         for evaluator_name, evaluator in evaluators.items():
             # Apply column mapping
-            mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
+            mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
             new_df = _apply_column_mapping(df, mapping_config)
             # Validate input data for evaluator
@@ -372,11 +378,11 @@ def _apply_target_to_data(
     return target_output, generated_columns, run
-def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
-    """Process evaluator_config to replace ${target.} with ${data.}
+def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
+    """Process column_mapping to replace ${target.} with ${data.}
-    :param evaluator_config: The configuration for evaluators.
-    :type evaluator_config: Dict[str, Dict[str, str]]
+    :param column_mapping: The configuration for evaluators.
+    :type column_mapping: Dict[str, Dict[str, str]]
     :return: The processed configuration.
     :rtype: Dict[str, Dict[str, str]]
     """
@@ -385,15 +391,15 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Di
     unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
-    if evaluator_config:
-        for evaluator, mapping_config in evaluator_config.items():
+    if column_mapping:
+        for evaluator, mapping_config in column_mapping.items():
             if isinstance(mapping_config, dict):
                 processed_config[evaluator] = {}
                 for map_to_key, map_value in mapping_config.items():
                     # Check if there's any unexpected reference other than ${target.} or ${data.}
                     if unexpected_references.search(map_value):
-                        msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
+                        msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
                         raise EvaluationException(
                             message=msg,
                             internal_message=msg,
@@ -439,7 +445,7 @@ def evaluate(
     evaluators: Dict[str, Callable],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
-    evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
+    evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[str] = None,
     **kwargs,
@@ -458,10 +464,10 @@ def evaluate(
     :keyword target: Target to be evaluated. `target` and `data` both cannot be None
     :paramtype target: Optional[Callable]
     :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
-        names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
-        keys as the column names in the evaluator input and values as the column names in the input data or data
-        generated by target.
-    :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
+        names as keys and a values that are dictionaries containing the column mappings. The column mappings should
+        be a dictionary with keys as the column names in the evaluator input and values as the column names in the
+        input data or data generated by target.
+    :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
     :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
           the results will be saved to a file named `evaluation_results.json` in the folder.
     :paramtype output_path: Optional[str]
@@ -482,7 +488,7 @@ def evaluate(
             model_config = {
                 "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
                 "api_key": os.environ.get("AZURE_OPENAI_KEY"),
-                "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
+                "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
             }
             coherence_eval = CoherenceEvaluator(model_config=model_config)
@@ -497,15 +503,19 @@ def evaluate(
                 },
                 evaluator_config={
                     "coherence": {
-                        "response": "${data.response}",
-                        "query": "${data.query}"
+                        "column_mapping": {
+                            "response": "${data.response}",
+                            "query": "${data.query}",
+                        },
                     },
                     "relevance": {
-                        "response": "${data.response}",
-                        "context": "${data.context}",
-                        "query": "${data.query}"
-                    }
-                }
+                        "column_mapping": {
+                            "response": "${data.response}",
+                            "context": "${data.context}",
+                            "query": "${data.query}",
+                        },
+                    },
+                },
             )
     """
@@ -544,13 +554,13 @@ def evaluate(
         raise e
-def _evaluate(  # pylint: disable=too-many-locals
+def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
     data: Optional[str] = None,
     evaluators: Optional[Dict[str, Callable]] = None,
-    evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
+    evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[str] = None,
     **kwargs,
@@ -560,8 +570,13 @@ def _evaluate(  # pylint: disable=too-many-locals
     # Process evaluator config to replace ${target.} with ${data.}
     if evaluator_config is None:
         evaluator_config = {}
-    evaluator_config = _process_evaluator_config(evaluator_config)
-    _validate_columns(input_data_df, evaluators, target, evaluator_config)
+    # extract column mapping dicts into dictionary mapping evaluator name to column mapping
+    column_mapping = {
+        evaluator_name: evaluator_configuration.get("column_mapping", None)
+        for evaluator_name, evaluator_configuration in evaluator_config.items()
+    }
+    column_mapping = _process_column_mappings(column_mapping)
+    _validate_columns(input_data_df, evaluators, target, column_mapping)
     # Target Run
     pf_client = PFClient(
@@ -577,8 +592,8 @@ def _evaluate(  # pylint: disable=too-many-locals
     # Create default configuration for evaluators that directly maps
     # input data names to keyword inputs of the same name in the evaluators.
-    evaluator_config = evaluator_config or {}
-    evaluator_config.setdefault("default", {})
+    column_mapping = column_mapping or {}
+    column_mapping.setdefault("default", {})
     # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
     if data is not None and target is not None:
@@ -586,21 +601,21 @@ def _evaluate(  # pylint: disable=too-many-locals
             target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
         )
-        for evaluator_name, mapping in evaluator_config.items():
+        for evaluator_name, mapping in column_mapping.items():
             mapped_to_values = set(mapping.values())
             for col in target_generated_columns:
                 # If user defined mapping differently, do not change it.
                 # If it was mapped to target, we have already changed it
-                # in _process_evaluator_config
+                # in _process_column_mappings
                 run_output = f"${{run.outputs.{col}}}"
                 # We will add our mapping only if
                 # customer did not mapped target output.
                 if col not in mapping and run_output not in mapped_to_values:
-                    evaluator_config[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
+                    column_mapping[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
         # After we have generated all columns we can check if we have
         # everything we need for evaluators.
-        _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
+        _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
     # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
     # via target mapping.
@@ -610,13 +625,16 @@ def _evaluate(  # pylint: disable=too-many-locals
         for col in input_data_df.columns:
             # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
             # Also ignore columns that are already in config, since they've been covered by target mapping.
-            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
-                evaluator_config["default"][col] = f"${{data.{col}}}"
+            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
+                column_mapping["default"][col] = f"${{data.{col}}}"
     # Batch Run
     evaluators_info = {}
     use_pf_client = kwargs.get("_use_pf_client", True)
     if use_pf_client:
-        batch_run_client = ProxyClient(pf_client)
+        # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
+        # The root cause is still unclear, but it seems related to a conflict between the async run uploader
+        # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
+        batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
         # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
         # multiple evaluators. If the path is already absolute, abspath will return the original path.
@@ -632,7 +650,7 @@ def _evaluate(  # pylint: disable=too-many-locals
                 flow=evaluator,
                 run=target_run,
                 evaluator_name=evaluator_name,
-                column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
+                column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
                 data=data,
                 stream=True,
                 name=kwargs.get("_run_name"),

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -1,77 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os
-import re
-import numpy as np
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
-try:
-    from ..._user_agent import USER_AGENT
-except ImportError:
-    USER_AGENT = None
-class _AsyncCoherenceEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    PROMPTY_FILE = "coherence.prompty"
-    LLM_CALL_TIMEOUT = 600
-    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
+from typing import Optional
+from typing_extensions import override
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-        ensure_user_agent_in_aoai_model_config(
-            model_config,
-            prompty_model_config,
-            USER_AGENT,
-        )
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
-    async def __call__(self, *, query: str, response: str, **kwargs):
-        # Validate input parameters
-        query = str(query or "")
-        response = str(response or "")
-        if not (query.strip() and response.strip()):
-            msg = "Both 'query' and 'response' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.INVALID_VALUE,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.COHERENCE_EVALUATOR,
-            )
-        # Run the evaluation flow
-        llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
-        score = np.nan
-        if llm_output:
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
-        return {"gpt_coherence": float(score)}
-class CoherenceEvaluator:
+class CoherenceEvaluator(PromptyEvaluatorBase):
     """
     Initialize a coherence evaluator configured for a specific Azure OpenAI model.
@@ -97,21 +34,37 @@ class CoherenceEvaluator:
         }
     """
-    def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncCoherenceEvaluator(model_config)
+    PROMPTY_FILE = "coherence.prompty"
+    RESULT_KEY = "gpt_coherence"
-    def __call__(self, *, query: str, response: str, **kwargs):
-        """
-        Evaluate coherence.
+    @override
+    def __init__(self, model_config: dict):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation: Optional[dict] = None,
+        **kwargs
+    ):
+        """Evaluate coherence. Accepts either a query and response for a single evaluation,
+        or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
+        turns, the evaluator will aggregate the results of each turn.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
         :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :return: The coherence score.
-        :rtype: Dict[str, float]
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated.
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[Dict]
+        :return: The relevance score.
+        :rtype: dict
         """
-        return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
-    def _to_async(self):
-        return self._async_evaluator
+        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)

azure/ai/evaluation/_evaluators/_coherence/coherence.prompty CHANGED Viewed

@@ -3,11 +3,6 @@ name: Coherence
 description: Evaluates coherence score for QA scenario
 model:
   api: chat
-  configuration:
-    type: azure_openai
-    azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
-    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     max_tokens: 1

azure/ai/evaluation/_evaluators/_common/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._base_eval import EvaluatorBase
+from ._base_prompty_eval import PromptyEvaluatorBase
+from ._base_rai_svc_eval import RaiServiceEvaluatorBase
+__all__ = [
+    "EvaluatorBase",
+    "PromptyEvaluatorBase",
+    "RaiServiceEvaluatorBase",
+]

azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b3__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.0.0b3py3-none-any.whl