PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.0.0b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (78) hide show

azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py CHANGED Viewed

@@ -1,64 +1,22 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import override
 from azure.ai.evaluation._common.constants import EvaluationMetrics
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
-class _AsyncProtectedMaterialEvaluator:
-    def __init__(self, azure_ai_project: dict, credential=None):
-        self._azure_ai_project = azure_ai_project
-        self._credential = credential
-    async def __call__(self, *, query: str, response: str, **kwargs):
-        """
-        Evaluates content according to this evaluator's metric.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :return: The evaluation score computation based on the Content Safety metric (self.metric).
-        :rtype: Any
-        """
-        # Validate inputs
-        # Raises value error if failed, so execution alone signifies success.
-        if not (query and query.strip() and query != "None") or not (
-            response and response.strip() and response != "None"
-        ):
-            msg = "Both 'query' and 'response' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
-            )
-        # Run score computation based on supplied metric.
-        result = await evaluate_with_rai_service(
-            metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
-            query=query,
-            response=response,
-            project_scope=self._azure_ai_project,
-            credential=self._credential,
-        )
-        return result
-class ProtectedMaterialEvaluator:
+class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a protected material evaluator to detect whether protected material
     is present in your AI system's response. Outputs True or False with AI-generated reasoning.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: ~azure.core.credentials.TokenCredential
     :return: Whether or not protected material was found in the response, with AI-generated reasoning.
     :rtype: Dict[str, str]
@@ -84,21 +42,16 @@ class ProtectedMaterialEvaluator:
         }
     """
-    def __init__(self, azure_ai_project: dict, credential=None):
-        self._async_evaluator = _AsyncProtectedMaterialEvaluator(azure_ai_project, credential)
-    def __call__(self, *, query: str, response: str, **kwargs):
-        """
-        Evaluates protected material content.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :return: A dictionary containing a boolean label and reasoning.
-        :rtype: dict
-        """
-        return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __init__(
+        self,
+        credential,
+        azure_ai_project: dict,
+        eval_last_turn: bool = False,
+    ):
+        super().__init__(
+            eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
+            azure_ai_project=azure_ai_project,
+            credential=credential,
+            eval_last_turn=eval_last_turn,
+        )

azure/ai/evaluation/_evaluators/_qa/_qa.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # ---------------------------------------------------------
 from concurrent.futures import as_completed
+from typing import Callable, Dict, List
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
@@ -53,7 +54,7 @@ class QAEvaluator:
     def __init__(self, model_config: dict, parallel: bool = True):
         self._parallel = parallel
-        self._evaluators = [
+        self._evaluators: List[Callable[..., Dict[str, float]]] = [
             GroundednessEvaluator(model_config),
             RelevanceEvaluator(model_config),
             CoherenceEvaluator(model_config),
@@ -77,9 +78,9 @@ class QAEvaluator:
         :keyword parallel: Whether to evaluate in parallel. Defaults to True.
         :paramtype parallel: bool
         :return: The scores for QA scenario.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
-        results = {}
+        results: Dict[str, float] = {}
         if self._parallel:
             with ThreadPoolExecutor() as executor:
                 futures = {

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -3,78 +3,14 @@
 # ---------------------------------------------------------
 import os
-import re
+from typing import Optional
-import numpy as np
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
+from typing_extensions import override
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
-try:
-    from ..._user_agent import USER_AGENT
-except ImportError:
-    USER_AGENT = None
-class _AsyncRelevanceEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    PROMPTY_FILE = "relevance.prompty"
-    LLM_CALL_TIMEOUT = 600
-    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
-        ensure_user_agent_in_aoai_model_config(
-            model_config,
-            prompty_model_config,
-            USER_AGENT,
-        )
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
-    async def __call__(self, *, query: str, response: str, context: str, **kwargs):
-        # Validate input parameters
-        query = str(query or "")
-        response = str(response or "")
-        context = str(context or "")
-        if not (query.strip() and response.strip() and context.strip()):
-            msg = "'query', 'response' and 'context' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.RELEVANCE_EVALUATOR,
-            )
-        # Run the evaluation flow
-        llm_output = await self._flow(
-            query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
-        )
-        score = np.nan
-        if llm_output:
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
-        return {"gpt_relevance": float(score)}
-class RelevanceEvaluator:
+class RelevanceEvaluator(PromptyEvaluatorBase):
     """
     Initialize a relevance evaluator configured for a specific Azure OpenAI model.
@@ -102,25 +38,41 @@ class RelevanceEvaluator:
         }
     """
-    def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncRelevanceEvaluator(model_config)
+    # Constants must be defined within eval's directory to be save/loadable
+    PROMPTY_FILE = "relevance.prompty"
+    RESULT_KEY = "gpt_relevance"
-    def __call__(self, *, query: str, response: str, context: str, **kwargs):
-        """
-        Evaluate relevance.
+    @override
+    def __init__(self, model_config: dict):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        context: Optional[str] = None,
+        conversation: Optional[dict] = None,
+        **kwargs,
+    ):
+        """Evaluate relevance. Accepts either a response and context a single evaluation,
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn.
         :keyword query: The query to be evaluated.
-        :paramtype query: str
+        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword context: The context to be evaluated.
-        :paramtype context: str
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[Dict]
         :return: The relevance score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, query=query, response=response, context=context, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+        return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)

azure/ai/evaluation/_evaluators/_relevance/relevance.prompty CHANGED Viewed

@@ -3,11 +3,6 @@ name: Relevance
 description: Evaluates relevance score for QA scenario
 model:
   api: chat
-  configuration:
-    type: azure_openai
-    azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
-    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     max_tokens: 1

azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py RENAMED Viewed

@@ -2,8 +2,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from ._chat import ChatEvaluator
+from ._retrieval import RetrievalEvaluator
 __all__ = [
-    "ChatEvaluator",
+    "RetrievalEvaluator",
 ]

azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py RENAMED Viewed

@@ -4,41 +4,37 @@
 import json
 import logging
+import math
 import os
 import re
+from typing import Union
-import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from promptflow.core import AsyncPrompty
-from ...._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from ..._common.math import list_mean_nan_safe
+from ..._common.utils import construct_prompty_model_config, validate_model_config
 logger = logging.getLogger(__name__)
 try:
-    from ...._user_agent import USER_AGENT
+    from .._user_agent import USER_AGENT
 except ImportError:
-    USER_AGENT = None
+    USER_AGENT = "None"
-class _AsyncRetrievalChatEvaluator:
+class _AsyncRetrievalScoreEvaluator:
     # Constants must be defined within eval's directory to be save/loadable
     PROMPTY_FILE = "retrieval.prompty"
     LLM_CALL_TIMEOUT = 600
     DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
-        ensure_user_agent_in_aoai_model_config(
+    def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]):
+        prompty_model_config = construct_prompty_model_config(
             model_config,
-            prompty_model_config,
+            self.DEFAULT_OPEN_API_VERSION,
             USER_AGENT,
         )
@@ -76,7 +72,7 @@ class _AsyncRetrievalChatEvaluator:
                 llm_output = await self._flow(
                     query=query, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
                 )
-                score = np.nan
+                score = math.nan
                 if llm_output:
                     parsed_score_response = re.findall(r"\d+", llm_output.split("# Result")[-1].strip())
                     if len(parsed_score_response) > 0:
@@ -89,10 +85,10 @@ class _AsyncRetrievalChatEvaluator:
                     "Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
                 )
-                per_turn_scores.append(np.nan)
+                per_turn_scores.append(math.nan)
         return {
-            "gpt_retrieval": np.nanmean(per_turn_scores),
+            "gpt_retrieval": list_mean_nan_safe(per_turn_scores),
             "evaluation_per_turn": {
                 "gpt_retrieval": {
                     "score": per_turn_scores,
@@ -101,7 +97,7 @@ class _AsyncRetrievalChatEvaluator:
         }
-class RetrievalChatEvaluator:
+class RetrievalEvaluator:
     """
     Initialize an evaluator configured for a specific Azure OpenAI model.
@@ -110,11 +106,12 @@ class RetrievalChatEvaluator:
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :return: A function that evaluates and generates metrics for "chat" scenario.
     :rtype: Callable
     **Usage**
     .. code-block:: python
-        chat_eval = RetrievalChatEvaluator(model_config)
+        chat_eval = RetrievalScoreEvaluator(model_config)
         conversation = [
             {"role": "user", "content": "What is the value of 2 + 2?"},
             {"role": "assistant", "content": "2 + 2 = 4", "context": {
@@ -130,18 +127,18 @@ class RetrievalChatEvaluator:
     .. code-block:: python
-    {
-        "gpt_retrieval": 3.0
-        "evaluation_per_turn": {
-            "gpt_retrieval": {
-                "score": [1.0, 2.0, 3.0]
+        {
+            "gpt_retrieval": 3.0
+            "evaluation_per_turn": {
+                "gpt_retrieval": {
+                    "score": [1.0, 2.0, 3.0]
+                }
             }
         }
-    }
     """
     def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncRetrievalChatEvaluator(model_config)
+        self._async_evaluator = _AsyncRetrievalScoreEvaluator(validate_model_config(model_config))
     def __call__(self, *, conversation, **kwargs):
         """Evaluates retrieval score chat scenario.

azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty RENAMED Viewed

@@ -3,11 +3,6 @@ name: Retrieval
 description: Evaluates retrieval score for Chat scenario
 model:
   api: chat
-  configuration:
-    type: azure_openai
-    azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
-    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     top_p: 1.0

azure/ai/evaluation/_evaluators/_rouge/_rouge.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from enum import Enum
 from promptflow._utils.async_utils import async_run_allowing_running_loop
-from rouge_score import rouge_scorer
+from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
 from azure.core import CaseInsensitiveEnumMeta

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -2,21 +2,23 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import math
 import os
 import re
+from typing import Union
-import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from promptflow.core import AsyncPrompty
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
-from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
+from ..._common.utils import construct_prompty_model_config, validate_model_config
 try:
     from ..._user_agent import USER_AGENT
 except ImportError:
-    USER_AGENT = None
+    USER_AGENT = "None"
 class _AsyncSimilarityEvaluator:
@@ -25,18 +27,10 @@ class _AsyncSimilarityEvaluator:
     LLM_CALL_TIMEOUT = 600
     DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
-        ensure_user_agent_in_aoai_model_config(
+    def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]):
+        prompty_model_config = construct_prompty_model_config(
             model_config,
-            prompty_model_config,
+            self.DEFAULT_OPEN_API_VERSION,
             USER_AGENT,
         )
@@ -45,6 +39,18 @@ class _AsyncSimilarityEvaluator:
         self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
     async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
+        """
+        Evaluate similarity.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The similarity score.
+        :rtype: Dict[str, float]
+        """
         # Validate input parameters
         query = str(query or "")
         response = str(response or "")
@@ -65,7 +71,7 @@ class _AsyncSimilarityEvaluator:
             query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
         )
-        score = np.nan
+        score = math.nan
         if llm_output:
             match = re.search(r"\d", llm_output)
             if match:
@@ -102,7 +108,7 @@ class SimilarityEvaluator:
     """
     def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
+        self._async_evaluator = _AsyncSimilarityEvaluator(validate_model_config(model_config))
     def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
         """
@@ -115,7 +121,7 @@ class SimilarityEvaluator:
         :keyword ground_truth: The ground truth to be evaluated.
         :paramtype ground_truth: str
         :return: The similarity score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
         return async_run_allowing_running_loop(
             self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs

azure/ai/evaluation/_evaluators/_similarity/similarity.prompty CHANGED Viewed

@@ -3,11 +3,6 @@ name: Similarity
 description: Evaluates similarity score for QA scenario
 model:
   api: chat
-  configuration:
-    type: azure_openai
-    azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
-    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     max_tokens: 1

azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.0.0b4py3-none-any.whl