PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b2__py3-none-any.whl → 1.0.0b3__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.0.0b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (43) hide show

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -3,78 +3,13 @@
 # ---------------------------------------------------------
 import os
-import re
+from typing import Optional
+from typing_extensions import override
-import numpy as np
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
-try:
-    from ..._user_agent import USER_AGENT
-except ImportError:
-    USER_AGENT = None
-class _AsyncRelevanceEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    PROMPTY_FILE = "relevance.prompty"
-    LLM_CALL_TIMEOUT = 600
-    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
-        ensure_user_agent_in_aoai_model_config(
-            model_config,
-            prompty_model_config,
-            USER_AGENT,
-        )
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
-    async def __call__(self, *, query: str, response: str, context: str, **kwargs):
-        # Validate input parameters
-        query = str(query or "")
-        response = str(response or "")
-        context = str(context or "")
-        if not (query.strip() and response.strip() and context.strip()):
-            msg = "'query', 'response' and 'context' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.RELEVANCE_EVALUATOR,
-            )
-        # Run the evaluation flow
-        llm_output = await self._flow(
-            query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
-        )
-        score = np.nan
-        if llm_output:
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
-        return {"gpt_relevance": float(score)}
-class RelevanceEvaluator:
+class RelevanceEvaluator(PromptyEvaluatorBase):
     """
     Initialize a relevance evaluator configured for a specific Azure OpenAI model.
@@ -102,25 +37,41 @@ class RelevanceEvaluator:
         }
     """
-    def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncRelevanceEvaluator(model_config)
+    # Constants must be defined within eval's directory to be save/loadable
+    PROMPTY_FILE = "relevance.prompty"
+    RESULT_KEY = "gpt_relevance"
-    def __call__(self, *, query: str, response: str, context: str, **kwargs):
-        """
-        Evaluate relevance.
+    @override
+    def __init__(self, model_config: dict):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        context: Optional[str] = None,
+        conversation: Optional[dict] = None,
+        **kwargs
+    ):
+        """Evaluate relevance. Accepts either a response and context a single evaluation,
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn.
         :keyword query: The query to be evaluated.
-        :paramtype query: str
+        :paramtype query: Optional[str]
         :keyword response: The response to be evaluated.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword context: The context to be evaluated.
-        :paramtype context: str
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[Dict]
         :return: The relevance score.
         :rtype: dict
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, query=query, response=response, context=context, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+        return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)

azure/ai/evaluation/_evaluators/_relevance/relevance.prompty CHANGED Viewed

@@ -3,11 +3,6 @@ name: Relevance
 description: Evaluates relevance score for QA scenario
 model:
   api: chat
-  configuration:
-    type: azure_openai
-    azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
-    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     max_tokens: 1

azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py RENAMED Viewed

@@ -2,8 +2,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from ._chat import ChatEvaluator
+from ._retrieval import RetrievalEvaluator
 __all__ = [
-    "ChatEvaluator",
+    "RetrievalEvaluator",
 ]

azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py RENAMED Viewed

@@ -11,34 +11,27 @@ import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from promptflow.core import AsyncPrompty
-from ...._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
+from ..._common.utils import construct_prompty_model_config
 logger = logging.getLogger(__name__)
 try:
-    from ...._user_agent import USER_AGENT
+    from .._user_agent import USER_AGENT
 except ImportError:
     USER_AGENT = None
-class _AsyncRetrievalChatEvaluator:
+class _AsyncRetrievalScoreEvaluator:
     # Constants must be defined within eval's directory to be save/loadable
     PROMPTY_FILE = "retrieval.prompty"
     LLM_CALL_TIMEOUT = 600
     DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
     def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
-        ensure_user_agent_in_aoai_model_config(
+        prompty_model_config = construct_prompty_model_config(
             model_config,
-            prompty_model_config,
+            self.DEFAULT_OPEN_API_VERSION,
             USER_AGENT,
         )
@@ -101,7 +94,7 @@ class _AsyncRetrievalChatEvaluator:
         }
-class RetrievalChatEvaluator:
+class RetrievalEvaluator:
     """
     Initialize an evaluator configured for a specific Azure OpenAI model.
@@ -110,11 +103,12 @@ class RetrievalChatEvaluator:
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :return: A function that evaluates and generates metrics for "chat" scenario.
     :rtype: Callable
     **Usage**
     .. code-block:: python
-        chat_eval = RetrievalChatEvaluator(model_config)
+        chat_eval = RetrievalScoreEvaluator(model_config)
         conversation = [
             {"role": "user", "content": "What is the value of 2 + 2?"},
             {"role": "assistant", "content": "2 + 2 = 4", "context": {
@@ -130,18 +124,18 @@ class RetrievalChatEvaluator:
     .. code-block:: python
-    {
-        "gpt_retrieval": 3.0
-        "evaluation_per_turn": {
-            "gpt_retrieval": {
-                "score": [1.0, 2.0, 3.0]
+        {
+            "gpt_retrieval": 3.0
+            "evaluation_per_turn": {
+                "gpt_retrieval": {
+                    "score": [1.0, 2.0, 3.0]
+                }
             }
         }
-    }
     """
     def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncRetrievalChatEvaluator(model_config)
+        self._async_evaluator = _AsyncRetrievalScoreEvaluator(model_config)
     def __call__(self, *, conversation, **kwargs):
         """Evaluates retrieval score chat scenario.

azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty RENAMED Viewed

@@ -3,11 +3,6 @@ name: Retrieval
 description: Evaluates retrieval score for Chat scenario
 model:
   api: chat
-  configuration:
-    type: azure_openai
-    azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
-    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     top_p: 1.0

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -11,7 +11,7 @@ from promptflow.core import AsyncPrompty
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
+from ..._common.utils import construct_prompty_model_config
 try:
     from ..._user_agent import USER_AGENT
@@ -26,17 +26,9 @@ class _AsyncSimilarityEvaluator:
     DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
     def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
-        ensure_user_agent_in_aoai_model_config(
+        prompty_model_config = construct_prompty_model_config(
             model_config,
-            prompty_model_config,
+            self.DEFAULT_OPEN_API_VERSION,
             USER_AGENT,
         )

azure/ai/evaluation/_evaluators/_similarity/similarity.prompty CHANGED Viewed

@@ -3,11 +3,6 @@ name: Similarity
 description: Evaluates similarity score for QA scenario
 model:
   api: chat
-  configuration:
-    type: azure_openai
-    azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
-    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     max_tokens: 1

azure/ai/evaluation/_evaluators/_xpia/xpia.py CHANGED Viewed

@@ -2,18 +2,17 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import logging
-from typing import Optional
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing import Optional
+from typing_extensions import override
 from azure.ai.evaluation._common.constants import EvaluationMetrics
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 logger = logging.getLogger(__name__)
-class IndirectAttackEvaluator:
+class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
     """A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator.
     Detect whether cross domain injected attacks are present in your AI system's response.
@@ -25,7 +24,7 @@ class IndirectAttackEvaluator:
         focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
     :type eval_last_turn: bool
     :param credential: The credential for connecting to Azure AI project.
-    :type credential: ~azure.core.credentials.TokenCredential
+    :type credential: Optional[~azure.core.credentials.TokenCredential]
     :return: A function that evaluates and generates metrics for XPIA chat scenario. Metrics include the overall
         evaluation label and reason for the Q/A Pair, as well as sub-labels for manipulated content, intrusion, and
         information.
@@ -51,89 +50,16 @@ class IndirectAttackEvaluator:
             }
     """
-    def __init__(self, azure_ai_project: dict, eval_last_turn: bool = False, credential=None):
-        self._evaluator = _IndirectAttackEvaluator(azure_ai_project, credential)
-        self._eval_last_turn = eval_last_turn
-    def __call__(
+    @override
+    def __init__(
         self,
-        *,
-        query: Optional[str],
-        response: Optional[str],
-        **kwargs,
+        azure_ai_project: dict,
+        credential: Optional[dict] = None,
+        eval_last_turn: bool = False,
     ):
-        """
-        Evaluates content according to the presence of attacks injected into the conversation context to
-        interrupt normal expected functionality by eliciting manipulated content, intrusion and attempting
-        to gather information outside the scope of your AI system.
-        :keyword query: The query to be evaluated. Mutually exclusive with 'conversation'.
-        :paramtype query: Optional[str]
-        :keyword response: The response to be evaluated. Mutually exclusive with 'conversation'.
-        :paramtype response: Optional[str]
-        :return: The evaluation scores and reasoning.
-        :rtype: dict
-        """
-        return self._evaluator(query=query, response=response, **kwargs)
-class _AsyncIndirectAttackEvaluator:
-    def __init__(self, azure_ai_project: dict, credential=None):
-        self._azure_ai_project = azure_ai_project
-        self._credential = credential
-    async def __call__(self, *, query: str, response: str, **kwargs):
-        """
-        Evaluates content according to this evaluator's metric.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :return: The evaluation score computation based on the metric (self.metric).
-        :rtype: Any
-        """
-        # Validate inputs
-        # Raises value error if failed, so execution alone signifies success.
-        if not (query and query.strip() and query != "None") or not (
-            response and response.strip() and response != "None"
-        ):
-            msg = "Both 'query' and 'response' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.INDIRECT_ATTACK_EVALUATOR,
-            )
-        # Run score computation based on supplied metric.
-        result = await evaluate_with_rai_service(
-            metric_name=EvaluationMetrics.XPIA,
-            query=query,
-            response=response,
-            project_scope=self._azure_ai_project,
-            credential=self._credential,
+        super().__init__(
+            eval_metric=EvaluationMetrics.XPIA,
+            azure_ai_project=azure_ai_project,
+            credential=credential,
+            eval_last_turn=eval_last_turn,
         )
-        return result
-class _IndirectAttackEvaluator:
-    def __init__(self, azure_ai_project: dict, credential=None):
-        self._async_evaluator = _AsyncIndirectAttackEvaluator(azure_ai_project, credential)
-    def __call__(self, *, query: str, response: str, **kwargs):
-        """
-        Evaluates XPIA content.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword context: The context to be evaluated.
-        :paramtype context: str
-        :return: The XPIA score.
-        :rtype: dict
-        """
-        return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
-    def _to_async(self):
-        return self._async_evaluator

azure/ai/evaluation/_exceptions.py CHANGED Viewed

@@ -54,7 +54,6 @@ class ErrorTarget(Enum):
     EVAL_RUN = "EvalRun"
     CODE_CLIENT = "CodeClient"
     RAI_CLIENT = "RAIClient"
-    CHAT_EVALUATOR = "ChatEvaluator"
     COHERENCE_EVALUATOR = "CoherenceEvaluator"
     CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
     ECI_EVALUATOR = "ECIEvaluator"

azure/ai/evaluation/_model_configurations.py CHANGED Viewed

@@ -2,26 +2,54 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import TypedDict
+from typing import Dict, Literal, TypedDict
+from typing_extensions import NotRequired
-class AzureOpenAIModelConfigurationBase(TypedDict):
+class AzureOpenAIModelConfiguration(TypedDict, total=False):
+    """Model Configuration for Azure OpenAI Model"""
+    type: Literal["azure_openai"]
+    """The type of the model configuration. Should be 'azure_openai' for AzureOpenAIModelConfiguration"""
     azure_deployment: str
+    """Name of Azure OpenAI deployment to make request to"""
     azure_endpoint: str
+    """Endpoint of Azure OpenAI resource to make request to"""
     api_key: str
+    """API key of Azure OpenAI resource"""
+    api_version: NotRequired[str]
+    """(Optional) API version to use in request to Azure OpenAI deployment"""
-class AzureOpenAIModelConfiguration(AzureOpenAIModelConfigurationBase, total=False):
-    api_version: str
+class OpenAIModelConfiguration(TypedDict, total=False):
+    """Model Configuration for OpenAI Model"""
-class OpenAIModelConfiguration(TypedDict):
+    type: Literal["openai"]
+    """The type of the model configuration. Should be 'openai' for OpenAIModelConfiguration"""
     api_key: str
-    base_url: str
-    organization: str
+    "API key needed to make request to model"
+    model: str
+    """Name of model to be used in OpenAI request"""
+    base_url: NotRequired[str]
+    """(Optional) Base URL to be used in OpenAI request"""
+    organization: NotRequired[str]
+    """(Optional) OpenAI organization"""
 class AzureAIProject(TypedDict):
+    """Azure AI Project Information"""
     subscription_id: str
+    """Azure subscription id of the project"""
     resource_group_name: str
+    """Azure resource group name of the project"""
     project_name: str
+    """Azure project name"""
+class EvaluatorConfig(TypedDict, total=False):
+    """Configuration for an evaluator"""
+    column_mapping: Dict[str, str]
+    """Dictionary mapping evaluator input name to column in data"""

azure/ai/evaluation/_version.py CHANGED Viewed

@@ -2,4 +2,4 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-VERSION = "1.0.0b2"
+VERSION = "1.0.0b3"

azure/ai/evaluation/simulator/_prompty/task_query_response.prompty CHANGED Viewed

@@ -33,7 +33,8 @@ Answer must not be more than 5 words
 Answer must be picked from Text as is
 Question should be as descriptive as possible and must include as much context as possible from Text
 Output must always have the provided number of QnAs
-Output must be in JSON format
+Output must be in JSON format.
+Output must have {{num_queries}} objects in the format specified below. Any other count is unacceptable.
 Text:
 <|text_start|>
 On January 24, 1984, former Apple CEO Steve Jobs introduced the first Macintosh. In late 2003, Apple had 2.06 percent of the desktop share in the United States.

azure/ai/evaluation/simulator/_simulator.py CHANGED Viewed

@@ -41,7 +41,7 @@ class Simulator:
         """
         self._validate_project_config(azure_ai_project)
         self.azure_ai_project = azure_ai_project
-        self.azure_ai_project["api_version"] = "2024-02-15-preview"
+        self.azure_ai_project["api_version"] = "2024-06-01"
         self.credential = credential
     @staticmethod
@@ -129,7 +129,6 @@ class Simulator:
         max_conversation_turns *= 2  # account for both user and assistant turns
         prompty_model_config = self._build_prompty_model_config()
         if conversation_turns:
             return await self._simulate_with_predefined_turns(
                 target=target,
@@ -234,8 +233,16 @@ class Simulator:
                     target=target,
                     progress_bar=progress_bar,
                 )
-            simulated_conversations.append(current_simulation.to_list())
+            simulated_conversations.append(
+                JsonLineChatProtocol(
+                    {
+                        "messages": current_simulation.to_list(),
+                        "finish_reason": ["stop"],
+                        "context": {},
+                        "$schema": "http://azureml/sdk-2-0/ChatConversation.json",
+                    }
+                )
+            )
         progress_bar.close()
         return simulated_conversations
@@ -280,7 +287,9 @@ class Simulator:
         while len(current_simulation) < max_conversation_turns:
             user_response_content = user_flow(
-                task="Continue the conversation", conversation_history=current_simulation.to_list()
+                task="Continue the conversation",
+                conversation_history=current_simulation.to_list(),
+                **user_simulator_prompty_kwargs,
             )
             user_response = self._parse_prompty_response(response=user_response_content)
             user_turn = Turn(role=ConversationRole.USER, content=user_response["content"])
@@ -317,6 +326,7 @@ class Simulator:
             resource_name = "task_simulate.prompty"
             try:
                 # Access the resource as a file path
+                # pylint: disable=deprecated-method
                 with pkg_resources.path(package, resource_name) as prompty_path:
                     return load_flow(source=str(prompty_path), model=prompty_model_config)
             except FileNotFoundError as e:
@@ -398,7 +408,6 @@ class Simulator:
             prompty_model_config=prompty_model_config,
             query_response_generating_prompty_kwargs=query_response_generating_prompty_kwargs,
         )
         try:
             query_responses = query_flow(text=text, num_queries=num_queries)
             if isinstance(query_responses, dict):
@@ -432,6 +441,7 @@ class Simulator:
             resource_name = "task_query_response.prompty"
             try:
                 # Access the resource as a file path
+                # pylint: disable=deprecated-method
                 with pkg_resources.path(package, resource_name) as prompty_path:
                     return load_flow(source=str(prompty_path), model=prompty_model_config)
             except FileNotFoundError as e:
@@ -612,9 +622,10 @@ class Simulator:
             prompty_model_config=self._build_prompty_model_config(),
             user_simulator_prompty_kwargs=user_simulator_prompty_kwargs,
         )
         try:
-            response_content = user_flow(task=task, conversation_history=conversation_history)
+            response_content = user_flow(
+                task=task, conversation_history=conversation_history, **user_simulator_prompty_kwargs
+            )
             user_response = self._parse_prompty_response(response=response_content)
             return user_response["content"]
         except Exception as e:

azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b3__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.0.0b3py3-none-any.whl