PyPI - azure-ai-evaluation - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0py3-none-any.whl → 1.0.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -3,112 +3,129 @@
 # ---------------------------------------------------------
 import os
-from typing import Dict, Union, List
+import re
+from typing import Union
-from typing_extensions import overload, override
+import numpy as np
-from azure.ai.evaluation._model_configurations import Conversation
-from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from promptflow._utils.async_utils import async_run_allowing_running_loop
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from promptflow.core import AsyncPrompty
+from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from ..._common.utils import (
+    check_and_add_api_version_for_aoai_model_config,
+    check_and_add_user_agent_for_aoai_model_config,
+)
-class RelevanceEvaluator(PromptyEvaluatorBase):
-    """
-    Evaluates relevance score for a given query and response or a multi-turn conversation, including reasoning.
+try:
+    from ..._user_agent import USER_AGENT
+except ImportError:
+    USER_AGENT = None
+class _AsyncRelevanceEvaluator:
+    # Constants must be defined within eval's directory to be save/loadable
+    PROMPTY_FILE = "relevance.prompty"
+    LLM_CALL_TIMEOUT = 600
+    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
+    def __init__(self, model_config: dict):
+        check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-    The relevance measure assesses the ability of answers to capture the key points of the context.
-    High relevance scores signify the AI system's understanding of the input and its capability to produce coherent
-    and contextually appropriate outputs. Conversely, low relevance scores indicate that generated responses might
-    be off-topic, lacking in context, or insufficient in addressing the user's intended queries. Use the relevance
-    metric when evaluating the AI system's performance in understanding the input and generating contextually
-    appropriate responses.
+        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-    Relevance scores range from 1 to 5, with 1 being the worst and 5 being the best.
+        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
+        # https://github.com/encode/httpx/discussions/2959
+        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
+        check_and_add_user_agent_for_aoai_model_config(
+            model_config,
+            prompty_model_config,
+            USER_AGENT,
+        )
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
+        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
+    async def __call__(self, *, query: str, response: str, context: str, **kwargs):
+        # Validate input parameters
+        query = str(query or "")
+        response = str(response or "")
+        context = str(context or "")
+        if not (query.strip() and response.strip() and context.strip()):
+            msg = "'query', 'response' and 'context' must be non-empty strings."
+            raise EvaluationException(
+                message=msg,
+                internal_message=msg,
+                error_category=ErrorCategory.MISSING_FIELD,
+                error_blame=ErrorBlame.USER_ERROR,
+                error_target=ErrorTarget.RELEVANCE_EVALUATOR,
+            )
+        # Run the evaluation flow
+        llm_output = await self._flow(
+            query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
+        )
+        score = np.nan
+        if llm_output:
+            match = re.search(r"\d", llm_output)
+            if match:
+                score = float(match.group())
+        return {"gpt_relevance": float(score)}
+class RelevanceEvaluator:
+    """
+    Initialize a relevance evaluator configured for a specific Azure OpenAI model.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
-    .. admonition:: Example:
+    **Usage**
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START relevance_evaluator]
-            :end-before: [END relevance_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
+    .. code-block:: python
-    .. note::
+        eval_fn = RelevanceEvaluator(model_config)
+        result = eval_fn(
+            query="What is the capital of Japan?",
+            response="The capital of Japan is Tokyo.",
+            context="Tokyo is Japan's capital, known for its blend of traditional culture \
+                and technological advancements.")
-        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
-        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
-    """
+    **Output format**
-    # Constants must be defined within eval's directory to be save/loadable
-    _PROMPTY_FILE = "relevance.prompty"
-    _RESULT_KEY = "relevance"
+    .. code-block:: python
-    id = "azureml://registries/azureml/models/Relevance-Evaluator/versions/4"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+        {
+            "gpt_relevance": 3.0
+        }
+    """
-    @override
-    def __init__(self, model_config):
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
-    @overload
-    def __call__(
-        self,
-        *,
-        query: str,
-        response: str,
-    ) -> Dict[str, Union[str, float]]:
-        """Evaluate groundedness for given input of query, response, context
+    def __init__(self, model_config: dict):
+        self._async_evaluator = _AsyncRelevanceEvaluator(model_config)
+    def __call__(self, *, query: str, response: str, context: str, **kwargs):
+        """
+        Evaluate relevance.
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
         :return: The relevance score.
-        :rtype: Dict[str, float]
+        :rtype: dict
         """
+        return async_run_allowing_running_loop(
+            self._async_evaluator, query=query, response=response, context=context, **kwargs
+        )
-    @overload
-    def __call__(
-        self,
-        *,
-        conversation: Conversation,
-    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
-        """Evaluate relevance for a conversation
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages", and potentially a global context under the key "context". Conversation turns are expected
-            to be dictionaries with keys "content", "role", and possibly "context".
-        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The relevance score.
-        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
-        """
-    @override
-    def __call__(  # pylint: disable=docstring-missing-param
-        self,
-        *args,
-        **kwargs,
-    ):
-        """Evaluate relevance. Accepts either a query and response for a single evaluation,
-        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
-        the evaluator will aggregate the results of each turn.
-        :keyword query: The query to be evaluated. Mutually exclusive with the `conversation` parameter.
-        :paramtype query: Optional[str]
-        :keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter.
-        :paramtype response: Optional[str]
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages", and potentially a global context under the key "context". Conversation turns are expected
-            to be dictionaries with keys "content", "role", and possibly "context".
-        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The relevance score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
-        """
-        return super().__call__(*args, **kwargs)
+    def _to_async(self):
+        return self._async_evaluator

azure/ai/evaluation/_evaluators/_relevance/relevance.prompty CHANGED Viewed

@@ -3,9 +3,14 @@ name: Relevance
 description: Evaluates relevance score for QA scenario
 model:
   api: chat
+  configuration:
+    type: azure_openai
+    azure_deployment: ${env:AZURE_DEPLOYMENT}
+    api_key: ${env:AZURE_OPENAI_API_KEY}
+    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
-    max_tokens: 800
+    max_tokens: 1
     top_p: 1.0
     presence_penalty: 0
     frequency_penalty: 0
@@ -17,84 +22,48 @@ inputs:
     type: string
   response:
     type: string
+  context:
+    type: string
 ---
 system:
-# Instruction
-## Goal
-### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
-- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
-- **Data**: Your input data include QUERY and RESPONSE.
-- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
 user:
-# Definition
-**Relevance** refers to how effectively a response addresses a question. It assesses the accuracy, completeness, and direct relevance of the response based solely on the given information.
-# Ratings
-## [Relevance: 1] (Irrelevant Response)
-**Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed.
-**Examples:**
-  **Query:** What is the team preparing for?
-  **Response:** I went grocery shopping yesterday evening.
-  **Query:** When will the company's new product line launch?
-  **Response:** International travel can be very rewarding and educational.
-## [Relevance: 2] (Incorrect Response)
-**Definition:** The response attempts to address the question but includes incorrect information. It provides a response that is factually wrong based on the provided information.
-**Examples:**
-  **Query:** When was the merger between the two firms finalized?
-  **Response:** The merger was finalized on April 10th.
-  **Query:** Where and when will the solar eclipse be visible?
-  **Response:** The solar eclipse will be visible in Asia on December 14th.
-## [Relevance: 3] (Incomplete Response)
-**Definition:** The response addresses the question but omits key details necessary for a full understanding. It provides a partial response that lacks essential information.
-**Examples:**
-  **Query:** What type of food does the new restaurant offer?
-  **Response:** The restaurant offers Italian food like pasta.
-  **Query:** What topics will the conference cover?
-  **Response:** The conference will cover renewable energy and climate change.
-## [Relevance: 4] (Complete Response)
-**Definition:** The response fully addresses the question with accurate and complete information. It includes all essential details required for a comprehensive understanding, without adding any extraneous information.
-**Examples:**
-  **Query:** What type of food does the new restaurant offer?
-  **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto.
-  **Query:** What topics will the conference cover?
-  **Response:** The conference will cover renewable energy, climate change, and sustainability practices.
-## [Relevance: 5] (Comprehensive Response with Insights)
-**Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding.
-**Examples:**
-  **Query:** What type of food does the new restaurant offer?
-  **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience.
-  **Query:** What topics will the conference cover?
-  **Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues.
-# Data
-QUERY: {{query}}
-RESPONSE: {{response}}
-# Tasks
-## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
-- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
-- **Explanation**: a very short explanation of why you think the input Data should get that Score.
-- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
-## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
-# Output
+Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale:
+One star: the answer completely lacks relevance
+Two stars: the answer mostly lacks relevance
+Three stars: the answer is partially relevant
+Four stars: the answer is mostly relevant
+Five stars: the answer has perfect relevance
+This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
+context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize.
+question: What field did Marie Curie excel in?
+answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques.
+stars: 1
+context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history.
+question: Where were The Beatles formed?
+answer: The band The Beatles began their journey in London, England, and they changed the history of music.
+stars: 2
+context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
+question: What are the main goals of Perseverance Mars rover mission?
+answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars.
+stars: 3
+context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health.
+question: What are the main components of the Mediterranean diet?
+answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes.
+stars: 4
+context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty.
+question: What are the main attractions of the Queen's Royal Castle?
+answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty.
+stars: 5
+context: {{context}}
+question: {{query}}
+answer: {{response}}
+stars:

azure/ai/evaluation/_evaluators/_rouge/_rouge.py CHANGED Viewed

@@ -3,12 +3,12 @@
 # ---------------------------------------------------------
 from enum import Enum
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from rouge_score import rouge_scorer
-from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
+from promptflow._utils.async_utils import async_run_allowing_running_loop
-class RougeType(Enum):
+class RougeType(str, Enum):
     """
     Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
     """
@@ -37,8 +37,8 @@ class _AsyncRougeScoreEvaluator:
         self._rouge_type = rouge_type
     async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
-        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
+        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
+        metrics = scorer.score(ground_truth, response)[self._rouge_type]
         return {
             "rouge_precision": metrics.precision,
             "rouge_recall": metrics.recall,
@@ -48,33 +48,33 @@ class _AsyncRougeScoreEvaluator:
 class RougeScoreEvaluator:
     """
-    Calculates the ROUGE score for a given response and ground truth.
+    Evaluator for computes the ROUGE scores between two strings.
-    The ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) evaluates the similarity between the
-    generated text and reference text based on n-gram overlap, including ROUGE-N (unigram, bigram, etc.), and
-    ROUGE-L (longest common subsequence). It calculates precision, recall, and F1 scores to capture how well
-    the generated text matches the reference text. Rouge type options are "rouge1" (Unigram overlap), "rouge2"
-    (Bigram overlap), "rouge3" (Trigram overlap), "rouge4" (4-gram overlap), "rouge5" (5-gram overlap), "rougeL"
-    (L-graph overlap)
+    ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate automatic
+    summarization and machine translation. It measures the overlap between generated text and reference summaries.
+    ROUGE focuses on recall-oriented measures to assess how well the generated text covers the reference text. Text
+    summarization and document comparison are among optimal use cases for ROUGE, particularly in scenarios where text
+    coherence and relevance are critical.
-    Use the ROUGE score when you need a robust evaluation metric for text summarization, machine translation, and
-    other natural language processing tasks, especially when focusing on recall and the ability to capture relevant
-    information from the reference text.
+    **Usage**
-    ROUGE scores range from 0 to 1, with higher scores indicating better quality.
+    .. code-block:: python
-    .. admonition:: Example:
+        eval_fn = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
+        result = eval_fn(
+            response="Tokyo is the capital of Japan.",
+            ground_truth="The capital of Japan is Tokyo.")
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START rouge_score_evaluator]
-            :end-before: [END rouge_score_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
-    """
+    **Output format**
-    id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    .. code-block:: python
+        {
+            "rouge_precision": 1.0,
+            "rouge_recall": 1.0,
+            "rouge_f1_score": 1.0
+        }
+    """
     def __init__(self, rouge_type: RougeType):
         self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
@@ -88,7 +88,7 @@ class RougeScoreEvaluator:
         :keyword ground_truth: The ground truth to be compared against.
         :paramtype ground_truth: str
         :return: The ROUGE score.
-        :rtype: Dict[str, float]
+        :rtype: dict
         """
         return async_run_allowing_running_loop(
             self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -2,53 +2,54 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import math
 import os
 import re
+from typing import Union
+import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from promptflow.core import AsyncPrompty
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from ..._common.utils import construct_prompty_model_config, validate_model_config
+from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from ..._common.utils import (
+    check_and_add_api_version_for_aoai_model_config,
+    check_and_add_user_agent_for_aoai_model_config,
+)
 try:
     from ..._user_agent import USER_AGENT
 except ImportError:
-    USER_AGENT = "None"
+    USER_AGENT = None
 class _AsyncSimilarityEvaluator:
     # Constants must be defined within eval's directory to be save/loadable
-    _PROMPTY_FILE = "similarity.prompty"
-    _LLM_CALL_TIMEOUT = 600
-    _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
+    PROMPTY_FILE = "similarity.prompty"
+    LLM_CALL_TIMEOUT = 600
+    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
     def __init__(self, model_config: dict):
-        prompty_model_config = construct_prompty_model_config(
-            validate_model_config(model_config),
-            self._DEFAULT_OPEN_API_VERSION,
+        check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
+        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
+        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
+        # https://github.com/encode/httpx/discussions/2959
+        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
+        check_and_add_user_agent_for_aoai_model_config(
+            model_config,
+            prompty_model_config,
             USER_AGENT,
         )
         current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
         self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
     async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
-        """
-        Evaluate similarity.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword ground_truth: The ground truth to be evaluated.
-        :paramtype ground_truth: str
-        :return: The similarity score.
-        :rtype: Dict[str, float]
-        """
         # Validate input parameters
         query = str(query or "")
         response = str(response or "")
@@ -66,57 +67,46 @@ class _AsyncSimilarityEvaluator:
         # Run the evaluation flow
         llm_output = await self._flow(
-            query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
+            query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
         )
-        score = math.nan
+        score = np.nan
         if llm_output:
             match = re.search(r"\d", llm_output)
             if match:
                 score = float(match.group())
-        return {"similarity": float(score), "gpt_similarity": float(score)}
+        return {"gpt_similarity": float(score)}
 class SimilarityEvaluator:
     """
-    Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
-    The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
-    AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
-    the ground truth and the model's prediction, which are high-dimensional vector representations capturing
-    the semantic meaning and context of the sentences.
-    Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
-    tasks where you have access to ground truth responses. Similarity enables you to assess the generated
-    text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
-    Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
+    Initialize a similarity evaluator configured for a specific Azure OpenAI model.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
-    .. admonition:: Example:
+    **Usage**
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START rouge_score_evaluator]
-            :end-before: [END rouge_score_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
+    .. code-block:: python
-    .. note::
+        eval_fn = SimilarityEvaluator(model_config)
+        result = eval_fn(
+            query="What is the capital of Japan?",
+            response="The capital of Japan is Tokyo.",
+            ground_truth="Tokyo is Japan's capital.")
-        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
-        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
-    """
+    **Output format**
-    id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    .. code-block:: python
-    def __init__(self, model_config):
+        {
+            "gpt_similarity": 3.0
+        }
+    """
+    def __init__(self, model_config: dict):
         self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
     def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
@@ -130,7 +120,7 @@ class SimilarityEvaluator:
         :keyword ground_truth: The ground truth to be evaluated.
         :paramtype ground_truth: str
         :return: The similarity score.
-        :rtype: Dict[str, float]
+        :rtype: dict
         """
         return async_run_allowing_running_loop(
             self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs

azure/ai/evaluation/_evaluators/_similarity/similarity.prompty CHANGED Viewed

@@ -3,6 +3,11 @@ name: Similarity
 description: Evaluates similarity score for QA scenario
 model:
   api: chat
+  configuration:
+    type: azure_openai
+    azure_deployment: ${env:AZURE_DEPLOYMENT}
+    api_key: ${env:AZURE_OPENAI_API_KEY}
+    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     max_tokens: 1

azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0py3-none-any.whl → 1.0.0b1py3-none-any.whl