PyPI - azure-ai-evaluation - Versions diffs - 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py CHANGED Viewed

@@ -14,9 +14,7 @@ RetrievalGroundTruthDocument = TypedDict(
     "RetrievalGroundTruthDocument", {"document_id": str, "query_relevance_label": int}
 )
-RetrievedDocument = TypedDict(
-    "RetrievedDocument", {"document_id": str, "relevance_score": float}
-)
+RetrievedDocument = TypedDict("RetrievedDocument", {"document_id": str, "relevance_score": float})
 class DocumentRetrievalEvaluator(EvaluatorBase):
@@ -33,15 +31,15 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
             :caption: Initialize and call a DocumentRetrievalEvaluator
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START document_retrieval_evaluator]
             :end-before: [END document_retrieval_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
+            :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
             :start-after: [START threshold_document_retrieval_evaluator]
@@ -62,7 +60,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
         top1_relevance_threshold: Optional[float] = 50.0,
         top3_max_relevance_threshold: Optional[float] = 50.0,
         total_retrieved_documents_threshold: Optional[int] = 50,
-        total_ground_truth_documents_threshold: Optional[int] = 50
+        total_ground_truth_documents_threshold: Optional[int] = 50,
     ):
         super().__init__()
         self.k = 3
@@ -74,14 +72,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
             )
         if not isinstance(ground_truth_label_min, int):
-            raise EvaluationException(
-                "The ground truth label minimum must be an integer value."
-            )
+            raise EvaluationException("The ground truth label minimum must be an integer value.")
         if not isinstance(ground_truth_label_max, int):
-            raise EvaluationException(
-                "The ground truth label maximum must be an integer value."
-            )
+            raise EvaluationException("The ground truth label maximum must be an integer value.")
         self.ground_truth_label_min = ground_truth_label_min
         self.ground_truth_label_max = ground_truth_label_max
@@ -122,7 +116,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
     ) -> float:
         """NDCG (Normalized Discounted Cumulative Gain) calculated for the top K documents retrieved from a search query.
         NDCG measures how well a document ranking compares to an ideal document ranking given a list of ground-truth documents.
         :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
         :type result_docs_groundtruth_labels: List[int]
         :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
@@ -145,7 +139,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
     def _compute_xdcg(self, result_docs_groundtruth_labels: List[int]) -> float:
         """XDCG calculated for the top K documents retrieved from a search query.
         XDCG measures how objectively good are the top K documents, discounted by their position in the list.
         :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
         :type result_docs_groundtruth_labels: List[int]
         :return: The XDCG@K calculation result.
@@ -159,11 +153,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
             return math.pow(self.xdcg_discount_factor, rank - 1)
         ranks = list(range(1, self.k + 1))
-        xdcg_n = sum(
-            starmap(
-                calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)
-            )
-        )
+        xdcg_n = sum(starmap(calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)))
         xdcg_d = sum(map(calculate_xdcg_denominator, ranks))
         return xdcg_n / float(xdcg_d)
@@ -175,7 +165,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
     ) -> float:
         """Fidelity calculated over all documents retrieved from a search query.
         Fidelity measures how objectively good are all of the documents retrieved compared with all known good documents in the underlying data store.
         :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
         :type result_docs_groundtruth_labels: List[int]
         :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
@@ -196,25 +186,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
                 if label >= s:
                     label_counts[str(label)] += 1
-            sorted_label_counts = [
-                x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])
-            ]
+            sorted_label_counts = [x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])]
             # calculate weights
-            weights = [
-                (math.pow(2, i + 1) - 1)
-                for i in range(s, self.ground_truth_label_max + 1)
-            ]
+            weights = [(math.pow(2, i + 1) - 1) for i in range(s, self.ground_truth_label_max + 1)]
             # return weighted sum
             return sum(starmap(operator.mul, zip(sorted_label_counts, weights)))
-        weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(
-            result_docs_groundtruth_labels
-        )
-        weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(
-            ideal_docs_groundtruth_labels
-        )
+        weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(result_docs_groundtruth_labels)
+        weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(ideal_docs_groundtruth_labels)
         if weighted_sum_by_rating_index == 0:
             return math.nan
@@ -226,12 +207,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
         for metric_name, metric_value in metrics.items():
             if metric_name in self._threshold_metrics.keys():
-                result[f"{metric_name}_result"] = "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
+                result[f"{metric_name}_result"] = (
+                    "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
+                )
                 result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
                 result[f"{metric_name}_higher_is_better"] = True
             elif metric_name in self._threshold_holes.keys():
-                result[f"{metric_name}_result"] = "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
+                result[f"{metric_name}_result"] = (
+                    "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
+                )
                 result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
                 result[f"{metric_name}_higher_is_better"] = False
@@ -256,8 +241,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
         # if the qrels are empty, no meaningful evaluation is possible
         if not retrieval_ground_truth:
             raise EvaluationException(
-                ("'retrieval_ground_truth' parameter must contain at least one item. "
-                 "Check your data input to be sure that each input record has ground truth defined.")
+                (
+                    "'retrieval_ground_truth' parameter must contain at least one item. "
+                    "Check your data input to be sure that each input record has ground truth defined."
+                )
             )
         qrels = []
@@ -277,9 +264,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
                 )
             if not isinstance(query_relevance_label, int):
-                raise EvaluationException(
-                    "Query relevance labels must be integer values."
-                )
+                raise EvaluationException("Query relevance labels must be integer values.")
             if query_relevance_label < self.ground_truth_label_min:
                 raise EvaluationException(
@@ -318,12 +303,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
                         )
                     )
-                if not isinstance(relevance_score, float) and not isinstance(
-                    relevance_score, int
-                ):
-                    raise EvaluationException(
-                        "Retrieved document relevance score must be a numerical value."
-                    )
+                if not isinstance(relevance_score, float) and not isinstance(relevance_score, int):
+                    raise EvaluationException("Retrieved document relevance score must be a numerical value.")
                 results.append(result)
@@ -368,24 +349,17 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
         results_lookup = {x["document_id"]: x["relevance_score"] for x in results}
         # sort each input set by label to get the ranking
-        qrels_sorted_by_rank = sorted(
-            qrels_lookup.items(), key=lambda x: x[1], reverse=True
-        )
-        results_sorted_by_rank = sorted(
-            results_lookup.items(), key=lambda x: x[1], reverse=True
-        )
+        qrels_sorted_by_rank = sorted(qrels_lookup.items(), key=lambda x: x[1], reverse=True)
+        results_sorted_by_rank = sorted(results_lookup.items(), key=lambda x: x[1], reverse=True)
         # find ground truth labels for the results set and ideal set
         result_docs_groundtruth_labels = [
-            qrels_lookup[doc_id] if doc_id in qrels_lookup else 0
-            for (doc_id, _) in results_sorted_by_rank
+            qrels_lookup[doc_id] if doc_id in qrels_lookup else 0 for (doc_id, _) in results_sorted_by_rank
         ]
         ideal_docs_groundtruth_labels = [label for (_, label) in qrels_sorted_by_rank]
         # calculate the proportion of result docs with no ground truth label (holes)
-        holes = self._compute_holes(
-            [x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank]
-        )
+        holes = self._compute_holes([x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank])
         holes_ratio = holes / float(len(results))
         # if none of the retrieved docs are labeled, report holes only
@@ -412,12 +386,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
                 result_docs_groundtruth_labels[: self.k],
                 ideal_docs_groundtruth_labels[: self.k],
             ),
-            f"xdcg@{self.k}": self._compute_xdcg(
-                result_docs_groundtruth_labels[: self.k]
-            ),
-            "fidelity": self._compute_fidelity(
-                result_docs_groundtruth_labels, ideal_docs_groundtruth_labels
-            ),
+            f"xdcg@{self.k}": self._compute_xdcg(result_docs_groundtruth_labels[: self.k]),
+            "fidelity": self._compute_fidelity(result_docs_groundtruth_labels, ideal_docs_groundtruth_labels),
             "top1_relevance": result_docs_groundtruth_labels[0],
             "top3_max_relevance": max(result_docs_groundtruth_labels[: self.k]),
             "holes": holes,

azure/ai/evaluation/_evaluators/_eci/_eci.py CHANGED Viewed

@@ -22,9 +22,9 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
-    :param azure_ai_project: The scope of the Azure AI project.
-        It contains subscription id, resource group, and project name.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
     :return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
     :rtype: Dict[str, str]

azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py CHANGED Viewed

@@ -39,15 +39,15 @@ class F1ScoreEvaluator(EvaluatorBase):
             :caption: Initialize and call an F1ScoreEvaluator.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START f1_score_evaluator]
             :end-before: [END f1_score_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
+            :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -147,7 +147,7 @@ class F1ScoreEvaluator(EvaluatorBase):
             if f1_result <= self._threshold:
                 binary_result = True
         return {
-            "f1_score": f1_result,
+            "f1_score": f1_result,
             "f1_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
             "f1_threshold": self._threshold,
         }

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -45,7 +45,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a FluencyEvaluator.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START fluency_evaluator]
             :end-before: [END fluency_evaluator]
@@ -78,7 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
-            _higher_is_better=self._higher_is_better
+            _higher_is_better=self._higher_is_better,
         )
     @overload

azure/ai/evaluation/_evaluators/_gleu/_gleu.py CHANGED Viewed

@@ -34,7 +34,7 @@ class GleuScoreEvaluator(EvaluatorBase):
             :language: python
             :dedent: 8
             :caption: Initialize and call a GleuScoreEvaluator.
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -45,13 +45,13 @@ class GleuScoreEvaluator(EvaluatorBase):
             :caption: Initialize with threshold and call a GleuScoreEvaluator.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START gleu_score_evaluator]
             :end-before: [END gleu_score_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -12,9 +12,13 @@ from azure.ai.evaluation._model_configurations import Conversation
 from ..._common.utils import construct_prompty_model_config, validate_model_config
 try:
-    from ..._user_agent import USER_AGENT
+    from ..._user_agent import UserAgentSingleton
 except ImportError:
-    USER_AGENT = "None"
+    class UserAgentSingleton:
+        @property
+        def value(self) -> str:
+            return "None"
 class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
@@ -35,7 +39,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the groundedness evaluator. Default is 3.
     :type threshold: int
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
@@ -54,13 +58,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a GroundednessEvaluator.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START groundedness_evaluator]
             :end-before: [END groundedness_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::
@@ -89,7 +93,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
-            _higher_is_better=self._higher_is_better
+            _higher_is_better=self._higher_is_better,
         )
         self._model_config = model_config
         self.threshold = threshold
@@ -165,7 +169,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_model_config = construct_prompty_model_config(
                 validate_model_config(self._model_config),
                 self._DEFAULT_OPEN_API_VERSION,
-                USER_AGENT,
+                UserAgentSingleton().value,
             )
             self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)

azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # ---------------------------------------------------------
 import os
 import math
+import logging
 from typing import Dict, Union, List, Optional
 from typing_extensions import overload, override
@@ -10,9 +11,12 @@ from typing_extensions import overload, override
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation, Message
-from ..._common.utils import check_score_is_valid
+from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response
 from azure.ai.evaluation._common._experimental import experimental
+logger = logging.getLogger(__name__)
 @experimental
 class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
@@ -34,13 +38,13 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START intent_resolution_evaluator]
             :end-before: [END intent_resolution_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
@@ -57,23 +61,19 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *,
-                 threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD,
-                 **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path,
-                         result_key=self._RESULT_KEY,
-                         **kwargs)
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
     @overload
     def __call__(
         self,
         *,
-        query            : Union[str, List[dict]],
-        response         : Union[str, List[dict]],
-        tool_definitions : Optional[Union[dict, List[dict]]] = None,
+        query: Union[str, List[dict]],
+        response: Union[str, List[dict]],
+        tool_definitions: Optional[Union[dict, List[dict]]] = None,
     ) -> Dict[str, Union[str, float]]:
         """Evaluate intent resolution for a given query, response and optional tool definitions.
         The query and response can be either a string or a list of messages.
@@ -135,11 +135,19 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
             )
+        # reformat query and response to the format expected by the prompty flow
+        eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
+        eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
         if isinstance(llm_output, dict):
-            score  = llm_output.get("resolution_score", math.nan)
-            if not check_score_is_valid(score, IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE, IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE):
+            score = llm_output.get("score", math.nan)
+            if not check_score_is_valid(
+                score,
+                IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE,
+                IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE,
+            ):
                 raise EvaluationException(
                     message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
                     internal_message="Invalid score value.",
@@ -148,19 +156,16 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
                 )
             reason = llm_output.get("explanation", "")
             score = float(score)
-            score_result = 'pass' if score >= self.threshold else 'fail'
-            #remove fields 'explanation' and 'resolution_score' from llm_output as they are already included in the response_dict
-            if 'explanation' in llm_output: llm_output.pop("explanation")
-            if 'resolution_score' in llm_output: llm_output.pop("resolution_score")
+            score_result = "pass" if score >= self.threshold else "fail"
             response_dict = {
-                             f"{self._result_key}"           : score,
-                             f"{self._result_key}_result"    : score_result,
-                             f"{self._result_key}_threshold" : self.threshold,
-                             f"{self._result_key}_reason"    : reason,
-                             f"additional_details"           : llm_output
-                        }
+                f"{self._result_key}": score,
+                f"{self._result_key}_result": score_result,
+                f"{self._result_key}_threshold": self.threshold,
+                f"{self._result_key}_reason": reason,
+            }
             return response_dict
         # If llm_output is not a dictionary, return NaN for the score. This should never happen
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
         return {self._result_key: math.nan}

azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

azure-ai-evaluation 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl