PyPI - azure-ai-evaluation - Versions diffs - 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

azure/ai/evaluation/_evaluators/_rouge/_rouge.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from enum import Enum
-from typing import Dict
+from typing import Dict, Union
 from typing_extensions import overload, override
 from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
@@ -12,7 +12,7 @@ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 import math
-class RougeType(Enum):
+class RougeType(str, Enum):
     """
     Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
     """
@@ -71,13 +71,13 @@ class RougeScoreEvaluator(EvaluatorBase):
             :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START rouge_score_evaluator]
             :end-before: [END rouge_score_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with threshold:
@@ -95,17 +95,17 @@ class RougeScoreEvaluator(EvaluatorBase):
     @override
     def __init__(
-        self,
+        self,
         rouge_type: RougeType,
         *,
         precision_threshold: float = 0.5,
         recall_threshold: float = 0.5,
-        f1_score_threshold: float = 0.5
+        f1_score_threshold: float = 0.5,
     ):
         self._rouge_type = rouge_type
         self._higher_is_better = True
         super().__init__()
         # Type checking for threshold parameters
         for name, value in [
             ("precision_threshold", precision_threshold),
@@ -114,7 +114,7 @@ class RougeScoreEvaluator(EvaluatorBase):
         ]:
             if not isinstance(value, float):
                 raise TypeError(f"{name} must be a float, got {type(value)}")
         self._threshold = {
             "precision": precision_threshold,
             "recall": recall_threshold,
@@ -122,10 +122,10 @@ class RougeScoreEvaluator(EvaluatorBase):
         }
     def _get_binary_result(
-            self,
-            rouge_precision: float,
-            rouge_recall: float,
-            rouge_f1_score: float,
+        self,
+        rouge_precision: float,
+        rouge_recall: float,
+        rouge_f1_score: float,
     ) -> Dict[str, bool]:
         """
         Get binary result based on the threshold.
@@ -150,22 +150,22 @@ class RougeScoreEvaluator(EvaluatorBase):
         precision_valid = not math.isnan(rouge_precision)
         recall_valid = not math.isnan(rouge_recall)
         f1_valid = not math.isnan(rouge_f1_score)
         if self._higher_is_better:
             if precision_valid:
-                results["rouge_precision_result"] = (rouge_precision >= self._threshold["precision"])
+                results["rouge_precision_result"] = rouge_precision >= self._threshold["precision"]
             if recall_valid:
-                results["rouge_recall_result"] = (rouge_recall >= self._threshold["recall"])
+                results["rouge_recall_result"] = rouge_recall >= self._threshold["recall"]
             if f1_valid:
-                results["rouge_f1_score_result"] = (rouge_f1_score >= self._threshold["f1_score"])
+                results["rouge_f1_score_result"] = rouge_f1_score >= self._threshold["f1_score"]
         else:
             if precision_valid:
-                results["rouge_precision_result"] = (rouge_precision <= self._threshold["precision"])
+                results["rouge_precision_result"] = rouge_precision <= self._threshold["precision"]
             if recall_valid:
-                results["rouge_recall_result"] = (rouge_recall <= self._threshold["recall"])
+                results["rouge_recall_result"] = rouge_recall <= self._threshold["recall"]
             if f1_valid:
-                results["rouge_f1_score_result"] = (rouge_f1_score <= self._threshold["f1_score"])
+                results["rouge_f1_score_result"] = rouge_f1_score <= self._threshold["f1_score"]
         return results
     @override
@@ -179,17 +179,17 @@ class RougeScoreEvaluator(EvaluatorBase):
         """
         ground_truth = eval_input["ground_truth"]
         response = eval_input["response"]
-        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
-        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
+        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
+        metrics = scorer.score(ground_truth, response)[self._rouge_type]
         binary_results = {
             "rouge_precision_result": False,
             "rouge_recall_result": False,
             "rouge_f1_score_result": False,
         }
         # Convert metrics to floats, using nan for None or non-convertible values
-        rouge_precision = float(metrics.precision) if metrics.precision is not None else float('nan')
-        rouge_recall = float(metrics.recall) if metrics.recall is not None else float('nan')
-        rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float('nan')
+        rouge_precision = float(metrics.precision) if metrics.precision is not None else float("nan")
+        rouge_recall = float(metrics.recall) if metrics.recall is not None else float("nan")
+        rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float("nan")
         binary_results = self._get_binary_result(
             rouge_precision=rouge_precision,
             rouge_recall=rouge_recall,

azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py CHANGED Viewed

@@ -24,9 +24,9 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
-    :param azure_ai_project: The scope of the Azure AI project.
-        It contains subscription id, resource group, and project name.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
     :param threshold: The threshold for the groundedness pro evaluator. Default is 5.
     :type threshold: int
     :param kwargs: Additional arguments to pass to the evaluator.
@@ -42,13 +42,13 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START groundedness_pro_evaluator]
             :end-before: [END groundedness_pro_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with threshold:

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -41,13 +41,13 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
             :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START similarity_evaluator]
             :end-before: [END similarity_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example:
@@ -85,7 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
-            _higher_is_better=self._higher_is_better
+            _higher_is_better=self._higher_is_better,
         )
     # Ignoring a mypy error about having only 1 overload function.

azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py CHANGED Viewed

@@ -13,6 +13,7 @@ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_sco
 from azure.ai.evaluation._model_configurations import Message
 from azure.ai.evaluation._common._experimental import experimental
 @experimental
 class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """The Task Adherence evaluator assesses how well an AI-generated response follows the assigned task based on:
@@ -42,15 +43,15 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :language: python
             :dedent: 8
             :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START task_adherence_evaluator]
             :end-before: [END task_adherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
@@ -65,14 +66,11 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE,
-                 **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path,
-                         result_key=self._RESULT_KEY,
-                         **kwargs)
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
     @overload
     def __call__(
@@ -85,7 +83,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         """Evaluate task adherence for a given query, response, and optional tool defintions.
         The query and response can be either a string or a list of messages.
         Example with string inputs and no tools:
             evaluator = TaskAdherenceEvaluator(model_config)
             query = "What is the weather today?"
@@ -113,9 +111,9 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     @override
     def __call__(  # pylint: disable=docstring-missing-param
-            self,
-            *args,
-            **kwargs,
+        self,
+        *args,
+        **kwargs,
     ):
         """
         Invokes the instance using the overloaded __call__ signature.
@@ -149,7 +147,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         if llm_output:
             score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
-            score_result = 'pass' if score >= self.threshold else 'fail'
+            score_result = "pass" if score >= self.threshold else "fail"
             return {
                 f"{self._result_key}": score,
@@ -159,4 +157,3 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             }
         return {self._result_key: math.nan}

azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py CHANGED Viewed

@@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
 T_EvalValue = TypeVar("T_EvalValue")
 @experimental
 class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
@@ -46,13 +47,13 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :caption: Initialize and call a ToolCallAccuracyEvaluator.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START tool_call_accuracy_evaluator]
             :end-before: [END tool_call_accuracy_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::
@@ -74,15 +75,11 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *,
-                 threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
-                 **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path,
-                         result_key=self._RESULT_KEY,
-                         **kwargs)
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
     @overload
     def __call__(
@@ -90,8 +87,8 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         *,
         query: Union[str, List[dict]],
         tool_definitions: Union[dict, List[dict]],
-        tool_calls: Union[dict, List[dict]]  = None,
-        response: Union[str, List[dict]] = None
+        tool_calls: Union[dict, List[dict]] = None,
+        response: Union[str, List[dict]] = None,
     ) -> Dict[str, Union[str, float]]:
         """
         Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
@@ -165,8 +162,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             if isinstance(response, list):
                 for message in response:
                     if message.get("role") == "assistant":
-                        tool_calls.extend([content for content in message.get("content")
-                                        if content.get("type") == "tool_call"])
+                        tool_calls.extend(
+                            [content for content in message.get("content") if content.get("type") == "tool_call"]
+                        )
             if len(tool_calls) == 0:
                 raise EvaluationException(
                     message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
@@ -185,7 +183,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         # TODO : When evaluating an agent tool that depends on the output of a previous tool call,
         # we need to provide the output of the previous tool call as part of messages.
         for tool_call in tool_calls:
-            if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":  # TODO assuming dict here but it can be a class
+            if (
+                isinstance(tool_call, dict) and tool_call.get("type") == "tool_call"
+            ):  # TODO assuming dict here but it can be a class
                 function_name = tool_call.get("name")
                 tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
                 if len(tool_definition) > 0:
@@ -228,7 +228,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
                 return {
                     self._result_key: bool(float(score)),
                     f"{self._result_key}_reason": reason,
-                    "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
+                    "tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
                 }
         raise EvaluationException(
             message="Tool call accuracy evaluator: Invalid score returned from LLM.",
@@ -248,13 +248,13 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         # Convert inputs into list of evaluable inputs.
         eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
         if len(eval_input_list) == 0:
-            return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
-                    f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
-                    f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
-                    f"{self._AGGREGATE_RESULT_KEY}_reason":
-                        "No tool calls were made.",
-                    "per_tool_call_details": []
-                    }
+            return {
+                self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
+                f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
+                f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
+                f"{self._AGGREGATE_RESULT_KEY}_reason": "No tool calls were made.",
+                "per_tool_call_details": [],
+            }
         per_turn_results = []
         # Evaluate all inputs.
@@ -293,7 +293,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         return {
             f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
             f"{self._result_key}_reason": "Tool call not supported for evaluation",
-            "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
+            "tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
         }
     def _aggregate_results(self, per_turn_results):
@@ -318,23 +318,32 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         # Go over each turn, and rotate the results into a
         # metric: List[values] format for the evals_per_turn dictionary.
-        num_evaluated = len([per_turn_result for per_turn_result in per_turn_results
-                             if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT])
+        num_evaluated = len(
+            [
+                per_turn_result
+                for per_turn_result in per_turn_results
+                if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT
+            ]
+        )
         if num_evaluated == 0:
             # None of the invoked tools were applicable, return not applicable result
             # (If a tool fails evaluation, we'll throw an exception)
-            return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
-                    f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
-                    f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
-                    f"{self._AGGREGATE_RESULT_KEY}_reason":
-                        "Tool call accuracy evaluation is not yet supported for the invoked tools.",
-                    "per_tool_call_details": []
-                    }
+            return {
+                self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
+                f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
+                f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
+                f"{self._AGGREGATE_RESULT_KEY}_reason": "Tool call accuracy evaluation is not yet supported for the invoked tools.",
+                "per_tool_call_details": [],
+            }
         # ignore not_applicable results, where the _result_key will be "not applicable"
-        score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated
+        score = (
+            sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results]) / num_evaluated
+        )
         aggregated[self._AGGREGATE_RESULT_KEY] = score
-        aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
-        aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
+        aggregated[f"{self._AGGREGATE_RESULT_KEY}_result"] = (
+            self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
+        )
+        aggregated[f"{self._AGGREGATE_RESULT_KEY}_threshold"] = self.threshold
         aggregated["per_tool_call_details"] = per_turn_results
         return aggregated

azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty CHANGED Viewed

@@ -14,11 +14,11 @@ model:
 inputs:
   query:
-    type: List
+    type: array
   tool_call:
-    type: Dict
+    type: object
   tool_definition:
-    type: Dict
+    type: object
 ---
 system:

azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py CHANGED Viewed

@@ -8,27 +8,28 @@ from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 @experimental
 class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     """
-    Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
-    where query represents the user query and response represents the AI system response given the provided context.
-    Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
+    Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
+    where query represents the user query and response represents the AI system response given the provided context.
+    Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
     emotional state of a person.
     It identifies the following attributes:
     - emotional_state
     - protected_class
     - groundedness
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
-    :param azure_ai_project: The scope of the Azure AI project.
-        It contains subscription id, resource group, and project name.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
@@ -42,13 +43,13 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START ungrounded_attributes_evaluator]
             :end-before: [END ungrounded_attributes_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::
@@ -109,5 +110,5 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
         :return: The ungrounded attributes label.
         :rtype: Dict[str, Union[str, bool]]
         """
         return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_xpia/xpia.py CHANGED Viewed

@@ -40,9 +40,9 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
-    :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project
-        name.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
     :param threshold: The threshold for the IndirectAttack evaluator. Default is 0.
     :type threshold: int
@@ -54,15 +54,15 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :language: python
             :dedent: 8
             :caption: Initialize and call an IndirectAttackEvaluator.
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START indirect_attack_evaluator]
             :end-before: [END indirect_attack_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
+            :caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """

azure/ai/evaluation/_exceptions.py CHANGED Viewed

@@ -9,6 +9,15 @@ from typing import Optional
 from azure.core.exceptions import AzureError
+class ErrorMessage(Enum):
+    """Error messages to be used when raising EvaluationException.
+    These messages are used to provide a consistent error message format across the SDK.
+    """
+    MALFORMED_CONVERSATION_HISTORY = "Malformed Conversation History: Query parameter representing conversation history should have exactly one more user query than agent responses"
 class ErrorCategory(Enum):
     """Error category to be specified when using EvaluationException class.
@@ -87,6 +96,7 @@ class ErrorTarget(Enum):
     TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
     RED_TEAM = "RedTeam"
     AOAI_GRADER = "AoaiGrader"
+    CONVERSATION_HISTORY_PARSING = "_get_conversation_history"
 class EvaluationException(AzureError):

azure/ai/evaluation/_http_utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Dict, MutableMapping, Optional, TypedDict, cast
 from typing_extensions import Self, Unpack
-from azure.ai.evaluation._user_agent import USER_AGENT
+from azure.ai.evaluation._user_agent import UserAgentSingleton
 from azure.core.configuration import Configuration
 from azure.core.pipeline import AsyncPipeline, Pipeline
 from azure.core.pipeline.policies import (
@@ -454,7 +454,7 @@ def get_http_client(**kwargs: Any) -> HttpPipeline:
     :returns: An HttpPipeline with a set of applied policies:
     :rtype: HttpPipeline
     """
-    kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
+    kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=UserAgentSingleton().value))
     return HttpPipeline(**kwargs)
@@ -464,5 +464,5 @@ def get_async_http_client(**kwargs: Any) -> AsyncHttpPipeline:
     :returns: An AsyncHttpPipeline with a set of applied policies:
     :rtype: AsyncHttpPipeline
     """
-    kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
+    kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=UserAgentSingleton().value))
     return AsyncHttpPipeline(**kwargs)

azure/ai/evaluation/_legacy/_batch_engine/_engine.py CHANGED Viewed

@@ -319,9 +319,9 @@ class BatchEngine:
                         # to maximize the parallelism, we run the synchronous function in a separate thread
                         # and await its result
                         output = await asyncio.get_event_loop().run_in_executor(
-                            self._executor,
-                            partial(self._func, **inputs))
+                            self._executor, partial(self._func, **inputs)
+                        )
                     # This should in theory never happen but as an extra precaution, let's check if the output
                     # is awaitable and await it if it is.
                     if inspect.isawaitable(output):

azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py CHANGED Viewed

@@ -90,7 +90,9 @@ def _openai_api_list() -> Generator[Tuple[Any, Callable, bool], None, None]:
         except ImportError:
             raise MissingRequiredPackage("Please install the 'openai' package to use the Azure AI Evaluation SDK")
         except AttributeError:
-            logging.warning("The module '%s' does not have class '%s' or method '%s'", module_name, class_name, method_name)
+            logging.warning(
+                "The module '%s' does not have class '%s' or method '%s'", module_name, class_name, method_name
+            )
 def inject_openai_api():
@@ -117,6 +119,7 @@ def recover_openai_api():
 class CaptureOpenAITokenUsage:
     """Context manager to capture OpenAI token usage."""
     def __init__(self):
         self._tokens = TokenMetrics(0, 0, 0)
@@ -126,4 +129,4 @@ class CaptureOpenAITokenUsage:
     def __exit__(self, exc_type: Optional[Exception], exc_value: Optional[Exception], traceback: Optional[Any]) -> None:
         captured_metrics = _token_metrics.get()
-        self._tokens.update(captured_metrics)
+        self._tokens.update(captured_metrics)

azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

azure-ai-evaluation 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl