PyPI - azure-ai-evaluation - Versions diffs - 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.6.0py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (69) hide show

azure/ai/evaluation/_converters/_models.py CHANGED Viewed

@@ -3,10 +3,18 @@ import json
 from pydantic import BaseModel
-from azure.ai.projects.models import RunStepFunctionToolCall
 from typing import List, Optional, Union
+# Models moved in a later version of agents SDK, so try a few different locations
+try:
+    from azure.ai.projects.models import RunStepFunctionToolCall
+except ImportError:
+    pass
+try:
+    from azure.ai.agents.models import RunStepFunctionToolCall
+except ImportError:
+    pass
 # Message roles constants.
 _SYSTEM = "system"
 _USER = "user"
@@ -21,6 +29,57 @@ _FUNCTION = "function"
 # This is returned by AI services in the API to filter against tool invocations.
 _TOOL_CALLS = "tool_calls"
+# Constants to only be used internally in this file for the built-in tools.
+_CODE_INTERPRETER = "code_interpreter"
+_BING_GROUNDING = "bing_grounding"
+_FILE_SEARCH = "file_search"
+_AZURE_AI_SEARCH = "azure_ai_search"
+_FABRIC_DATAAGENT = "fabric_dataagent"
+# Built-in tool descriptions and parameters are hidden, but we include basic descriptions
+# for evaluation purposes.
+_BUILT_IN_DESCRIPTIONS = {
+    _CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
+    + "generate code, and create graphs and charts using your data. Supports "
+    + "up to 20 files.",
+    _BING_GROUNDING: "Enhance model output with web data.",
+    _FILE_SEARCH: "Search for data across uploaded files.",
+    _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
+    _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
+}
+# Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
+_BUILT_IN_PARAMS = {
+    _CODE_INTERPRETER: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
+    },
+    _BING_GROUNDING: {
+        "type": "object",
+        "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
+    },
+    _FILE_SEARCH: {
+        "type": "object",
+        "properties": {
+            "ranking_options": {
+                "type": "object",
+                "properties": {
+                    "ranker": {"type": "string", "description": "Ranking algorithm to use."},
+                    "score_threshold": {"type": "number", "description": "Threshold for search results."},
+                },
+                "description": "Ranking options for search results.",
+            }
+        },
+    },
+    _AZURE_AI_SEARCH: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Search terms to use."}},
+    },
+    _FABRIC_DATAAGENT: {
+        "type": "object",
+        "properties": {"input": {"type": "string", "description": "Search terms to use."}},
+    },
+}
 class Message(BaseModel):
     """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
@@ -98,6 +157,8 @@ class ToolDefinition(BaseModel):
     :param name: The name of the tool.
     :type name: str
+    :param type: The type of the tool.
+    :type type: str
     :param description: A description of the tool.
     :type description: str
     :param parameters: The parameters required by the tool.
@@ -105,6 +166,7 @@ class ToolDefinition(BaseModel):
     """
     name: str
+    type: str
     description: Optional[str] = None
     parameters: dict
@@ -191,6 +253,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
             arguments = {
                 "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
             }
+        elif tool_call.details["type"] == "azure_ai_search":
+            arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
+        elif tool_call.details["type"] == "fabric_dataagent":
+            arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
         else:
             # unsupported tool type, skip
             return messages
@@ -211,17 +277,17 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
     messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
     if hasattr(tool_call.details, _FUNCTION):
-        output = safe_loads(tool_call.details.function.output)
+        output = safe_loads(tool_call.details.function["output"])
     else:
         try:
             # Some built-ins may have output, others may not
             # Try to retrieve it, but if we don't find anything, skip adding the message
             # Just manually converting to dicts for easy serialization for now rather than custom serializers
-            if tool_call.details.type == "code_interpreter":
+            if tool_call.details.type == _CODE_INTERPRETER:
                 output = tool_call.details.code_interpreter.outputs
-            elif tool_call.details.type == "bing_grounding":
+            elif tool_call.details.type == _BING_GROUNDING:
                 return messages  # not supported yet from bing grounding tool
-            elif tool_call.details.type == "file_search":
+            elif tool_call.details.type == _FILE_SEARCH:
                 output = [
                     {
                         "file_id": result.file_id,
@@ -231,6 +297,10 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
                     }
                     for result in tool_call.details.file_search.results
                 ]
+            elif tool_call.details.type == _AZURE_AI_SEARCH:
+                output = tool_call.details.azure_ai_search["output"]
+            elif tool_call.details.type == _FABRIC_DATAAGENT:
+                output = tool_call.details.fabric_dataagent["output"]
         except:
             return messages

azure/ai/evaluation/_eval_mapping.py CHANGED Viewed

@@ -16,6 +16,7 @@ from azure.ai.evaluation import (
     CodeVulnerabilityEvaluator,
     CoherenceEvaluator,
     ContentSafetyEvaluator,
+    DocumentRetrievalEvaluator,
     F1ScoreEvaluator,
     FluencyEvaluator,
     GleuScoreEvaluator,
@@ -45,6 +46,7 @@ EVAL_CLASS_MAP = {
     CodeVulnerabilityEvaluator: "code_vulnerability",
     CoherenceEvaluator: "coherence",
     ContentSafetyEvaluator: "content_safety",
+    DocumentRetrievalEvaluator: "document_retrieval",
     ECIEvaluator: "eci",
     F1ScoreEvaluator: "f1_score",
     FluencyEvaluator: "fluency",

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -141,7 +141,6 @@ def _aggregate_content_safety_metrics(
             module = inspect.getmodule(evaluators[evaluator_name])
             if (
                 module
-                and module.__name__.startswith("azure.ai.evaluation.")
                 and metric_name.endswith("_score")
                 and metric_name.replace("_score", "") in content_safety_metrics
             ):
@@ -739,7 +738,17 @@ def evaluate(
             :end-before: [END evaluate_method]
             :language: python
             :dedent: 8
-            :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
+            :caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START evaluate_method]
+            :end-before: [END evaluate_method]
+            :language: python
+            :dedent: 8
+            :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
     try:
         return _evaluate(
@@ -978,17 +987,6 @@ def _preprocess_data(
     # Split normal evaluators and OAI graders
     evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
-    input_data_df = _validate_and_load_data(
-        target,
-        data,
-        evaluators_and_graders,
-        output_path,
-        azure_ai_project,
-        evaluation_name
-    )
-    if target is not None:
-        _validate_columns_for_target(input_data_df, target)
     target_run: Optional[BatchClientRun] = None
     target_generated_columns: Set[str] = set()
     batch_run_client: BatchClient
@@ -1135,8 +1133,8 @@ def _map_names_to_builtins(
     ) -> Dict[str, str]:
     """
     Construct a mapping from user-supplied evaluator names to which known, built-in
-    evaluator or grader they refer to. Custom or otherwise unknown evaluators are
-    mapped to the "unknown" value.
+    evaluator or grader they refer to. Custom evaluators are excluded from the mapping
+    as we only want to track built-in evaluators and graders.
     :param evaluators: The dictionary of evaluators.
     :type evaluators: Dict[str, Callable]
@@ -1158,8 +1156,8 @@ def _map_names_to_builtins(
                 found_eval = True
                 break
         if not found_eval:
-            # If not found, map to "unknown"
-            name_map[name] = "unknown"
+            # Skip custom evaluators - we only want to track built-in evaluators
+            pass
     for  name, grader in graders.items():
         name_map[name] = grader.id

azure/ai/evaluation/_evaluate/_evaluate_aoai.py CHANGED Viewed

@@ -208,7 +208,7 @@ def _get_single_run_results(
     if run_results.status != "completed":
         raise EvaluationException(
             message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
-             + " failed with status {run_results.status}.",
+             + f" failed with status {run_results.status}.",
             blame=ErrorBlame.UNKNOWN,
             category=ErrorCategory.FAILED_EXECUTION,
             target=ErrorTarget.AOAI_GRADER,
@@ -217,6 +217,16 @@ def _get_single_run_results(
                 + " completed successfully. Gathering results...")
     # Convert run results into a dictionary of metrics
     run_metrics = {}
+    if run_results.per_testing_criteria_results is None:
+        msg = ("AOAI evaluation run returned no results, despite 'completed' status. This might" +
+               " occur when invalid or conflicting models are selected in the model and grader configs."
+            f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}")
+        raise EvaluationException(
+            message=msg,
+            blame=ErrorBlame.UNKNOWN,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.AOAI_GRADER,
+        )
     for criteria_result in run_results.per_testing_criteria_results:
         grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
         passed = criteria_result.passed
@@ -240,8 +250,12 @@ def _get_single_run_results(
         eval_id=run_info["eval_group_id"],
         run_id=run_info["eval_run_id"]
     )
-    listed_results = {}
+    listed_results = {"index": []}
+    # raw data has no order guarantees, we need to sort them by their
+    # datasource_item_id
     for row_result in raw_list_results.data:
+        # Add the datasource_item_id for later sorting
+        listed_results["index"].append(row_result.datasource_item_id)
         for single_grader_row_result in row_result.results:
             grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
             for name, value in single_grader_row_result.items():
@@ -251,14 +265,19 @@ def _get_single_run_results(
                     # create a `_result` column for each grader
                     result_column_name = f"outputs.{grader_name}.{grader_name}_result"
                     if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
-                        listed_results[result_column_name] = EVALUATION_PASS_FAIL_MAPPING[value]
+                        if (result_column_name not in listed_results):
+                            listed_results[result_column_name] = []
+                        listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
                 formatted_column_name = f"outputs.{grader_name}.{name}"
                 if (formatted_column_name not in listed_results):
                     listed_results[formatted_column_name] = []
-                listed_results[f"outputs.{grader_name}.{name}"].append(value)
+                listed_results[formatted_column_name].append(value)
     output_df = pd.DataFrame(listed_results)
+    # sort by index
+    output_df = output_df.sort_values('index', ascending=[True])
+    # remove index column
+    output_df.drop(columns=["index"], inplace=True)
     return output_df, run_metrics

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -33,7 +33,17 @@ class BleuScoreEvaluator(EvaluatorBase):
             :end-before: [END bleu_score_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call an BleuScoreEvaluator.
+            :caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START bleu_score_evaluator]
+            :end-before: [END bleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py CHANGED Viewed

@@ -62,7 +62,15 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :end-before: [END code_vulnerability_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a CodeVulnerabilityEvaluator with a query and response.
+            :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START code_vulnerability_evaluator]
+            :end-before: [END code_vulnerability_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -31,7 +31,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a CoherenceEvaluator with a query and response.
+            :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START coherence_evaluator]
+            :end-before: [END coherence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
@@ -40,7 +50,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END threshold_coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and and call a CoherenceEvaluator with a query and response.
+            :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
     .. note::

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -86,6 +86,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     :type _higher_is_better: Optional[bool]
     """
+    _NOT_APPLICABLE_RESULT = "not applicable"
+    _PASS_RESULT = "pass"
+    _FAIL_RESULT = "fail"
     # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
     # Make sure to call super().__init__() in the child class's __init__ method.

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -44,7 +44,17 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :end-before: [END content_safety_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a ContentSafetyEvaluator.
+            :caption: Initialize and call ContentSafetyEvaluator using azure.ai.evaluation.AzureAIProject.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START content_safety_evaluator]
+            :end-before: [END content_safety_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
     .. admonition:: Example with Threshold:
@@ -53,7 +63,7 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :end-before: [END threshold_content_safety_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a ContentSafetyEvaluator.
+            :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
     """
     id = "content_safety"

azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py CHANGED Viewed

@@ -58,16 +58,26 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :end-before: [END hate_unfairness_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a HateUnfairnessEvaluator.
-    .. admonition:: Example with Threshold:
+            :caption: Initialize and call HateUnfairnessEvaluator using azure.ai.evaluation.AzureAIProject.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START hate_unfairness_evaluator]
+            :end-before: [END hate_unfairness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
+    .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
             :start-after: [START threshold_hate_unfairness_evaluator]
             :end-before: [END threshold_hate_unfairness_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a HateUnfairnessEvaluator.
+            :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
     """
     id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"

azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py CHANGED Viewed

@@ -52,16 +52,17 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :end-before: [END self_harm_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a SelfHarmEvaluator.
-    .. admonition:: Example:
-        .. literalinclude:: ../samples/evaluation_samples_threshold.py
-            :start-after: [START threshold_self_harm_evaluator]
-            :end-before: [END threshold_self_harm_evaluator]
+            :caption: Initialize and call SelfHarmEvaluator using azure.ai.evaluation.AzureAIProject.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START self_harm_evaluator]
+            :end-before: [END self_harm_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a SelfHarmEvaluator.
+            :caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
     """
     id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"

azure/ai/evaluation/_evaluators/_content_safety/_sexual.py CHANGED Viewed

@@ -56,6 +56,16 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a SexualEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START sexual_evaluator]
+            :end-before: [END sexual_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_content_safety/_violence.py CHANGED Viewed

@@ -56,6 +56,16 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a ViolenceEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START violence_evaluator]
+            :end-before: [END violence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import math
 import operator
 from itertools import starmap
-from typing import Dict, List, TypedDict, Tuple, Optional
+from typing import Any, Dict, List, TypedDict, Tuple, Optional, Union
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
 from azure.ai.evaluation._exceptions import EvaluationException
 from typing_extensions import override, overload
@@ -30,8 +30,18 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
             :end-before: [END document_retrieval_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a Document RetrievalEvaluator
+            :caption: Initialize and call a DocumentRetrievalEvaluator
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START document_retrieval_evaluator]
+            :end-before: [END document_retrieval_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
             :start-after: [START threshold_document_retrieval_evaluator]
@@ -46,7 +56,13 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
         *,
         ground_truth_label_min: int = 0,
         ground_truth_label_max: int = 4,
-        threshold: Optional[dict] = None,
+        ndcg_threshold: Optional[float] = 0.5,
+        xdcg_threshold: Optional[float] = 50.0,
+        fidelity_threshold: Optional[float] = 0.5,
+        top1_relevance_threshold: Optional[float] = 50.0,
+        top3_max_relevance_threshold: Optional[float] = 50.0,
+        total_retrieved_documents_threshold: Optional[int] = 50,
+        total_ground_truth_documents_threshold: Optional[int] = 50
     ):
         super().__init__()
         self.k = 3
@@ -71,27 +87,19 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
         self.ground_truth_label_max = ground_truth_label_max
         # The default threshold for metrics where higher numbers are better.
-        self._threshold_metrics = {
-            "ndcg@3": 0.5,
-            "xdcg@3": 0.5,
-            "fidelity": 0.5,
-            "top1_relevance": 50,
-            "top3_max_relevance": 50,
-            "total_retrieved_documents": 50,
-            "total_ground_truth_documents": 50,
+        self._threshold_metrics: Dict[str, Any] = {
+            "ndcg@3": ndcg_threshold,
+            "xdcg@3": xdcg_threshold,
+            "fidelity": fidelity_threshold,
+            "top1_relevance": top1_relevance_threshold,
+            "top3_max_relevance": top3_max_relevance_threshold,
+            "total_retrieved_documents": total_retrieved_documents_threshold,
+            "total_ground_truth_documents": total_ground_truth_documents_threshold,
         }
         # Ideally, the number of holes should be zero.
         self._threshold_holes = {"holes": 0, "holes_ratio": 0}
-        if threshold and not isinstance(threshold, dict):
-            raise EvaluationException(
-                f"Threshold must be a dictionary, got {type(threshold)}"
-            )
-        elif isinstance(threshold, dict):
-            self._threshold_metrics.update(threshold)
     def _compute_holes(self, actual_docs: List[str], labeled_docs: List[str]) -> int:
         """
         The number of documents retrieved from a search query which have no provided ground-truth label.
@@ -214,22 +222,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
         return weighted_sum_by_rating_results / float(weighted_sum_by_rating_index)
     def _get_binary_result(self, **metrics) -> Dict[str, float]:
-        result = {}
+        result: Dict[str, Any] = {}
         for metric_name, metric_value in metrics.items():
             if metric_name in self._threshold_metrics.keys():
-                result[f"{metric_name}_result"] = (
-                    metric_value >= self._threshold_metrics[metric_name]
-                )
-                result[f"{metric_name}_threshold"] = self._threshold_metrics[
-                    metric_name
-                ]
+                result[f"{metric_name}_result"] = "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
+                result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
                 result[f"{metric_name}_higher_is_better"] = True
             elif metric_name in self._threshold_holes.keys():
-                result[f"{metric_name}_result"] = (
-                    metric_value <= self._threshold_holes[metric_name]
-                )
+                result[f"{metric_name}_result"] = "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
                 result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
                 result[f"{metric_name}_higher_is_better"] = False

azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py CHANGED Viewed

@@ -38,6 +38,16 @@ class F1ScoreEvaluator(EvaluatorBase):
             :dedent: 8
             :caption: Initialize and call an F1ScoreEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START f1_score_evaluator]
+            :end-before: [END f1_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -44,6 +44,16 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize with threshold and call a FluencyEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START fluency_evaluator]
+            :end-before: [END fluency_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call FluencyEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::
         To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.

azure/ai/evaluation/_evaluators/_gleu/_gleu.py CHANGED Viewed

@@ -43,6 +43,16 @@ class GleuScoreEvaluator(EvaluatorBase):
             :language: python
             :dedent: 8
             :caption: Initialize with threshold and call a GleuScoreEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START gleu_score_evaluator]
+            :end-before: [END gleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
     id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"

azure-ai-evaluation 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.6.0py3-none-any.whl → 1.8.0py3-none-any.whl