PyPI - azure-ai-evaluation - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -7,7 +7,9 @@ import json
 import logging
 import os
 import re
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
+import tempfile
+import json
+from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
 from openai import OpenAI, AzureOpenAI
 from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
@@ -611,6 +613,18 @@ def _apply_target_to_data(
             category=ErrorCategory.FAILED_EXECUTION,
             blame=ErrorBlame.USER_ERROR,
         )
+    # Log a warning if some rows failed
+    failed_lines = run_summary.get("failed_lines", 0)
+    completed_lines = run_summary["completed_lines"]
+    total_lines = failed_lines + completed_lines
+    if failed_lines > 0:
+        LOGGER.warning(
+            f"Target function completed {completed_lines} out of {total_lines} rows. "
+            f"{failed_lines} rows failed and will be filled with NaN values."
+        )
     # Remove input and output prefix
     generated_columns = {
         col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -618,6 +632,13 @@ def _apply_target_to_data(
     # Sort output by line numbers
     target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
     target_output.sort_index(inplace=True)
+    initial_data_with_line_numbers = initial_data.copy()
+    initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
+    complete_index = initial_data_with_line_numbers[LINE_NUMBER]
+    target_output = target_output.reindex(complete_index)
     target_output.reset_index(inplace=True, drop=False)
     # target_output contains only input columns, taken by function,
     # so we need to concatenate it to the input data frame.
@@ -626,8 +647,8 @@ def _apply_target_to_data(
     # Rename outputs columns to __outputs
     rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
     target_output.rename(columns=rename_dict, inplace=True)
-    # Concatenate output to input
-    target_output = pd.concat([target_output, initial_data], axis=1)
+    # Concatenate output to input - now both dataframes have the same number of rows
+    target_output = pd.concat([initial_data, target_output], axis=1)
     return target_output, generated_columns, run
@@ -645,7 +666,7 @@ def _process_column_mappings(
     processed_config: Dict[str, Dict[str, str]] = {}
-    expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
+    expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
     if column_mapping:
         for evaluator, mapping_config in column_mapping.items():
@@ -855,6 +876,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         output_path=output_path,
         azure_ai_project=azure_ai_project,
         evaluation_name=evaluation_name,
+        fail_on_evaluator_errors=fail_on_evaluator_errors,
         **kwargs,
     )
@@ -962,6 +984,7 @@ def _preprocess_data(
     output_path: Optional[Union[str, os.PathLike]] = None,
     azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     evaluation_name: Optional[str] = None,
+    fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> __ValidatedData:
     # Process evaluator config to replace ${target.} with ${data.}
@@ -995,15 +1018,49 @@ def _preprocess_data(
     batch_run_client: BatchClient
     batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
-    if kwargs.pop("_use_run_submitter_client", False):
-        batch_run_client = RunSubmitterClient()
+    def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
+        """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
+        _use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
+        _use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
+        if _use_run_submitter_client is None and _use_pf_client is None:
+            # If both are unset, return default
+            return "run_submitter"
+        if _use_run_submitter_client and _use_pf_client:
+            raise EvaluationException(
+                message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
+                target=ErrorTarget.EVALUATE,
+                category=ErrorCategory.INVALID_VALUE,
+                blame=ErrorBlame.USER_ERROR,
+            )
+        if _use_run_submitter_client == False and _use_pf_client == False:
+            return "code_client"
+        if _use_run_submitter_client:
+            return "run_submitter"
+        if _use_pf_client:
+            return "pf_client"
+        if _use_run_submitter_client is None and _use_pf_client == False:
+            return "run_submitter"
+        if _use_run_submitter_client == False and _use_pf_client is None:
+            return "pf_client"
+        assert False, "This should be impossible"
+    client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
+    if client_type == "run_submitter":
+        batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
         batch_run_data = input_data_df
-    elif kwargs.pop("_use_pf_client", True):
+    elif client_type == "pf_client":
         batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
         # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
         # multiple evaluators. If the path is already absolute, abspath will return the original path.
         batch_run_data = os.path.abspath(data)
-    else:
+    elif client_type == "code_client":
         batch_run_client = CodeClient()
         batch_run_data = input_data_df
@@ -1013,17 +1070,50 @@ def _preprocess_data(
             target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
         )
-        for evaluator_name, mapping in column_mapping.items():
-            mapped_to_values = set(mapping.values())
-            for col in target_generated_columns:
-                # If user defined mapping differently, do not change it.
-                # If it was mapped to target, we have already changed it
-                # in _process_column_mappings
-                run_output = f"${{run.outputs.{col}}}"
-                # We will add our mapping only if
-                # customer did not mapped target output.
-                if col not in mapping and run_output not in mapped_to_values:
-                    column_mapping[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
+        # IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
+        # This ensures that evaluators get all rows (including failed ones with NaN values)
+        if isinstance(batch_run_client, ProxyClient):
+            # Create a temporary JSONL file with the complete dataframe
+            temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
+            try:
+                for _, row in input_data_df.iterrows():
+                    row_dict = row.to_dict()
+                    temp_file.write(json.dumps(row_dict) + "\n")
+                temp_file.close()
+                batch_run_data = temp_file.name
+                # Update column mappings to use data references instead of run outputs
+                for evaluator_name, mapping in column_mapping.items():
+                    mapped_to_values = set(mapping.values())
+                    for col in target_generated_columns:
+                        # Use data reference instead of run output to ensure we get all rows
+                        target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
+                        # We will add our mapping only if customer did not map target output.
+                        if col not in mapping and target_reference not in mapped_to_values:
+                            column_mapping[evaluator_name][col] = target_reference
+                # Don't pass the target_run since we're now using the complete dataframe
+                target_run = None
+            except Exception as e:
+                # Clean up the temp file if something goes wrong
+                if os.path.exists(temp_file.name):
+                    os.unlink(temp_file.name)
+                raise e
+        else:
+            # For DataFrame-based clients, update batch_run_data to use the updated input_data_df
+            batch_run_data = input_data_df
+            # Update column mappings for DataFrame clients
+            for evaluator_name, mapping in column_mapping.items():
+                mapped_to_values = set(mapping.values())
+                for col in target_generated_columns:
+                    target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
+                    # We will add our mapping only if customer did not map target output.
+                    if col not in mapping and target_reference not in mapped_to_values:
+                        column_mapping[evaluator_name][col] = target_reference
     # After we have generated all columns, we can check if we have everything we need for evaluators.
     _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -1062,30 +1152,50 @@ def _run_callable_evaluators(
     batch_run_data = validated_data["batch_run_data"]
     column_mapping = validated_data["column_mapping"]
     evaluators = validated_data["evaluators"]
-    with EvalRunContext(batch_run_client):
-        runs = {
-            evaluator_name: batch_run_client.run(
-                flow=evaluator,
-                data=batch_run_data,
-                run=target_run,
-                evaluator_name=evaluator_name,
-                column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
-                stream=True,
-                name=kwargs.get("_run_name"),
-            )
-            for evaluator_name, evaluator in evaluators.items()
-        }
-        # get_details needs to be called within EvalRunContext scope in order to have user agent populated
-        per_evaluator_results: Dict[str, __EvaluatorInfo] = {
-            evaluator_name: {
-                "result": batch_run_client.get_details(run, all_results=True),
-                "metrics": batch_run_client.get_metrics(run),
-                "run_summary": batch_run_client.get_run_summary(run),
+    # Clean up temporary file after evaluation if it was created
+    temp_file_to_cleanup = None
+    if (
+        isinstance(batch_run_client, ProxyClient)
+        and isinstance(batch_run_data, str)
+        and batch_run_data.endswith(".jsonl")
+    ):
+        # Check if it's a temporary file (contains temp directory path)
+        if tempfile.gettempdir() in batch_run_data:
+            temp_file_to_cleanup = batch_run_data
+    try:
+        with EvalRunContext(batch_run_client):
+            runs = {
+                evaluator_name: batch_run_client.run(
+                    flow=evaluator,
+                    data=batch_run_data,
+                    # Don't pass target_run when using complete dataframe
+                    run=target_run,
+                    evaluator_name=evaluator_name,
+                    column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
+                    stream=True,
+                    name=kwargs.get("_run_name"),
+                )
+                for evaluator_name, evaluator in evaluators.items()
             }
-            for evaluator_name, run in runs.items()
-        }
+            # get_details needs to be called within EvalRunContext scope in order to have user agent populated
+            per_evaluator_results: Dict[str, __EvaluatorInfo] = {
+                evaluator_name: {
+                    "result": batch_run_client.get_details(run, all_results=True),
+                    "metrics": batch_run_client.get_metrics(run),
+                    "run_summary": batch_run_client.get_run_summary(run),
+                }
+                for evaluator_name, run in runs.items()
+            }
+    finally:
+        # Clean up temporary file if it was created
+        if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
+            try:
+                os.unlink(temp_file_to_cleanup)
+            except Exception as e:
+                LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
     # Concatenate all results
     evaluators_result_df = pd.DataFrame()
     evaluators_metric = {}

azure/ai/evaluation/_evaluate/_evaluate_aoai.py CHANGED Viewed

@@ -353,6 +353,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
         AzureOpenAIStringCheckGrader,
         AzureOpenAITextSimilarityGrader,
         AzureOpenAIScoreModelGrader,
+        AzureOpenAIPythonGrader,
     )
     id_map = {
@@ -361,6 +362,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
         AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
         AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
         AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
+        AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
     }
     for key in id_map.keys():

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -178,7 +178,6 @@ def _log_metrics_and_instance_results_onedp(
         properties = {
             EvaluationRunProperties.RUN_TYPE: "eval_run",
-            EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
             EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
             "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
         }
@@ -191,6 +190,7 @@ def _log_metrics_and_instance_results_onedp(
         upload_run_response = client.start_evaluation_run(
             evaluation=EvaluationUpload(
                 display_name=evaluation_name,
+                properties=properties,
             )
         )
@@ -202,7 +202,6 @@ def _log_metrics_and_instance_results_onedp(
                 outputs={
                     "evaluationResultId": create_evaluation_result_response.id,
                 },
-                properties=properties,
             ),
         )

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -54,7 +54,7 @@ class BleuScoreEvaluator(EvaluatorBase):
             :caption: Initialize with threshold and call an BleuScoreEvaluator.
     """
-    id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/bleu_score"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self, *, threshold=0.5):

azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py CHANGED Viewed

@@ -79,19 +79,26 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
         for the code vulnerability will be "code_vulnerability_label".
     """
-    id = "code_vulnerability"
+    id = "azureai://built-in/evaluators/code_vulnerability"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        **kwargs,
     ):
+        # Set default for evaluate_query if not provided
+        if "evaluate_query" not in kwargs:
+            kwargs["evaluate_query"] = True
         super().__init__(
             eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -62,7 +62,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     _PROMPTY_FILE = "coherence.prompty"
     _RESULT_KEY = "coherence"
-    id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
+    id = "azureai://built-in/evaluators/coherence"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -4,14 +4,34 @@
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    List,
+    TypedDict,
+    TypeVar,
+    Union,
+    cast,
+    final,
+    Optional,
+)
 from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
 from typing_extensions import ParamSpec, TypeAlias, get_overloads
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._exceptions import (
+    ErrorBlame,
+    ErrorCategory,
+    ErrorTarget,
+    EvaluationException,
+)
 from azure.ai.evaluation._common.utils import remove_optional_singletons
-from azure.ai.evaluation._constants import _AggregationType, EVALUATION_PASS_FAIL_MAPPING
+from azure.ai.evaluation._constants import (
+    _AggregationType,
+    EVALUATION_PASS_FAIL_MAPPING,
+)
 from azure.ai.evaluation._model_configurations import Conversation
 from azure.ai.evaluation._common._experimental import experimental
@@ -176,7 +196,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             singletons.extend([p for p in params if p != "self"])
         return singletons
-    def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
+    def _derive_conversation_converter(
+        self,
+    ) -> Callable[[Dict], List[DerivedEvalInput]]:
         """Produce the function that will be used to convert conversations to a list of evaluable inputs.
         This uses the inputs derived from the _derive_singleton_inputs function to determine which
         aspects of a conversation ought to be extracted.
@@ -235,7 +257,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         return converter
-    def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
+    def _derive_multi_modal_conversation_converter(
+        self,
+    ) -> Callable[[Dict], List[Dict[str, Any]]]:
         """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
         This uses the inputs derived from the _derive_singleton_inputs function to determine which
         aspects of a conversation ought to be extracted.
@@ -288,7 +312,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         return multi_modal_converter
-    def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
+    def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
         """Convert an arbitrary input into a list of inputs for evaluators.
         It is assumed that evaluators generally make use of their inputs in one of two ways.
         Either they receive a collection of keyname inputs that are all single values

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -36,14 +36,17 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
         when this occurs. Default is False, resulting full conversation evaluation and aggregation.
     :type eval_last_turn: bool
-    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
-        to produce a single result.
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation        to produce a single result.
         Default is ~azure.ai.evaluation._AggregationType.MEAN.
     :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
     :param threshold: The threshold for the evaluation. Default is 3.
     :type threshold: Optional[int]
     :param _higher_is_better: If True, higher scores are better. Default is True.
     :type _higher_is_better: Optional[bool]
+    :param evaluate_query: If True, the query will be included in the evaluation data when evaluating
+        query-response pairs. If False, only the response will be evaluated. Default is False.
+        Can be passed as a keyword argument.
+    :type evaluate_query: bool
     """
     @override
@@ -56,6 +59,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
         threshold: int = 3,
         _higher_is_better: Optional[bool] = False,
+        **kwargs,
     ):
         super().__init__(
             eval_last_turn=eval_last_turn,
@@ -67,6 +71,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential
         self._threshold = threshold
+        # Handle evaluate_query parameter from kwargs
+        self._evaluate_query = kwargs.get("evaluate_query", False)
         self._higher_is_better = _higher_is_better
     @override
@@ -101,7 +108,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         :return: The evaluation result.
         :rtype: Dict
         """
-        if "query" in eval_input and "response" in eval_input:
+        if "response" in eval_input:
             return await self._evaluate_query_response(eval_input)
         conversation = eval_input.get("conversation", None)
@@ -131,17 +138,20 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         return result
     async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
-        query = str(eval_input.get("query", None))
-        response = str(eval_input.get("response", None))
-        if query is None or response is None:
+        query = eval_input.get("query", None)
+        response = eval_input.get("response", None)
+        if response is None:
             raise EvaluationException(
                 message="Not implemented",
                 internal_message=(
-                    "Reached query/response evaluation without supplying query or response."
+                    "Reached query/response evaluation without supplying response."
                     + " This should have failed earlier."
                 ),
             )
-        input_data = {"query": query, "response": response}
+        input_data = {"response": str(response)}
+        if query is not None and self._evaluate_query:
+            input_data["query"] = str(query)
         if "context" in self._singleton_inputs:
             context = eval_input.get("context", None)

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
     :type self_harm_threshold: int
     :param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
     :type hate_unfairness_threshold: int
+    :param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
+    :type evaluate_query: bool
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -66,8 +68,9 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
     """
-    id = "content_safety"
+    id = "azureai://built-in/evaluators/content_safety"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     def __init__(
         self,
@@ -90,11 +93,18 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             if not isinstance(value, int):
                 raise TypeError(f"{name} must be an int, got {type(value)}")
+        # Extract evaluate_query from kwargs if present
+        evaluate_query_kwargs = {}
+        if "evaluate_query" in kwargs:
+            evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
         evaluators = [
-            ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
-            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
-            SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
-            HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
+            ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
+            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
+            SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
+            HateUnfairnessEvaluator(
+                credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
+            ),
         ]
         super().__init__(evaluators=evaluators, **kwargs)

azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py CHANGED Viewed

@@ -80,8 +80,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
     """
-    id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
+    id = "azureai://built-in/evaluators/hate_unfairness"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
@@ -90,6 +91,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
         azure_ai_project,
         *,
         threshold: int = 3,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -98,6 +100,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py CHANGED Viewed

@@ -65,8 +65,9 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
     """
-    id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/self_harm"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
@@ -75,6 +76,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
         azure_ai_project,
         *,
         threshold: int = 3,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SELF_HARM,
@@ -83,6 +85,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_content_safety/_sexual.py CHANGED Viewed

@@ -76,8 +76,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a SexualEvaluator.
     """
-    id = "azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/sexual"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
@@ -86,6 +87,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
         azure_ai_project,
         *,
         threshold: int = 3,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SEXUAL,
@@ -94,6 +96,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            **kwargs,
         )
     @overload
@@ -146,7 +149,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
+        :return: The sexual score.
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
         return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_content_safety/_violence.py CHANGED Viewed

@@ -76,8 +76,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a ViolenceEvaluator.
     """
-    id = "azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/violence"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
@@ -86,6 +87,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
         azure_ai_project,
         *,
         threshold: int = 3,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.VIOLENCE,
@@ -94,6 +96,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            **kwargs,
         )
     @overload

azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl