PyPI - azure-ai-evaluation - Versions diffs - 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.3.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show

azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py CHANGED Viewed

@@ -8,15 +8,21 @@ import inspect
 import logging
 import math
 import os
+from datetime import datetime
 from collections import OrderedDict
 from concurrent.futures import Future
-from typing import Any, Callable, Dict, Optional, Union
+from typing import Any, Callable, Dict, Optional, Union, cast
+from azure.ai.evaluation._legacy._adapters.entities import Run
+from azure.ai.evaluation._legacy._adapters._configuration import Configuration
+from azure.ai.evaluation._legacy._adapters.client import PFClient
+from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext
 import pandas as pd
-from promptflow.client import PFClient
-from promptflow.entities import Run
-from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClientRun, HasAsyncCallable
+Configuration.get_instance().set_config("trace.destination", "none")
 LOGGER = logging.getLogger(__name__)
@@ -26,46 +32,56 @@ class ProxyRun:
 class ProxyClient:  # pylint: disable=client-accepts-api-version-keyword
-    def __init__(  # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
-        self, pf_client: PFClient
+    def __init__(  # pylint: disable=missing-client-constructor-parameter-credential
+        self,
+        **kwargs: Any,
     ) -> None:
-        self._pf_client = pf_client
-        self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
+        self._pf_client = PFClient(**kwargs)
+        self._thread_pool = ThreadPoolExecutorWithContext(thread_name_prefix="evaluators_thread")
     def run(
         self,
-        flow: Union[str, os.PathLike, Callable],
-        data: Union[str, os.PathLike],
+        flow: Callable,
+        data: Union[str, os.PathLike, pd.DataFrame],
         column_mapping: Optional[Dict[str, str]] = None,
-        **kwargs
+        evaluator_name: Optional[str] = None,
+        **kwargs: Any,
     ) -> ProxyRun:
-        flow_to_run = flow
-        if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
+        if isinstance(data, pd.DataFrame):
+            raise ValueError("Data cannot be a pandas DataFrame")
+        flow_to_run: Callable = flow
+        if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and isinstance(flow, HasAsyncCallable):
             flow_to_run = flow._to_async()  # pylint: disable=protected-access
+        name: str = kwargs.pop("name", "")
+        if not name:
+            name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
         batch_use_async = self._should_batch_use_async(flow_to_run)
         eval_future = self._thread_pool.submit(
             self._pf_client.run,
             flow_to_run,
             data=data,
-            column_mapping=column_mapping,
+            column_mapping=column_mapping,  # type: ignore
             batch_use_async=batch_use_async,
-            **kwargs
+            name=name,
+            **kwargs,
         )
         return ProxyRun(run=eval_future)
-    def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
-        run: Run = proxy_run.run.result()
+    def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
+        run: Run = self.get_result(client_run)
         result_df = self._pf_client.get_details(run, all_results=all_results)
         result_df.replace("(Failed)", math.nan, inplace=True)
         return result_df
-    def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
-        run: Run = proxy_run.run.result()
+    def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
+        run: Run = self.get_result(client_run)
         return self._pf_client.get_metrics(run)
-    def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
-        run = proxy_run.run.result()
+    def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
+        run: Run = self.get_result(client_run)
         # pylint: disable=protected-access
         completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
@@ -81,13 +97,17 @@ class ProxyClient:  # pylint: disable=client-accepts-api-version-keyword
         return OrderedDict(
             [
                 ("status", status),
-                ("duration", str(run._end_time - run._created_on)),
+                ("duration", str((run._end_time or run._created_on) - run._created_on)),
                 ("completed_lines", completed_lines),
                 ("failed_lines", failed_lines),
                 ("log_path", str(run._output_path)),
             ]
         )
+    @staticmethod
+    def get_result(run: BatchClientRun) -> Run:
+        return cast(ProxyRun, run).run.result()
     @staticmethod
     def _should_batch_use_async(flow):
         if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":

azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 import types
 from typing import Optional, Type
-from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
+from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP
 from azure.ai.evaluation._constants import PF_DISABLE_TRACING

azure/ai/evaluation/_evaluate/_eval_run.py CHANGED Viewed

@@ -13,7 +13,7 @@ import uuid
 from typing import Any, Dict, List, Optional, Set, Type
 from urllib.parse import urlparse
-from promptflow._sdk.entities import Run
+from azure.ai.evaluation._legacy._adapters.entities import Run
 from typing_extensions import Self
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -404,7 +404,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
             LOGGER.warning("The run results file was not found, skipping artifacts upload.")
             return
         # First we will list the files and the appropriate remote paths for them.
-        root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
+        root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_id)
         remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
         local_paths = []
         # Go over the artifact folder and upload all artifacts.

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -6,13 +6,11 @@ import json
 import logging
 import os
 import re
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
+from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
+from azure.ai.evaluation._legacy._adapters.entities import Run
 import pandas as pd
-from promptflow._sdk._constants import LINE_NUMBER
-from promptflow.client import PFClient
-from promptflow.entities import Run
-from promptflow._sdk._configuration import Configuration
 from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
@@ -27,7 +25,14 @@ from .._constants import (
 )
 from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
 from .._user_agent import USER_AGENT
-from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
+from ._batch_run import (
+    EvalRunContext,
+    CodeClient,
+    ProxyClient,
+    ProxyRun,
+    TargetRunContext,
+    RunSubmitterClient,
+)
 from ._utils import (
     _apply_column_mapping,
     _log_metrics_and_instance_results,
@@ -35,8 +40,8 @@ from ._utils import (
     _write_output,
     DataLoaderFactory,
 )
+from ._batch_run.batch_clients import BatchClient
-TClient = TypeVar("TClient", ProxyClient, CodeClient)
 LOGGER = logging.getLogger(__name__)
 # For metrics (aggregates) whose metric names intentionally differ from their
@@ -71,7 +76,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
         if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
             renamed_cols.append(col)
             new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
-            col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
+            col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
             try:
                 metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
             except EvaluationException:  # only exception that can be cause is all NaN values
@@ -122,7 +127,7 @@ def _aggregate_content_safety_metrics(
     defect_rates = {}
     for col in content_safety_df.columns:
         defect_rate_name = col.replace("_score", "_defect_rate")
-        col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
+        col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
         try:
             col_with_boolean_values = apply_transform_nan_safe(
                 col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
@@ -152,26 +157,57 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         EvaluationMetrics.LOGOS_AND_BRANDS,
         _InternalEvaluationMetrics.ECI,
         EvaluationMetrics.XPIA,
+        EvaluationMetrics.CODE_VULNERABILITY,
+        EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
     ]
     label_cols = []
+    details_cols = []
     for col in df.columns:
         metric_name = col.split(".")[1]
         if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
             label_cols.append(col)
+        if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
+            details_cols = col
     label_df = df[label_cols]
     defect_rates = {}
     for col in label_df.columns:
         defect_rate_name = col.replace("_label", "_defect_rate")
-        col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
+        col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
         try:
             defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
         except EvaluationException:  # only exception that can be cause is all NaN values
             msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
             LOGGER.warning(msg)
+    if details_cols:
+        details_df = df[details_cols]
+        detail_defect_rates = {}
+        for key, value in details_df.items():
+            _process_rows(value, detail_defect_rates)
+        for key, value in detail_defect_rates.items():
+            col_with_boolean_values = pd.to_numeric(value, errors="coerce")
+            try:
+                defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
+                    list_mean_nan_safe(col_with_boolean_values), 2
+                )
+            except EvaluationException:  # only exception that can be cause is all NaN values
+                msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
+                LOGGER.warning(msg)
     return label_cols, defect_rates
+def _process_rows(row, detail_defect_rates):
+    for key, value in row.items():
+        if key not in detail_defect_rates:
+            detail_defect_rates[key] = []
+        detail_defect_rates[key].append(value)
+    return detail_defect_rates
 def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
     """Aggregate metrics from the evaluation results.
     On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -303,7 +339,7 @@ def _validate_columns_for_evaluators(
                 missing_inputs = []
             else:
                 optional_params = (
-                    evaluator._OPTIONAL_PARAMS  # pylint: disable=protected-access
+                    cast(Any, evaluator)._OPTIONAL_PARAMS  # pylint: disable=protected-access
                     if hasattr(evaluator, "_OPTIONAL_PARAMS")
                     else []
                 )
@@ -451,7 +487,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
 def _apply_target_to_data(
     target: Callable,
     data: Union[str, os.PathLike],
-    batch_client: TClient,
+    batch_client: BatchClient,
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
     **kwargs,
@@ -472,22 +508,31 @@ def _apply_target_to_data(
     :return: The tuple, containing data frame and the list of added columns.
     :rtype: Tuple[pandas.DataFrame, List[str]]
     """
+    if not isinstance(batch_client, ProxyClient):
+        raise ValueError("Only ProxyClient supports target runs for now.")
     _run_name = kwargs.get("_run_name")
     with TargetRunContext():
-        run: ProxyRun = batch_client.run(
-            flow=target,
-            display_name=evaluation_name,
-            data=data,
-            stream=True,
-            name=_run_name,
+        run = cast(
+            ProxyRun,
+            batch_client.run(
+                flow=target,
+                display_name=evaluation_name,
+                data=data,
+                stream=True,
+                name=_run_name,
+            ),
         )
     target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
     run_summary = batch_client.get_run_summary(run)
     if run_summary["completed_lines"] == 0:
-        msg = (f"Evaluation target failed to produce any results."
-               f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
+        msg = (
+            f"Evaluation target failed to produce any results."
+            f" Please check the logs at {run_summary['log_path']} for more details about cause of failure."
+        )
         raise EvaluationException(
             message=msg,
             target=ErrorTarget.EVALUATE,
@@ -577,7 +622,6 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
     return df
-# @log_evaluate_activity
 def evaluate(
     *,
     data: Union[str, os.PathLike],
@@ -728,20 +772,24 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     if target is not None:
         _validate_columns_for_target(input_data_df, target)
-    Configuration.get_instance().set_config("trace.destination", "none")
-    pf_client = PFClient(user_agent=USER_AGENT)
-    target_run: Optional[Run] = None
     # Create default configuration for evaluators that directly maps
     # input data names to keyword inputs of the same name in the evaluators.
     column_mapping = column_mapping or {}
     column_mapping.setdefault("default", {})
-    # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
+    target_run: Optional[Run] = None
     target_generated_columns: Set[str] = set()
+    batch_run_client: BatchClient
+    batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
+    # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
     if data is not None and target is not None:
+        # Right now, only the ProxyClient that uses Promptflow supports a target function
+        batch_run_client = ProxyClient(user_agent=USER_AGENT)
+        batch_run_data = os.path.abspath(data)
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
-            target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
+            target, data, batch_run_client, input_data_df, evaluation_name, **kwargs
         )
         for evaluator_name, mapping in column_mapping.items():
@@ -755,6 +803,17 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
                 # customer did not mapped target output.
                 if col not in mapping and run_output not in mapped_to_values:
                     column_mapping[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
+    elif kwargs.pop("_use_run_submitter_client", False):
+        batch_run_client = RunSubmitterClient()
+        batch_run_data = input_data_df
+    elif kwargs.pop("_use_pf_client", True):
+        batch_run_client = ProxyClient(user_agent=USER_AGENT)
+        # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
+        # multiple evaluators. If the path is already absolute, abspath will return the original path.
+        batch_run_data = os.path.abspath(data)
+    else:
+        batch_run_client = CodeClient()
+        batch_run_data = input_data_df
     # After we have generated all columns, we can check if we have everything we need for evaluators.
     _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -770,46 +829,32 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
             if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
                 column_mapping["default"][col] = f"${{data.{col}}}"
-    def eval_batch_run(
-        batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
-    ) -> Dict[str, __EvaluatorInfo]:
-        with EvalRunContext(batch_run_client):
-            runs = {
-                evaluator_name: batch_run_client.run(
-                    flow=evaluator,
-                    run=target_run,
-                    evaluator_name=evaluator_name,
-                    column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
-                    data=data,
-                    stream=True,
-                    name=kwargs.get("_run_name"),
-                )
-                for evaluator_name, evaluator in evaluators.items()
-            }
+    with EvalRunContext(batch_run_client):
+        runs = {
+            evaluator_name: batch_run_client.run(
+                flow=evaluator,
+                data=batch_run_data,
+                run=target_run,
+                evaluator_name=evaluator_name,
+                column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
+                stream=True,
+                name=kwargs.get("_run_name"),
+            )
+            for evaluator_name, evaluator in evaluators.items()
+        }
-            # get_details needs to be called within EvalRunContext scope in order to have user agent populated
-            return {
-                evaluator_name: {
-                    "result": batch_run_client.get_details(run, all_results=True),
-                    "metrics": batch_run_client.get_metrics(run),
-                    "run_summary": batch_run_client.get_run_summary(run),
-                }
-                for evaluator_name, run in runs.items()
+        # get_details needs to be called within EvalRunContext scope in order to have user agent populated
+        per_evaluator_results: Dict[str, __EvaluatorInfo] = {
+            evaluator_name: {
+                "result": batch_run_client.get_details(run, all_results=True),
+                "metrics": batch_run_client.get_metrics(run),
+                "run_summary": batch_run_client.get_run_summary(run),
             }
-    # Batch Run
-    use_pf_client = kwargs.get("_use_pf_client", True)
-    if use_pf_client:
-        # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
-        # multiple evaluators. If the path is already absolute, abspath will return the original path.
-        data = os.path.abspath(data)
-        per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
-    else:
-        data = input_data_df
-        per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
+            for evaluator_name, run in runs.items()
+        }
     # Concatenate all results
-    evaluators_result_df = None
+    evaluators_result_df = pd.DataFrame()
     evaluators_metric = {}
     for evaluator_name, evaluator_result in per_evaluator_results.items():
         if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
@@ -851,7 +896,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     metrics.update(evaluators_metric)
     # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
-    target_run = None
+    target_run: Optional[Run] = None
     trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
     studio_url = None
     if trace_destination:

azure/ai/evaluation/_evaluate/_telemetry/__init__.py CHANGED Viewed

@@ -9,11 +9,10 @@ import logging
 from typing import Callable, Dict, Literal, Optional, Union, cast
 import pandas as pd
-from promptflow._sdk.entities._flows import FlexFlow as flex_flow
-from promptflow._sdk.entities._flows import Prompty as prompty_sdk
-from promptflow._sdk.entities._flows.dag import Flow as dag_flow
-from promptflow.client import PFClient
-from promptflow.core import Prompty as prompty_core
+from azure.ai.evaluation._legacy._adapters._flows import FlexFlow as flex_flow
+from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty as prompty_sdk
+from azure.ai.evaluation._legacy._adapters._flows import Flow as dag_flow
+from azure.ai.evaluation._legacy._adapters.client import PFClient
 from typing_extensions import ParamSpec
 from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
@@ -66,7 +65,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
     try:
         # Cover flex flow and prompty based evaluator
-        if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
+        if isinstance(evaluator, (prompty_sdk, flex_flow)):
             name = evaluator.name
             pf_type = evaluator.__class__.__name__
         # Cover dag flow based evaluator
@@ -94,86 +93,3 @@ def _get_evaluator_properties(evaluator, evaluator_name):
         "type": _get_evaluator_type(evaluator),
         "alias": evaluator_name if evaluator_name else "",
     }
-# cspell:ignore isna
-def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
-    """Decorator to log evaluate activity
-    :param func: The function to be decorated
-    :type func: Callable
-    :returns: The decorated function
-    :rtype: Callable[P, EvaluationResult]
-    """
-    @functools.wraps(func)
-    def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
-        from promptflow._sdk._telemetry import ActivityType, log_activity
-        from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
-        evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
-        azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
-        pf_client = PFClient(
-            config=(
-                {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
-                if azure_ai_project
-                else None
-            ),
-            user_agent=USER_AGENT,
-        )
-        trace_destination = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
-        track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
-        evaluate_target = bool(kwargs.get("target", None))
-        evaluator_config = bool(kwargs.get("evaluator_config", None))
-        custom_dimensions: Dict[str, Union[str, bool]] = {
-            "track_in_cloud": track_in_cloud,
-            "evaluate_target": evaluate_target,
-            "evaluator_config": evaluator_config,
-        }
-        with log_activity(
-            get_telemetry_logger(),
-            "pf.evals.evaluate",
-            activity_type=ActivityType.PUBLICAPI,
-            user_agent=USER_AGENT,
-            custom_dimensions=custom_dimensions,
-        ):
-            result = func(*args, **kwargs)
-            try:
-                evaluators_info = []
-                for evaluator_name, evaluator in evaluators.items():
-                    evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
-                    try:
-                        evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
-                            like=f"outputs.{evaluator_name}", axis=1
-                        )
-                        failed_rows = (
-                            evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
-                        )
-                        total_rows = evaluator_df.shape[0]
-                        evaluator_info["failed_rows"] = failed_rows
-                        evaluator_info["total_rows"] = total_rows
-                    except Exception as e:  # pylint: disable=broad-exception-caught
-                        LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
-                    evaluators_info.append(evaluator_info)
-                custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
-                with log_activity(
-                    get_telemetry_logger(),
-                    "pf.evals.evaluate_usage_info",
-                    activity_type=ActivityType.PUBLICAPI,
-                    user_agent=USER_AGENT,
-                    custom_dimensions=custom_dimensions,
-                ):
-                    pass
-            except Exception as e:  # pylint: disable=broad-exception-caught
-                LOGGER.debug("Failed to collect evaluate usage info: %s", e)
-            return result
-    return wrapper

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ import uuid
 import base64
 import pandas as pd
-from promptflow.entities import Run
+from azure.ai.evaluation._legacy._adapters.entities import Run
 from azure.ai.evaluation._constants import (
     DEFAULT_EVALUATION_RESULTS_FILE_NAME,
@@ -46,7 +46,7 @@ def is_none(value) -> bool:
 def extract_workspace_triad_from_trace_provider(  # pylint: disable=name-too-long
     trace_provider: str,
 ) -> AzureMLWorkspace:
-    from promptflow._cli._utils import get_workspace_triad_from_local
+    from azure.ai.evaluation._legacy._adapters.utils import get_workspace_triad_from_local
     match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
     if not match or len(match.groups()) != 5:
@@ -131,7 +131,7 @@ def _log_metrics_and_instance_results(
     metrics: Dict[str, Any],
     instance_results: pd.DataFrame,
     trace_destination: Optional[str],
-    run: Run,
+    run: Optional[Run],
     evaluation_name: Optional[str],
     **kwargs,
 ) -> Optional[str]:

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing_extensions import overload, override
 from azure.ai.evaluation._common.utils import nltk_tokenize
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 class BleuScoreEvaluator(EvaluatorBase):
@@ -22,6 +23,8 @@ class BleuScoreEvaluator(EvaluatorBase):
     indicator of quality.
     The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
+    :param threshold: The threshold for the evaluation. Default is 0.5.
+    :type threshold: float
     .. admonition:: Example:
@@ -31,17 +34,27 @@ class BleuScoreEvaluator(EvaluatorBase):
             :language: python
             :dedent: 8
             :caption: Initialize and call an BleuScoreEvaluator.
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_bleu_score_evaluator]
+            :end-before: [END threshold_bleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call an BleuScoreEvaluator.
     """
     id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    def __init__(self):
-        super().__init__()
+    def __init__(self, *, threshold=0.5):
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
-        """Produce a glue score evaluation result.
+        """Produce a bleu score evaluation result.
         :param eval_input: The input to the evaluation function.
         :type eval_input: Dict
@@ -56,9 +69,16 @@ class BleuScoreEvaluator(EvaluatorBase):
         # NIST Smoothing
         smoothing_function = SmoothingFunction().method4
         score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
+        binary_result = False
+        if self._higher_is_better:
+            binary_result = score >= self._threshold
+        else:
+            binary_result = score <= self._threshold
         return {
             "bleu_score": score,
+            "bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
+            "bleu_threshold": self._threshold,
         }
     @overload  # type: ignore

azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from ._code_vulnerability import CodeVulnerabilityEvaluator
+__all__ = [
+    "CodeVulnerabilityEvaluator",
+]

azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.3.0py3-none-any.whl → 1.5.0py3-none-any.whl