PyPI - azure-ai-evaluation - Versions diffs - 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl - Mend

azure-ai-evaluation 0.0.0b0py3-none-any.whl → 1.0.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (100) hide show

azure/ai/evaluation/_constants.py ADDED Viewed

@@ -0,0 +1,50 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+class EvaluationMetrics:
+    """Metrics for model evaluation."""
+    GPT_GROUNDEDNESS = "gpt_groundedness"
+    GPT_RELEVANCE = "gpt_relevance"
+    GPT_COHERENCE = "gpt_coherence"
+    GPT_FLUENCY = "gpt_fluency"
+    GPT_SIMILARITY = "gpt_similarity"
+    F1_SCORE = "f1_score"
+    GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
+    HATE_FAIRNESS = "hate_fairness"
+    HATE_UNFAIRNESS = "hate_unfairness"
+    VIOLENCE = "violence"
+    SELF_HARM = "self_harm"
+    SEXUAL = "sexual"
+    PROTECTED_MATERIAL = "protected_material"
+    XPIA = "xpia"
+class _InternalEvaluationMetrics:
+    """Evaluation metrics that are not publicly supported.
+    These metrics are experimental and subject to potential change or migration to the main
+    enum over time.
+    """
+    ECI = "eci"
+class Prefixes:
+    """Column prefixes for inputs and outputs."""
+    INPUTS = "inputs."
+    OUTPUTS = "outputs."
+    TSG_OUTPUTS = "__outputs."
+DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
+CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
+PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
+PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
+OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
+OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60

azure/ai/evaluation/_evaluate/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------

azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from .batch_run_context import BatchRunContext
+from .code_client import CodeClient
+from .proxy_client import ProxyClient
+__all__ = ["CodeClient", "ProxyClient", "BatchRunContext"]

azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py ADDED Viewed

@@ -0,0 +1,72 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import os
+from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
+from promptflow._utils.user_agent_utils import ClientUserAgentUtil
+from azure.ai.evaluation._constants import (
+    OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
+    OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
+    PF_BATCH_TIMEOUT_SEC,
+    PF_BATCH_TIMEOUT_SEC_DEFAULT,
+)
+from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
+from ..._user_agent import USER_AGENT
+from .._utils import set_event_loop_policy
+from .code_client import CodeClient
+from .proxy_client import ProxyClient
+class BatchRunContext:
+    """Context manager for batch run clients.
+    :param client: The client to run in the context.
+    :type client: Union[
+        ~azure.ai.evaluation._evaluate._batch_run_client.code_client.CodeClient,
+        ~azure.ai.evaluation._evaluate._batch_run_client.proxy_client.ProxyClient
+    ]
+    """
+    def __init__(self, client) -> None:
+        self.client = client
+        self._is_batch_timeout_set_by_system = False
+        self._is_otel_timeout_set_by_system = False
+    def __enter__(self):
+        if isinstance(self.client, CodeClient):
+            ClientUserAgentUtil.append_user_agent(USER_AGENT)
+            inject_openai_api()
+        if isinstance(self.client, ProxyClient):
+            os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
+            os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
+            if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
+                os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
+                self._is_batch_timeout_set_by_system = True
+            # For dealing with the timeout issue of OpenTelemetry exporter when multiple evaluators are running
+            if os.environ.get(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT) is None:
+                os.environ[OTEL_EXPORTER_OTLP_TRACES_TIMEOUT] = str(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT)
+                self._is_otel_timeout_set_by_system = True
+            # For addressing the issue of asyncio event loop closed on Windows
+            set_event_loop_policy()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if isinstance(self.client, CodeClient):
+            recover_openai_api()
+        if isinstance(self.client, ProxyClient):
+            os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
+            os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
+            if self._is_batch_timeout_set_by_system:
+                os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
+                self._is_batch_timeout_set_by_system = False
+            if self._is_otel_timeout_set_by_system:
+                os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
+                self._is_otel_timeout_set_by_system = False

azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py ADDED Viewed

@@ -0,0 +1,150 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import inspect
+import json
+import logging
+import pandas as pd
+from promptflow.contracts.types import AttrDict
+from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
+from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
+LOGGER = logging.getLogger(__name__)
+class CodeRun:
+    def __init__(self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs):
+        self.run = run
+        self.evaluator_name = evaluator_name if evaluator_name is not None else ""
+        self.input_data = input_data
+        self.aggregated_metrics = aggregated_metrics
+    def get_result_df(self, exclude_inputs=False):
+        batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
+        result_df = self.run.result(timeout=batch_run_timeout)
+        if exclude_inputs:
+            result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
+        return result_df
+    def get_aggregated_metrics(self):
+        try:
+            batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
+            aggregated_metrics = (
+                self.aggregated_metrics.result(timeout=batch_run_timeout)
+                if self.aggregated_metrics is not None
+                else None
+            )
+        except Exception as ex:  # pylint: disable=broad-exception-caught
+            LOGGER.debug(f"Error calculating metrics for evaluator {self.evaluator_name}, failed with error {str(ex)}")
+            aggregated_metrics = None
+        if not isinstance(aggregated_metrics, dict):
+            LOGGER.warning(
+                f"Aggregated metrics for evaluator {self.evaluator_name}"
+                f" is not a dictionary will not be logged as metrics"
+            )
+        aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
+        return aggregated_metrics
+class CodeClient:
+    def __init__(self):
+        self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
+    def _calculate_metric(self, evaluator, input_df, column_mapping, evaluator_name):
+        row_metric_futures = []
+        row_metric_results = []
+        input_df = _apply_column_mapping(input_df, column_mapping)
+        # Ignoring args and kwargs from the signature since they are usually catching extra arguments
+        parameters = {
+            param.name
+            for param in inspect.signature(evaluator).parameters.values()
+            if param.name not in ["args", "kwargs"]
+        }
+        for value in input_df.to_dict("records"):
+            # Filter out only the parameters that are present in the input data
+            # if no parameters then pass data as is
+            filtered_values = {k: v for k, v in value.items() if k in parameters} if len(parameters) > 0 else value
+            row_metric_futures.append(self._thread_pool.submit(evaluator, **filtered_values))
+        for row_number, row_metric_future in enumerate(row_metric_futures):
+            try:
+                result = row_metric_future.result()
+                if not isinstance(result, dict):
+                    result = {"output": result}
+                row_metric_results.append(result)
+            except Exception as ex:  # pylint: disable=broad-except
+                msg_1 = f"Error calculating value for row {row_number} for metric {evaluator_name}, "
+                msg_2 = f"failed with error {str(ex)} : Stack trace : {str(ex.__traceback__)}"
+                LOGGER.info(msg_1 + msg_2)
+                # If a row fails to calculate, add an empty dict to maintain the row index
+                # This is to ensure the output dataframe has the same number of rows as the input dataframe
+                # pd concat will fill NaN for missing values
+                row_metric_results.append({})
+        return pd.concat(
+            [input_df.add_prefix("inputs."), pd.DataFrame(row_metric_results)],
+            axis=1,
+            verify_integrity=True,
+        )
+    def _calculate_aggregations(self, evaluator, run):
+        try:
+            if _has_aggregator(evaluator):
+                aggregate_input = None
+                evaluator_output = run.get_result_df(exclude_inputs=True)
+                if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
+                    aggregate_input = evaluator_output["output"].tolist()
+                else:
+                    aggregate_input = [AttrDict(item) for item in evaluator_output.to_dict("records")]
+                aggr_func = getattr(evaluator, "__aggregate__")
+                aggregated_output = aggr_func(aggregate_input)
+                return aggregated_output
+        except Exception as ex:  # pylint: disable=broad-exception-caught
+            LOGGER.warning(
+                f"Error calculating aggregations for evaluator {run.evaluator_name}," f" failed with error {str(ex)}"
+            )
+        return None
+    def run(self, flow, data, evaluator_name=None, column_mapping=None, **kwargs):
+        input_df = data
+        if not isinstance(input_df, pd.DataFrame):
+            try:
+                json_data = load_jsonl(data)
+            except json.JSONDecodeError as exc:
+                raise EvaluationException(
+                    message = f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
+                    internal_message="Failed to parse data as JSON",
+                    target=ErrorTarget.CODE_CLIENT,
+                    category=ErrorCategory.INVALID_VALUE,
+                    blame=ErrorBlame.USER_ERROR,
+                ) from exc
+            input_df = pd.DataFrame(json_data)
+        eval_future = self._thread_pool.submit(self._calculate_metric, flow, input_df, column_mapping, evaluator_name)
+        run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
+        aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
+        run.aggregated_metrics = aggregation_future
+        return run
+    def get_details(self, run, all_results=False):
+        result_df = run.get_result_df(exclude_inputs=not all_results)
+        return result_df
+    def get_metrics(self, run):
+        try:
+            aggregated_metrics = run.get_aggregated_metrics()
+            print("Aggregated metrics")
+            print(aggregated_metrics)
+        except Exception as ex:  # pylint: disable=broad-exception-caught
+            LOGGER.debug(f"Error calculating metrics for evaluator {run.evaluator_name}, failed with error {str(ex)}")
+            return None
+        return aggregated_metrics

azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py ADDED Viewed

@@ -0,0 +1,61 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import inspect
+import logging
+import os
+import numpy as np
+from promptflow.client import PFClient
+from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+LOGGER = logging.getLogger(__name__)
+class ProxyRun:
+    def __init__(self, run, **kwargs):
+        self.run = run
+class ProxyClient:
+    def __init__(self, pf_client: PFClient):
+        self._pf_client = pf_client
+        self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
+    def run(self, flow, data, column_mapping=None, **kwargs):
+        flow_to_run = flow
+        if hasattr(flow, "_to_async"):
+            flow_to_run = flow._to_async()
+        batch_use_async = self._should_batch_use_async(flow_to_run)
+        eval_future = self._thread_pool.submit(
+            self._pf_client.run,
+            flow_to_run,
+            data=data,
+            column_mapping=column_mapping,
+            batch_use_async=batch_use_async,
+            **kwargs
+        )
+        return ProxyRun(run=eval_future)
+    def get_details(self, proxy_run, all_results=False):
+        run = proxy_run.run.result()
+        result_df = self._pf_client.get_details(run, all_results=all_results)
+        result_df.replace("(Failed)", np.nan, inplace=True)
+        return result_df
+    def get_metrics(self, proxy_run):
+        run = proxy_run.run.result()
+        return self._pf_client.get_metrics(run)
+    @staticmethod
+    def _should_batch_use_async(flow):
+        if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
+            if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
+                return True
+            elif inspect.iscoroutinefunction(flow):
+                return True
+            else:
+                return False
+        return False

azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 0.0.0b0py3-none-any.whl → 1.0.0b1py3-none-any.whl