PyPI - azure-ai-evaluation - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show

azure/ai/evaluation/_evaluators/_xpia/xpia.py CHANGED Viewed

@@ -67,19 +67,22 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     """
-    id = "azureml://registries/azureml/models/Indirect-Attack-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/indirect_attack"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.XPIA,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_legacy/_batch_engine/_config.py CHANGED Viewed

@@ -19,7 +19,7 @@ class BatchEngineConfig:
     batch_timeout_seconds: int = PF_BATCH_TIMEOUT_SEC_DEFAULT
     """The maximum amount of time to wait for all evaluations in the batch to complete."""
-    run_timeout_seconds: int = 600
+    line_timeout_seconds: int = 600
     """The maximum amount of time to wait for an evaluation to run against a single entry
     in the data input to complete."""
@@ -32,13 +32,16 @@ class BatchEngineConfig:
     default_num_results: int = 100
     """The default number of results to return if you don't ask for all results."""
+    raise_on_error: bool = True
+    """Whether to raise an error if an evaluation fails."""
     def __post_init__(self):
         if self.logger is None:
             raise ValueError("logger cannot be None")
         if self.batch_timeout_seconds <= 0:
             raise ValueError("batch_timeout_seconds must be greater than 0")
-        if self.run_timeout_seconds <= 0:
-            raise ValueError("run_timeout_seconds must be greater than 0")
+        if self.line_timeout_seconds <= 0:
+            raise ValueError("line_timeout_seconds must be greater than 0")
         if self.max_concurrency <= 0:
             raise ValueError("max_concurrency must be greater than 0")
         if self.default_num_results <= 0:

azure/ai/evaluation/_legacy/_batch_engine/_engine.py CHANGED Viewed

@@ -20,15 +20,31 @@ from concurrent.futures import Executor
 from functools import partial
 from contextlib import contextmanager
 from datetime import datetime, timezone
-from typing import Any, Callable, Dict, Final, Generator, Mapping, MutableMapping, Optional, Sequence, Set, Tuple, cast
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Final,
+    Generator,
+    List,
+    Mapping,
+    MutableMapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    cast,
+    Literal,
+)
 from uuid import uuid4
+from ._config import BatchEngineConfig
 from ._utils import DEFAULTS_KEY, get_int_env_var, get_value_from_path, is_async_callable
 from ._status import BatchStatus
 from ._result import BatchResult, BatchRunDetails, BatchRunError, TokenMetrics
 from ._run_storage import AbstractRunStorage, NoOpRunStorage
-from .._common._logging import log_progress, NodeLogManager
-from ..._exceptions import ErrorBlame
+from .._common._logging import log_progress, logger, NodeLogManager
+from ..._exceptions import ErrorBlame, EvaluationException
 from ._exceptions import (
     BatchEngineCanceledError,
     BatchEngineError,
@@ -54,30 +70,25 @@ class BatchEngine:
         self,
         func: Callable,
         *,
+        config: BatchEngineConfig,
         storage: Optional[AbstractRunStorage] = None,
-        batch_timeout_sec: Optional[int] = None,
-        line_timeout_sec: Optional[int] = None,
-        max_worker_count: Optional[int] = None,
         executor: Optional[Executor] = None,
     ):
         """Create a new batch engine instance
         :param Callable func: The function to run the flow
+        :param BatchEngineConfig config: The configuration for the batch engine
         :param Optional[AbstractRunStorage] storage: The storage to store execution results
-        :param Optional[int] batch_timeout_sec: The timeout of batch run in seconds
-        :param Optional[int] line_timeout_sec: The timeout of each line in seconds
-        :param Optional[int] max_worker_count: The concurrency limit of batch run
         :param Optional[Executor] executor: The executor to run the flow (if needed)
         """
         self._func: Callable = func
+        self._config: BatchEngineConfig = config
         self._storage: AbstractRunStorage = storage or NoOpRunStorage()
-        # TODO ralphe: Consume these from the batch context/config instead of from
-        #              kwargs or (even worse) environment variables
-        self._batch_timeout_sec = batch_timeout_sec or get_int_env_var("PF_BATCH_TIMEOUT_SEC")
-        self._line_timeout_sec = line_timeout_sec or get_int_env_var("PF_LINE_TIMEOUT_SEC", 600)
-        self._max_worker_count = max_worker_count or get_int_env_var("PF_WORKER_COUNT") or MAX_WORKER_COUNT
+        self._batch_timeout_sec = self._config.batch_timeout_seconds
+        self._line_timeout_sec = self._config.line_timeout_seconds
+        self._max_worker_count = self._config.max_concurrency
         self._executor: Optional[Executor] = executor
         self._is_canceled: bool = False
@@ -85,15 +96,13 @@ class BatchEngine:
     async def run(
         self,
         data: Sequence[Mapping[str, Any]],
-        column_mapping: Mapping[str, str],
+        column_mapping: Optional[Mapping[str, str]],
         *,
         id: Optional[str] = None,
         max_lines: Optional[int] = None,
     ) -> BatchResult:
         if not data:
             raise BatchEngineValidationError("Please provide a non-empty data mapping.")
-        if not column_mapping:
-            raise BatchEngineValidationError("The column mapping is required.")
         start_time = datetime.now(timezone.utc)
@@ -105,6 +114,8 @@ class BatchEngine:
             id = id or str(uuid4())
             result: BatchResult = await self._exec_in_task(id, batch_inputs, start_time)
             return result
+        except EvaluationException:
+            raise
         except Exception as ex:
             raise BatchEngineError(
                 "Unexpected error while running the batch run.", blame=ErrorBlame.SYSTEM_ERROR
@@ -114,20 +125,58 @@ class BatchEngine:
         # TODO ralphe: Make sure this works
         self._is_canceled = True
-    @staticmethod
     def _apply_column_mapping(
+        self,
         data: Sequence[Mapping[str, Any]],
-        column_mapping: Mapping[str, str],
+        column_mapping: Optional[Mapping[str, str]],
         max_lines: Optional[int],
     ) -> Sequence[Mapping[str, str]]:
+        resolved_column_mapping: Mapping[str, str] = self._resolve_column_mapping(column_mapping)
+        resolved_column_mapping.update(self._generate_defaults_for_column_mapping())
+        return self._apply_column_mapping_to_lines(data, resolved_column_mapping, max_lines)
+    def _resolve_column_mapping(
+        self,
+        column_mapping: Optional[Mapping[str, str]],
+    ) -> Mapping[str, str]:
+        parameters = inspect.signature(self._func).parameters
+        default_column_mapping: Dict[str, str] = {
+            name: f"${{data.{name}}}"
+            for name, value in parameters.items()
+            if name not in ["self", "cls", "args", "kwargs"]
+        }
+        resolved_mapping: Dict[str, str] = default_column_mapping.copy()
+        for name, value in parameters.items():
+            if value and value.default is not inspect.Parameter.empty:
+                resolved_mapping.pop(name)
+        resolved_mapping.update(column_mapping or {})
+        return resolved_mapping
+    def _generate_defaults_for_column_mapping(self) -> Mapping[Literal["$defaults$"], Any]:
+        return {
+            DEFAULTS_KEY: {
+                name: value.default
+                for name, value in inspect.signature(self._func).parameters.items()
+                if value.default is not inspect.Parameter.empty
+            }
+        }
+    @staticmethod
+    def _apply_column_mapping_to_lines(
+        data: Sequence[Mapping[str, Any]],
+        column_mapping: Mapping[str, str],
+        max_lines: Optional[int],
+    ) -> Sequence[Mapping[str, Any]]:
         data = data[:max_lines] if max_lines else data
         inputs: Sequence[Mapping[str, Any]] = []
-        line: int = 0
         defaults = cast(Mapping[str, Any], column_mapping.get(DEFAULTS_KEY, {}))
-        for input in data:
-            line += 1
+        for line_number, input in enumerate(data, start=1):
             mapped: Dict[str, Any] = {}
             missing_inputs: Set[str] = set()
@@ -148,18 +197,18 @@ class BatchEngine:
                     continue
                 dict_path = match.group(1)
-                found, value = get_value_from_path(dict_path, input)
+                found, mapped_value = get_value_from_path(dict_path, input)
                 if not found:  # try default value
-                    found, value = get_value_from_path(dict_path, defaults)
+                    found, mapped_value = get_value_from_path(dict_path, defaults)
                 if found:
-                    mapped[key] = value
+                    mapped[key] = mapped_value
                 else:
                     missing_inputs.add(dict_path)
             if missing_inputs:
                 missing = ", ".join(missing_inputs)
-                raise BatchEngineValidationError(f"Missing inputs for line {line}: '{missing}'")
+                raise BatchEngineValidationError(f"Missing inputs for line {line_number}: '{missing}'")
             inputs.append(mapped)
@@ -212,10 +261,12 @@ class BatchEngine:
                     end_time=None,
                     tokens=TokenMetrics(0, 0, 0),
                     error=BatchRunError("The line run is not completed.", None),
+                    index=i,
                 )
             )
             for i in range(len(batch_inputs))
         ]
+        self.handle_line_failures(result_details)
         for line_result in result_details:
             # Indicate the worst status of the batch run. This works because
@@ -229,9 +280,15 @@ class BatchEngine:
                 metrics.total_tokens += line_result.tokens.total_tokens
         if failed_lines and not error:
-            error = BatchEngineRunFailedError(
-                str(floor(failed_lines / len(batch_inputs) * 100)) + f"% of the batch run failed."
+            error_message = f"{floor(failed_lines / len(batch_inputs) * 100)}% of the batch run failed."
+            first_exception: Optional[Exception] = next(
+                (result.error.exception for result in result_details if result.error and result.error.exception),
+                None,
             )
+            if first_exception is not None:
+                error_message += f" {first_exception}"
+            error = BatchEngineRunFailedError(error_message)
         return BatchResult(
             status=status,
@@ -283,6 +340,13 @@ class BatchEngine:
                 # TODO ralphe: set logger to use here
             )
+    def __preprocess_inputs(self, inputs: Mapping[str, Any]) -> Mapping[str, Any]:
+        func_params = inspect.signature(self._func).parameters
+        filtered_params = {key: value for key, value in inputs.items() if key in func_params}
+        return filtered_params
     async def _exec_line_async(
         self,
         run_id: str,
@@ -298,6 +362,7 @@ class BatchEngine:
                 end_time=None,
                 tokens=TokenMetrics(0, 0, 0),
                 error=None,
+                index=index,
             )
             try:
@@ -313,13 +378,15 @@ class BatchEngine:
                     #       For now we will just run the function in the current process, but in the future we may
                     #       want to consider running the function in a separate process for isolation reasons.
                     output: Any
+                    processed_inputs = self.__preprocess_inputs(inputs)
                     if is_async_callable(self._func):
-                        output = await self._func(**inputs)
+                        output = await self._func(**processed_inputs)
                     else:
                         # to maximize the parallelism, we run the synchronous function in a separate thread
                         # and await its result
                         output = await asyncio.get_event_loop().run_in_executor(
-                            self._executor, partial(self._func, **inputs)
+                            self._executor, partial(self._func, **processed_inputs)
                         )
                     # This should in theory never happen but as an extra precaution, let's check if the output
@@ -340,6 +407,24 @@ class BatchEngine:
         return index, details
+    @staticmethod
+    def handle_line_failures(run_infos: List[BatchRunDetails], raise_on_line_failure: bool = False):
+        """Handle line failures in batch run"""
+        failed_run_infos: List[BatchRunDetails] = [r for r in run_infos if r.status == BatchStatus.Failed]
+        failed_msg: Optional[str] = None
+        if len(failed_run_infos) > 0:
+            failed_indexes = ",".join([str(r.index) for r in failed_run_infos])
+            first_fail_exception: str = failed_run_infos[0].error.details
+            if raise_on_line_failure:
+                failed_msg = "Flow run failed due to the error: " + first_fail_exception
+                raise Exception(failed_msg)
+            failed_msg = (
+                f"{len(failed_run_infos)}/{len(run_infos)} flow run failed, indexes: [{failed_indexes}],"
+                f" exception of index {failed_run_infos[0].index}: {first_fail_exception}"
+            )
+            logger.error(failed_msg)
     def _persist_run_info(self, line_results: Sequence[BatchRunDetails]):
         # TODO ralphe: implement?
         pass

azure/ai/evaluation/_legacy/_batch_engine/_result.py CHANGED Viewed

@@ -55,6 +55,8 @@ class BatchRunDetails:
     """The token metrics of the line run."""
     error: Optional[BatchRunError]
     """The error of the line run. This will only be set if the status is Failed."""
+    index: int
+    """The line run index."""
     @property
     def duration(self) -> timedelta:

azure/ai/evaluation/_legacy/_batch_engine/_run.py CHANGED Viewed

@@ -58,7 +58,7 @@ class Run:
         dynamic_callable: Callable,
         name_prefix: Optional[str],
         inputs: Sequence[Mapping[str, Any]],
-        column_mapping: Mapping[str, str],
+        column_mapping: Optional[Mapping[str, str]] = None,
         created_on: Optional[datetime] = None,
         run: Optional["Run"] = None,
     ):
@@ -70,7 +70,7 @@ class Run:
         self.dynamic_callable = dynamic_callable
         self.name = self._generate_run_name(name_prefix, self._created_on)
         self.inputs = inputs
-        self.column_mapping = column_mapping
+        self.column_mapping: Optional[Mapping[str, str]] = column_mapping
         self.result: Optional[BatchResult] = None
         self.metrics: Mapping[str, Any] = {}
         self._run = run

azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py CHANGED Viewed

@@ -5,6 +5,7 @@
 import dataclasses
 import inspect
 import sys
+import traceback
 from concurrent.futures import Executor
 from datetime import datetime, timezone
@@ -46,11 +47,6 @@ class RunSubmitter:
         **kwargs,
     ) -> Run:
-        # if the column mappings are not provided, generate them based on the arguments to the
-        # flow function.
-        if column_mapping is None:
-            column_mapping = self._generate_column_mapping(dynamic_callable)
         # The old code always spun up two threads here using a ThreadPoolExecutor:
         # 1. One thread essentially did nothing of value (since tracing was disabled, and we
         #    don't care about checking for the latest PromptFlow version number now)
@@ -84,7 +80,7 @@ class RunSubmitter:
             # unnecessary Flow loading code was removed here. Instead do direct calls to _submit_bulk_run
             await self._submit_bulk_run(run=run, local_storage=local_storage, **kwargs)
-        self.stream_run(run=run, storage=local_storage, raise_on_error=True)
+        self.stream_run(run=run, storage=local_storage, raise_on_error=self._config.raise_on_error)
         return run
     async def _submit_bulk_run(self, run: Run, local_storage: AbstractRunStorage, **kwargs) -> None:
@@ -125,10 +121,8 @@ class RunSubmitter:
         try:
             batch_engine = BatchEngine(
                 run.dynamic_callable,
+                config=self._config,
                 storage=local_storage,
-                batch_timeout_sec=self._config.batch_timeout_seconds,
-                line_timeout_sec=self._config.run_timeout_seconds,
-                max_worker_count=self._config.max_concurrency,
                 executor=self._executor,
             )
@@ -160,10 +154,10 @@ class RunSubmitter:
             # system metrics
             system_metrics = {}
             if batch_result:
-                system_metrics.update(dataclasses.asdict(batch_result.tokens))  # token related
+                # system_metrics.update(dataclasses.asdict(batch_result.tokens))  # token related
                 system_metrics.update(
                     {
-                        "duration": batch_result.duration.total_seconds(),
+                        # "duration": batch_result.duration.total_seconds(),
                         # "__pf__.lines.completed": batch_result.total_lines - batch_result.failed_lines,
                         # "__pf__.lines.failed": batch_result.failed_lines,
                     }
@@ -173,31 +167,16 @@ class RunSubmitter:
             run.metrics = system_metrics
             run.result = batch_result
-    @staticmethod
-    def _generate_column_mapping(function: Callable) -> Mapping[str, Any]:
-        args = inspect.signature(function).parameters
-        default_values: Dict[str, Any] = {}
-        mapping: Dict[str, Any] = {}
-        for key, value in args.items():
-            if key in ["self", "cls"] or value.kind in [value.VAR_POSITIONAL, value.VAR_KEYWORD]:
-                continue
-            mapping[key] = f"${{data.{key}}}"
-            if value.default != inspect.Parameter.empty:
-                default_values[key] = value.default
-        return {
-            **mapping,
-            DEFAULTS_KEY: default_values,
-        }
     @staticmethod
     def _validate_inputs(run: Run):
         if not run.inputs and not run.previous_run:
             raise BatchEngineValidationError("Either data, or a previous run must be specified for the evaluation run.")
     @staticmethod
-    def _validate_column_mapping(column_mapping: Mapping[str, str]):
+    def _validate_column_mapping(column_mapping: Optional[Mapping[str, str]]):
+        if not column_mapping:
+            return
         if not isinstance(column_mapping, Mapping):
             raise BatchEngineValidationError(f"Column mapping must be a dict, got {type(column_mapping)}.")
@@ -221,6 +200,7 @@ class RunSubmitter:
             return
         file_handler = sys.stdout
+        error_message: Optional[str] = None
         try:
             printed = 0
             available_logs = storage.logger.get_logs()
@@ -232,7 +212,24 @@ class RunSubmitter:
         if run.status == RunStatus.FAILED or run.status == RunStatus.CANCELED:
             if run.status == RunStatus.FAILED:
-                error_message = storage.load_exception().get("message", "Run fails with unknown error.")
+                # Get the first error message from the results, or use a default one
+                if run.result and run.result.error:
+                    error_message = "".join(
+                        traceback.format_exception(
+                            type(run.result.error), run.result.error, run.result.error.__traceback__
+                        )
+                    )
+                elif run.result and run.result.details:
+                    err = next((r.error for r in run.result.details if r.error), None)
+                    if err and err.exception:
+                        error_message = "".join(
+                            traceback.format_exception(type(err.exception), err.exception, err.exception.__traceback__)
+                        )
+                    elif err and err.details:
+                        error_message = err.details
+                if not error_message:
+                    error_message = "Run fails with unknown error."
             else:
                 error_message = "Run is canceled."
             if raise_on_error:

azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py CHANGED Viewed

@@ -290,6 +290,7 @@ class _SafetyEvaluation:
                 target=callback,
                 text=source_text if source_text else "",
                 concurrent_async_tasks=concurrent_async_tasks,
+                randomization_seed=randomization_seed,
             )
         ## Run AdversarialSimulator
@@ -902,6 +903,7 @@ class _SafetyEvaluation:
                     evaluation_name=evaluation_name,
                     output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
                     _use_pf_client=False,  # TODO: Remove this once eval logic for red team agent is moved to red team agent
+                    _use_run_submitter_client=False,  # TODO: Remove this once eval logic for red team agent is moved to red team agent
                 )
                 evaluation_results[strategy] = evaluate_outputs
             return evaluation_results

azure/ai/evaluation/_version.py CHANGED Viewed

@@ -3,4 +3,4 @@
 # ---------------------------------------------------------
 # represents upcoming version
-VERSION = "1.9.0"
+VERSION = "1.10.0"

azure/ai/evaluation/red_team/__init__.py CHANGED Viewed

@@ -8,8 +8,8 @@ try:
     from ._attack_objective_generator import RiskCategory
     from ._red_team_result import RedTeamResult
 except ImportError:
-    print(
-        "[INFO] Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
+    raise ImportError(
+        "Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
     )

azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl