PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b1__py3-none-any.whl → 1.0.0b2__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b1py3-none-any.whl → 1.0.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (58) hide show

azure/ai/evaluation/__init__.py CHANGED Viewed

@@ -25,11 +25,7 @@ from ._evaluators._relevance import RelevanceEvaluator
 from ._evaluators._rouge import RougeScoreEvaluator, RougeType
 from ._evaluators._similarity import SimilarityEvaluator
 from ._evaluators._xpia import IndirectAttackEvaluator
-from ._model_configurations import (
-    AzureAIProject,
-    AzureOpenAIModelConfiguration,
-    OpenAIModelConfiguration,
-)
+from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 __all__ = [
     "evaluate",

azure/ai/evaluation/_common/rai_service.py CHANGED Viewed

@@ -11,12 +11,12 @@ from urllib.parse import urlparse
 import jwt
 import numpy as np
-from azure.core.credentials import TokenCredential
-from azure.identity import DefaultAzureCredential
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import get_async_http_client
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from azure.ai.evaluation._model_configurations import AzureAIProject
+from azure.core.credentials import TokenCredential
+from azure.identity import DefaultAzureCredential
 from .constants import (
     CommonConstants,
@@ -348,7 +348,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
         )
     if response.status_code != 200:
-        msg = f"Failed to retrieve the discovery service URL."
+        msg = "Failed to retrieve the discovery service URL."
         raise EvaluationException(
             message=msg,
             internal_message=msg,

azure/ai/evaluation/_common/utils.py CHANGED Viewed

@@ -2,20 +2,15 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Union
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+import threading
+from typing import List, Optional, Union
-try:
-    from . import constants
-except ImportError:
-    import constants
+import nltk
+import numpy as np
-from typing import List
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
-import threading
-import numpy as np
-import nltk
+from . import constants
 _nltk_data_download_lock = threading.Lock()
@@ -46,7 +41,7 @@ def ensure_nltk_data_downloaded():
     """Download NLTK data packages if not already downloaded."""
     with _nltk_data_download_lock:
         try:
-            from nltk.tokenize.nist import NISTTokenizer
+            from nltk.tokenize.nist import NISTTokenizer  # pylint: disable=unused-import
         except LookupError:
             nltk.download("perluniprops")
             nltk.download("punkt")
@@ -54,12 +49,19 @@ def ensure_nltk_data_downloaded():
 def nltk_tokenize(text: str) -> List[str]:
-    """Tokenize the input text using the NLTK tokenizer."""
+    """Tokenize the input text using the NLTK tokenizer.
+    :param text: The text to tokenize
+    :type text: str
+    :return: A list of tokens
+    :rtype: list[str]
+    """
     ensure_nltk_data_downloaded()
     if not text.isascii():
         # Use NISTTokenizer for international tokenization
         from nltk.tokenize.nist import NISTTokenizer
         tokens = NISTTokenizer().international_tokenize(text)
     else:
         # By default, use NLTK word tokenizer
@@ -68,20 +70,18 @@ def nltk_tokenize(text: str) -> List[str]:
     return list(tokens)
-def check_and_add_api_version_for_aoai_model_config(
+def ensure_api_version_in_aoai_model_config(
     model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
     default_api_version: str,
 ) -> None:
-    if (
-        "azure_endpoint" in model_config or "azure_deployment" in model_config
-    ):
+    if "azure_endpoint" in model_config or "azure_deployment" in model_config:
         model_config["api_version"] = model_config.get("api_version", default_api_version)
-def check_and_add_user_agent_for_aoai_model_config(
+def ensure_user_agent_in_aoai_model_config(
     model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
     prompty_model_config: dict,
     user_agent: Optional[str] = None,
 ) -> None:
     if user_agent and ("azure_endpoint" in model_config or "azure_deployment" in model_config):
-        prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
+        prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})

azure/ai/evaluation/_constants.py CHANGED Viewed

@@ -39,6 +39,15 @@ class Prefixes:
     TSG_OUTPUTS = "__outputs."
+class DefaultOpenEncoding:
+    """Enum that captures SDK's default values for the encoding param of open(...)"""
+    READ = "utf-8-sig"
+    """SDK Default Encoding when reading a file"""
+    WRITE = "utf-8"
+    """SDK Default Encoding when writing a file"""
 DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
 CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4

azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py CHANGED Viewed

@@ -5,13 +5,14 @@ import os
 from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
 from promptflow._utils.user_agent_utils import ClientUserAgentUtil
+from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
 from azure.ai.evaluation._constants import (
     OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
     OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
     PF_BATCH_TIMEOUT_SEC,
     PF_BATCH_TIMEOUT_SEC_DEFAULT,
 )
-from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
 from ..._user_agent import USER_AGENT
 from .._utils import set_event_loop_policy

azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py CHANGED Viewed

@@ -4,13 +4,16 @@
 import inspect
 import json
 import logging
+import os
+from pathlib import Path
+from typing import Callable, Dict, Optional, Union
 import pandas as pd
 from promptflow.contracts.types import AttrDict
-from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
@@ -18,7 +21,9 @@ LOGGER = logging.getLogger(__name__)
 class CodeRun:
-    def __init__(self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs):
+    def __init__(
+        self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs  # pylint: disable=unused-argument
+    ):
         self.run = run
         self.evaluator_name = evaluator_name if evaluator_name is not None else ""
         self.input_data = input_data
@@ -40,13 +45,13 @@ class CodeRun:
                 else None
             )
         except Exception as ex:  # pylint: disable=broad-exception-caught
-            LOGGER.debug(f"Error calculating metrics for evaluator {self.evaluator_name}, failed with error {str(ex)}")
+            LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", self.evaluator_name, ex)
             aggregated_metrics = None
         if not isinstance(aggregated_metrics, dict):
             LOGGER.warning(
-                f"Aggregated metrics for evaluator {self.evaluator_name}"
-                f" is not a dictionary will not be logged as metrics"
+                "Aggregated metrics for evaluator %s is not a dictionary will not be logged as metrics",
+                self.evaluator_name,
             )
         aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
@@ -54,11 +59,15 @@ class CodeRun:
         return aggregated_metrics
-class CodeClient:
-    def __init__(self):
+class CodeClient:  # pylint: disable=client-accepts-api-version-keyword
+    def __init__(  # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
+        self,
+    ) -> None:
         self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
-    def _calculate_metric(self, evaluator, input_df, column_mapping, evaluator_name):
+    def _calculate_metric(
+        self, evaluator: Callable, input_df: pd.DataFrame, column_mapping: Optional[Dict[str, str]], evaluator_name: str
+    ) -> pd.DataFrame:
         row_metric_futures = []
         row_metric_results = []
         input_df = _apply_column_mapping(input_df, column_mapping)
@@ -110,18 +119,25 @@ class CodeClient:
                 return aggregated_output
         except Exception as ex:  # pylint: disable=broad-exception-caught
             LOGGER.warning(
-                f"Error calculating aggregations for evaluator {run.evaluator_name}," f" failed with error {str(ex)}"
+                "Error calculating aggregations for evaluator %s, failed with error %s", run.evaluator_name, ex
             )
         return None
-    def run(self, flow, data, evaluator_name=None, column_mapping=None, **kwargs):
+    def run(
+        self,  # pylint: disable=unused-argument
+        flow: Callable,
+        data: Union[os.PathLike, Path, pd.DataFrame],
+        evaluator_name: Optional[str] = None,
+        column_mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ) -> CodeRun:
         input_df = data
         if not isinstance(input_df, pd.DataFrame):
             try:
                 json_data = load_jsonl(data)
             except json.JSONDecodeError as exc:
                 raise EvaluationException(
-                    message = f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
+                    message=f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
                     internal_message="Failed to parse data as JSON",
                     target=ErrorTarget.CODE_CLIENT,
                     category=ErrorCategory.INVALID_VALUE,
@@ -129,22 +145,28 @@ class CodeClient:
                 ) from exc
             input_df = pd.DataFrame(json_data)
-        eval_future = self._thread_pool.submit(self._calculate_metric, flow, input_df, column_mapping, evaluator_name)
+        eval_future = self._thread_pool.submit(
+            self._calculate_metric,
+            evaluator=flow,
+            input_df=input_df,
+            column_mapping=column_mapping,
+            evaluator_name=evaluator_name,
+        )
         run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
         aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
         run.aggregated_metrics = aggregation_future
         return run
-    def get_details(self, run, all_results=False):
+    def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
         result_df = run.get_result_df(exclude_inputs=not all_results)
         return result_df
-    def get_metrics(self, run):
+    def get_metrics(self, run: CodeRun) -> Optional[None]:
         try:
             aggregated_metrics = run.get_aggregated_metrics()
             print("Aggregated metrics")
             print(aggregated_metrics)
         except Exception as ex:  # pylint: disable=broad-exception-caught
-            LOGGER.debug(f"Error calculating metrics for evaluator {run.evaluator_name}, failed with error {str(ex)}")
+            LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
             return None
         return aggregated_metrics

azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py CHANGED Viewed

@@ -4,29 +4,40 @@
 import inspect
 import logging
 import os
+from concurrent.futures import Future
+from typing import Any, Callable, Dict, Optional, Union
 import numpy as np
+import pandas as pd
 from promptflow.client import PFClient
+from promptflow.entities import Run
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
 LOGGER = logging.getLogger(__name__)
 class ProxyRun:
-    def __init__(self, run, **kwargs):
+    def __init__(self, run: Future, **kwargs) -> None:  # pylint: disable=unused-argument
         self.run = run
-class ProxyClient:
-    def __init__(self, pf_client: PFClient):
+class ProxyClient:  # pylint: disable=client-accepts-api-version-keyword
+    def __init__(  # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
+        self, pf_client: PFClient
+    ) -> None:
         self._pf_client = pf_client
         self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
-    def run(self, flow, data, column_mapping=None, **kwargs):
+    def run(
+        self,
+        flow: Union[str, os.PathLike, Callable],
+        data: Union[str, os.PathLike],
+        column_mapping: Optional[Dict[str, str]] = None,
+        **kwargs
+    ) -> ProxyRun:
         flow_to_run = flow
         if hasattr(flow, "_to_async"):
-            flow_to_run = flow._to_async()
+            flow_to_run = flow._to_async()  # pylint: disable=protected-access
         batch_use_async = self._should_batch_use_async(flow_to_run)
         eval_future = self._thread_pool.submit(
@@ -39,14 +50,14 @@ class ProxyClient:
         )
         return ProxyRun(run=eval_future)
-    def get_details(self, proxy_run, all_results=False):
-        run = proxy_run.run.result()
+    def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
+        run: Run = proxy_run.run.result()
         result_df = self._pf_client.get_details(run, all_results=all_results)
         result_df.replace("(Failed)", np.nan, inplace=True)
         return result_df
-    def get_metrics(self, proxy_run):
-        run = proxy_run.run.result()
+    def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
+        run: Run = proxy_run.run.result()
         return self._pf_client.get_metrics(run)
     @staticmethod
@@ -54,8 +65,7 @@ class ProxyClient:
         if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
             if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
                 return True
-            elif inspect.iscoroutinefunction(flow):
+            if inspect.iscoroutinefunction(flow):
                 return True
-            else:
-                return False
+            return False
         return False

azure/ai/evaluation/_evaluate/_eval_run.py CHANGED Viewed

@@ -8,17 +8,18 @@ import logging
 import os
 import posixpath
 import time
+import types
 import uuid
-from typing import Any, Dict, Optional, Set
+from typing import Any, Dict, Optional, Set, Type
 from urllib.parse import urlparse
-from azure.core.pipeline.policies import RetryPolicy
-from azure.core.rest import HttpResponse
 from promptflow._sdk.entities import Run
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import get_http_client
 from azure.ai.evaluation._version import VERSION
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.core.pipeline.policies import RetryPolicy
+from azure.core.rest import HttpResponse
 LOGGER = logging.getLogger(__name__)
@@ -165,7 +166,9 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
             self._url_base = urlparse(self._tracking_uri).netloc
             if self._promptflow_run is not None:
                 self.info = RunInfo(
-                    self._promptflow_run.name, self._promptflow_run._experiment_name, self._promptflow_run.name
+                    self._promptflow_run.name,
+                    self._promptflow_run._experiment_name,  # pylint: disable=protected-access
+                    self._promptflow_run.name,
                 )
             else:
                 url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
@@ -181,8 +184,10 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
                 if response.status_code != 200:
                     self.info = RunInfo.generate(self._run_name)
                     LOGGER.warning(
-                        f"The run failed to start: {response.status_code}: {response.text()}."
-                        "The results will be saved locally, but will not be logged to Azure."
+                        "The run failed to start: %s: %s."
+                        "The results will be saved locally, but will not be logged to Azure.",
+                        response.status_code,
+                        response.text(),
                     )
                     self._status = RunStatus.BROKEN
                 else:
@@ -216,7 +221,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
                 internal_message="Incorrect terminal status. Valid statuses are 'FINISHED', 'FAILED' and 'KILLED'",
                 target=ErrorTarget.EVAL_RUN,
                 category=ErrorCategory.FAILED_EXECUTION,
-                blame=ErrorBlame.UNKNOWN
+                blame=ErrorBlame.UNKNOWN,
             )
         url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/update"
         body = {
@@ -239,8 +244,21 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         self._start_run()
         return self
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        """The context manager exit call."""
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        exc_tb: Optional[types.TracebackType],
+    ) -> Optional[bool]:
+        """The context manager exit call.
+        :param exc_type: The exception type
+        :type exc_type: Optional[Type[BaseException]]
+        :param exc_value: The exception value
+        :type exc_value: Optional[BaseException]
+        :param exc_tb: The exception traceback
+        :type exc_tb: Optional[types.TracebackType]
+        """
         self._end_run("FINISHED")
     def get_run_history_uri(self) -> str:
@@ -280,7 +298,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         # is an optional dependency.
         from promptflow.azure._utils._token_cache import ArmTokenCache  # pylint: disable=import-error,no-name-in-module
-        return ArmTokenCache().get_token(self._ml_client._credential)
+        return ArmTokenCache().get_token(self._ml_client._credential)  # pylint: disable=protected-access
     def request_with_retry(
         self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
@@ -326,9 +344,10 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         :type response: HttpResponse
         """
         LOGGER.warning(
-            f"Unable to {failed_op}, "
-            f"the request failed with status code {response.status_code}, "
-            f"{response.text()=}."
+            "Unable to %s, the request failed with status code %s, response.text()=%s.",
+            failed_op,
+            response.status_code,
+            response.text(),
         )
     def _check_state_and_log(self, action: str, bad_states: Set[RunStatus], should_raise: bool) -> bool:
@@ -342,7 +361,8 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         :type bad_states: Set[RunStatus]
         :param should_raise: Should we raise an error if the bad state has been encountered
         :type should_raise: bool
-        :raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True and invalid state was encountered.
+        :raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True
+             and invalid state was encountered.
         :return: Whether or not run is in the correct state.
         :rtype: bool
         """
@@ -354,7 +374,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
                     internal_message=msg,
                     target=ErrorTarget.EVAL_RUN,
                     category=ErrorCategory.FAILED_EXECUTION,
-                    blame=ErrorBlame.UNKNOWN
+                    blame=ErrorBlame.UNKNOWN,
                 )
             LOGGER.warning(msg)
             return False
@@ -446,7 +466,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
             return credential.account_key
         if hasattr(credential, "sas_token"):
             return credential.sas_token
-        return self._ml_client.datastores._credential
+        return self._ml_client.datastores._credential  # pylint: disable=protected-access
     def log_metric(self, key: str, value: float) -> None:
         """

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -8,27 +8,26 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
 import numpy as np
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
 from promptflow.client import PFClient
-from .._model_configurations import AzureAIProject
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
     EvaluationMetrics,
     Prefixes,
     _InternalEvaluationMetrics,
 )
+from .._model_configurations import AzureAIProject
 from .._user_agent import USER_AGENT
 from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
-from ._telemetry import log_evaluate_activity
 from ._utils import (
     _apply_column_mapping,
     _log_metrics_and_instance_results,
     _trace_destination_from_project_scope,
     _write_output,
 )
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 # pylint: disable=line-too-long
@@ -260,12 +259,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
         initial_data_df = pd.read_json(data, lines=True)
     except Exception as e:
         raise EvaluationException(
-                message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
-                internal_message="Failed to load data. Confirm that it is valid jsonl data.",
-                target=ErrorTarget.EVALUATE,
-                category=ErrorCategory.INVALID_VALUE,
-                blame=ErrorBlame.USER_ERROR,
-            ) from e
+            message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
+            internal_message="Failed to load data. Confirm that it is valid jsonl data.",
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        ) from e
     return initial_data_df
@@ -436,10 +435,10 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
 # @log_evaluate_activity
 def evaluate(
     *,
+    data: str,
+    evaluators: Dict[str, Callable],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
-    data: Optional[str] = None,
-    evaluators: Optional[Dict[str, Callable]] = None,
     evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[str] = None,
@@ -448,16 +447,16 @@ def evaluate(
     """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
         data will be run through target function and then results will be evaluated.
+    :keyword data: Path to the data to be evaluated or passed to target if target is set.
+        Only .jsonl format files are supported.  `target` and `data` both cannot be None. Required.
+    :paramtype data: str
+    :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
+        and value as the evaluator function. Required.
+    :paramtype evaluators: Dict[str, Callable]
     :keyword evaluation_name: Display name of the evaluation.
     :paramtype evaluation_name: Optional[str]
     :keyword target: Target to be evaluated. `target` and `data` both cannot be None
     :paramtype target: Optional[Callable]
-    :keyword data: Path to the data to be evaluated or passed to target if target is set.
-        Only .jsonl format files are supported.  `target` and `data` both cannot be None
-    :paramtype data: Optional[str]
-    :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
-        and value as the evaluator function.
-    :paramtype evaluators: Optional[Dict[str, Callable]
     :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
         names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
         keys as the column names in the evaluator input and values as the column names in the input data or data
@@ -572,22 +571,21 @@ def _evaluate(  # pylint: disable=too-many-locals
         user_agent=USER_AGENT,
     )
-    trace_destination = pf_client._config.get_trace_destination()
+    trace_destination = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
     target_run = None
     target_generated_columns = set()
+    # Create default configuration for evaluators that directly maps
+    # input data names to keyword inputs of the same name in the evaluators.
+    evaluator_config = evaluator_config or {}
+    evaluator_config.setdefault("default", {})
+    # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
     if data is not None and target is not None:
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
             target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
         )
-        # Make sure, the default is always in the configuration.
-        if not evaluator_config:
-            evaluator_config = {}
-        if "default" not in evaluator_config:
-            evaluator_config["default"] = {}
         for evaluator_name, mapping in evaluator_config.items():
             mapped_to_values = set(mapping.values())
             for col in target_generated_columns:
@@ -604,6 +602,16 @@ def _evaluate(  # pylint: disable=too-many-locals
         # everything we need for evaluators.
         _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
+    # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
+    # via target mapping.
+    # If both the data and the output dictionary of the target function
+    # have the same column, then the target function value is used.
+    if input_data_df is not None:
+        for col in input_data_df.columns:
+            # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
+            # Also ignore columns that are already in config, since they've been covered by target mapping.
+            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
+                evaluator_config["default"][col] = f"${{data.{col}}}"
     # Batch Run
     evaluators_info = {}
     use_pf_client = kwargs.get("_use_pf_client", True)
@@ -672,7 +680,6 @@ def _evaluate(  # pylint: disable=too-many-locals
     result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
     metrics = _aggregate_metrics(evaluators_result_df, evaluators)
     metrics.update(evaluators_metric)
     studio_url = _log_metrics_and_instance_results(
         metrics,
         result_df,

azure-ai-evaluation 1.0.0b1__py3-none-any.whl → 1.0.0b2__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b1py3-none-any.whl → 1.0.0b2py3-none-any.whl