PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl - Mend - Supply Chain Defender

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.0.0b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (78) hide show

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -4,35 +4,48 @@
 import inspect
 import os
 import re
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
+import json
-import numpy as np
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
 from promptflow.client import PFClient
+from promptflow.entities import Run
+from promptflow._sdk._errors import MissingAzurePackage
+from azure.ai.evaluation._common.math import list_sum
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
     EvaluationMetrics,
+    EvaluationRunProperties,
     Prefixes,
     _InternalEvaluationMetrics,
 )
-from .._model_configurations import AzureAIProject
+from .._model_configurations import AzureAIProject, EvaluatorConfig
 from .._user_agent import USER_AGENT
 from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
 from ._utils import (
+    EvaluateResult,
     _apply_column_mapping,
     _log_metrics_and_instance_results,
     _trace_destination_from_project_scope,
     _write_output,
 )
+TClient = TypeVar("TClient", ProxyClient, CodeClient)
+class __EvaluatorInfo(TypedDict):
+    result: pd.DataFrame
+    metrics: Dict[str, Any]
+    run_summary: Dict[str, Any]
 # pylint: disable=line-too-long
 def _aggregate_content_safety_metrics(
-    df: pd.DataFrame, evaluators: Dict[str, Type]
+    df: pd.DataFrame, evaluators: Dict[str, Callable]
 ) -> Tuple[List[str], Dict[str, float]]:
     """Find and aggregate defect rates for content safety metrics. Returns both a list
     of columns that were used to calculate defect rates and the defect rates themselves.
@@ -73,7 +86,7 @@ def _aggregate_content_safety_metrics(
         defect_rate_name = col.replace("_score", "_defect_rate")
         col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
         defect_rates[defect_rate_name] = round(
-            np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
+            list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
             / col_with_numeric_values.count(),
             2,
         )
@@ -107,13 +120,13 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         defect_rate_name = col.replace("_label", "_defect_rate")
         col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
         defect_rates[defect_rate_name] = round(
-            np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
+            list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
             2,
         )
     return label_cols, defect_rates
-def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
+def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
     """Aggregate metrics from the evaluation results.
     On top of naively calculating the mean of most metrics, this function also identifies certain columns
     that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
@@ -122,7 +135,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
     :param df: The dataframe of evaluation results.
     :type df: ~pandas.DataFrame
     :param evaluators:  A dictionary mapping of strings to evaluator classes.
-    :type evaluators: Dict[str, Type]
+    :type evaluators: Dict[str, Callable]
     :return: The aggregated metrics.
     :rtype: Dict[str, float]
     """
@@ -158,6 +171,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
     ]
     missing_inputs = [col for col in required_inputs if col not in df_data.columns]
+    if missing_inputs and "conversation" in required_inputs:
+        non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
+        if len(missing_inputs) == len(non_conversation_inputs) and [
+            input in non_conversation_inputs for input in missing_inputs
+        ]:
+            missing_inputs = []
     if missing_inputs:
         if not is_target_fn:
             msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
@@ -271,9 +290,9 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
 def _validate_columns(
     df: pd.DataFrame,
-    evaluators: Dict[str, Any],
+    evaluators: Dict[str, Callable],
     target: Optional[Callable],
-    evaluator_config: Dict[str, Dict[str, str]],
+    column_mapping: Dict[str, Dict[str, str]],
 ) -> None:
     """
     Check that all columns needed by evaluator or target function are present.
@@ -281,11 +300,11 @@ def _validate_columns(
     :param df: The data frame to be validated.
     :type df: pd.DataFrame
     :param evaluators: The dictionary of evaluators.
-    :type evaluators: Dict[str, Any]
+    :type evaluators: Dict[str, Callable]
     :param target: The callable to be applied to data set.
     :type target: Optional[Callable]
-    :param evaluator_config: The configuration for evaluators.
-    :type evaluator_config: Dict[str, Dict[str, str]]
+    :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
+    :type column_mapping: Dict[str, Dict[str, str]]
     :raises EvaluationException: If column starts from "__outputs." while target is defined.
     """
     if target:
@@ -306,7 +325,7 @@ def _validate_columns(
     else:
         for evaluator_name, evaluator in evaluators.items():
             # Apply column mapping
-            mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
+            mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
             new_df = _apply_column_mapping(df, mapping_config)
             # Validate input data for evaluator
@@ -320,7 +339,7 @@ def _apply_target_to_data(
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
     _run_name: Optional[str] = None,
-) -> Tuple[pd.DataFrame, Set[str]]:
+) -> Tuple[pd.DataFrame, Set[str], Run]:
     """
     Apply the target function to the data set and return updated data and generated columns.
@@ -342,15 +361,15 @@ def _apply_target_to_data(
     # We are manually creating the temporary directory for the flow
     # because the way tempdir remove temporary directories will
     # hang the debugger, because promptflow will keep flow directory.
-    run = pf_client.run(
+    run: Run = pf_client.run(
         flow=target,
         display_name=evaluation_name,
         data=data,
-        properties={"runType": "eval_run", "isEvaluatorRun": "true"},
+        properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
         stream=True,
         name=_run_name,
     )
-    target_output = pf_client.runs.get_details(run, all_results=True)
+    target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
     # Remove input and output prefix
     generated_columns = {
         col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -372,28 +391,30 @@ def _apply_target_to_data(
     return target_output, generated_columns, run
-def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
-    """Process evaluator_config to replace ${target.} with ${data.}
+def _process_column_mappings(
+    column_mapping: Dict[str, Optional[Dict[str, str]]],
+) -> Dict[str, Dict[str, str]]:
+    """Process column_mapping to replace ${target.} with ${data.}
-    :param evaluator_config: The configuration for evaluators.
-    :type evaluator_config: Dict[str, Dict[str, str]]
+    :param column_mapping: The configuration for evaluators.
+    :type column_mapping: Dict[str, Optional[Dict[str, str]]]
     :return: The processed configuration.
     :rtype: Dict[str, Dict[str, str]]
     """
-    processed_config = {}
+    processed_config: Dict[str, Dict[str, str]] = {}
     unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
-    if evaluator_config:
-        for evaluator, mapping_config in evaluator_config.items():
+    if column_mapping:
+        for evaluator, mapping_config in column_mapping.items():
             if isinstance(mapping_config, dict):
                 processed_config[evaluator] = {}
                 for map_to_key, map_value in mapping_config.items():
                     # Check if there's any unexpected reference other than ${target.} or ${data.}
                     if unexpected_references.search(map_value):
-                        msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
+                        msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
                         raise EvaluationException(
                             message=msg,
                             internal_message=msg,
@@ -439,7 +460,7 @@ def evaluate(
     evaluators: Dict[str, Callable],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
-    evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
+    evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[str] = None,
     **kwargs,
@@ -458,10 +479,10 @@ def evaluate(
     :keyword target: Target to be evaluated. `target` and `data` both cannot be None
     :paramtype target: Optional[Callable]
     :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
-        names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
-        keys as the column names in the evaluator input and values as the column names in the input data or data
-        generated by target.
-    :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
+        names as keys and a values that are dictionaries containing the column mappings. The column mappings should
+        be a dictionary with keys as the column names in the evaluator input and values as the column names in the
+        input data or data generated by target.
+    :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
     :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
           the results will be saved to a file named `evaluation_results.json` in the folder.
     :paramtype output_path: Optional[str]
@@ -482,7 +503,7 @@ def evaluate(
             model_config = {
                 "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
                 "api_key": os.environ.get("AZURE_OPENAI_KEY"),
-                "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
+                "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
             }
             coherence_eval = CoherenceEvaluator(model_config=model_config)
@@ -497,15 +518,19 @@ def evaluate(
                 },
                 evaluator_config={
                     "coherence": {
-                        "response": "${data.response}",
-                        "query": "${data.query}"
+                        "column_mapping": {
+                            "response": "${data.response}",
+                            "query": "${data.query}",
+                        },
                     },
                     "relevance": {
-                        "response": "${data.response}",
-                        "context": "${data.context}",
-                        "query": "${data.query}"
-                    }
-                }
+                        "column_mapping": {
+                            "response": "${data.response}",
+                            "context": "${data.context}",
+                            "query": "${data.query}",
+                        },
+                    },
+                },
             )
     """
@@ -544,41 +569,74 @@ def evaluate(
         raise e
-def _evaluate(  # pylint: disable=too-many-locals
+def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
+    # Extract evaluators with a non-empty "run_summary"
+    output_dict = {
+        name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
+    }
+    if output_dict:
+        print("======= Combined Run Summary (Per Evaluator) =======\n")
+        print(json.dumps(output_dict, indent=4))
+        print("\n====================================================")
+def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
+    evaluators: Dict[str, Callable],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
-    data: Optional[str] = None,
-    evaluators: Optional[Dict[str, Callable]] = None,
-    evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
+    data: str,
+    evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[str] = None,
     **kwargs,
-):
+) -> EvaluateResult:
     input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
     # Process evaluator config to replace ${target.} with ${data.}
     if evaluator_config is None:
         evaluator_config = {}
-    evaluator_config = _process_evaluator_config(evaluator_config)
-    _validate_columns(input_data_df, evaluators, target, evaluator_config)
+    # extract column mapping dicts into dictionary mapping evaluator name to column mapping
+    column_mapping = _process_column_mappings(
+        {
+            evaluator_name: evaluator_configuration.get("column_mapping", None)
+            for evaluator_name, evaluator_configuration in evaluator_config.items()
+        }
+    )
+    _validate_columns(input_data_df, evaluators, target, column_mapping)
     # Target Run
-    pf_client = PFClient(
-        config=(
-            {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
-        ),
-        user_agent=USER_AGENT,
-    )
+    try:
+        pf_client = PFClient(
+            config=(
+                {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
+                if azure_ai_project
+                else None
+            ),
+            user_agent=USER_AGENT,
+        )
+    # pylint: disable=raise-missing-from
+    except MissingAzurePackage:
+        msg = (
+            "The required packages for remote tracking are missing.\n"
+            'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
+        )
-    trace_destination = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
-    target_run = None
-    target_generated_columns = set()
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.MISSING_PACKAGE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    trace_destination: Optional[str] = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
+    target_run: Optional[Run] = None
     # Create default configuration for evaluators that directly maps
     # input data names to keyword inputs of the same name in the evaluators.
-    evaluator_config = evaluator_config or {}
-    evaluator_config.setdefault("default", {})
+    column_mapping = column_mapping or {}
+    column_mapping.setdefault("default", {})
     # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
     if data is not None and target is not None:
@@ -586,21 +644,21 @@ def _evaluate(  # pylint: disable=too-many-locals
             target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
         )
-        for evaluator_name, mapping in evaluator_config.items():
+        for evaluator_name, mapping in column_mapping.items():
             mapped_to_values = set(mapping.values())
             for col in target_generated_columns:
                 # If user defined mapping differently, do not change it.
                 # If it was mapped to target, we have already changed it
-                # in _process_evaluator_config
+                # in _process_column_mappings
                 run_output = f"${{run.outputs.{col}}}"
                 # We will add our mapping only if
                 # customer did not mapped target output.
                 if col not in mapping and run_output not in mapped_to_values:
-                    evaluator_config[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
+                    column_mapping[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
         # After we have generated all columns we can check if we have
         # everything we need for evaluators.
-        _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
+        _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
     # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
     # via target mapping.
@@ -610,44 +668,56 @@ def _evaluate(  # pylint: disable=too-many-locals
         for col in input_data_df.columns:
             # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
             # Also ignore columns that are already in config, since they've been covered by target mapping.
-            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
-                evaluator_config["default"][col] = f"${{data.{col}}}"
+            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
+                column_mapping["default"][col] = f"${{data.{col}}}"
+    def eval_batch_run(
+        batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
+    ) -> Dict[str, __EvaluatorInfo]:
+        with BatchRunContext(batch_run_client):
+            runs = {
+                evaluator_name: batch_run_client.run(
+                    flow=evaluator,
+                    run=target_run,
+                    evaluator_name=evaluator_name,
+                    column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
+                    data=data,
+                    stream=True,
+                    name=kwargs.get("_run_name"),
+                )
+                for evaluator_name, evaluator in evaluators.items()
+            }
+            # get_details needs to be called within BatchRunContext scope in order to have user agent populated
+            return {
+                evaluator_name: {
+                    "result": batch_run_client.get_details(run, all_results=True),
+                    "metrics": batch_run_client.get_metrics(run),
+                    "run_summary": batch_run_client.get_run_summary(run),
+                }
+                for evaluator_name, run in runs.items()
+            }
     # Batch Run
-    evaluators_info = {}
     use_pf_client = kwargs.get("_use_pf_client", True)
     if use_pf_client:
-        batch_run_client = ProxyClient(pf_client)
         # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
         # multiple evaluators. If the path is already absolute, abspath will return the original path.
         data = os.path.abspath(data)
+        # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
+        # The root cause is still unclear, but it seems related to a conflict between the async run uploader
+        # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
+        per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
     else:
-        batch_run_client = CodeClient()
         data = input_data_df
-    with BatchRunContext(batch_run_client):
-        for evaluator_name, evaluator in evaluators.items():
-            evaluators_info[evaluator_name] = {}
-            evaluators_info[evaluator_name]["run"] = batch_run_client.run(
-                flow=evaluator,
-                run=target_run,
-                evaluator_name=evaluator_name,
-                column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
-                data=data,
-                stream=True,
-                name=kwargs.get("_run_name"),
-            )
-        # get_details needs to be called within BatchRunContext scope in order to have user agent populated
-        for evaluator_name, evaluator_info in evaluators_info.items():
-            evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
-            evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
+        per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
     # Concatenate all results
     evaluators_result_df = None
     evaluators_metric = {}
-    for evaluator_name, evaluator_info in evaluators_info.items():
-        evaluator_result_df = evaluator_info["result"]
+    for evaluator_name, evaluator_result in per_evaluator_results.items():
+        evaluator_result_df = evaluator_result["result"]
         # drop input columns
         evaluator_result_df = evaluator_result_df.drop(
@@ -670,7 +740,7 @@ def _evaluate(  # pylint: disable=too-many-locals
             else evaluator_result_df
         )
-        evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
+        evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
     # Rename columns, generated by target function to outputs instead of inputs.
     # If target generates columns, already present in the input data, these columns
@@ -688,9 +758,11 @@ def _evaluate(  # pylint: disable=too-many-locals
         evaluation_name,
     )
-    result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
+    result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
     if output_path:
         _write_output(output_path, result)
+    _print_summary(per_evaluator_results)
     return result

azure/ai/evaluation/_evaluate/_telemetry/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ import functools
 import inspect
 import json
 import logging
-from typing import Callable, Dict, TypeVar
+from typing import Callable, Dict, Literal, Optional, Union, cast
 import pandas as pd
 from promptflow._sdk.entities._flows import FlexFlow as flex_flow
@@ -16,31 +16,30 @@ from promptflow.client import PFClient
 from promptflow.core import Prompty as prompty_core
 from typing_extensions import ParamSpec
+from azure.ai.evaluation._model_configurations import AzureAIProject
 from ..._user_agent import USER_AGENT
-from .._utils import _trace_destination_from_project_scope
+from .._utils import EvaluateResult, _trace_destination_from_project_scope
 LOGGER = logging.getLogger(__name__)
 P = ParamSpec("P")
-R = TypeVar("R")
-def _get_evaluator_type(evaluator: Dict[str, Callable]):
+def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
     """
     Get evaluator type for telemetry.
     :param evaluator: The evaluator object
     :type evaluator: Dict[str, Callable]
     :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
-    :rtype: str
+    :rtype: Literal["content-safety", "built-in", "custom"]
     """
-    built_in = False
-    content_safety = False
     module = inspect.getmodule(evaluator)
-    built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
-    if built_in:
-        content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
+    module_name = module.__name__ if module else ""
+    built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
+    content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
     if content_safety:
         return "content-safety"
@@ -98,22 +97,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
 # cspell:ignore isna
-def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
+def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
     """Decorator to log evaluate activity
     :param func: The function to be decorated
     :type func: Callable
     :returns: The decorated function
-    :rtype: Callable[P, R]
+    :rtype: Callable[P, EvaluateResult]
     """
     @functools.wraps(func)
-    def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
         from promptflow._sdk._telemetry import ActivityType, log_activity
         from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
-        evaluators = kwargs.get("evaluators", [])
-        azure_ai_project = kwargs.get("azure_ai_project", None)
+        evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
+        azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
         pf_client = PFClient(
             config=(
@@ -127,7 +126,7 @@ def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
         track_in_cloud = bool(pf_client._config.get_trace_destination())  # pylint: disable=protected-access
         evaluate_target = bool(kwargs.get("target", None))
         evaluator_config = bool(kwargs.get("evaluator_config", None))
-        custom_dimensions = {
+        custom_dimensions: Dict[str, Union[str, bool]] = {
             "track_in_cloud": track_in_cloud,
             "evaluate_target": evaluate_target,
             "evaluator_config": evaluator_config,