PyPI - azure-ai-evaluation - Versions diffs - 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.8.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show

azure/ai/evaluation/_evaluate/_evaluate_aoai.py CHANGED Viewed

@@ -11,7 +11,7 @@ from time import sleep
 from ._batch_run import CodeClient, ProxyClient
-#import aoai_mapping
+# import aoai_mapping
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
@@ -30,17 +30,18 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
     eval_run_id: str
     grader_name_map: Dict[str, str]
 def _split_evaluators_and_grader_configs(
-        evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]]
-    ) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
+    evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
+) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
     """
     Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
     dictionaries that each contain one subset, the first containing the evaluators and the second containing
     the AOAI graders. AOAI graders are defined as anything that is an instance of the AoaiGrader class,
-    including child class instances.
+    including child class instances.
     :param evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
-        and value as the evaluator function or AOAI grader.
+        and value as the evaluator function or AOAI grader.
     :type evaluators: Dict[str, Union[Callable, ]]
     :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
     :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
@@ -54,13 +55,14 @@ def _split_evaluators_and_grader_configs(
             true_evaluators[key] = value
     return true_evaluators, aoai_graders
 @experimental
 def _begin_aoai_evaluation(
-        graders: Dict[str, AzureOpenAIGrader],
-        column_mappings: Optional[Dict[str, Dict[str, str]]],
-        data: pd.DataFrame,
-        run_name: str
-    ) -> List[OAIEvalRunCreationInfo]:
+    graders: Dict[str, AzureOpenAIGrader],
+    column_mappings: Optional[Dict[str, Dict[str, str]]],
+    data: pd.DataFrame,
+    run_name: str,
+) -> List[OAIEvalRunCreationInfo]:
     """
     Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
     AOAI evaluation runs must be queried for completion, so this returns the IDs needed to poll for the
@@ -84,26 +86,20 @@ def _begin_aoai_evaluation(
     :rtype: List[OAIEvalRunCreationInfo]
     """
     LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
     all_eval_run_info: List[OAIEvalRunCreationInfo] = []
     for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
-        all_eval_run_info.append(_begin_single_aoai_evaluation(
-            selected_graders,
-            data,
-            selected_column_mapping,
-            run_name
-        ))
+        all_eval_run_info.append(
+            _begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
+        )
     return all_eval_run_info
 def _begin_single_aoai_evaluation(
-        graders: Dict[str, AzureOpenAIGrader],
-        data: pd.DataFrame,
-        column_mapping: Dict[str, str],
-        run_name: str
-    ) -> OAIEvalRunCreationInfo:
+    graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
+) -> OAIEvalRunCreationInfo:
     """
     Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
     AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
@@ -121,7 +117,7 @@ def _begin_single_aoai_evaluation(
     """
     # Format data for eval group creation
-    grader_name_list  = []
+    grader_name_list = []
     grader_list = []
     # It's expected that all graders supplied for a single eval run use the same credentials
     # so grab a client from the first grader.
@@ -135,19 +131,17 @@ def _begin_single_aoai_evaluation(
     # Create eval group
     # import pdb; pdb.set_trace()
     eval_group_info = client.evals.create(
-        data_source_config=data_source_config,
-        testing_criteria=grader_list,
-        metadata={"is_foundry_eval": "true"}
+        data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
     )
     LOGGER.info(f"AOAI: Eval group created with id {eval_group_info.id}. Creating eval run next...")
     # Use eval group info to map grader IDs back to user-assigned names.
     grader_name_map = {}
     num_criteria = len(eval_group_info.testing_criteria)
     if num_criteria != len(grader_name_list):
         raise EvaluationException(
-            message=f"Number of testing criteria ({num_criteria})" +
-                f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
+            message=f"Number of testing criteria ({num_criteria})"
+            + f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
             blame=ErrorBlame.USER_ERROR,
             category=ErrorCategory.INVALID_VALUE,
             target=ErrorTarget.AOAI_GRADER,
@@ -155,21 +149,24 @@ def _begin_single_aoai_evaluation(
     for name, criteria in zip(grader_name_list, eval_group_info.testing_criteria):
         grader_name_map[criteria.id] = name
-    # Create eval run
+    # Create eval run
     eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
-    LOGGER.info(f"AOAI: Eval run created with id {eval_run_id}." +
-          " Results will be retrieved after normal evaluation is complete...")
+    LOGGER.info(
+        f"AOAI: Eval run created with id {eval_run_id}."
+        + " Results will be retrieved after normal evaluation is complete..."
+    )
+    return OAIEvalRunCreationInfo(
+        client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
+    )
-    return OAIEvalRunCreationInfo(client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
-def _get_evaluation_run_results(
-        all_run_info: List[OAIEvalRunCreationInfo]
-    ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
+def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
     """
     Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
     pipeline to consume. This method accepts a list of eval run information, and will combine the
     results into a single dataframe and metrics dictionary.
     :param all_run_info: A list of evaluation run information that contains the needed values
         to retrieve the results of the evaluation run.
     :type all_run_info: List[OAIEvalRunCreationInfo]
@@ -188,13 +185,14 @@ def _get_evaluation_run_results(
     return output_df, run_metrics
 def _get_single_run_results(
-        run_info: OAIEvalRunCreationInfo,
-    ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
+    run_info: OAIEvalRunCreationInfo,
+) -> Tuple[pd.DataFrame, Dict[str, Any]]:
     """
     Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
     pipeline to consume.
     :param run_info: The evaluation run information that contains the needed values
         to retrieve the results of the evaluation run.
     :type run_info: OAIEvalRunCreationInfo
@@ -205,28 +203,30 @@ def _get_single_run_results(
     """
     # Wait for evaluation run to complete
     run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
     if run_results.status != "completed":
         raise EvaluationException(
             message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
-             + f" failed with status {run_results.status}.",
+            + f" failed with status {run_results.status}.",
             blame=ErrorBlame.UNKNOWN,
             category=ErrorCategory.FAILED_EXECUTION,
             target=ErrorTarget.AOAI_GRADER,
         )
-    LOGGER.info(f"AOAI: Evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
-                + " completed successfully. Gathering results...")
     # Convert run results into a dictionary of metrics
     run_metrics = {}
     if run_results.per_testing_criteria_results is None:
-        msg = ("AOAI evaluation run returned no results, despite 'completed' status. This might" +
-               " occur when invalid or conflicting models are selected in the model and grader configs."
-            f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}")
+        msg = (
+            "AOAI evaluation run returned no results, despite 'completed' status. This might"
+            + " occur when invalid or conflicting models are selected in the model and grader configs."
+            f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}"
+        )
         raise EvaluationException(
             message=msg,
             blame=ErrorBlame.UNKNOWN,
             category=ErrorCategory.FAILED_EXECUTION,
             target=ErrorTarget.AOAI_GRADER,
-        )
+        )
     for criteria_result in run_results.per_testing_criteria_results:
         grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
         passed = criteria_result.passed
@@ -235,7 +235,6 @@ def _get_single_run_results(
         formatted_column_name = f"{grader_name}.pass_rate"
         run_metrics[formatted_column_name] = ratio
     # Get full results and convert them into a dataframe.
     # Notes on raw full data output from OAI eval runs:
     # Each row in the full results list in itself a list.
@@ -246,36 +245,72 @@ def _get_single_run_results(
     # The passed and score values are then added to the results dictionary, prepended with the grader's name
     # as entered by the user in the inputted dictionary.
     # Other values, if they exist, are also added to the results dictionary.
-    raw_list_results = run_info["client"].evals.runs.output_items.list(
-        eval_id=run_info["eval_group_id"],
-        run_id=run_info["eval_run_id"]
-    )
+    # Collect all results with pagination
+    all_results = []
+    next_cursor = None
+    limit = 100  # Max allowed by API
+    while True:
+        # Build kwargs for the API call
+        list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
+        if next_cursor is not None:
+            list_kwargs["after"] = next_cursor
+        raw_list_results = run_info["client"].evals.runs.output_items.list(**list_kwargs)
+        # Add current page results
+        all_results.extend(raw_list_results.data)
+        # Check for more pages
+        if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
+            if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
+                # Get the last item's ID for cursor-based pagination
+                next_cursor = raw_list_results.data[-1].id
+            else:
+                break
+        else:
+            break
     listed_results = {"index": []}
     # raw data has no order guarantees, we need to sort them by their
     # datasource_item_id
-    for row_result in raw_list_results.data:
+    for row_result in all_results:
         # Add the datasource_item_id for later sorting
         listed_results["index"].append(row_result.datasource_item_id)
         for single_grader_row_result in row_result.results:
             grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
             for name, value in single_grader_row_result.items():
-                if name in ["name"]: # Todo decide if we also want to exclude "sample"
+                if name in ["name"]:  # Todo decide if we also want to exclude "sample"
                     continue
                 if name.lower() == "passed":
                     # create a `_result` column for each grader
                     result_column_name = f"outputs.{grader_name}.{grader_name}_result"
-                    if len(result_column_name) < 50: #TODO: is this the limit? Should we keep "passed"?
-                        if (result_column_name not in listed_results):
+                    if len(result_column_name) < 50:  # TODO: is this the limit? Should we keep "passed"?
+                        if result_column_name not in listed_results:
                             listed_results[result_column_name] = []
                         listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
                 formatted_column_name = f"outputs.{grader_name}.{name}"
-                if (formatted_column_name not in listed_results):
+                if formatted_column_name not in listed_results:
                     listed_results[formatted_column_name] = []
                 listed_results[formatted_column_name].append(value)
+    # Ensure all columns have the same length as the index
+    num_rows = len(listed_results["index"])
+    for col_name in list(listed_results.keys()):
+        if col_name != "index":
+            col_length = len(listed_results[col_name])
+            if col_length < num_rows:
+                # Pad with None values
+                listed_results[col_name].extend([None] * (num_rows - col_length))
+            elif col_length > num_rows:
+                # This shouldn't happen, but truncate if it does
+                listed_results[col_name] = listed_results[col_name][:num_rows]
     output_df = pd.DataFrame(listed_results)
     # sort by index
-    output_df = output_df.sort_values('index', ascending=[True])
+    output_df = output_df.sort_values("index", ascending=[True])
     # remove index column
     output_df.drop(columns=["index"], inplace=True)
     return output_df, run_metrics
@@ -303,9 +338,10 @@ def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str,
             target=ErrorTarget.AOAI_GRADER,
         )
-    grader_class =  _get_grader_class(grader_id)
+    grader_class = _get_grader_class(grader_id)
     return grader_class(**init_params)
 def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
     """
     Given a model ID, return the class of the corresponding grader wrapper.
@@ -316,12 +352,17 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
         AzureOpenAILabelGrader,
         AzureOpenAIStringCheckGrader,
         AzureOpenAITextSimilarityGrader,
+        AzureOpenAIScoreModelGrader,
+        AzureOpenAIPythonGrader,
     )
     id_map = {
         AzureOpenAIGrader.id: AzureOpenAIGrader,
         AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
         AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
         AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
+        AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
+        AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
     }
     for key in id_map.keys():
@@ -336,9 +377,9 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
 def _get_graders_and_column_mappings(
-        graders: Dict[str, AzureOpenAIGrader],
-        column_mappings: Optional[Dict[str, Dict[str, str]]],
-    ) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
+    graders: Dict[str, AzureOpenAIGrader],
+    column_mappings: Optional[Dict[str, Dict[str, str]]],
+) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
     """
     Given a dictionary of column mappings and a dictionary of AOAI graders,
     Split them into sub-lists and sub-dictionaries that each correspond to a single evaluation run
@@ -366,20 +407,21 @@ def _get_graders_and_column_mappings(
     """
     default_mapping = column_mappings.get("default", None)
-    return [({name : grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
+    return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
 def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
     """Produce a data source config that maps all columns from the supplied data source into
     the OAI API. The mapping is naive unless a column mapping is provided, in which case
     the column mapping's values overrule the relevant naive mappings
       :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
     helper function.
     :type input_data_df: pd.DataFrame
     :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
     :type column_mapping: Optional[Dict[str, str]]
     :return: A dictionary that can act as data source config for OAI evaluation group creation.
-    :rtype: Dict[str, Any]
+    :rtype: Dict[str, Any]
     """
     data_source_config = {
@@ -388,7 +430,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
             "type": "object",
             "properties": {},
             "required": [],
-        }
+        },
     }
     properties = data_source_config["item_schema"]["properties"]
     required = data_source_config["item_schema"]["required"]
@@ -399,10 +441,11 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
         required.append(key)
     return data_source_config
 def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
     """Produce a data source config that naively maps all columns from the supplied data source into
     the OAI API.
     :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
     helper function.
     :type input_data_df: pd.DataFrame
@@ -424,10 +467,11 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
             "type": "object",
             "properties": properties,
             "required": required,
-        }
+        },
     }
     return data_source_config
 def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
     """
     Given a dataframe of data to be evaluated, and an optional column mapping,
@@ -457,7 +501,7 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
     # dictionary that'll work in an OAI data source.
     for row in input_data_df.iterrows():
         row_dict = {}
-        for oai_key,dataframe_key in column_to_source_map.items():
+        for oai_key, dataframe_key in column_to_source_map.items():
             row_dict[oai_key] = str(row[1][dataframe_key])
         content.append({"item": row_dict})
@@ -466,20 +510,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
         "source": {
             "type": "file_content",
             "content": content,
-        }
+        },
     }
 def _begin_eval_run(
-        client: Union[OpenAI, AzureOpenAI],
-        eval_group_id: str,
-        run_name: str,
-        input_data_df: pd.DataFrame,
-        column_mapping: Dict[str, str]
-    ) -> str:
+    client: Union[OpenAI, AzureOpenAI],
+    eval_group_id: str,
+    run_name: str,
+    input_data_df: pd.DataFrame,
+    column_mapping: Dict[str, str],
+) -> str:
     """
-    Given an eval group id and a dataset file path, use the AOAI API to
+    Given an eval group id and a dataset file path, use the AOAI API to
     start an evaluation run with the given name and description.
-    Returns a poller that can be used to monitor the run.
+    Returns a poller that can be used to monitor the run.
     :param client: The AOAI client to use for the evaluation.
     :type client: Union[OpenAI, AzureOpenAI]
@@ -499,18 +544,16 @@ def _begin_eval_run(
         eval_id=eval_group_id,
         data_source=data_source,
         name=run_name,
-        metadata={"sample_generation": "off","file_format": "jsonl", "is_foundry_eval": "true"}
+        metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
         # TODO decide if we want to add our own timeout value?
     )
     return eval_run.id
 # Post built TODO: replace with _red_team.py's retry logic?
 def _wait_for_run_conclusion(
-        client: Union[OpenAI, AzureOpenAI],
-        eval_group_id: str,
-        eval_run_id: str,
-        max_wait_seconds = 21600
-    ) -> Any:
+    client: Union[OpenAI, AzureOpenAI], eval_group_id: str, eval_run_id: str, max_wait_seconds=21600
+) -> Any:
     """
     Perform exponential backoff polling to get the results of an AOAI evaluation run.
     Raises an EvaluationException if max attempts are reached without receiving a concluding status.
@@ -532,8 +575,8 @@ def _wait_for_run_conclusion(
     iters = 0
     # start with ~51 minutes of exponential backoff
     # max wait time = 2^10 * 3 = 3072 seconds ~= 51 minutes
-    wait_interval = 3 # Seconds.
-    while(True):
+    wait_interval = 3  # Seconds.
+    while True:
         wait_interval *= 1.5
         total_wait += wait_interval
         # Reduce last wait interval if total wait time exceeds max wait time
@@ -541,13 +584,13 @@ def _wait_for_run_conclusion(
             wait_interval -= total_wait - max_wait_seconds
         sleep(wait_interval)
         response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
-        if response.status not in  ["queued", "in_progress"]:
+        if response.status not in ["queued", "in_progress"]:
             return response
         if total_wait > max_wait_seconds:
             raise EvaluationException(
                 message=f"Timed out waiting for AOAI evaluation to complete after {iters}"
-                    + f" rounds of polling. Final status was {response.status}",
+                + f" rounds of polling. Final status was {response.status}",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.FAILED_EXECUTION,
                 target=ErrorTarget.AOAI_GRADER,
-            )
+            )

azure/ai/evaluation/_evaluate/_telemetry/__init__.py CHANGED Viewed

@@ -17,7 +17,6 @@ from typing_extensions import ParamSpec
 from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
-from ..._user_agent import USER_AGENT
 from .._utils import _trace_destination_from_project_scope
 LOGGER = logging.getLogger(__name__)

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -13,6 +13,9 @@ import base64
 import math
 import pandas as pd
+from tqdm import tqdm
+from azure.core.pipeline.policies import UserAgentPolicy
 from azure.ai.evaluation._legacy._adapters.entities import Run
 from azure.ai.evaluation._constants import (
@@ -24,6 +27,7 @@ from azure.ai.evaluation._constants import (
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._model_configurations import AzureAIProject
 from azure.ai.evaluation._version import VERSION
+from azure.ai.evaluation._user_agent import UserAgentSingleton
 from azure.ai.evaluation._azure._clients import LiteMLClient
 LOGGER = logging.getLogger(__name__)
@@ -127,6 +131,7 @@ def process_message_content(content, images_folder_path):
             f.write(image_data_binary)
     return None
 def _log_metrics_and_instance_results_onedp(
     metrics: Dict[str, Any],
     instance_results: pd.DataFrame,
@@ -146,7 +151,8 @@ def _log_metrics_and_instance_results_onedp(
     )
     client = EvaluationServiceOneDPClient(
         endpoint=project_url,
-        credential=credentials
+        credential=credentials,
+        user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
     )
     # Massaging before artifacts are put on disk
@@ -172,21 +178,19 @@ def _log_metrics_and_instance_results_onedp(
         properties = {
             EvaluationRunProperties.RUN_TYPE: "eval_run",
-            EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
             EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
             "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
-        }
+        }
         properties.update(_convert_name_map_into_property_entries(name_map))
         create_evaluation_result_response = client.create_evaluation_result(
-            name=uuid.uuid4(),
-            path=tmpdir,
-            metrics=metrics
+            name=uuid.uuid4(), path=tmpdir, metrics=metrics
         )
         upload_run_response = client.start_evaluation_run(
             evaluation=EvaluationUpload(
                 display_name=evaluation_name,
+                properties=properties,
             )
         )
@@ -196,14 +200,14 @@ def _log_metrics_and_instance_results_onedp(
                 display_name=evaluation_name,
                 status="Completed",
                 outputs={
-                    'evaluationResultId': create_evaluation_result_response.id,
+                    "evaluationResultId": create_evaluation_result_response.id,
                 },
-                properties=properties,
-            )
+            ),
         )
     return update_run_response.properties.get("AiStudioEvaluationUri")
 def _log_metrics_and_instance_results(
     metrics: Dict[str, Any],
     instance_results: pd.DataFrame,
@@ -266,11 +270,11 @@ def _log_metrics_and_instance_results(
             # We are doing that only for the pure evaluation runs.
             if run is None:
                 properties = {
-                        EvaluationRunProperties.RUN_TYPE: "eval_run",
-                        EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
-                        EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
-                        "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
-                    }
+                    EvaluationRunProperties.RUN_TYPE: "eval_run",
+                    EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
+                    EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
+                    "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
+                }
                 properties.update(_convert_name_map_into_property_entries(name_map))
                 ev_run.write_properties_to_run_history(properties=properties)
             else:
@@ -321,7 +325,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
     with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
         json.dump(data_dict, f, ensure_ascii=False)
-    print(f'Evaluation results saved to "{p.resolve()}".\n')
+    # Use tqdm.write to print message without interfering with any current progress bar
+    tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
 def _apply_column_mapping(
@@ -407,9 +412,11 @@ def set_event_loop_policy() -> None:
         # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # type: ignore[attr-defined]
 # textwrap.wrap tries to do fancy nonsense that we don't want
 def _wrap(s, w):
-    return [s[i:i + w] for i in range(0, len(s), w)]
+    return [s[i : i + w] for i in range(0, len(s), w)]
 def _convert_name_map_into_property_entries(
     name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
@@ -433,7 +440,7 @@ def _convert_name_map_into_property_entries(
     num_segments = math.ceil(len(name_map_string) / segment_length)
     # Property map is somehow still too long to encode within the space
     # we allow, so give up, but make sure the service knows we gave up
-    if (num_segments > max_segments):
+    if num_segments > max_segments:
         return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
     result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
@@ -443,6 +450,7 @@ def _convert_name_map_into_property_entries(
         result[segment_key] = segments_list[i]
     return result
 class JSONLDataFileLoader:
     def __init__(self, filename: Union[os.PathLike, str]):
         self.filename = filename

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -34,15 +34,15 @@ class BleuScoreEvaluator(EvaluatorBase):
             :language: python
             :dedent: 8
             :caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
     .. admonition:: Example using Azure AI Project URL:
         .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
             :start-after: [START bleu_score_evaluator]
             :end-before: [END bleu_score_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
+            :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
@@ -54,7 +54,7 @@ class BleuScoreEvaluator(EvaluatorBase):
             :caption: Initialize with threshold and call an BleuScoreEvaluator.
     """
-    id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/bleu_score"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self, *, threshold=0.5):

azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.8.0py3-none-any.whl → 1.10.0py3-none-any.whl