PyPI - azure-ai-evaluation - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0py3-none-any.whl → 1.0.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show

azure/ai/evaluation/_evaluate/_telemetry/__init__.py CHANGED Viewed

@@ -6,40 +6,38 @@ import functools
 import inspect
 import json
 import logging
-from typing import Callable, Dict, Literal, Optional, Union, cast
+from typing import Callable, Dict
 import pandas as pd
 from promptflow._sdk.entities._flows import FlexFlow as flex_flow
 from promptflow._sdk.entities._flows import Prompty as prompty_sdk
 from promptflow._sdk.entities._flows.dag import Flow as dag_flow
 from promptflow.client import PFClient
 from promptflow.core import Prompty as prompty_core
-from typing_extensions import ParamSpec
-from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
 from ..._user_agent import USER_AGENT
 from .._utils import _trace_destination_from_project_scope
 LOGGER = logging.getLogger(__name__)
-P = ParamSpec("P")
-def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
+def _get_evaluator_type(evaluator: Dict[str, Callable]):
     """
     Get evaluator type for telemetry.
     :param evaluator: The evaluator object
     :type evaluator: Dict[str, Callable]
     :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
-    :rtype: Literal["content-safety", "built-in", "custom"]
+    :rtype: str
     """
-    module = inspect.getmodule(evaluator)
-    module_name = module.__name__ if module else ""
+    built_in = False
+    content_safety = False
-    built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
-    content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
+    module = inspect.getmodule(evaluator)
+    built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
+    if built_in:
+        content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
     if content_safety:
         return "content-safety"
@@ -84,7 +82,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
             name = str(evaluator)
             pf_type = "Unknown"
     except Exception as e:  # pylint: disable=broad-exception-caught
-        LOGGER.debug("Failed to get evaluator properties: %s", e)
+        LOGGER.debug(f"Failed to get evaluator properties: {e}")
         name = str(evaluator)
         pf_type = "Unknown"
@@ -97,22 +95,20 @@ def _get_evaluator_properties(evaluator, evaluator_name):
 # cspell:ignore isna
-def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
+def log_evaluate_activity(func) -> None:
     """Decorator to log evaluate activity
     :param func: The function to be decorated
     :type func: Callable
-    :returns: The decorated function
-    :rtype: Callable[P, EvaluationResult]
     """
     @functools.wraps(func)
-    def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
+    def wrapper(*args, **kwargs) -> Callable:
         from promptflow._sdk._telemetry import ActivityType, log_activity
         from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
-        evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
-        azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
+        evaluators = kwargs.get("evaluators", [])
+        azure_ai_project = kwargs.get("azure_ai_project", None)
         pf_client = PFClient(
             config=(
@@ -123,11 +119,10 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
             user_agent=USER_AGENT,
         )
-        trace_destination = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
-        track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
+        track_in_cloud = bool(pf_client._config.get_trace_destination())
         evaluate_target = bool(kwargs.get("target", None))
         evaluator_config = bool(kwargs.get("evaluator_config", None))
-        custom_dimensions: Dict[str, Union[str, bool]] = {
+        custom_dimensions = {
             "track_in_cloud": track_in_cloud,
             "evaluate_target": evaluate_target,
             "evaluator_config": evaluator_config,
@@ -159,7 +154,7 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
                         evaluator_info["failed_rows"] = failed_rows
                         evaluator_info["total_rows"] = total_rows
                     except Exception as e:  # pylint: disable=broad-exception-caught
-                        LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
+                        LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}")
                     evaluators_info.append(evaluator_info)
                 custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
@@ -172,7 +167,7 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
                 ):
                     pass
             except Exception as e:  # pylint: disable=broad-exception-caught
-                LOGGER.debug("Failed to collect evaluate usage info: %s", e)
+                LOGGER.debug(f"Failed to collect evaluate usage info: {e}")
             return result

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -6,23 +6,14 @@ import logging
 import os
 import re
 import tempfile
+from collections import namedtuple
 from pathlib import Path
-from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
-import uuid
-import base64
 import pandas as pd
-from promptflow.client import PFClient
-from promptflow.entities import Run
-from azure.ai.evaluation._constants import (
-    DEFAULT_EVALUATION_RESULTS_FILE_NAME,
-    DefaultOpenEncoding,
-    EvaluationRunProperties,
-    Prefixes,
-)
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from azure.ai.evaluation._model_configurations import AzureAIProject
+from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, Prefixes
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.ai.evaluation._evaluate._eval_run import EvalRun
 LOGGER = logging.getLogger(__name__)
@@ -31,45 +22,39 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
     "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
 )
-class AzureMLWorkspace(NamedTuple):
-    subscription_id: str
-    resource_group_name: str
-    workspace_name: str
+AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
-def is_none(value) -> bool:
+def is_none(value):
     return value is None or str(value).lower() == "none"
-def extract_workspace_triad_from_trace_provider(  # pylint: disable=name-too-long
-    trace_provider: str,
-) -> AzureMLWorkspace:
+def extract_workspace_triad_from_trace_provider(trace_provider: str):  # pylint: disable=name-too-long
     match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
     if not match or len(match.groups()) != 5:
         raise EvaluationException(
-            message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
-            "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
-            f"workspaces/<workspace_name>, got {trace_provider}",
-            internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
-            "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
-            "workspaces/<workspace_name>,",
-            target=ErrorTarget.UNKNOWN,
-            category=ErrorCategory.INVALID_VALUE,
-            blame=ErrorBlame.UNKNOWN,
-        )
+                message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
+                "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
+                f"workspaces/<workspace_name>, got {trace_provider}",
+                internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
+                "resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
+                "workspaces/<workspace_name>,",
+                target=ErrorTarget.UNKNOWN,
+                category=ErrorCategory.INVALID_VALUE,
+                blame=ErrorBlame.UNKNOWN,
+            )
     subscription_id = match.group(1)
     resource_group_name = match.group(3)
     workspace_name = match.group(5)
-    return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
+    return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
 def load_jsonl(path):
-    with open(path, "r", encoding=DefaultOpenEncoding.READ) as f:
+    with open(path, "r", encoding="utf-8") as f:
         return [json.loads(line) for line in f.readlines()]
-def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
+def _azure_pf_client_and_triad(trace_destination):
     from promptflow.azure._cli._utils import _get_azure_pf_client
     ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
@@ -82,45 +67,15 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
     return azure_pf_client, ws_triad
-def _store_multimodal_content(messages, tmpdir: str):
-    # verify if images folder exists
-    images_folder_path = os.path.join(tmpdir, "images")
-    os.makedirs(images_folder_path, exist_ok=True)
-    # traverse all messages and replace base64 image data with new file name.
-    for message in messages:
-        if isinstance(message.get("content", []), list):
-            for content in message.get("content", []):
-                if content.get("type") == "image_url":
-                    image_url = content.get("image_url")
-                    if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
-                        # Extract the base64 string
-                        base64image = image_url["url"].replace("data:image/jpg;base64,", "")
-                        # Generate a unique filename
-                        image_file_name = f"{str(uuid.uuid4())}.jpg"
-                        image_url["url"] = f"images/{image_file_name}"  # Replace the base64 URL with the file path
-                        # Decode the base64 string to binary image data
-                        image_data_binary = base64.b64decode(base64image)
-                        # Write the binary image data to the file
-                        image_file_path = os.path.join(images_folder_path, image_file_name)
-                        with open(image_file_path, "wb") as f:
-                            f.write(image_data_binary)
 def _log_metrics_and_instance_results(
-    metrics: Dict[str, Any],
-    instance_results: pd.DataFrame,
-    trace_destination: Optional[str],
-    run: Run,
-    evaluation_name: Optional[str],
-) -> Optional[str]:
-    from azure.ai.evaluation._evaluate._eval_run import EvalRun
+    metrics,
+    instance_results,
+    trace_destination,
+    run,
+    evaluation_name,
+) -> str:
     if trace_destination is None:
-        LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
+        LOGGER.error("Unable to log traces as trace destination was not defined.")
         return None
     azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
@@ -138,21 +93,13 @@ def _log_metrics_and_instance_results(
         ml_client=azure_pf_client.ml_client,
         promptflow_run=run,
     ) as ev_run:
-        artifact_name = EvalRun.EVALUATION_ARTIFACT
+        artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
         with tempfile.TemporaryDirectory() as tmpdir:
-            # storing multi_modal images if exists
-            col_name = "inputs.conversation"
-            if col_name in instance_results.columns:
-                for item in instance_results[col_name].items():
-                    value = item[1]
-                    if "messages" in value:
-                        _store_multimodal_content(value["messages"], tmpdir)
-            # storing artifact result
             tmp_path = os.path.join(tmpdir, artifact_name)
-            with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
+            with open(tmp_path, "w", encoding="utf-8") as f:
                 f.write(instance_results.to_json(orient="records", lines=True))
             ev_run.log_artifact(tmpdir, artifact_name)
@@ -164,9 +111,9 @@ def _log_metrics_and_instance_results(
             if run is None:
                 ev_run.write_properties_to_run_history(
                     properties={
-                        EvaluationRunProperties.RUN_TYPE: "eval_run",
-                        EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
+                        "_azureml.evaluation_run": "azure-ai-generative-parent",
                         "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
+                        "isEvaluatorRun": "true",
                     }
                 )
@@ -190,7 +137,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
     return studio_url
-def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
+def _trace_destination_from_project_scope(project_scope: dict) -> str:
     subscription_id = project_scope["subscription_id"]
     resource_group_name = project_scope["resource_group_name"]
     workspace_name = project_scope["project_name"]
@@ -203,20 +150,16 @@ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
     return trace_destination
-def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
+def _write_output(path, data_dict):
     p = Path(path)
-    if p.is_dir():
+    if os.path.isdir(path):
         p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
-    with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
+    with open(p, "w") as f:
         json.dump(data_dict, f)
-    print(f'Evaluation results saved to "{p.resolve()}".\n')
-def _apply_column_mapping(
-    source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
-) -> pd.DataFrame:
+def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False) -> pd.DataFrame:
     """
     Apply column mapping to source_df based on mapping_config.
@@ -224,11 +167,10 @@ def _apply_column_mapping(
     :param source_df: the data frame to be changed.
     :type source_df: pd.DataFrame
     :param mapping_config: The configuration, containing column mapping.
-    :type mapping_config: Dict[str, str].
+    :type mapping_config: dict.
     :param inplace: If true, the source_df will be changed inplace.
     :type inplace: bool
     :return: The modified data frame.
-    :rtype: pd.DataFrame
     """
     result_df = source_df
@@ -265,34 +207,31 @@ def _apply_column_mapping(
     return result_df
-def _has_aggregator(evaluator: object) -> bool:
+def _has_aggregator(evaluator):
     return hasattr(evaluator, "__aggregate__")
-def get_int_env_var(env_var_name: str, default_value: int) -> int:
+def get_int_env_var(env_var_name, default_value=None):
     """
-    The function `get_int_env_var` retrieves an integer environment variable value, with a
+    The function `get_int_env_var` retrieves an integer environment variable value, with an optional
     default value if the variable is not set or cannot be converted to an integer.
     :param env_var_name: The name of the environment variable you want to retrieve the value of
-    :type env_var_name: str
     :param default_value: The default value is the value that will be returned if the environment
-        variable is not found or if it cannot be converted to an integer
-    :type default_value: int
+    variable is not found or if it cannot be converted to an integer
     :return: an integer value.
-    :rtype: int
     """
     try:
-        return int(os.environ[env_var_name])
-    except (ValueError, KeyError):
+        return int(os.environ.get(env_var_name, default_value))
+    except Exception:
         return default_value
-def set_event_loop_policy() -> None:
+def set_event_loop_policy():
     import asyncio
     import platform
     if platform.system().lower() == "windows":
         # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
         # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
-        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # type: ignore[attr-defined]
+        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -2,8 +2,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from promptflow._utils.async_utils import async_run_allowing_running_loop
 from azure.ai.evaluation._common.utils import nltk_tokenize
@@ -26,29 +26,30 @@ class _AsyncBleuScoreEvaluator:
 class BleuScoreEvaluator:
     """
-    Calculate the BLEU score for a given response and ground truth.
+    Evaluator that computes the BLEU Score between two strings.
     BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
-    translation. It is widely used in text summarization and text generation use cases.
+    translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
+    generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
+    better quality.
-    Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
-    especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
-    indicator of quality.
+    **Usage**
-    The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
+    .. code-block:: python
-    .. admonition:: Example:
+        eval_fn = BleuScoreEvaluator()
+        result = eval_fn(
+            response="Tokyo is the capital of Japan.",
+            ground_truth="The capital of Japan is Tokyo.")
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START bleu_score_evaluator]
-            :end-before: [END bleu_score_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call an BleuScoreEvaluator.
-    """
+    **Output format**
-    id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    .. code-block:: python
+        {
+            "bleu_score": 0.22
+        }
+    """
     def __init__(self):
         self._async_evaluator = _AsyncBleuScoreEvaluator()
@@ -62,7 +63,7 @@ class BleuScoreEvaluator:
         :keyword ground_truth: The ground truth to be compared against.
         :paramtype ground_truth: str
         :return: The BLEU score.
-        :rtype: Dict[str, float]
+        :rtype: dict
         """
         return async_run_allowing_running_loop(
             self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs

azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py RENAMED Viewed

@@ -2,8 +2,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from ._retrieval import RetrievalEvaluator
+from ._chat import ChatEvaluator
 __all__ = [
-    "RetrievalEvaluator",
+    "ChatEvaluator",
 ]

azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0py3-none-any.whl → 1.0.0b1py3-none-any.whl