PyPI - azure-ai-evaluation - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.0b2__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0py3-none-any.whl → 1.0.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (105) hide show

azure/ai/evaluation/_evaluate/_telemetry/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ import functools
 import inspect
 import json
 import logging
-from typing import Callable, Dict, Literal, Optional, Union, cast
+from typing import Callable, Dict, TypeVar
 import pandas as pd
 from promptflow._sdk.entities._flows import FlexFlow as flex_flow
@@ -16,30 +16,31 @@ from promptflow.client import PFClient
 from promptflow.core import Prompty as prompty_core
 from typing_extensions import ParamSpec
-from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
 from ..._user_agent import USER_AGENT
 from .._utils import _trace_destination_from_project_scope
 LOGGER = logging.getLogger(__name__)
 P = ParamSpec("P")
+R = TypeVar("R")
-def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
+def _get_evaluator_type(evaluator: Dict[str, Callable]):
     """
     Get evaluator type for telemetry.
     :param evaluator: The evaluator object
     :type evaluator: Dict[str, Callable]
     :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
-    :rtype: Literal["content-safety", "built-in", "custom"]
+    :rtype: str
     """
-    module = inspect.getmodule(evaluator)
-    module_name = module.__name__ if module else ""
+    built_in = False
+    content_safety = False
-    built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
-    content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
+    module = inspect.getmodule(evaluator)
+    built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
+    if built_in:
+        content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
     if content_safety:
         return "content-safety"
@@ -97,22 +98,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
 # cspell:ignore isna
-def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
+def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
     """Decorator to log evaluate activity
     :param func: The function to be decorated
     :type func: Callable
     :returns: The decorated function
-    :rtype: Callable[P, EvaluationResult]
+    :rtype: Callable[P, R]
     """
     @functools.wraps(func)
-    def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
         from promptflow._sdk._telemetry import ActivityType, log_activity
         from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
-        evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
-        azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
+        evaluators = kwargs.get("evaluators", [])
+        azure_ai_project = kwargs.get("azure_ai_project", None)
         pf_client = PFClient(
             config=(
@@ -123,11 +124,10 @@ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, Ev
             user_agent=USER_AGENT,
         )
-        trace_destination = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
-        track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
+        track_in_cloud = bool(pf_client._config.get_trace_destination())  # pylint: disable=protected-access
         evaluate_target = bool(kwargs.get("target", None))
         evaluator_config = bool(kwargs.get("evaluator_config", None))
-        custom_dimensions: Dict[str, Union[str, bool]] = {
+        custom_dimensions = {
             "track_in_cloud": track_in_cloud,
             "evaluate_target": evaluate_target,
             "evaluator_config": evaluator_config,

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -6,23 +6,15 @@ import logging
 import os
 import re
 import tempfile
+from collections import namedtuple
 from pathlib import Path
-from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
-import uuid
-import base64
+from typing import Dict
 import pandas as pd
-from promptflow.client import PFClient
-from promptflow.entities import Run
-from azure.ai.evaluation._constants import (
-    DEFAULT_EVALUATION_RESULTS_FILE_NAME,
-    DefaultOpenEncoding,
-    EvaluationRunProperties,
-    Prefixes,
-)
+from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
+from azure.ai.evaluation._evaluate._eval_run import EvalRun
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from azure.ai.evaluation._model_configurations import AzureAIProject
 LOGGER = logging.getLogger(__name__)
@@ -31,20 +23,14 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
     "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
 )
-class AzureMLWorkspace(NamedTuple):
-    subscription_id: str
-    resource_group_name: str
-    workspace_name: str
+AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
-def is_none(value) -> bool:
+def is_none(value):
     return value is None or str(value).lower() == "none"
-def extract_workspace_triad_from_trace_provider(  # pylint: disable=name-too-long
-    trace_provider: str,
-) -> AzureMLWorkspace:
+def extract_workspace_triad_from_trace_provider(trace_provider: str):  # pylint: disable=name-too-long
     match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
     if not match or len(match.groups()) != 5:
         raise EvaluationException(
@@ -61,7 +47,7 @@ def extract_workspace_triad_from_trace_provider(  # pylint: disable=name-too-lon
     subscription_id = match.group(1)
     resource_group_name = match.group(3)
     workspace_name = match.group(5)
-    return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
+    return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
 def load_jsonl(path):
@@ -69,7 +55,7 @@ def load_jsonl(path):
         return [json.loads(line) for line in f.readlines()]
-def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
+def _azure_pf_client_and_triad(trace_destination):
     from promptflow.azure._cli._utils import _get_azure_pf_client
     ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
@@ -82,45 +68,15 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
     return azure_pf_client, ws_triad
-def _store_multimodal_content(messages, tmpdir: str):
-    # verify if images folder exists
-    images_folder_path = os.path.join(tmpdir, "images")
-    os.makedirs(images_folder_path, exist_ok=True)
-    # traverse all messages and replace base64 image data with new file name.
-    for message in messages:
-        if isinstance(message.get("content", []), list):
-            for content in message.get("content", []):
-                if content.get("type") == "image_url":
-                    image_url = content.get("image_url")
-                    if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
-                        # Extract the base64 string
-                        base64image = image_url["url"].replace("data:image/jpg;base64,", "")
-                        # Generate a unique filename
-                        image_file_name = f"{str(uuid.uuid4())}.jpg"
-                        image_url["url"] = f"images/{image_file_name}"  # Replace the base64 URL with the file path
-                        # Decode the base64 string to binary image data
-                        image_data_binary = base64.b64decode(base64image)
-                        # Write the binary image data to the file
-                        image_file_path = os.path.join(images_folder_path, image_file_name)
-                        with open(image_file_path, "wb") as f:
-                            f.write(image_data_binary)
 def _log_metrics_and_instance_results(
-    metrics: Dict[str, Any],
-    instance_results: pd.DataFrame,
-    trace_destination: Optional[str],
-    run: Run,
-    evaluation_name: Optional[str],
-) -> Optional[str]:
-    from azure.ai.evaluation._evaluate._eval_run import EvalRun
+    metrics,
+    instance_results,
+    trace_destination,
+    run,
+    evaluation_name,
+) -> str:
     if trace_destination is None:
-        LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
+        LOGGER.error("Unable to log traces as trace destination was not defined.")
         return None
     azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
@@ -138,18 +94,10 @@ def _log_metrics_and_instance_results(
         ml_client=azure_pf_client.ml_client,
         promptflow_run=run,
     ) as ev_run:
-        artifact_name = EvalRun.EVALUATION_ARTIFACT
+        artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
         with tempfile.TemporaryDirectory() as tmpdir:
-            # storing multi_modal images if exists
-            col_name = "inputs.conversation"
-            if col_name in instance_results.columns:
-                for item in instance_results[col_name].items():
-                    value = item[1]
-                    if "messages" in value:
-                        _store_multimodal_content(value["messages"], tmpdir)
-            # storing artifact result
             tmp_path = os.path.join(tmpdir, artifact_name)
             with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
@@ -164,9 +112,9 @@ def _log_metrics_and_instance_results(
             if run is None:
                 ev_run.write_properties_to_run_history(
                     properties={
-                        EvaluationRunProperties.RUN_TYPE: "eval_run",
-                        EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
+                        "_azureml.evaluation_run": "azure-ai-generative-parent",
                         "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
+                        "isEvaluatorRun": "true",
                     }
                 )
@@ -190,7 +138,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
     return studio_url
-def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
+def _trace_destination_from_project_scope(project_scope: dict) -> str:
     subscription_id = project_scope["subscription_id"]
     resource_group_name = project_scope["resource_group_name"]
     workspace_name = project_scope["project_name"]
@@ -203,19 +151,17 @@ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
     return trace_destination
-def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
+def _write_output(path, data_dict):
     p = Path(path)
-    if p.is_dir():
+    if os.path.isdir(path):
         p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
     with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
         json.dump(data_dict, f)
-    print(f'Evaluation results saved to "{p.resolve()}".\n')
 def _apply_column_mapping(
-    source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
+    source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
 ) -> pd.DataFrame:
     """
     Apply column mapping to source_df based on mapping_config.
@@ -265,7 +211,7 @@ def _apply_column_mapping(
     return result_df
-def _has_aggregator(evaluator: object) -> bool:
+def _has_aggregator(evaluator):
     return hasattr(evaluator, "__aggregate__")
@@ -288,11 +234,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
         return default_value
-def set_event_loop_policy() -> None:
+def set_event_loop_policy():
     import asyncio
     import platform
     if platform.system().lower() == "windows":
         # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
         # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
-        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # type: ignore[attr-defined]
+        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -26,29 +26,30 @@ class _AsyncBleuScoreEvaluator:
 class BleuScoreEvaluator:
     """
-    Calculate the BLEU score for a given response and ground truth.
+    Evaluator that computes the BLEU Score between two strings.
     BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
-    translation. It is widely used in text summarization and text generation use cases.
+    translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
+    generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
+    better quality.
-    Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
-    especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
-    indicator of quality.
+    **Usage**
-    The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
+    .. code-block:: python
-    .. admonition:: Example:
+        eval_fn = BleuScoreEvaluator()
+        result = eval_fn(
+            response="Tokyo is the capital of Japan.",
+            ground_truth="The capital of Japan is Tokyo.")
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START bleu_score_evaluator]
-            :end-before: [END bleu_score_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call an BleuScoreEvaluator.
-    """
+    **Output format**
+    .. code-block:: python
-    id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+        {
+            "bleu_score": 0.22
+        }
+    """
     def __init__(self):
         self._async_evaluator = _AsyncBleuScoreEvaluator()
@@ -62,7 +63,7 @@ class BleuScoreEvaluator:
         :keyword ground_truth: The ground truth to be compared against.
         :paramtype ground_truth: str
         :return: The BLEU score.
-        :rtype: Dict[str, float]
+        :rtype: dict
         """
         return async_run_allowing_running_loop(
             self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs

azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py RENAMED Viewed

@@ -2,8 +2,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from ._retrieval import RetrievalEvaluator
+from ._chat import ChatEvaluator
 __all__ = [
-    "RetrievalEvaluator",
+    "ChatEvaluator",
 ]

azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b2__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0py3-none-any.whl → 1.0.0b2py3-none-any.whl