PyPI - azure-ai-evaluation - Versions diffs - 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pathlib import Path
 from typing import Any, Dict, NamedTuple, Optional, Union, cast
 import uuid
 import base64
+import math
 import pandas as pd
 from azure.ai.evaluation._legacy._adapters.entities import Run
@@ -126,6 +127,82 @@ def process_message_content(content, images_folder_path):
             f.write(image_data_binary)
     return None
+def _log_metrics_and_instance_results_onedp(
+    metrics: Dict[str, Any],
+    instance_results: pd.DataFrame,
+    project_url: str,
+    evaluation_name: Optional[str],
+    name_map: Dict[str, str],
+    **kwargs,
+) -> Optional[str]:
+    # One RP Client
+    from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
+    from azure.ai.evaluation._constants import TokenScope
+    from azure.ai.evaluation._common import EvaluationServiceOneDPClient, EvaluationUpload
+    credentials = AzureMLTokenManager(
+        TokenScope.COGNITIVE_SERVICES_MANAGEMENT.value, LOGGER, credential=kwargs.get("credential")
+    )
+    client = EvaluationServiceOneDPClient(
+        endpoint=project_url,
+        credential=credentials
+    )
+    # Massaging before artifacts are put on disk
+    # Adding line_number as index column this is needed by UI to form link to individual instance run
+    instance_results["line_number"] = instance_results.index.values
+    artifact_name = "instance_results.jsonl"
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # storing multi_modal images if exists
+        col_name = "inputs.conversation"
+        if col_name in instance_results.columns:
+            for item in instance_results[col_name].items():
+                value = item[1]
+                if "messages" in value:
+                    _store_multimodal_content(value["messages"], tmpdir)
+        # storing artifact result
+        tmp_path = os.path.join(tmpdir, artifact_name)
+        with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
+            f.write(instance_results.to_json(orient="records", lines=True))
+        properties = {
+            EvaluationRunProperties.RUN_TYPE: "eval_run",
+            EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
+            EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
+            "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
+        }
+        properties.update(_convert_name_map_into_property_entries(name_map))
+        create_evaluation_result_response = client.create_evaluation_result(
+            name=uuid.uuid4(),
+            path=tmpdir,
+            metrics=metrics
+        )
+        upload_run_response = client.start_evaluation_run(
+            evaluation=EvaluationUpload(
+                display_name=evaluation_name,
+            )
+        )
+        update_run_response = client.update_evaluation_run(
+            name=upload_run_response.id,
+            evaluation=EvaluationUpload(
+                display_name=evaluation_name,
+                status="Completed",
+                outputs={
+                    'evaluationResultId': create_evaluation_result_response.id,
+                },
+                properties=properties,
+            )
+        )
+    return update_run_response.properties.get("AiStudioEvaluationUri")
 def _log_metrics_and_instance_results(
     metrics: Dict[str, Any],
@@ -133,6 +210,7 @@ def _log_metrics_and_instance_results(
     trace_destination: Optional[str],
     run: Optional[Run],
     evaluation_name: Optional[str],
+    name_map: Dict[str, str],
     **kwargs,
 ) -> Optional[str]:
     from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -187,14 +265,14 @@ def _log_metrics_and_instance_results(
             # adding these properties to avoid showing traces if a dummy run is created.
             # We are doing that only for the pure evaluation runs.
             if run is None:
-                ev_run.write_properties_to_run_history(
-                    properties={
+                properties = {
                         EvaluationRunProperties.RUN_TYPE: "eval_run",
                         EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
                         EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
                         "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
                     }
-                )
+                properties.update(_convert_name_map_into_property_entries(name_map))
+                ev_run.write_properties_to_run_history(properties=properties)
             else:
                 ev_run.write_properties_to_run_history(
                     properties={
@@ -241,7 +319,7 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
         p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
     with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
-        json.dump(data_dict, f)
+        json.dump(data_dict, f, ensure_ascii=False)
     print(f'Evaluation results saved to "{p.resolve()}".\n')
@@ -329,6 +407,41 @@ def set_event_loop_policy() -> None:
         # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # type: ignore[attr-defined]
+# textwrap.wrap tries to do fancy nonsense that we don't want
+def _wrap(s, w):
+    return [s[i:i + w] for i in range(0, len(s), w)]
+def _convert_name_map_into_property_entries(
+    name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
+) -> Dict[str, Any]:
+    """
+    Convert the name map into property entries.
+    :param name_map: The name map to be converted.
+    :type name_map: Dict[str, str]
+    :param segment_length: The max length of each individual segment,
+        which will each have their own dictionary entry
+    :type segment_length: str
+    :param max_segments: The max number of segments we can have. If the stringified
+        name map is too long, we just return a length entry with a value
+        of -1 to indicate that the map was too long.
+    :type max_segments: str
+    :return: The converted name map.
+    :rtype: Dict[str, Any]
+    """
+    name_map_string = json.dumps(name_map)
+    num_segments = math.ceil(len(name_map_string) / segment_length)
+    # Property map is somehow still too long to encode within the space
+    # we allow, so give up, but make sure the service knows we gave up
+    if (num_segments > max_segments):
+        return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
+    result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
+    segments_list = _wrap(name_map_string, segment_length)
+    for i in range(0, num_segments):
+        segment_key = f"{EvaluationRunProperties.NAME_MAP}_{i}"
+        result[segment_key] = segments_list[i]
+    return result
 class JSONLDataFileLoader:
     def __init__(self, filename: Union[os.PathLike, str]):

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -33,7 +33,17 @@ class BleuScoreEvaluator(EvaluatorBase):
             :end-before: [END bleu_score_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call an BleuScoreEvaluator.
+            :caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START bleu_score_evaluator]
+            :end-before: [END bleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py CHANGED Viewed

@@ -62,7 +62,15 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :end-before: [END code_vulnerability_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a CodeVulnerabilityEvaluator with a query and response.
+            :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START code_vulnerability_evaluator]
+            :end-before: [END code_vulnerability_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -31,7 +31,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a CoherenceEvaluator with a query and response.
+            :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START coherence_evaluator]
+            :end-before: [END coherence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
@@ -40,7 +50,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :end-before: [END threshold_coherence_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and and call a CoherenceEvaluator with a query and response.
+            :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
     .. note::

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -86,6 +86,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     :type _higher_is_better: Optional[bool]
     """
+    _NOT_APPLICABLE_RESULT = "not applicable"
+    _PASS_RESULT = "pass"
+    _FAIL_RESULT = "fail"
     # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
     # Make sure to call super().__init__() in the child class's __init__ method.
@@ -495,7 +499,8 @@ class AsyncEvaluatorBase:
     # are known to throw at this, mash them into kwargs, and then pass them into the real call.
     async def __call__(
         self, *, query=None, response=None, context=None, conversation=None, ground_truth=None,
-            tool_call=None, tool_definitions=None, messages=None, **kwargs
+            tool_calls=None, tool_definitions=None, messages=None, retrieval_ground_truth=None,
+            retrieved_documents=None,**kwargs
     ):
         if conversation is not None:
             kwargs["conversation"] = conversation
@@ -509,11 +514,15 @@ class AsyncEvaluatorBase:
             kwargs["context"] = context
         if ground_truth is not None:
             kwargs["ground_truth"] = ground_truth
-        if tool_call is not None:
-            kwargs["tool_call"] = tool_call
+        if tool_calls is not None:
+            kwargs["tool_calls"] = tool_calls
         if tool_definitions is not None:
             kwargs["tool_definitions"] = tool_definitions
         if messages is not None:
             kwargs["messages"] = messages
+        if retrieval_ground_truth is not None:
+            kwargs["retrieval_ground_truth"] = retrieval_ground_truth
+        if retrieved_documents is not None:
+            kwargs["retrieved_documents"] = retrieved_documents
         return await self._real_call(**kwargs)

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -4,9 +4,13 @@
 import math
 import re
+import os
 from typing import Dict, TypeVar, Union
-from azure.ai.evaluation._legacy.prompty import AsyncPrompty
+if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
+    from promptflow.core._flow import AsyncPrompty
+else:
+    from azure.ai.evaluation._legacy.prompty import AsyncPrompty
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
@@ -39,13 +43,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
     :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
         Useful since some evaluators of this format are response-only.
     :type ignore_queries: bool
+    :keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
+    :type is_reasoning_model: bool
     """
     _LLM_CALL_TIMEOUT = 600
     _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False, threshold: int = 3, _higher_is_better: bool = False):
+    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
+                 threshold: int = 3, _higher_is_better: bool = False, **kwargs) -> None:
         self._result_key = result_key
+        self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
         self._prompty_file = prompty_file
         self._threshold = threshold
         self._higher_is_better = _higher_is_better
@@ -59,7 +67,8 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
             user_agent,
         )
-        self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
+        self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config,
+                                       is_reasoning_model=self._is_reasoning_model)
     # __call__ not overridden here because child classes have such varied signatures that there's no point
     # defining a default here.

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -12,7 +12,7 @@ from azure.ai.evaluation._common.constants import (
     _InternalAnnotationTasks,
 )
 from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
-from azure.ai.evaluation._common.utils import validate_azure_ai_project
+from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
 from azure.ai.evaluation._exceptions import EvaluationException
 from azure.ai.evaluation._common.utils import validate_conversation
 from azure.ai.evaluation._constants import _AggregationType
@@ -50,7 +50,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
     def __init__(
         self,
         eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
-        azure_ai_project: dict,
+        azure_ai_project: Union[dict, str],
         credential: TokenCredential,
         eval_last_turn: bool = False,
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -44,7 +44,17 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :end-before: [END content_safety_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a ContentSafetyEvaluator.
+            :caption: Initialize and call ContentSafetyEvaluator using azure.ai.evaluation.AzureAIProject.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START content_safety_evaluator]
+            :end-before: [END content_safety_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call ContentSafetyEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
     .. admonition:: Example with Threshold:
@@ -53,7 +63,7 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :end-before: [END threshold_content_safety_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a ContentSafetyEvaluator.
+            :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
     """
     id = "content_safety"

azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py CHANGED Viewed

@@ -58,16 +58,26 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :end-before: [END hate_unfairness_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a HateUnfairnessEvaluator.
-    .. admonition:: Example with Threshold:
+            :caption: Initialize and call HateUnfairnessEvaluator using azure.ai.evaluation.AzureAIProject.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START hate_unfairness_evaluator]
+            :end-before: [END hate_unfairness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call HateUnfairnessEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
+    .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
             :start-after: [START threshold_hate_unfairness_evaluator]
             :end-before: [END threshold_hate_unfairness_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a HateUnfairnessEvaluator.
+            :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
     """
     id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"

azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py CHANGED Viewed

@@ -52,16 +52,17 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :end-before: [END self_harm_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a SelfHarmEvaluator.
-    .. admonition:: Example:
-        .. literalinclude:: ../samples/evaluation_samples_threshold.py
-            :start-after: [START threshold_self_harm_evaluator]
-            :end-before: [END threshold_self_harm_evaluator]
+            :caption: Initialize and call SelfHarmEvaluator using azure.ai.evaluation.AzureAIProject.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START self_harm_evaluator]
+            :end-before: [END self_harm_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize with threshold and call a SelfHarmEvaluator.
+            :caption: Initialize and call SelfHarmEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
     """
     id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"

azure/ai/evaluation/_evaluators/_content_safety/_sexual.py CHANGED Viewed

@@ -56,6 +56,16 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a SexualEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START sexual_evaluator]
+            :end-before: [END sexual_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call SexualEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_content_safety/_violence.py CHANGED Viewed

@@ -56,6 +56,16 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a ViolenceEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START violence_evaluator]
+            :end-before: [END violence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call ViolenceEvaluator using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._document_retrieval import DocumentRetrievalEvaluator, RetrievalGroundTruthDocument, RetrievedDocument
+__all__ = [
+    "DocumentRetrievalEvaluator",
+    "RetrievalGroundTruthDocument",
+    "RetrievedDocument"
+]

azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl