PyPI - azure-ai-evaluation - Versions diffs - 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.4.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (150) hide show

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -10,9 +10,10 @@ from pathlib import Path
 from typing import Any, Dict, NamedTuple, Optional, Union, cast
 import uuid
 import base64
+import math
 import pandas as pd
-from promptflow.entities import Run
+from azure.ai.evaluation._legacy._adapters.entities import Run
 from azure.ai.evaluation._constants import (
     DEFAULT_EVALUATION_RESULTS_FILE_NAME,
@@ -46,7 +47,7 @@ def is_none(value) -> bool:
 def extract_workspace_triad_from_trace_provider(  # pylint: disable=name-too-long
     trace_provider: str,
 ) -> AzureMLWorkspace:
-    from promptflow._cli._utils import get_workspace_triad_from_local
+    from azure.ai.evaluation._legacy._adapters.utils import get_workspace_triad_from_local
     match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
     if not match or len(match.groups()) != 5:
@@ -126,13 +127,90 @@ def process_message_content(content, images_folder_path):
             f.write(image_data_binary)
     return None
+def _log_metrics_and_instance_results_onedp(
+    metrics: Dict[str, Any],
+    instance_results: pd.DataFrame,
+    project_url: str,
+    evaluation_name: Optional[str],
+    name_map: Dict[str, str],
+    **kwargs,
+) -> Optional[str]:
+    # One RP Client
+    from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
+    from azure.ai.evaluation._constants import TokenScope
+    from azure.ai.evaluation._common import EvaluationServiceOneDPClient, EvaluationUpload
+    credentials = AzureMLTokenManager(
+        TokenScope.COGNITIVE_SERVICES_MANAGEMENT.value, LOGGER, credential=kwargs.get("credential")
+    )
+    client = EvaluationServiceOneDPClient(
+        endpoint=project_url,
+        credential=credentials
+    )
+    # Massaging before artifacts are put on disk
+    # Adding line_number as index column this is needed by UI to form link to individual instance run
+    instance_results["line_number"] = instance_results.index.values
+    artifact_name = "instance_results.jsonl"
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # storing multi_modal images if exists
+        col_name = "inputs.conversation"
+        if col_name in instance_results.columns:
+            for item in instance_results[col_name].items():
+                value = item[1]
+                if "messages" in value:
+                    _store_multimodal_content(value["messages"], tmpdir)
+        # storing artifact result
+        tmp_path = os.path.join(tmpdir, artifact_name)
+        with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
+            f.write(instance_results.to_json(orient="records", lines=True))
+        properties = {
+            EvaluationRunProperties.RUN_TYPE: "eval_run",
+            EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
+            EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
+            "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
+        }
+        properties.update(_convert_name_map_into_property_entries(name_map))
+        create_evaluation_result_response = client.create_evaluation_result(
+            name=uuid.uuid4(),
+            path=tmpdir,
+            metrics=metrics
+        )
+        upload_run_response = client.start_evaluation_run(
+            evaluation=EvaluationUpload(
+                display_name=evaluation_name,
+            )
+        )
+        update_run_response = client.update_evaluation_run(
+            name=upload_run_response.id,
+            evaluation=EvaluationUpload(
+                display_name=evaluation_name,
+                status="Completed",
+                outputs={
+                    'evaluationResultId': create_evaluation_result_response.id,
+                },
+                properties=properties,
+            )
+        )
+    return update_run_response.properties.get("AiStudioEvaluationUri")
 def _log_metrics_and_instance_results(
     metrics: Dict[str, Any],
     instance_results: pd.DataFrame,
     trace_destination: Optional[str],
-    run: Run,
+    run: Optional[Run],
     evaluation_name: Optional[str],
+    name_map: Dict[str, str],
     **kwargs,
 ) -> Optional[str]:
     from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -187,14 +265,14 @@ def _log_metrics_and_instance_results(
             # adding these properties to avoid showing traces if a dummy run is created.
             # We are doing that only for the pure evaluation runs.
             if run is None:
-                ev_run.write_properties_to_run_history(
-                    properties={
+                properties = {
                         EvaluationRunProperties.RUN_TYPE: "eval_run",
                         EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
                         EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
                         "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
                     }
-                )
+                properties.update(_convert_name_map_into_property_entries(name_map))
+                ev_run.write_properties_to_run_history(properties=properties)
             else:
                 ev_run.write_properties_to_run_history(
                     properties={
@@ -241,7 +319,7 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
         p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
     with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
-        json.dump(data_dict, f)
+        json.dump(data_dict, f, ensure_ascii=False)
     print(f'Evaluation results saved to "{p.resolve()}".\n')
@@ -329,6 +407,41 @@ def set_event_loop_policy() -> None:
         # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # type: ignore[attr-defined]
+# textwrap.wrap tries to do fancy nonsense that we don't want
+def _wrap(s, w):
+    return [s[i:i + w] for i in range(0, len(s), w)]
+def _convert_name_map_into_property_entries(
+    name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
+) -> Dict[str, Any]:
+    """
+    Convert the name map into property entries.
+    :param name_map: The name map to be converted.
+    :type name_map: Dict[str, str]
+    :param segment_length: The max length of each individual segment,
+        which will each have their own dictionary entry
+    :type segment_length: str
+    :param max_segments: The max number of segments we can have. If the stringified
+        name map is too long, we just return a length entry with a value
+        of -1 to indicate that the map was too long.
+    :type max_segments: str
+    :return: The converted name map.
+    :rtype: Dict[str, Any]
+    """
+    name_map_string = json.dumps(name_map)
+    num_segments = math.ceil(len(name_map_string) / segment_length)
+    # Property map is somehow still too long to encode within the space
+    # we allow, so give up, but make sure the service knows we gave up
+    if (num_segments > max_segments):
+        return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
+    result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
+    segments_list = _wrap(name_map_string, segment_length)
+    for i in range(0, num_segments):
+        segment_key = f"{EvaluationRunProperties.NAME_MAP}_{i}"
+        result[segment_key] = segments_list[i]
+    return result
 class JSONLDataFileLoader:
     def __init__(self, filename: Union[os.PathLike, str]):

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -6,7 +6,7 @@ import inspect
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
 from typing_extensions import ParamSpec, TypeAlias, get_overloads
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -495,7 +495,8 @@ class AsyncEvaluatorBase:
     # are known to throw at this, mash them into kwargs, and then pass them into the real call.
     async def __call__(
         self, *, query=None, response=None, context=None, conversation=None, ground_truth=None,
-            tool_call=None, tool_definitions=None, messages=None, **kwargs
+            tool_calls=None, tool_definitions=None, messages=None, retrieval_ground_truth=None,
+            retrieved_documents=None,**kwargs
     ):
         if conversation is not None:
             kwargs["conversation"] = conversation
@@ -509,11 +510,15 @@ class AsyncEvaluatorBase:
             kwargs["context"] = context
         if ground_truth is not None:
             kwargs["ground_truth"] = ground_truth
-        if tool_call is not None:
-            kwargs["tool_call"] = tool_call
+        if tool_calls is not None:
+            kwargs["tool_calls"] = tool_calls
         if tool_definitions is not None:
             kwargs["tool_definitions"] = tool_definitions
         if messages is not None:
             kwargs["messages"] = messages
+        if retrieval_ground_truth is not None:
+            kwargs["retrieval_ground_truth"] = retrieval_ground_truth
+        if retrieved_documents is not None:
+            kwargs["retrieved_documents"] = retrieved_documents
         return await self._real_call(**kwargs)

azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from concurrent.futures import as_completed
 from typing import TypeVar, Dict, List
-from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
 from typing_extensions import override
 from azure.ai.evaluation._evaluators._common import EvaluatorBase

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -4,9 +4,13 @@
 import math
 import re
+import os
 from typing import Dict, TypeVar, Union
-from promptflow.core import AsyncPrompty
+if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
+    from promptflow.core._flow import AsyncPrompty
+else:
+    from azure.ai.evaluation._legacy.prompty import AsyncPrompty
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
@@ -39,13 +43,17 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
     :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
         Useful since some evaluators of this format are response-only.
     :type ignore_queries: bool
+    :keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
+    :type is_reasoning_model: bool
     """
     _LLM_CALL_TIMEOUT = 600
     _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False, threshold: int = 3, _higher_is_better: bool = False):
+    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False,
+                 threshold: int = 3, _higher_is_better: bool = False, **kwargs) -> None:
         self._result_key = result_key
+        self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
         self._prompty_file = prompty_file
         self._threshold = threshold
         self._higher_is_better = _higher_is_better
@@ -59,7 +67,8 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
             user_agent,
         )
-        self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
+        self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config,
+                                       is_reasoning_model=self._is_reasoning_model)
     # __call__ not overridden here because child classes have such varied signatures that there's no point
     # defining a default here.

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -12,7 +12,7 @@ from azure.ai.evaluation._common.constants import (
     _InternalAnnotationTasks,
 )
 from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
-from azure.ai.evaluation._common.utils import validate_azure_ai_project
+from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
 from azure.ai.evaluation._exceptions import EvaluationException
 from azure.ai.evaluation._common.utils import validate_conversation
 from azure.ai.evaluation._constants import _AggregationType
@@ -50,7 +50,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
     def __init__(
         self,
         eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
-        azure_ai_project: dict,
+        azure_ai_project: Union[dict, str],
         credential: TokenCredential,
         eval_last_turn: bool = False,
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,

azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._document_retrieval import DocumentRetrievalEvaluator, RetrievalGroundTruthDocument, RetrievedDocument
+__all__ = [
+    "DocumentRetrievalEvaluator",
+    "RetrievalGroundTruthDocument",
+    "RetrievedDocument"
+]

azure-ai-evaluation 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.4.0py3-none-any.whl → 1.6.0py3-none-any.whl