PyPI - azure-ai-evaluation - Versions diffs - 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

azure/ai/evaluation/_common/onedp/operations/_patch.py CHANGED Viewed

@@ -1,21 +1,21 @@
-# coding=utf-8
-# --------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See License.txt in the project root for license information.
-# --------------------------------------------------------------------------
-"""Customize generated code here.
-Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
-"""
-from typing import List
-__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
-def patch_sdk():
-    """Do not remove from this file.
-    `patch_sdk` is a last resort escape hatch that allows you to do customizations
-    you can't accomplish using the techniques described in
-    https://aka.ms/azsdk/python/dpcodegen/python/customize
-    """
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# --------------------------------------------------------------------------
+"""Customize generated code here.
+Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
+"""
+from typing import List
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
+def patch_sdk():
+    """Do not remove from this file.
+    `patch_sdk` is a last resort escape hatch that allows you to do customizations
+    you can't accomplish using the techniques described in
+    https://aka.ms/azsdk/python/dpcodegen/python/customize
+    """

azure/ai/evaluation/_common/rai_service.py CHANGED Viewed

@@ -21,10 +21,11 @@ from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
 from azure.ai.evaluation._model_configurations import AzureAIProject
+from azure.ai.evaluation._user_agent import UserAgentSingleton
 from azure.ai.evaluation._common.utils import is_onedp_project
 from azure.core.credentials import TokenCredential
 from azure.core.exceptions import HttpResponseError
-from azure.core.pipeline.policies import AsyncRetryPolicy
+from azure.core.pipeline.policies import AsyncRetryPolicy, UserAgentPolicy
 from .constants import (
     CommonConstants,
@@ -35,20 +36,16 @@ from .constants import (
 )
 from .utils import get_harm_severity_level, retrieve_content_type
-try:
-    version = importlib.metadata.version("azure-ai-evaluation")
-except importlib.metadata.PackageNotFoundError:
-    version = "unknown"
-USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
 USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
     "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
 }
-ML_WORKSPACE  = "https://management.azure.com/.default"
+ML_WORKSPACE = "https://management.azure.com/.default"
 COG_SRV_WORKSPACE = "https://ai.azure.com/.default"
 INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"
 def get_formatted_template(data: dict, annotation_task: str) -> str:
     """Given the task and input data, produce a formatted string that will serve as the main
     payload for the RAI service. Requires specific per-task logic.
@@ -71,16 +68,13 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
         }
         return json.dumps(as_dict)
     if annotation_task == Tasks.CODE_VULNERABILITY:
-        as_dict = {
-            "context": data.get("query", ""),
-            "completion": data.get("response", "")
-        }
+        as_dict = {"context": data.get("query", ""), "completion": data.get("response", "")}
         return json.dumps(as_dict)
     if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
         as_dict = {
             "query": data.get("query", ""),
             "response": data.get("response", ""),
-            "context": data.get("context", "")
+            "context": data.get("context", ""),
         }
         return json.dumps(as_dict)
     as_dict = {
@@ -101,7 +95,11 @@ def get_common_headers(token: str, evaluator_name: Optional[str] = None) -> Dict
     :return: The common headers.
     :rtype: Dict
     """
-    user_agent = f"{USER_AGENT} (type=evaluator; subtype={evaluator_name})" if evaluator_name else USER_AGENT
+    user_agent = (
+        f"{UserAgentSingleton().value} (type=evaluator; subtype={evaluator_name})"
+        if evaluator_name
+        else UserAgentSingleton().value
+    )
     return {
         "Authorization": f"Bearer {token}",
         "User-Agent": user_agent,
@@ -113,7 +111,10 @@ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
         retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
     )
-async def ensure_service_availability_onedp(client: AIProjectClient, token: str, capability: Optional[str] = None) -> None:
+async def ensure_service_availability_onedp(
+    client: AIProjectClient, token: str, capability: Optional[str] = None
+) -> None:
     """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
     :param client: The AI project client.
@@ -126,7 +127,7 @@ async def ensure_service_availability_onedp(client: AIProjectClient, token: str,
     """
     headers = get_common_headers(token)
     capabilities = client.evaluations.check_annotation(headers=headers)
     if capability and capability not in capabilities:
         msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
         raise EvaluationException(
@@ -137,7 +138,8 @@ async def ensure_service_availability_onedp(client: AIProjectClient, token: str,
             blame=ErrorBlame.USER_ERROR,
             tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
         )
 async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
     """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
@@ -257,12 +259,13 @@ async def submit_request(
 async def submit_request_onedp(
-    client: AIProjectClient,
-    data: dict,
-    metric: str,
-    token: str,
-    annotation_task: str,
-    evaluator_name: str
+    client: AIProjectClient,
+    data: dict,
+    metric: str,
+    token: str,
+    annotation_task: str,
+    evaluator_name: str,
+    scan_session_id: Optional[str] = None,
 ) -> str:
     """Submit request to Responsible AI service for evaluation and return operation ID
@@ -278,12 +281,16 @@ async def submit_request_onedp(
     :type annotation_task: str
     :param evaluator_name: The evaluator name.
     :type evaluator_name: str
+    :param scan_session_id: The scan session ID to use for the evaluation.
+    :type scan_session_id: Optional[str]
     :return: The operation ID.
     :rtype: str
     """
     normalized_user_text = get_formatted_template(data, annotation_task)
     payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
     headers = get_common_headers(token, evaluator_name)
+    if scan_session_id:
+        headers["client_request_id"] = scan_session_id
     response = client.evaluations.submit_annotation(payload, headers=headers)
     result = json.loads(response)
     operation_id = result["location"].split("/")[-1]
@@ -326,6 +333,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
         sleep_time = RAIService.SLEEP_TIME**request_count
         await asyncio.sleep(sleep_time)
 async def fetch_result_onedp(client: AIProjectClient, operation_id: str, token: str) -> Dict:
     """Fetch the annotation result from Responsible AI service
@@ -349,11 +357,14 @@ async def fetch_result_onedp(client: AIProjectClient, operation_id: str, token:
             request_count += 1
             time_elapsed = time.time() - start
             if time_elapsed > RAIService.TIMEOUT:
-                raise TimeoutError(f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds")
+                raise TimeoutError(
+                    f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds"
+                )
             sleep_time = RAIService.SLEEP_TIME**request_count
             await asyncio.sleep(sleep_time)
 def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
 ) -> Dict[str, Union[str, float]]:
@@ -382,10 +393,13 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         result = {}
         if not batch_response or len(batch_response[0]) == 0:
             return {}
-        if metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]:
-            batch_response[0] = {
-                EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
-            }
+        if (
+            metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
+            and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]
+        ):
+            batch_response[0] = {
+                EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
+            }
         if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
             pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
             for pm_metric_name in pm_metric_names:
@@ -421,7 +435,10 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
             result[metric_display_name + "_information_gathering"] = (
                 parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
             )
-        if metric_name == EvaluationMetrics.CODE_VULNERABILITY or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
+        if (
+            metric_name == EvaluationMetrics.CODE_VULNERABILITY
+            or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
+        ):
             # Add all attributes under the details.
             details = {}
             for key, value in parsed_response.items():
@@ -587,7 +604,9 @@ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
     return rai_url
-async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None, workspace: Optional[str] = ML_WORKSPACE) -> str:
+async def fetch_or_reuse_token(
+    credential: TokenCredential, token: Optional[str] = None, workspace: Optional[str] = ML_WORKSPACE
+) -> str:
     """Get token. Fetch a new token if the current token is near expiry
        :param credential: The Azure authentication credential.
@@ -622,6 +641,7 @@ async def evaluate_with_rai_service(
     annotation_task: str = Tasks.CONTENT_HARM,
     metric_display_name=None,
     evaluator_name=None,
+    scan_session_id: Optional[str] = None,
 ) -> Dict[str, Union[str, float]]:
     """Evaluate the content safety of the response using Responsible AI service
@@ -629,8 +649,8 @@ async def evaluate_with_rai_service(
     :type data: dict
     :param metric_name: The evaluation metric to use.
     :type metric_name: str
-    :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
-        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
     :type project_scope: Union[str, AzureAIProject]
     :param credential: The Azure authentication credential.
     :type credential: ~azure.core.credentials.TokenCredential
@@ -640,15 +660,23 @@ async def evaluate_with_rai_service(
     :type metric_display_name: str
     :param evaluator_name: The evaluator name to use.
     :type evaluator_name: str
+    :param scan_session_id: The scan session ID to use for the evaluation.
+    :type scan_session_id: Optional[str]
     :return: The parsed annotation result.
     :rtype: Dict[str, Union[str, float]]
     """
     if is_onedp_project(project_scope):
-        client = AIProjectClient(endpoint=project_scope, credential=credential)
+        client = AIProjectClient(
+            endpoint=project_scope,
+            credential=credential,
+            user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
+        )
         token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
         await ensure_service_availability_onedp(client, token, annotation_task)
-        operation_id = await submit_request_onedp(client, data, metric_name, token, annotation_task, evaluator_name)
+        operation_id = await submit_request_onedp(
+            client, data, metric_name, token, annotation_task, evaluator_name, scan_session_id
+        )
         annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
         result = parse_response(annotation_response, metric_name, metric_display_name)
         return result
@@ -665,6 +693,7 @@ async def evaluate_with_rai_service(
         return result
 def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
     """Generate the payload for the annotation request
     :param content_type: The type of the content representing multimodal or images.
@@ -696,6 +725,7 @@ def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dic
         "AnnotationTask": task,
     }
 async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
     """Submit request to Responsible AI service for evaluation and return operation ID
     :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
@@ -741,8 +771,9 @@ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, tok
     operation_id = result["location"].split("/")[-1]
     return operation_id
 async def submit_multimodal_request_onedp(client: AIProjectClient, messages, metric: str, token: str) -> str:
     #  handle inference sdk strongly type messages
     if len(messages) > 0 and not isinstance(messages[0], dict):
         try:
@@ -758,37 +789,42 @@ async def submit_multimodal_request_onedp(client: AIProjectClient, messages, met
     ## fetch system and assistant messages from the list of messages
     filtered_messages = [message for message in messages if message["role"] != "system"]
     assistant_messages = [message for message in messages if message["role"] == "assistant"]
     ## prepare for request
     content_type = retrieve_content_type(assistant_messages, metric)
     payload = generate_payload_multimodal(content_type, filtered_messages, metric)
     headers = get_common_headers(token)
     response = client.evaluations.submit_annotation(payload, headers=headers)
     result = json.loads(response)
     operation_id = result["location"].split("/")[-1]
     return operation_id
 async def evaluate_with_rai_service_multimodal(
     messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential
 ):
     """ "Evaluate the content safety of the response using Responsible AI service
-       :param messages: The normalized list of messages.
-       :type messages: str
-       :param metric_name: The evaluation metric to use.
-       :type metric_name: str
-       :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
-            or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
-       :type project_scope: Union[str, AzureAIProject]
-       :param credential: The Azure authentication credential.
-       :type credential: ~azure.core.credentials.TokenCredential
-       :return: The parsed annotation result.
-       :rtype: List[List[Dict]]
+    :param messages: The normalized list of messages.
+    :type messages: str
+    :param metric_name: The evaluation metric to use.
+    :type metric_name: str
+    :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
+         or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :type project_scope: Union[str, AzureAIProject]
+    :param credential: The Azure authentication credential.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :return: The parsed annotation result.
+    :rtype: List[List[Dict]]
     """
     if is_onedp_project(project_scope):
-        client = AIProjectClient(endpoint=project_scope, credential=credential)
+        client = AIProjectClient(
+            endpoint=project_scope,
+            credential=credential,
+            user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
+        )
         token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
         await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
         operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
@@ -803,4 +839,4 @@ async def evaluate_with_rai_service_multimodal(
         operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
         annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
         result = parse_response(annotation_response, metric_name)
-        return result
+        return result

azure/ai/evaluation/_common/raiclient/__init__.py CHANGED Viewed

@@ -24,7 +24,7 @@ except ImportError:
     _patch_all = []
 from ._patch import patch_sdk as _patch_sdk
-# Export GeneratedRAIClient as alias of MachineLearningServicesClient for backward compatibility
+# Export GeneratedRAIClient as alias of MachineLearningServicesClient for backward compatibility
 __all__ = [
     "MachineLearningServicesClient",

azure/ai/evaluation/_common/raiclient/operations/_operations.py CHANGED Viewed

@@ -112,7 +112,12 @@ def build_rai_svc_get_jail_break_dataset_with_type_request(  # pylint: disable=n
 def build_rai_svc_get_attack_objectives_request(  # pylint: disable=name-too-long
-    *, risk_types: Optional[List[str]] = None, lang: Optional[str] = None, strategy: Optional[str] = None, **kwargs: Any
+    *,
+    risk_types: Optional[List[str]] = None,
+    risk_categories: Optional[List[str]] = None,
+    lang: Optional[str] = None,
+    strategy: Optional[str] = None,
+    **kwargs: Any
 ) -> HttpRequest:
     _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
     _params = case_insensitive_dict(kwargs.pop("params", {}) or {})
@@ -127,6 +132,10 @@ def build_rai_svc_get_attack_objectives_request(  # pylint: disable=name-too-lon
     _params["api-version"] = _SERIALIZER.query("api_version", api_version, "str")
     if risk_types is not None:
         _params["riskTypes"] = [_SERIALIZER.query("risk_types", q, "str") if q is not None else "" for q in risk_types]
+    if risk_categories is not None:
+        _params["riskCategory"] = [
+            _SERIALIZER.query("risk_categories", q, "str") if q is not None else "" for q in risk_categories
+        ]
     if lang is not None:
         _params["lang"] = _SERIALIZER.query("lang", lang, "str")
     if strategy is not None:
@@ -573,6 +582,7 @@ class RAISvcOperations:
     def get_attack_objectives(
         self,
         *,
+        risk_category: str,
         risk_types: Optional[List[str]] = None,
         lang: Optional[str] = None,
         strategy: Optional[str] = None,
@@ -580,6 +590,8 @@ class RAISvcOperations:
     ) -> List[_models.AttackObjective]:
         """Get the attack objectives.
+        :keyword risk_category: Risk category for the attack objectives. Required.
+        :paramtype risk_category: str
         :keyword risk_types: Risk types for the attack objectives dataset. Default value is None.
         :paramtype risk_types: list[str]
         :keyword lang: The language for the attack objectives dataset, defaults to 'en'. Default value
@@ -605,6 +617,7 @@ class RAISvcOperations:
         cls: ClsType[List[_models.AttackObjective]] = kwargs.pop("cls", None)
         _request = build_rai_svc_get_attack_objectives_request(
+            risk_categories=[risk_category],
             risk_types=risk_types,
             lang=lang,
             strategy=strategy,

azure/ai/evaluation/_common/utils.py CHANGED Viewed

@@ -13,7 +13,7 @@ from azure.storage.blob import ContainerClient
 from typing_extensions import NotRequired, Required, TypeGuard
 from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
 from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -126,6 +126,7 @@ def construct_prompty_model_config(
     return prompty_model_config
 def is_onedp_project(azure_ai_project: AzureAIProject) -> bool:
     """Check if the Azure AI project is an OneDP project.
@@ -138,6 +139,7 @@ def is_onedp_project(azure_ai_project: AzureAIProject) -> bool:
         return True
     return False
 def validate_azure_ai_project(o: object) -> AzureAIProject:
     fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
@@ -291,7 +293,8 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
     return cast(T_TypedDict, o)
-def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool:
+def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool:
     """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
     :param score: The score to check.
@@ -310,6 +313,7 @@ def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5)
     return min_score <= numeric_score <= max_score
 def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
     """Parse the output of prompt-based quality evaluators that return a score and reason.
@@ -481,6 +485,123 @@ def validate_conversation(conversation):
             ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
         )
+def _extract_text_from_content(content):
+    text = []
+    for msg in content:
+        if "text" in msg:
+            text.append(msg["text"])
+    return text
+def _get_conversation_history(query):
+    all_user_queries = []
+    cur_user_query = []
+    all_agent_responses = []
+    cur_agent_response = []
+    for msg in query:
+        if not "role" in msg:
+            continue
+        if msg["role"] == "user" and "content" in msg:
+            if cur_agent_response != []:
+                all_agent_responses.append(cur_agent_response)
+                cur_agent_response = []
+            text_in_msg = _extract_text_from_content(msg["content"])
+            if text_in_msg:
+                cur_user_query.append(text_in_msg)
+        if msg["role"] == "assistant" and "content" in msg:
+            if cur_user_query != []:
+                all_user_queries.append(cur_user_query)
+                cur_user_query = []
+            text_in_msg = _extract_text_from_content(msg["content"])
+            if text_in_msg:
+                cur_agent_response.append(text_in_msg)
+    if cur_user_query != []:
+        all_user_queries.append(cur_user_query)
+    if cur_agent_response != []:
+        all_agent_responses.append(cur_agent_response)
+    if len(all_user_queries) != len(all_agent_responses) + 1:
+        raise EvaluationException(
+            message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
+            internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
+            target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    return {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
+def _pretty_format_conversation_history(conversation_history):
+    """Formats the conversation history for better readability."""
+    formatted_history = ""
+    for i, (user_query, agent_response) in enumerate(
+        zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
+    ):
+        formatted_history += f"User turn {i+1}:\n"
+        for msg in user_query:
+            formatted_history += "  " + "\n  ".join(msg)
+        formatted_history += "\n\n"
+        if agent_response:
+            formatted_history += f"Agent turn {i+1}:\n"
+            for msg in agent_response:
+                formatted_history += "  " + "\n  ".join(msg)
+            formatted_history += "\n\n"
+    return formatted_history
+def reformat_conversation_history(query, logger=None):
+    """Reformats the conversation history to a more compact representation."""
+    try:
+        conversation_history = _get_conversation_history(query)
+        return _pretty_format_conversation_history(conversation_history)
+    except:
+        # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
+        # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
+        # From our tests the negative impact on IntentResolution is:
+        #   Higher intra model variance (0.142 vs 0.046)
+        #   Higher inter model variance (0.345 vs 0.607)
+        #   Lower percentage of mode in Likert scale (73.4% vs 75.4%)
+        #   Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
+        if logger:
+            logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
+        return query
+def _get_agent_response(agent_response_msgs):
+    """Extracts the text from the agent response content."""
+    agent_response_text = []
+    for msg in agent_response_msgs:
+        if "role" in msg and msg["role"] == "assistant" and "content" in msg:
+            text = _extract_text_from_content(msg["content"])
+            if text:
+                agent_response_text.extend(text)
+    return agent_response_text
+def reformat_agent_response(response, logger=None):
+    try:
+        if response is None or response == []:
+            return ""
+        agent_response = _get_agent_response(response)
+        if agent_response == []:
+            # If no message could be extracted, likely the format changed, fallback to the original response in that case
+            if logger:
+                logger.warning(
+                    f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
+                )
+            return response
+        return "\n".join(agent_response)
+    except:
+        # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
+        # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
+        if logger:
+            logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
+        return response
 def upload(path: str, container_client: ContainerClient, logger=None):
     """Upload files or directories to Azure Blob Storage using a container client.
@@ -509,7 +630,7 @@ def upload(path: str, container_client: ContainerClient, logger=None):
     local_paths = []
     if os.path.isdir(path):
-        for (root, _, filenames) in os.walk(path):
+        for root, _, filenames in os.walk(path):
             upload_path = ""
             if root != path:
                 rel_path = os.path.relpath(root, path)

azure/ai/evaluation/_constants.py CHANGED Viewed

@@ -81,6 +81,7 @@ class _AggregationType(enum.Enum):
     SUM = "sum"
     CUSTOM = "custom"
 class TokenScope(str, enum.Enum):
     """Defines the scope of the token used to access Azure resources."""
@@ -114,4 +115,4 @@ BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
 AOAI_COLUMN_NAME = "aoai"
 DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
-DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
+DEFAULT_AOAI_API_VERSION = "2025-04-01-preview"  # Unfortunately relying on preview version for now.

azure/ai/evaluation/_converters/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
+# ---------------------------------------------------------

azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

azure-ai-evaluation 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl