azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +85 -14
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/_common/constants.py +124 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +578 -69
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +505 -27
- azure/ai/evaluation/_constants.py +147 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +87 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
- azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
- azure/ai/evaluation/_evaluate/_utils.py +237 -42
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +430 -29
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
- azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
- azure/ai/evaluation/_exceptions.py +24 -1
- azure/ai/evaluation/_http_utils.py +7 -5
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
- azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +43 -19
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/METADATA +378 -27
- azure_ai_evaluation-1.13.5.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# ------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation.
|
|
3
|
+
# Licensed under the MIT License.
|
|
4
|
+
# ------------------------------------
|
|
5
|
+
"""Customize generated code here.
|
|
6
|
+
|
|
7
|
+
Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
|
|
8
|
+
"""
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
__all__: List[str] = [] # Add all objects you want publicly available to users at this package level
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def patch_sdk():
|
|
15
|
+
"""Do not remove from this file.
|
|
16
|
+
|
|
17
|
+
`patch_sdk` is a last resort escape hatch that allows you to do customizations
|
|
18
|
+
you can't accomplish using the techniques described in
|
|
19
|
+
https://aka.ms/azsdk/python/dpcodegen/python/customize
|
|
20
|
+
"""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Marker file for PEP 561.
|
|
@@ -1,17 +1,19 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
|
|
4
|
+
import os
|
|
5
|
+
import posixpath
|
|
5
6
|
import re
|
|
6
7
|
import math
|
|
7
8
|
import threading
|
|
8
|
-
from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
9
|
+
from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
9
10
|
|
|
10
11
|
import nltk
|
|
11
|
-
from
|
|
12
|
-
from
|
|
12
|
+
from azure.storage.blob import ContainerClient
|
|
13
|
+
from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
|
|
14
|
+
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
13
15
|
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
|
14
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
|
+
from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
17
|
from azure.ai.evaluation._model_configurations import (
|
|
16
18
|
AzureAIProject,
|
|
17
19
|
AzureOpenAIModelConfiguration,
|
|
@@ -19,34 +21,122 @@ from azure.ai.evaluation._model_configurations import (
|
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
from . import constants
|
|
24
|
+
from .constants import EvaluatorScoringPattern, EVALUATOR_SCORING_PATTERNS, SCORING_PATTERN_CONFIG
|
|
22
25
|
|
|
23
26
|
_nltk_data_download_lock = threading.Lock()
|
|
24
27
|
|
|
25
28
|
T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
|
|
26
29
|
|
|
27
30
|
|
|
28
|
-
def get_harm_severity_level(harm_score: Union[float, int]) -> Union[str, float]:
|
|
31
|
+
def get_harm_severity_level(harm_score: Union[float, int], evaluator: Optional[str] = None) -> Union[str, float]:
|
|
29
32
|
"""Generate harm severity level based on harm score.
|
|
30
33
|
|
|
31
34
|
:param harm_score: The harm score to be evaluated.
|
|
32
|
-
:type harm_score: int
|
|
35
|
+
:type harm_score: Union[float, int]
|
|
36
|
+
:param evaluator: Optional evaluator name to determine scoring pattern. If not provided, uses default 0-7 scale.
|
|
37
|
+
:type evaluator: Optional[str]
|
|
33
38
|
:return: The harm severity level. If harm score is None or numpy.nan, returns numpy.nan.
|
|
34
|
-
:rtype: str
|
|
39
|
+
:rtype: Union[str, float]
|
|
35
40
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
if harm_score is None or (isinstance(harm_score, float) and math.isnan(harm_score)):
|
|
42
|
+
return math.nan
|
|
43
|
+
|
|
44
|
+
# Get the scoring pattern for this evaluator
|
|
45
|
+
pattern = get_evaluator_scoring_pattern(evaluator)
|
|
46
|
+
config = SCORING_PATTERN_CONFIG.get(pattern)
|
|
47
|
+
|
|
48
|
+
if not config:
|
|
49
|
+
# Fallback to default 0-7 mapping
|
|
50
|
+
HARM_SEVERITY_LEVEL_MAPPING = {
|
|
51
|
+
constants.HarmSeverityLevel.VeryLow: [0, 1],
|
|
52
|
+
constants.HarmSeverityLevel.Low: [2, 3],
|
|
53
|
+
constants.HarmSeverityLevel.Medium: [4, 5],
|
|
54
|
+
constants.HarmSeverityLevel.High: [6, 7],
|
|
55
|
+
}
|
|
56
|
+
for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
|
|
57
|
+
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
|
|
58
|
+
return harm_level.value
|
|
43
59
|
return math.nan
|
|
44
|
-
|
|
45
|
-
|
|
60
|
+
|
|
61
|
+
# Use pattern-specific severity mapping
|
|
62
|
+
severity_mapping = config.get("severity_mapping", {})
|
|
63
|
+
for harm_level, score_range in severity_mapping.items():
|
|
64
|
+
if score_range[0] <= harm_score <= score_range[-1]:
|
|
46
65
|
return harm_level.value
|
|
66
|
+
|
|
47
67
|
return math.nan
|
|
48
68
|
|
|
49
69
|
|
|
70
|
+
def get_evaluator_scoring_pattern(evaluator: Optional[str]) -> EvaluatorScoringPattern:
|
|
71
|
+
"""Get the scoring pattern for a given evaluator.
|
|
72
|
+
|
|
73
|
+
:param evaluator: The name of the evaluator.
|
|
74
|
+
:type evaluator: Optional[str]
|
|
75
|
+
:return: The scoring pattern for the evaluator, defaults to SCALE_0_7 if not found.
|
|
76
|
+
:rtype: EvaluatorScoringPattern
|
|
77
|
+
"""
|
|
78
|
+
if evaluator is None:
|
|
79
|
+
return EvaluatorScoringPattern.SCALE_0_7
|
|
80
|
+
|
|
81
|
+
# Normalize evaluator name (remove underscores, lowercase)
|
|
82
|
+
normalized_name = evaluator.lower().replace("_", "")
|
|
83
|
+
|
|
84
|
+
# Try exact match first
|
|
85
|
+
if evaluator in EVALUATOR_SCORING_PATTERNS:
|
|
86
|
+
return EVALUATOR_SCORING_PATTERNS[evaluator]
|
|
87
|
+
|
|
88
|
+
# Try normalized match
|
|
89
|
+
for key, pattern in EVALUATOR_SCORING_PATTERNS.items():
|
|
90
|
+
if key.replace("_", "").lower() == normalized_name:
|
|
91
|
+
return pattern
|
|
92
|
+
|
|
93
|
+
# Default to 0-7 scale for unknown evaluators
|
|
94
|
+
return EvaluatorScoringPattern.SCALE_0_7
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_default_threshold_for_evaluator(evaluator: str) -> int:
|
|
98
|
+
"""Get the default attack success threshold for a given evaluator.
|
|
99
|
+
|
|
100
|
+
:param evaluator: The name of the evaluator.
|
|
101
|
+
:type evaluator: str
|
|
102
|
+
:return: The default threshold value.
|
|
103
|
+
:rtype: int
|
|
104
|
+
"""
|
|
105
|
+
pattern = get_evaluator_scoring_pattern(evaluator)
|
|
106
|
+
config = SCORING_PATTERN_CONFIG.get(pattern, {})
|
|
107
|
+
return config.get("default_threshold", 3)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def convert_binary_to_numeric(value: Union[str, bool, int]) -> int:
|
|
111
|
+
"""Convert binary evaluator outputs to numeric scores.
|
|
112
|
+
|
|
113
|
+
:param value: The binary value ("safe"/"unsafe", "true"/"false", True/False, 0/1).
|
|
114
|
+
:type value: Union[str, bool, int]
|
|
115
|
+
:return: 0 for safe/true, 1 for unsafe/false.
|
|
116
|
+
:rtype: int
|
|
117
|
+
"""
|
|
118
|
+
if isinstance(value, bool):
|
|
119
|
+
return 0 if value else 1
|
|
120
|
+
|
|
121
|
+
if isinstance(value, int):
|
|
122
|
+
return value
|
|
123
|
+
|
|
124
|
+
if isinstance(value, str):
|
|
125
|
+
value_lower = value.lower().strip()
|
|
126
|
+
# For "safe"/"unsafe" pattern
|
|
127
|
+
if value_lower == "safe":
|
|
128
|
+
return 0
|
|
129
|
+
if value_lower == "unsafe":
|
|
130
|
+
return 1
|
|
131
|
+
# For "true"/"false" pattern
|
|
132
|
+
if value_lower == "true":
|
|
133
|
+
return 0
|
|
134
|
+
if value_lower == "false":
|
|
135
|
+
return 1
|
|
136
|
+
|
|
137
|
+
raise ValueError(f"Unable to convert value '{value}' to numeric score")
|
|
138
|
+
|
|
139
|
+
|
|
50
140
|
def ensure_nltk_data_downloaded():
|
|
51
141
|
"""Download NLTK data packages if not already downloaded."""
|
|
52
142
|
nltk_data = [
|
|
@@ -125,9 +215,24 @@ def construct_prompty_model_config(
|
|
|
125
215
|
return prompty_model_config
|
|
126
216
|
|
|
127
217
|
|
|
218
|
+
def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
|
|
219
|
+
"""Check if the Azure AI project is an OneDP project.
|
|
220
|
+
|
|
221
|
+
:param azure_ai_project: The scope of the Azure AI project.
|
|
222
|
+
:type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
|
|
223
|
+
:return: True if the Azure AI project is an OneDP project, False otherwise.
|
|
224
|
+
:rtype: bool
|
|
225
|
+
"""
|
|
226
|
+
return isinstance(azure_ai_project, str)
|
|
227
|
+
|
|
228
|
+
|
|
128
229
|
def validate_azure_ai_project(o: object) -> AzureAIProject:
|
|
129
230
|
fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
|
|
130
231
|
|
|
232
|
+
# TODO : Add regex check for malformed project uri
|
|
233
|
+
if is_onedp_project(o):
|
|
234
|
+
return o
|
|
235
|
+
|
|
131
236
|
if not isinstance(o, dict):
|
|
132
237
|
msg = "The 'azure_ai_project' parameter must be a dictionary."
|
|
133
238
|
raise EvaluationException(
|
|
@@ -275,7 +380,27 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
|
|
|
275
380
|
return cast(T_TypedDict, o)
|
|
276
381
|
|
|
277
382
|
|
|
278
|
-
def
|
|
383
|
+
def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool:
|
|
384
|
+
"""Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
|
|
385
|
+
|
|
386
|
+
:param score: The score to check.
|
|
387
|
+
:type score: Union[str, float]
|
|
388
|
+
:param min_score: The minimum score. Default is 1.
|
|
389
|
+
:type min_score: int
|
|
390
|
+
:param max_score: The maximum score. Default is 5.
|
|
391
|
+
:type max_score: int
|
|
392
|
+
:return: True if the score is valid, False otherwise.
|
|
393
|
+
:rtype: bool
|
|
394
|
+
"""
|
|
395
|
+
try:
|
|
396
|
+
numeric_score = float(score)
|
|
397
|
+
except (ValueError, TypeError):
|
|
398
|
+
return False
|
|
399
|
+
|
|
400
|
+
return min_score <= numeric_score <= max_score
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
|
|
279
404
|
"""Parse the output of prompt-based quality evaluators that return a score and reason.
|
|
280
405
|
|
|
281
406
|
Current supported evaluators:
|
|
@@ -284,6 +409,8 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
|
284
409
|
- Retrieval
|
|
285
410
|
- Groundedness
|
|
286
411
|
- Coherence
|
|
412
|
+
- ResponseCompleteness
|
|
413
|
+
- TaskAdherence
|
|
287
414
|
|
|
288
415
|
:param llm_output: The output of the prompt-based quality evaluator.
|
|
289
416
|
:type llm_output: str
|
|
@@ -294,7 +421,7 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
|
294
421
|
reason = ""
|
|
295
422
|
if llm_output:
|
|
296
423
|
try:
|
|
297
|
-
score_pattern =
|
|
424
|
+
score_pattern = rf"<S2>\D*?({valid_score_range}).*?</S2>"
|
|
298
425
|
reason_pattern = r"<S1>(.*?)</S1>"
|
|
299
426
|
score_match = re.findall(score_pattern, llm_output, re.DOTALL)
|
|
300
427
|
reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
|
|
@@ -366,7 +493,7 @@ def validate_conversation(conversation):
|
|
|
366
493
|
if not isinstance(messages, list):
|
|
367
494
|
raise_exception(
|
|
368
495
|
"'messages' parameter must be a JSON-compatible list of chat messages",
|
|
369
|
-
ErrorTarget.
|
|
496
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
370
497
|
)
|
|
371
498
|
expected_roles = {"user", "assistant", "system"}
|
|
372
499
|
image_found = False
|
|
@@ -393,7 +520,7 @@ def validate_conversation(conversation):
|
|
|
393
520
|
):
|
|
394
521
|
raise_exception(
|
|
395
522
|
f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
|
|
396
|
-
ErrorTarget.
|
|
523
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
397
524
|
)
|
|
398
525
|
if isinstance(message, AssistantMessage):
|
|
399
526
|
assistant_message_count += 1
|
|
@@ -407,7 +534,7 @@ def validate_conversation(conversation):
|
|
|
407
534
|
if message.get("role") not in expected_roles:
|
|
408
535
|
raise_exception(
|
|
409
536
|
f"Invalid role provided: {message.get('role')}. Message number: {num}",
|
|
410
|
-
ErrorTarget.
|
|
537
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
411
538
|
)
|
|
412
539
|
if message.get("role") == "assistant":
|
|
413
540
|
assistant_message_count += 1
|
|
@@ -417,7 +544,7 @@ def validate_conversation(conversation):
|
|
|
417
544
|
if not isinstance(content, (str, list)):
|
|
418
545
|
raise_exception(
|
|
419
546
|
f"Content in each turn must be a string or array. Message number: {num}",
|
|
420
|
-
ErrorTarget.
|
|
547
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
421
548
|
)
|
|
422
549
|
if isinstance(content, list):
|
|
423
550
|
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
|
|
@@ -425,21 +552,372 @@ def validate_conversation(conversation):
|
|
|
425
552
|
if not image_found:
|
|
426
553
|
raise_exception(
|
|
427
554
|
"Message needs to have multi-modal input like images.",
|
|
428
|
-
ErrorTarget.
|
|
555
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
429
556
|
)
|
|
430
557
|
if assistant_message_count == 0:
|
|
431
558
|
raise_exception(
|
|
432
559
|
"Assistant role required in one of the messages.",
|
|
433
|
-
ErrorTarget.
|
|
560
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
434
561
|
)
|
|
435
562
|
if user_message_count == 0:
|
|
436
563
|
raise_exception(
|
|
437
564
|
"User role required in one of the messages.",
|
|
438
|
-
ErrorTarget.
|
|
565
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
439
566
|
)
|
|
440
567
|
if assistant_message_count > 1:
|
|
441
568
|
raise_exception(
|
|
442
569
|
"Evaluators for multimodal conversations only support single turn. "
|
|
443
570
|
"User and assistant role expected as the only role in each message.",
|
|
444
|
-
ErrorTarget.
|
|
571
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _extract_text_from_content(content):
|
|
576
|
+
text = []
|
|
577
|
+
for msg in content:
|
|
578
|
+
if "text" in msg:
|
|
579
|
+
text.append(msg["text"])
|
|
580
|
+
return text
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def filter_to_used_tools(tool_definitions, msgs_lists, logger=None):
|
|
584
|
+
"""Filters the tool definitions to only include those that were actually used in the messages lists."""
|
|
585
|
+
try:
|
|
586
|
+
used_tool_names = set()
|
|
587
|
+
any_tools_used = False
|
|
588
|
+
for msgs in msgs_lists:
|
|
589
|
+
for msg in msgs:
|
|
590
|
+
if msg.get("role") == "assistant" and "content" in msg:
|
|
591
|
+
for content in msg.get("content", []):
|
|
592
|
+
if content.get("type") == "tool_call":
|
|
593
|
+
any_tools_used = True
|
|
594
|
+
if "tool_call" in content and "function" in content["tool_call"]:
|
|
595
|
+
used_tool_names.add(content["tool_call"]["function"])
|
|
596
|
+
elif "name" in content:
|
|
597
|
+
used_tool_names.add(content["name"])
|
|
598
|
+
|
|
599
|
+
filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names]
|
|
600
|
+
if any_tools_used and not filtered_tools:
|
|
601
|
+
if logger:
|
|
602
|
+
logger.warning("No tool definitions matched the tools used in the messages. Returning original list.")
|
|
603
|
+
filtered_tools = tool_definitions
|
|
604
|
+
|
|
605
|
+
return filtered_tools
|
|
606
|
+
except Exception as e:
|
|
607
|
+
if logger:
|
|
608
|
+
logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}")
|
|
609
|
+
return tool_definitions
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False):
|
|
613
|
+
all_user_queries, all_agent_responses = [], []
|
|
614
|
+
cur_user_query, cur_agent_response = [], []
|
|
615
|
+
system_message = None
|
|
616
|
+
|
|
617
|
+
for msg in query:
|
|
618
|
+
role = msg.get("role")
|
|
619
|
+
if not role:
|
|
620
|
+
continue
|
|
621
|
+
if include_system_messages and role == "system":
|
|
622
|
+
system_message = msg.get("content", "")
|
|
623
|
+
|
|
624
|
+
elif role == "user" and "content" in msg:
|
|
625
|
+
if cur_agent_response:
|
|
626
|
+
formatted_agent_response = _get_agent_response(
|
|
627
|
+
cur_agent_response, include_tool_messages=include_tool_messages
|
|
628
|
+
)
|
|
629
|
+
all_agent_responses.append([formatted_agent_response])
|
|
630
|
+
cur_agent_response = []
|
|
631
|
+
text_in_msg = _extract_text_from_content(msg["content"])
|
|
632
|
+
if text_in_msg:
|
|
633
|
+
cur_user_query.append(text_in_msg)
|
|
634
|
+
|
|
635
|
+
elif role in ("assistant", "tool"):
|
|
636
|
+
if cur_user_query:
|
|
637
|
+
all_user_queries.append(cur_user_query)
|
|
638
|
+
cur_user_query = []
|
|
639
|
+
cur_agent_response.append(msg)
|
|
640
|
+
|
|
641
|
+
if cur_user_query:
|
|
642
|
+
all_user_queries.append(cur_user_query)
|
|
643
|
+
if cur_agent_response:
|
|
644
|
+
formatted_agent_response = _get_agent_response(cur_agent_response, include_tool_messages=include_tool_messages)
|
|
645
|
+
all_agent_responses.append([formatted_agent_response])
|
|
646
|
+
|
|
647
|
+
if len(all_user_queries) != len(all_agent_responses) + 1:
|
|
648
|
+
raise EvaluationException(
|
|
649
|
+
message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
|
|
650
|
+
internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
|
|
651
|
+
target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
|
|
652
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
653
|
+
blame=ErrorBlame.USER_ERROR,
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
|
|
657
|
+
if include_system_messages and system_message:
|
|
658
|
+
result["system_message"] = system_message
|
|
659
|
+
return result
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def _pretty_format_conversation_history(conversation_history):
|
|
663
|
+
"""Formats the conversation history for better readability."""
|
|
664
|
+
formatted_history = ""
|
|
665
|
+
if conversation_history.get("system_message"):
|
|
666
|
+
formatted_history += "SYSTEM_PROMPT:\n"
|
|
667
|
+
formatted_history += " " + conversation_history["system_message"] + "\n\n"
|
|
668
|
+
for i, (user_query, agent_response) in enumerate(
|
|
669
|
+
zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
|
|
670
|
+
):
|
|
671
|
+
formatted_history += f"User turn {i+1}:\n"
|
|
672
|
+
for msg in user_query:
|
|
673
|
+
if isinstance(msg, list):
|
|
674
|
+
for submsg in msg:
|
|
675
|
+
formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
|
|
676
|
+
else:
|
|
677
|
+
formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
|
|
678
|
+
formatted_history += "\n"
|
|
679
|
+
if agent_response:
|
|
680
|
+
formatted_history += f"Agent turn {i+1}:\n"
|
|
681
|
+
for msg in agent_response:
|
|
682
|
+
if isinstance(msg, list):
|
|
683
|
+
for submsg in msg:
|
|
684
|
+
formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
|
|
685
|
+
else:
|
|
686
|
+
formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
|
|
687
|
+
formatted_history += "\n"
|
|
688
|
+
return formatted_history
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def reformat_conversation_history(query, logger=None, include_system_messages=False, include_tool_messages=False):
|
|
692
|
+
"""Reformats the conversation history to a more compact representation."""
|
|
693
|
+
try:
|
|
694
|
+
conversation_history = _get_conversation_history(
|
|
695
|
+
query,
|
|
696
|
+
include_system_messages=include_system_messages,
|
|
697
|
+
include_tool_messages=include_tool_messages,
|
|
698
|
+
)
|
|
699
|
+
return _pretty_format_conversation_history(conversation_history)
|
|
700
|
+
except Exception as e:
|
|
701
|
+
# If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
|
|
702
|
+
# This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
|
|
703
|
+
# From our tests the negative impact on IntentResolution is:
|
|
704
|
+
# Higher intra model variance (0.142 vs 0.046)
|
|
705
|
+
# Higher inter model variance (0.345 vs 0.607)
|
|
706
|
+
# Lower percentage of mode in Likert scale (73.4% vs 75.4%)
|
|
707
|
+
# Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
|
|
708
|
+
if logger:
|
|
709
|
+
logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
|
|
710
|
+
return query
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def _get_agent_response(agent_response_msgs, include_tool_messages=False):
|
|
714
|
+
"""Extracts formatted agent response including text, and optionally tool calls/results."""
|
|
715
|
+
agent_response_text = []
|
|
716
|
+
tool_results = {}
|
|
717
|
+
|
|
718
|
+
# First pass: collect tool results
|
|
719
|
+
if include_tool_messages:
|
|
720
|
+
for msg in agent_response_msgs:
|
|
721
|
+
if msg.get("role") == "tool" and "tool_call_id" in msg:
|
|
722
|
+
for content in msg.get("content", []):
|
|
723
|
+
if content.get("type") == "tool_result":
|
|
724
|
+
result = content.get("tool_result")
|
|
725
|
+
tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
|
|
726
|
+
|
|
727
|
+
# Second pass: parse assistant messages and tool calls
|
|
728
|
+
for msg in agent_response_msgs:
|
|
729
|
+
if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
|
|
730
|
+
text = _extract_text_from_content(msg["content"])
|
|
731
|
+
if text:
|
|
732
|
+
agent_response_text.extend(text)
|
|
733
|
+
if include_tool_messages:
|
|
734
|
+
for content in msg.get("content", []):
|
|
735
|
+
# Todo: Verify if this is the correct way to handle tool calls
|
|
736
|
+
if content.get("type") == "tool_call":
|
|
737
|
+
if "tool_call" in content and "function" in content.get("tool_call", {}):
|
|
738
|
+
tc = content.get("tool_call", {})
|
|
739
|
+
func_name = tc.get("function", {}).get("name", "")
|
|
740
|
+
args = tc.get("function", {}).get("arguments", {})
|
|
741
|
+
tool_call_id = tc.get("id")
|
|
742
|
+
else:
|
|
743
|
+
tool_call_id = content.get("tool_call_id")
|
|
744
|
+
func_name = content.get("name", "")
|
|
745
|
+
args = content.get("arguments", {})
|
|
746
|
+
args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
|
|
747
|
+
call_line = f"[TOOL_CALL] {func_name}({args_str})"
|
|
748
|
+
agent_response_text.append(call_line)
|
|
749
|
+
if tool_call_id in tool_results:
|
|
750
|
+
agent_response_text.append(tool_results[tool_call_id])
|
|
751
|
+
|
|
752
|
+
return agent_response_text
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
def reformat_agent_response(response, logger=None, include_tool_messages=False):
|
|
756
|
+
try:
|
|
757
|
+
if response is None or response == []:
|
|
758
|
+
return ""
|
|
759
|
+
agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
|
|
760
|
+
if agent_response == []:
|
|
761
|
+
# If no message could be extracted, likely the format changed, fallback to the original response in that case
|
|
762
|
+
if logger:
|
|
763
|
+
logger.warning(
|
|
764
|
+
f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
|
|
765
|
+
)
|
|
766
|
+
return response
|
|
767
|
+
return "\n".join(agent_response)
|
|
768
|
+
except:
|
|
769
|
+
# If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
|
|
770
|
+
# This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
|
|
771
|
+
if logger:
|
|
772
|
+
logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
|
|
773
|
+
return response
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
def reformat_tool_definitions(tool_definitions, logger=None):
|
|
777
|
+
try:
|
|
778
|
+
output_lines = ["TOOL_DEFINITIONS:"]
|
|
779
|
+
for tool in tool_definitions:
|
|
780
|
+
name = tool.get("name", "unnamed_tool")
|
|
781
|
+
desc = tool.get("description", "").strip()
|
|
782
|
+
params = tool.get("parameters", {}).get("properties", {})
|
|
783
|
+
param_names = ", ".join(params.keys()) if params else "no parameters"
|
|
784
|
+
output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
|
|
785
|
+
return "\n".join(output_lines)
|
|
786
|
+
except Exception as e:
|
|
787
|
+
# If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
|
|
788
|
+
# This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
|
|
789
|
+
if logger:
|
|
790
|
+
logger.warning(
|
|
791
|
+
f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
|
|
792
|
+
)
|
|
793
|
+
return tool_definitions
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
def simplify_messages(messages, drop_system=True, drop_tool_calls=False, logger=None):
|
|
797
|
+
"""
|
|
798
|
+
Simplify a list of conversation messages by keeping only role and content.
|
|
799
|
+
Optionally filter out system messages and/or tool calls.
|
|
800
|
+
|
|
801
|
+
:param messages: List of message dicts (e.g., from query or response)
|
|
802
|
+
:param drop_system: If True, remove system role messages
|
|
803
|
+
:param drop_tool_calls: If True, remove tool_call items from assistant content
|
|
804
|
+
:return: New simplified list of messages
|
|
805
|
+
"""
|
|
806
|
+
if isinstance(messages, str):
|
|
807
|
+
return messages
|
|
808
|
+
try:
|
|
809
|
+
# Validate input is a list
|
|
810
|
+
if not isinstance(messages, list):
|
|
811
|
+
return messages
|
|
812
|
+
|
|
813
|
+
simplified_msgs = []
|
|
814
|
+
for msg in messages:
|
|
815
|
+
# Ensure msg is a dict
|
|
816
|
+
if not isinstance(msg, dict):
|
|
817
|
+
simplified_msgs.append(msg)
|
|
818
|
+
continue
|
|
819
|
+
|
|
820
|
+
role = msg.get("role")
|
|
821
|
+
content = msg.get("content", [])
|
|
822
|
+
|
|
823
|
+
# Drop system message (if should)
|
|
824
|
+
if drop_system and role == "system":
|
|
825
|
+
continue
|
|
826
|
+
|
|
827
|
+
# Simplify user messages
|
|
828
|
+
if role == "user":
|
|
829
|
+
simplified_msg = {
|
|
830
|
+
"role": role,
|
|
831
|
+
"content": _extract_text_from_content(content),
|
|
832
|
+
}
|
|
833
|
+
simplified_msgs.append(simplified_msg)
|
|
834
|
+
continue
|
|
835
|
+
|
|
836
|
+
# Drop tool results (if should)
|
|
837
|
+
if drop_tool_calls and role == "tool":
|
|
838
|
+
continue
|
|
839
|
+
|
|
840
|
+
# Simplify assistant messages
|
|
841
|
+
if role == "assistant":
|
|
842
|
+
simplified_content = _extract_text_from_content(content)
|
|
843
|
+
# Check if message has content
|
|
844
|
+
if simplified_content:
|
|
845
|
+
simplified_msg = {"role": role, "content": simplified_content}
|
|
846
|
+
simplified_msgs.append(simplified_msg)
|
|
847
|
+
continue
|
|
848
|
+
|
|
849
|
+
# Drop tool calls (if should)
|
|
850
|
+
if drop_tool_calls and any(c.get("type") == "tool_call" for c in content if isinstance(c, dict)):
|
|
851
|
+
continue
|
|
852
|
+
|
|
853
|
+
# If we reach here, it means we want to keep the message
|
|
854
|
+
simplified_msgs.append(msg)
|
|
855
|
+
|
|
856
|
+
return simplified_msgs
|
|
857
|
+
|
|
858
|
+
except Exception as ex:
|
|
859
|
+
if logger:
|
|
860
|
+
logger.debug(f"Error simplifying messages: {str(ex)}. Returning original messages.")
|
|
861
|
+
return messages
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def upload(path: str, container_client: ContainerClient, logger=None):
|
|
865
|
+
"""Upload files or directories to Azure Blob Storage using a container client.
|
|
866
|
+
|
|
867
|
+
This function uploads a file or all files in a directory (recursively) to Azure Blob Storage.
|
|
868
|
+
When uploading a directory, the relative path structure is preserved in the blob container.
|
|
869
|
+
|
|
870
|
+
:param path: The local path to a file or directory to upload
|
|
871
|
+
:type path: str
|
|
872
|
+
:param container_client: The Azure Blob Container client to use for uploading
|
|
873
|
+
:type container_client: azure.storage.blob.ContainerClient
|
|
874
|
+
:param logger: Optional logger for debug output, defaults to None
|
|
875
|
+
:type logger: logging.Logger, optional
|
|
876
|
+
:raises EvaluationException: If the path doesn't exist or errors occur during upload
|
|
877
|
+
"""
|
|
878
|
+
|
|
879
|
+
if not os.path.isdir(path) and not os.path.isfile(path):
|
|
880
|
+
raise EvaluationException(
|
|
881
|
+
message=f"Path '{path}' is not a directory or a file",
|
|
882
|
+
internal_message=f"Path '{path}' is not a directory or a file",
|
|
883
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
884
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
885
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
remote_paths = []
|
|
889
|
+
local_paths = []
|
|
890
|
+
|
|
891
|
+
if os.path.isdir(path):
|
|
892
|
+
for root, _, filenames in os.walk(path):
|
|
893
|
+
upload_path = ""
|
|
894
|
+
if root != path:
|
|
895
|
+
rel_path = os.path.relpath(root, path)
|
|
896
|
+
upload_path = posixpath.join(rel_path)
|
|
897
|
+
for f in filenames:
|
|
898
|
+
remote_file_path = posixpath.join(upload_path, f)
|
|
899
|
+
remote_paths.append(remote_file_path)
|
|
900
|
+
local_file_path = os.path.join(root, f)
|
|
901
|
+
local_paths.append(local_file_path)
|
|
902
|
+
|
|
903
|
+
if os.path.isfile(path):
|
|
904
|
+
remote_paths = [os.path.basename(path)]
|
|
905
|
+
local_paths = [path]
|
|
906
|
+
|
|
907
|
+
try:
|
|
908
|
+
# Open the file in binary read mode
|
|
909
|
+
for local, remote in zip(local_paths, remote_paths):
|
|
910
|
+
with open(local, "rb") as data:
|
|
911
|
+
# Upload the file to Azure Blob Storage
|
|
912
|
+
container_client.upload_blob(data=data, name=remote)
|
|
913
|
+
if logger:
|
|
914
|
+
logger.debug(f"File '{local}' uploaded successfully")
|
|
915
|
+
|
|
916
|
+
except Exception as e:
|
|
917
|
+
raise EvaluationException(
|
|
918
|
+
message=f"Error uploading file: {e}",
|
|
919
|
+
internal_message=f"Error uploading file: {e}",
|
|
920
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
921
|
+
category=ErrorCategory.UPLOAD_ERROR,
|
|
922
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
445
923
|
)
|