azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +83 -14
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/_common/constants.py +124 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +578 -69
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +505 -27
- azure/ai/evaluation/_constants.py +148 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
- azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
- azure/ai/evaluation/_evaluate/_utils.py +237 -42
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
- azure/ai/evaluation/_exceptions.py +24 -1
- azure/ai/evaluation/_http_utils.py +7 -5
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
- azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +43 -19
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -4,14 +4,43 @@
|
|
|
4
4
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from
|
|
7
|
+
import json
|
|
8
|
+
import copy
|
|
9
|
+
from typing import (
|
|
10
|
+
Any,
|
|
11
|
+
Callable,
|
|
12
|
+
Dict,
|
|
13
|
+
Generic,
|
|
14
|
+
List,
|
|
15
|
+
Tuple,
|
|
16
|
+
TypedDict,
|
|
17
|
+
TypeVar,
|
|
18
|
+
Union,
|
|
19
|
+
cast,
|
|
20
|
+
final,
|
|
21
|
+
Optional,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
|
|
10
25
|
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
26
|
|
|
12
|
-
from azure.ai.evaluation.
|
|
13
|
-
|
|
27
|
+
from azure.ai.evaluation._exceptions import (
|
|
28
|
+
ErrorBlame,
|
|
29
|
+
ErrorCategory,
|
|
30
|
+
ErrorTarget,
|
|
31
|
+
EvaluationException,
|
|
32
|
+
)
|
|
14
33
|
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
34
|
+
from azure.ai.evaluation._constants import (
|
|
35
|
+
_AggregationType,
|
|
36
|
+
EVALUATION_PASS_FAIL_MAPPING,
|
|
37
|
+
)
|
|
38
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
39
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
40
|
+
|
|
41
|
+
from ._conversation_aggregators import GetAggregator, GetAggregatorType
|
|
42
|
+
|
|
43
|
+
import copy
|
|
15
44
|
|
|
16
45
|
P = ParamSpec("P")
|
|
17
46
|
T = TypeVar("T")
|
|
@@ -24,6 +53,7 @@ class DerivedEvalInput(TypedDict, total=False):
|
|
|
24
53
|
query: Dict[str, Any]
|
|
25
54
|
response: Dict[str, Any]
|
|
26
55
|
context: str
|
|
56
|
+
ground_truth: str
|
|
27
57
|
|
|
28
58
|
|
|
29
59
|
AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
|
|
@@ -68,8 +98,24 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
68
98
|
:type not_singleton_inputs: List[str]
|
|
69
99
|
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
|
|
70
100
|
:type eval_last_turn: bool
|
|
101
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
|
|
102
|
+
to produce a single result.
|
|
103
|
+
Default is ~azure.ai.evaluation._AggregationType.MEAN.
|
|
104
|
+
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
105
|
+
:param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
|
|
106
|
+
overrides the standard aggregator implied by conversation_aggregation_type. None by default.
|
|
107
|
+
:type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
|
|
108
|
+
:param threshold: The threshold for the evaluation. Default is 3.
|
|
109
|
+
:type threshold: Optional[int]
|
|
110
|
+
:param _higher_is_better: If True, higher scores are better. Default is True.
|
|
111
|
+
:type _higher_is_better: Optional[bool]
|
|
71
112
|
"""
|
|
72
113
|
|
|
114
|
+
_NOT_APPLICABLE_RESULT = "not applicable"
|
|
115
|
+
_PASS_RESULT = "pass"
|
|
116
|
+
_FAIL_RESULT = "fail"
|
|
117
|
+
_type = "azure_ai_evaluator"
|
|
118
|
+
|
|
73
119
|
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
74
120
|
|
|
75
121
|
# Make sure to call super().__init__() in the child class's __init__ method.
|
|
@@ -77,13 +123,23 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
77
123
|
def __init__(
|
|
78
124
|
self,
|
|
79
125
|
*,
|
|
126
|
+
threshold: float = 3.0,
|
|
80
127
|
not_singleton_inputs: List[str] = ["conversation", "kwargs"],
|
|
81
128
|
eval_last_turn: bool = False,
|
|
129
|
+
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
130
|
+
conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
|
|
131
|
+
_higher_is_better: Optional[bool] = True,
|
|
82
132
|
):
|
|
83
133
|
self._not_singleton_inputs = not_singleton_inputs
|
|
84
134
|
self._eval_last_turn = eval_last_turn
|
|
85
135
|
self._singleton_inputs = self._derive_singleton_inputs()
|
|
86
136
|
self._async_evaluator = AsyncEvaluatorBase(self._real_call)
|
|
137
|
+
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
|
|
138
|
+
self._higher_is_better = _higher_is_better
|
|
139
|
+
self._threshold = threshold
|
|
140
|
+
if conversation_aggregator_override is not None:
|
|
141
|
+
# Type ignore since we already checked for None, but mypy doesn't know that.
|
|
142
|
+
self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]
|
|
87
143
|
|
|
88
144
|
# This needs to be overridden just to change the function header into something more informative,
|
|
89
145
|
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
@@ -120,15 +176,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
120
176
|
|
|
121
177
|
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
122
178
|
|
|
123
|
-
def _derive_singleton_inputs(self) -> List[str]:
|
|
179
|
+
def _derive_singleton_inputs(self) -> List[List[str]]:
|
|
124
180
|
"""Inspect the evaluator's __call__ function to determine what singleton inputs are expected
|
|
125
181
|
when the evaluator is being used in a non-conversation context.
|
|
126
182
|
By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
|
|
127
183
|
Thankfully this works the way you'd hope, with the call_signature being based on the child
|
|
128
184
|
function's signature, not the parent's.
|
|
129
185
|
|
|
130
|
-
:return: A list of
|
|
131
|
-
:rtype: List[str]
|
|
186
|
+
:return: A list of lists, where each inner list represents the singleton inputs for each overload.
|
|
187
|
+
:rtype: List[List[str]]
|
|
132
188
|
"""
|
|
133
189
|
|
|
134
190
|
overloads = get_overloads(self.__call__)
|
|
@@ -136,17 +192,70 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
136
192
|
call_signatures = [inspect.signature(self.__call__)]
|
|
137
193
|
else:
|
|
138
194
|
call_signatures = [inspect.signature(overload) for overload in overloads]
|
|
139
|
-
|
|
140
|
-
|
|
195
|
+
|
|
196
|
+
overload_inputs = []
|
|
141
197
|
for call_signature in call_signatures:
|
|
142
198
|
params = call_signature.parameters
|
|
143
199
|
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
|
|
144
200
|
continue
|
|
145
201
|
# exclude self since it is not a singleton input
|
|
146
|
-
|
|
147
|
-
return
|
|
202
|
+
overload_inputs.append([p for p in params if p != "self"])
|
|
203
|
+
return overload_inputs
|
|
204
|
+
|
|
205
|
+
def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
|
|
206
|
+
"""Find the overload that matches the provided kwargs and return its input parameters.
|
|
148
207
|
|
|
149
|
-
|
|
208
|
+
:keyword kwargs: The keyword arguments to match against overloads.
|
|
209
|
+
:type kwargs: Dict
|
|
210
|
+
:return: List of input parameter names for the matching overload.
|
|
211
|
+
:rtype: List[str]
|
|
212
|
+
"""
|
|
213
|
+
overload_inputs = self._singleton_inputs
|
|
214
|
+
provided_keys = set(key for key, value in kwargs.items() if value is not None)
|
|
215
|
+
|
|
216
|
+
# Find the overload that best matches the provided parameters
|
|
217
|
+
best_match = None
|
|
218
|
+
best_score = -1
|
|
219
|
+
|
|
220
|
+
for inputs in overload_inputs:
|
|
221
|
+
input_set = set(inputs)
|
|
222
|
+
|
|
223
|
+
# Calculate match score: how many of the overload's params are provided
|
|
224
|
+
if input_set.issubset(provided_keys):
|
|
225
|
+
score = len(input_set)
|
|
226
|
+
if score > best_score:
|
|
227
|
+
best_score = score
|
|
228
|
+
best_match = inputs
|
|
229
|
+
|
|
230
|
+
# If exact match found, return it
|
|
231
|
+
if best_match is not None:
|
|
232
|
+
return best_match
|
|
233
|
+
|
|
234
|
+
# If no exact match, find the overload with the most overlap
|
|
235
|
+
for inputs in overload_inputs:
|
|
236
|
+
input_set = set(inputs)
|
|
237
|
+
overlap = len(input_set.intersection(provided_keys))
|
|
238
|
+
if overlap > best_score:
|
|
239
|
+
best_score = overlap
|
|
240
|
+
best_match = inputs
|
|
241
|
+
|
|
242
|
+
# Return the best match or the first overload as fallback
|
|
243
|
+
return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
|
|
244
|
+
|
|
245
|
+
def _get_all_singleton_inputs(self) -> List[str]:
|
|
246
|
+
"""Get a flattened list of all possible singleton inputs across all overloads.
|
|
247
|
+
|
|
248
|
+
:return: Flattened list of all singleton input names.
|
|
249
|
+
:rtype: List[str]
|
|
250
|
+
"""
|
|
251
|
+
all_inputs = set()
|
|
252
|
+
for inputs in self._singleton_inputs:
|
|
253
|
+
all_inputs.update(inputs)
|
|
254
|
+
return list(all_inputs)
|
|
255
|
+
|
|
256
|
+
def _derive_conversation_converter(
|
|
257
|
+
self,
|
|
258
|
+
) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
150
259
|
"""Produce the function that will be used to convert conversations to a list of evaluable inputs.
|
|
151
260
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
152
261
|
aspects of a conversation ought to be extracted.
|
|
@@ -154,9 +263,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
154
263
|
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
155
264
|
:rtype: Callable
|
|
156
265
|
"""
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
266
|
+
all_singleton_inputs = self._get_all_singleton_inputs()
|
|
267
|
+
include_context = "context" in all_singleton_inputs
|
|
268
|
+
include_query = "query" in all_singleton_inputs
|
|
269
|
+
include_response = "response" in all_singleton_inputs
|
|
270
|
+
include_ground_truth = "ground_truth" in all_singleton_inputs
|
|
160
271
|
|
|
161
272
|
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
162
273
|
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
@@ -197,21 +308,78 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
197
308
|
eval_input["response"] = response.get("content", "")
|
|
198
309
|
if include_context:
|
|
199
310
|
eval_input["context"] = str(context)
|
|
311
|
+
if include_ground_truth:
|
|
312
|
+
eval_input["ground_truth"] = response.get("ground_truth", "")
|
|
200
313
|
eval_inputs.append(eval_input)
|
|
201
314
|
return eval_inputs
|
|
202
315
|
|
|
203
316
|
return converter
|
|
204
317
|
|
|
205
|
-
def
|
|
318
|
+
def _derive_multi_modal_conversation_converter(
|
|
319
|
+
self,
|
|
320
|
+
) -> Callable[[Dict], List[Dict[str, Any]]]:
|
|
321
|
+
"""Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
|
|
322
|
+
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
323
|
+
aspects of a conversation ought to be extracted.
|
|
324
|
+
|
|
325
|
+
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
326
|
+
:rtype: Callable
|
|
327
|
+
"""
|
|
328
|
+
|
|
329
|
+
def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
|
|
330
|
+
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
331
|
+
# Extract user messages, assistant messages from conversation
|
|
332
|
+
user_messages: List[Dict[str, Any]] = []
|
|
333
|
+
assistant_messages: List[Dict[str, Any]] = []
|
|
334
|
+
system_messages: List[Dict[str, Any]] = []
|
|
335
|
+
|
|
336
|
+
# Convert conversation slice into queries and responses.
|
|
337
|
+
# Assume that 'user' role is asking queries and 'assistant' role is responding.
|
|
338
|
+
if self._eval_last_turn and len(messages) > 1:
|
|
339
|
+
messages = messages[-2:]
|
|
340
|
+
|
|
341
|
+
for each_turn in messages:
|
|
342
|
+
role = each_turn["role"]
|
|
343
|
+
if role == "user":
|
|
344
|
+
user_messages.append(each_turn)
|
|
345
|
+
elif role == "assistant":
|
|
346
|
+
assistant_messages.append(each_turn)
|
|
347
|
+
elif role == "system":
|
|
348
|
+
system_messages.append(each_turn)
|
|
349
|
+
|
|
350
|
+
# validation
|
|
351
|
+
if len(user_messages) != len(assistant_messages):
|
|
352
|
+
raise EvaluationException(
|
|
353
|
+
message="Mismatched number of user and assistant messages.",
|
|
354
|
+
internal_message=("Mismatched number of user and assistant messages."),
|
|
355
|
+
)
|
|
356
|
+
if len(assistant_messages) > 1:
|
|
357
|
+
raise EvaluationException(
|
|
358
|
+
message="Conversation can have only one assistant message.",
|
|
359
|
+
internal_message=("Conversation can have only one assistant message."),
|
|
360
|
+
)
|
|
361
|
+
eval_conv_inputs = []
|
|
362
|
+
for user_msg, assist_msg in zip(user_messages, assistant_messages):
|
|
363
|
+
conv_messages = []
|
|
364
|
+
if len(system_messages) == 1:
|
|
365
|
+
conv_messages.append(system_messages[0])
|
|
366
|
+
conv_messages.append(user_msg)
|
|
367
|
+
conv_messages.append(assist_msg)
|
|
368
|
+
eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
|
|
369
|
+
return eval_conv_inputs
|
|
370
|
+
|
|
371
|
+
return multi_modal_converter
|
|
372
|
+
|
|
373
|
+
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
|
|
206
374
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
207
375
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
208
376
|
Either they receive a collection of keyname inputs that are all single values
|
|
209
377
|
(like a query and response), or they receive conversation that iss a list of dictionary
|
|
210
378
|
values.
|
|
211
379
|
|
|
212
|
-
The self._singleton_inputs list assigned during initialization
|
|
213
|
-
|
|
214
|
-
|
|
380
|
+
The self._singleton_inputs list (containing overload signatures) assigned during initialization
|
|
381
|
+
is used to find and extract singleton keywords, and determine which overload matches the
|
|
382
|
+
provided arguments.
|
|
215
383
|
|
|
216
384
|
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
217
385
|
are inputted.
|
|
@@ -229,7 +397,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
229
397
|
conversation = kwargs.get("conversation", None)
|
|
230
398
|
singletons = {}
|
|
231
399
|
if len(self._singleton_inputs) > 0:
|
|
232
|
-
|
|
400
|
+
# Get all possible singleton inputs and check what's provided
|
|
401
|
+
all_singleton_inputs = self._get_all_singleton_inputs()
|
|
402
|
+
singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
|
|
403
|
+
|
|
233
404
|
# Check that both conversation and other inputs aren't set
|
|
234
405
|
if conversation is not None and any(singletons.values()):
|
|
235
406
|
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
|
|
@@ -241,11 +412,19 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
241
412
|
)
|
|
242
413
|
# Handle Conversation
|
|
243
414
|
if conversation is not None:
|
|
415
|
+
if self._is_multi_modal_conversation(conversation):
|
|
416
|
+
return self._derive_multi_modal_conversation_converter()(conversation)
|
|
244
417
|
return self._derive_conversation_converter()(conversation)
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
418
|
+
|
|
419
|
+
# Handle Singletons - find matching overload
|
|
420
|
+
matching_inputs = self._get_matching_overload_inputs(**kwargs)
|
|
421
|
+
if matching_inputs:
|
|
422
|
+
# Check if all required inputs for this overload are provided
|
|
423
|
+
required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
|
|
424
|
+
required_singletons = remove_optional_singletons(self, required_singletons)
|
|
425
|
+
if all(value is not None for value in required_singletons.values()):
|
|
426
|
+
return [singletons]
|
|
427
|
+
|
|
249
428
|
# Missing input
|
|
250
429
|
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
|
|
251
430
|
raise EvaluationException(
|
|
@@ -255,6 +434,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
255
434
|
target=ErrorTarget.CONVERSATION,
|
|
256
435
|
)
|
|
257
436
|
|
|
437
|
+
def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
|
|
438
|
+
if "messages" not in conversation:
|
|
439
|
+
return False
|
|
440
|
+
messages = conversation["messages"]
|
|
441
|
+
if not isinstance(messages, list):
|
|
442
|
+
return False
|
|
443
|
+
for message in messages:
|
|
444
|
+
if "content" in message:
|
|
445
|
+
content = message.get("content", "")
|
|
446
|
+
if isinstance(content, list):
|
|
447
|
+
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
|
|
448
|
+
return True
|
|
449
|
+
return False
|
|
450
|
+
|
|
258
451
|
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
|
|
259
452
|
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
260
453
|
|
|
@@ -285,11 +478,109 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
285
478
|
# Find and average all numeric values
|
|
286
479
|
for metric, values in evaluation_per_turn.items():
|
|
287
480
|
if all(isinstance(value, (int, float)) for value in values):
|
|
288
|
-
aggregated[metric] =
|
|
481
|
+
aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
|
|
289
482
|
# Slap the per-turn results back in.
|
|
290
483
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
291
484
|
return aggregated
|
|
292
485
|
|
|
486
|
+
def _parse_tools_from_response(self, response):
|
|
487
|
+
"""Parse the response to extract tool calls and results.
|
|
488
|
+
:param response: The response to parse.
|
|
489
|
+
:type response: Union[str, List[dict]]
|
|
490
|
+
:return: List of tool calls extracted from the response.
|
|
491
|
+
:rtype: List[dict]
|
|
492
|
+
"""
|
|
493
|
+
tool_calls = []
|
|
494
|
+
tool_results_map = {}
|
|
495
|
+
|
|
496
|
+
# Work on a deep copy to avoid modifying the original object
|
|
497
|
+
response_copy = copy.deepcopy(response)
|
|
498
|
+
|
|
499
|
+
if isinstance(response_copy, list):
|
|
500
|
+
for message in response_copy:
|
|
501
|
+
# Extract tool calls from assistant messages
|
|
502
|
+
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
503
|
+
for content_item in message.get("content"):
|
|
504
|
+
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
505
|
+
tool_calls.append(copy.deepcopy(content_item))
|
|
506
|
+
|
|
507
|
+
# Extract tool results from tool messages
|
|
508
|
+
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
509
|
+
tool_call_id = message.get("tool_call_id")
|
|
510
|
+
if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
|
|
511
|
+
result_content = message.get("content")[0]
|
|
512
|
+
if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
|
|
513
|
+
tool_results_map[tool_call_id] = result_content
|
|
514
|
+
|
|
515
|
+
# Attach results to their corresponding calls
|
|
516
|
+
for tool_call in tool_calls:
|
|
517
|
+
tool_call_id = tool_call.get("tool_call_id")
|
|
518
|
+
if tool_call_id in tool_results_map:
|
|
519
|
+
tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
|
|
520
|
+
|
|
521
|
+
return tool_calls
|
|
522
|
+
|
|
523
|
+
def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
|
|
524
|
+
"""Extract tool names and parameters from the response.
|
|
525
|
+
|
|
526
|
+
:param response: The response to parse.
|
|
527
|
+
:type response: Union[str, List[dict]]
|
|
528
|
+
:return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
|
|
529
|
+
:rtype: List[Tuple[str, Dict[str, str]]]
|
|
530
|
+
"""
|
|
531
|
+
tool_calls = self._parse_tools_from_response(response)
|
|
532
|
+
tool_name_param_pairs = []
|
|
533
|
+
for tool_call in tool_calls:
|
|
534
|
+
if not isinstance(tool_call, dict):
|
|
535
|
+
raise EvaluationException(
|
|
536
|
+
"Tool call must be a dictionary.",
|
|
537
|
+
internal_message=str(tool_call),
|
|
538
|
+
target=ErrorTarget.EVALUATE,
|
|
539
|
+
category=ErrorCategory.UNKNOWN,
|
|
540
|
+
)
|
|
541
|
+
if tool_call.get("type") != "tool_call":
|
|
542
|
+
raise EvaluationException(
|
|
543
|
+
"Tool call must have 'type' set to 'tool_call'.",
|
|
544
|
+
internal_message=str(tool_call),
|
|
545
|
+
target=ErrorTarget.EVALUATE,
|
|
546
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
if "name" not in tool_call:
|
|
550
|
+
raise EvaluationException(
|
|
551
|
+
"Tool call missing 'name' field.",
|
|
552
|
+
internal_message=str(tool_call),
|
|
553
|
+
target=ErrorTarget.EVALUATE,
|
|
554
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
tool_name = str(tool_call["name"]).strip()
|
|
558
|
+
|
|
559
|
+
# Extract parameters/arguments
|
|
560
|
+
parameters = {}
|
|
561
|
+
if "arguments" in tool_call:
|
|
562
|
+
args = tool_call["arguments"]
|
|
563
|
+
if isinstance(args, dict):
|
|
564
|
+
# Convert all values to strings for consistent comparison
|
|
565
|
+
parameters = {str(k): str(v) for k, v in args.items()}
|
|
566
|
+
elif isinstance(args, str):
|
|
567
|
+
# If arguments is a string, try to parse it as JSON
|
|
568
|
+
try:
|
|
569
|
+
parsed_args = json.loads(args)
|
|
570
|
+
if isinstance(parsed_args, dict):
|
|
571
|
+
parameters = {str(k): str(v) for k, v in parsed_args.items()}
|
|
572
|
+
except json.JSONDecodeError:
|
|
573
|
+
raise EvaluationException(
|
|
574
|
+
"Failed to parse tool call arguments as JSON.",
|
|
575
|
+
internal_message=str(tool_call),
|
|
576
|
+
target=ErrorTarget.EVALUATE,
|
|
577
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
tool_name_param_pairs.append((tool_name, parameters))
|
|
581
|
+
|
|
582
|
+
return tool_name_param_pairs
|
|
583
|
+
|
|
293
584
|
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
294
585
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
295
586
|
|
|
@@ -299,11 +590,48 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
299
590
|
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
300
591
|
"""
|
|
301
592
|
# Convert inputs into list of evaluable inputs.
|
|
302
|
-
|
|
593
|
+
try:
|
|
594
|
+
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
|
|
595
|
+
except Exception as e:
|
|
596
|
+
print(f"Error converting kwargs to eval_input_list: {e}")
|
|
597
|
+
raise e
|
|
303
598
|
per_turn_results = []
|
|
304
599
|
# Evaluate all inputs.
|
|
305
600
|
for eval_input in eval_input_list:
|
|
306
|
-
|
|
601
|
+
result = await self._do_eval(eval_input)
|
|
602
|
+
# logic to determine threshold pass/fail
|
|
603
|
+
try:
|
|
604
|
+
for key in list(result.keys()):
|
|
605
|
+
if key.endswith("_score") and "rouge" not in key:
|
|
606
|
+
score_value = result[key]
|
|
607
|
+
base_key = key[:-6] # Remove "_score" suffix
|
|
608
|
+
result_key = f"{base_key}_result"
|
|
609
|
+
threshold_key = f"{base_key}_threshold"
|
|
610
|
+
threshold_value = (
|
|
611
|
+
self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
|
|
612
|
+
)
|
|
613
|
+
if not isinstance(threshold_value, (int, float)):
|
|
614
|
+
raise EvaluationException(
|
|
615
|
+
"Threshold value must be a number.",
|
|
616
|
+
internal_message=str(threshold_value),
|
|
617
|
+
target=ErrorTarget.EVALUATE,
|
|
618
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
result[threshold_key] = threshold_value
|
|
622
|
+
if self._higher_is_better:
|
|
623
|
+
if float(score_value) >= threshold_value:
|
|
624
|
+
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
|
|
625
|
+
else:
|
|
626
|
+
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
|
|
627
|
+
else:
|
|
628
|
+
if float(score_value) <= threshold_value:
|
|
629
|
+
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
|
|
630
|
+
else:
|
|
631
|
+
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
|
|
632
|
+
except Exception as e:
|
|
633
|
+
print(f"Error calculating binary result: {e}")
|
|
634
|
+
per_turn_results.append(result)
|
|
307
635
|
# Return results as-is if only one result was produced.
|
|
308
636
|
|
|
309
637
|
if len(per_turn_results) == 1:
|
|
@@ -313,10 +641,51 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
313
641
|
# Otherwise, aggregate results.
|
|
314
642
|
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
315
643
|
|
|
644
|
+
# ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
|
|
645
|
+
|
|
316
646
|
@final
|
|
317
647
|
def _to_async(self) -> "AsyncEvaluatorBase":
|
|
318
648
|
return self._async_evaluator
|
|
319
649
|
|
|
650
|
+
@experimental
|
|
651
|
+
@final
|
|
652
|
+
def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
|
|
653
|
+
"""Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
|
|
654
|
+
multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
|
|
655
|
+
multi-turn conversation into a single top-level result.
|
|
656
|
+
|
|
657
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn
|
|
658
|
+
results of a conversation to produce a single result.
|
|
659
|
+
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
660
|
+
"""
|
|
661
|
+
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
|
|
662
|
+
|
|
663
|
+
@experimental
|
|
664
|
+
@final
|
|
665
|
+
def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
|
|
666
|
+
"""Set the conversation aggregator function directly. This function will be applied to all numeric outputs
|
|
667
|
+
of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
|
|
668
|
+
evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
|
|
669
|
+
suit your needs, but use with caution.
|
|
670
|
+
|
|
671
|
+
:param aggregator: The function to use to aggregate per-turn results.
|
|
672
|
+
:type aggregator: Callable[[List[float]], float]
|
|
673
|
+
"""
|
|
674
|
+
self._conversation_aggregation_function = aggregator
|
|
675
|
+
|
|
676
|
+
@experimental
|
|
677
|
+
@final
|
|
678
|
+
def _get_conversation_aggregator_type(self) -> _AggregationType:
|
|
679
|
+
"""Get the current conversation aggregation type used by this evaluator. This refers to the
|
|
680
|
+
method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
|
|
681
|
+
is inputted into an evaluator that evaluates each turn individually). The individual inputs
|
|
682
|
+
are combined by the function implied here to produce a single overall result.
|
|
683
|
+
|
|
684
|
+
:return: The conversation aggregation type.
|
|
685
|
+
:rtype: ~azure.ai.evaluation._AggregationType
|
|
686
|
+
"""
|
|
687
|
+
return GetAggregatorType(self._conversation_aggregation_function)
|
|
688
|
+
|
|
320
689
|
|
|
321
690
|
class AsyncEvaluatorBase:
|
|
322
691
|
"""The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
|
|
@@ -332,13 +701,42 @@ class AsyncEvaluatorBase:
|
|
|
332
701
|
# are just not passed into this function instead of ending up in kwargs.
|
|
333
702
|
# Since we want this to be relatively call-agnostic, we just account for every input that any children
|
|
334
703
|
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
335
|
-
async def __call__(
|
|
704
|
+
async def __call__(
|
|
705
|
+
self,
|
|
706
|
+
*,
|
|
707
|
+
query=None,
|
|
708
|
+
response=None,
|
|
709
|
+
context=None,
|
|
710
|
+
conversation=None,
|
|
711
|
+
ground_truth=None,
|
|
712
|
+
tool_calls=None,
|
|
713
|
+
tool_definitions=None,
|
|
714
|
+
messages=None,
|
|
715
|
+
retrieval_ground_truth=None,
|
|
716
|
+
retrieved_documents=None,
|
|
717
|
+
**kwargs,
|
|
718
|
+
):
|
|
336
719
|
if conversation is not None:
|
|
337
720
|
kwargs["conversation"] = conversation
|
|
338
721
|
if query is not None:
|
|
339
722
|
kwargs["query"] = query
|
|
340
723
|
if response is not None:
|
|
341
724
|
kwargs["response"] = response
|
|
725
|
+
if tool_definitions is not None:
|
|
726
|
+
kwargs["tool_definitions"] = tool_definitions
|
|
342
727
|
if context is not None:
|
|
343
728
|
kwargs["context"] = context
|
|
729
|
+
if ground_truth is not None:
|
|
730
|
+
kwargs["ground_truth"] = ground_truth
|
|
731
|
+
if tool_calls is not None:
|
|
732
|
+
kwargs["tool_calls"] = tool_calls
|
|
733
|
+
if tool_definitions is not None:
|
|
734
|
+
kwargs["tool_definitions"] = tool_definitions
|
|
735
|
+
if messages is not None:
|
|
736
|
+
kwargs["messages"] = messages
|
|
737
|
+
if retrieval_ground_truth is not None:
|
|
738
|
+
kwargs["retrieval_ground_truth"] = retrieval_ground_truth
|
|
739
|
+
if retrieved_documents is not None:
|
|
740
|
+
kwargs["retrieved_documents"] = retrieved_documents
|
|
741
|
+
|
|
344
742
|
return await self._real_call(**kwargs)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from concurrent.futures import as_completed
|
|
5
|
+
from typing import TypeVar, Dict, List
|
|
6
|
+
|
|
7
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
|
+
from typing_extensions import override
|
|
9
|
+
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiEvaluatorBase(EvaluatorBase[T]):
|
|
16
|
+
"""
|
|
17
|
+
Base class for evaluators that contain and run multiple other evaluators to produce a
|
|
18
|
+
suite of metrics.
|
|
19
|
+
|
|
20
|
+
Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval.
|
|
21
|
+
|
|
22
|
+
:param evaluators: The list of evaluators to run when this evaluator is called.
|
|
23
|
+
:type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase]
|
|
24
|
+
:param kwargs: Additional arguments to pass to the evaluator.
|
|
25
|
+
:type kwargs: Any
|
|
26
|
+
:return: An evaluator that runs multiple other evaluators and combines their results.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
|
|
30
|
+
self._threshold = kwargs.pop("threshold", 3)
|
|
31
|
+
self._higher_is_better = kwargs.pop("_higher_is_better", False)
|
|
32
|
+
super().__init__(threshold=self._threshold, _higher_is_better=self._higher_is_better)
|
|
33
|
+
self._parallel = kwargs.pop("_parallel", True)
|
|
34
|
+
self._evaluators = evaluators
|
|
35
|
+
|
|
36
|
+
@override
|
|
37
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
|
|
38
|
+
"""Run each evaluator, possibly in parallel, and combine the results into
|
|
39
|
+
a single large dictionary containing each evaluation. Inputs are passed
|
|
40
|
+
directly to each evaluator without additional processing.
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
:param eval_input: The input to the evaluation function.
|
|
44
|
+
:type eval_input: Dict
|
|
45
|
+
:return: The evaluation result.
|
|
46
|
+
:rtype: Dict
|
|
47
|
+
"""
|
|
48
|
+
results: Dict[str, T] = {}
|
|
49
|
+
if self._parallel:
|
|
50
|
+
with ThreadPoolExecutor() as executor:
|
|
51
|
+
# pylint: disable=no-value-for-parameter
|
|
52
|
+
futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators}
|
|
53
|
+
|
|
54
|
+
for future in as_completed(futures):
|
|
55
|
+
results.update(future.result())
|
|
56
|
+
else:
|
|
57
|
+
for evaluator in self._evaluators:
|
|
58
|
+
result = evaluator(**eval_input)
|
|
59
|
+
# Ignore is to avoid mypy getting upset over the amount of duck-typing
|
|
60
|
+
# that's going on to shove evaluators around like this.
|
|
61
|
+
results.update(result) # type: ignore[arg-type]
|
|
62
|
+
|
|
63
|
+
return results
|