azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from concurrent.futures import as_completed
|
|
5
|
+
from typing import TypeVar, Dict, List
|
|
6
|
+
|
|
7
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
|
+
from typing_extensions import override
|
|
9
|
+
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiEvaluatorBase(EvaluatorBase[T]):
|
|
16
|
+
"""
|
|
17
|
+
Base class for evaluators that contain and run multiple other evaluators to produce a
|
|
18
|
+
suite of metrics.
|
|
19
|
+
|
|
20
|
+
Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval.
|
|
21
|
+
|
|
22
|
+
:param evaluators: The list of evaluators to run when this evaluator is called.
|
|
23
|
+
:type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase]
|
|
24
|
+
:param kwargs: Additional arguments to pass to the evaluator.
|
|
25
|
+
:type kwargs: Any
|
|
26
|
+
:return: An evaluator that runs multiple other evaluators and combines their results.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
|
|
30
|
+
self._threshold = kwargs.pop("threshold", 3)
|
|
31
|
+
self._higher_is_better = kwargs.pop("_higher_is_better", False)
|
|
32
|
+
super().__init__(threshold=self._threshold, _higher_is_better=self._higher_is_better)
|
|
33
|
+
self._parallel = kwargs.pop("_parallel", True)
|
|
34
|
+
self._evaluators = evaluators
|
|
35
|
+
|
|
36
|
+
@override
|
|
37
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
|
|
38
|
+
"""Run each evaluator, possibly in parallel, and combine the results into
|
|
39
|
+
a single large dictionary containing each evaluation. Inputs are passed
|
|
40
|
+
directly to each evaluator without additional processing.
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
:param eval_input: The input to the evaluation function.
|
|
44
|
+
:type eval_input: Dict
|
|
45
|
+
:return: The evaluation result.
|
|
46
|
+
:rtype: Dict
|
|
47
|
+
"""
|
|
48
|
+
results: Dict[str, T] = {}
|
|
49
|
+
if self._parallel:
|
|
50
|
+
with ThreadPoolExecutor() as executor:
|
|
51
|
+
# pylint: disable=no-value-for-parameter
|
|
52
|
+
futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators}
|
|
53
|
+
|
|
54
|
+
for future in as_completed(futures):
|
|
55
|
+
results.update(future.result())
|
|
56
|
+
else:
|
|
57
|
+
for evaluator in self._evaluators:
|
|
58
|
+
result = evaluator(**eval_input)
|
|
59
|
+
# Ignore is to avoid mypy getting upset over the amount of duck-typing
|
|
60
|
+
# that's going on to shove evaluators around like this.
|
|
61
|
+
results.update(result) # type: ignore[arg-type]
|
|
62
|
+
|
|
63
|
+
return results
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
import os
|
|
8
|
+
from itertools import chain
|
|
9
|
+
from typing import Dict, Optional, TypeVar, Union, List
|
|
10
|
+
|
|
11
|
+
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
|
|
12
|
+
from promptflow.core._flow import AsyncPrompty
|
|
13
|
+
else:
|
|
14
|
+
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
15
|
+
from typing_extensions import override
|
|
16
|
+
|
|
17
|
+
from azure.core.credentials import TokenCredential
|
|
18
|
+
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
19
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
20
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
21
|
+
from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
|
|
22
|
+
from . import EvaluatorBase
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from ..._user_agent import UserAgentSingleton
|
|
26
|
+
except ImportError:
|
|
27
|
+
|
|
28
|
+
class UserAgentSingleton:
|
|
29
|
+
@property
|
|
30
|
+
def value(self) -> str:
|
|
31
|
+
return "None"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
T = TypeVar("T")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
38
|
+
"""Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
|
|
39
|
+
make use of a prompty file, and return their results as a dictionary, with a single key-value pair
|
|
40
|
+
linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
|
|
41
|
+
per-turn results are stored in a list under the key "evaluation_per_turn").
|
|
42
|
+
|
|
43
|
+
:param result_key: The key to use for the result of the evaluation. Single turn evaluations will return
|
|
44
|
+
a dictionary in the format {result_key: float}.
|
|
45
|
+
:type result_key: str
|
|
46
|
+
:param prompty_file: The path to the prompty file to use for evaluation.
|
|
47
|
+
:type prompty_file: str
|
|
48
|
+
:param model_config: The model configuration to use for evaluation.
|
|
49
|
+
:type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
|
|
50
|
+
:param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
|
|
51
|
+
Useful since some evaluators of this format are response-only.
|
|
52
|
+
:type ignore_queries: bool
|
|
53
|
+
:keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
|
|
54
|
+
:type is_reasoning_model: bool
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
_LLM_CALL_TIMEOUT = 600
|
|
58
|
+
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
result_key: str,
|
|
64
|
+
prompty_file: str,
|
|
65
|
+
model_config: dict,
|
|
66
|
+
eval_last_turn: bool = False,
|
|
67
|
+
threshold: int = 3,
|
|
68
|
+
credential: Optional[TokenCredential] = None,
|
|
69
|
+
_higher_is_better: bool = False,
|
|
70
|
+
**kwargs,
|
|
71
|
+
) -> None:
|
|
72
|
+
self._result_key = result_key
|
|
73
|
+
self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
|
|
74
|
+
self._prompty_file = prompty_file
|
|
75
|
+
self._threshold = threshold
|
|
76
|
+
self._higher_is_better = _higher_is_better
|
|
77
|
+
super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better)
|
|
78
|
+
|
|
79
|
+
subclass_name = self.__class__.__name__
|
|
80
|
+
user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})"
|
|
81
|
+
prompty_model_config = construct_prompty_model_config(
|
|
82
|
+
validate_model_config(model_config),
|
|
83
|
+
self._DEFAULT_OPEN_API_VERSION,
|
|
84
|
+
user_agent,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
self._flow = AsyncPrompty.load(
|
|
88
|
+
source=self._prompty_file,
|
|
89
|
+
model=prompty_model_config,
|
|
90
|
+
token_credential=credential,
|
|
91
|
+
is_reasoning_model=self._is_reasoning_model,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
95
|
+
# defining a default here.
|
|
96
|
+
def _get_binary_result(self, score: float) -> str:
|
|
97
|
+
"""Get the binary result based on the score.
|
|
98
|
+
|
|
99
|
+
:param score: The score to evaluate.
|
|
100
|
+
:type score: float
|
|
101
|
+
:return: The binary result.
|
|
102
|
+
:rtype: str
|
|
103
|
+
"""
|
|
104
|
+
if math.isnan(score):
|
|
105
|
+
return "unknown"
|
|
106
|
+
if self._higher_is_better:
|
|
107
|
+
if score >= self._threshold:
|
|
108
|
+
return EVALUATION_PASS_FAIL_MAPPING[True]
|
|
109
|
+
else:
|
|
110
|
+
return EVALUATION_PASS_FAIL_MAPPING[False]
|
|
111
|
+
else:
|
|
112
|
+
if score <= self._threshold:
|
|
113
|
+
return EVALUATION_PASS_FAIL_MAPPING[True]
|
|
114
|
+
else:
|
|
115
|
+
return EVALUATION_PASS_FAIL_MAPPING[False]
|
|
116
|
+
|
|
117
|
+
@override
|
|
118
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
119
|
+
"""Do a relevance evaluation.
|
|
120
|
+
|
|
121
|
+
:param eval_input: The input to the evaluator. Expected to contain
|
|
122
|
+
whatever inputs are needed for the _flow method, including context
|
|
123
|
+
and other fields depending on the child class.
|
|
124
|
+
:type eval_input: Dict
|
|
125
|
+
:return: The evaluation result.
|
|
126
|
+
:rtype: Dict
|
|
127
|
+
"""
|
|
128
|
+
if "query" not in eval_input and "response" not in eval_input:
|
|
129
|
+
raise EvaluationException(
|
|
130
|
+
message="Only text conversation inputs are supported.",
|
|
131
|
+
internal_message="Only text conversation inputs are supported.",
|
|
132
|
+
blame=ErrorBlame.USER_ERROR,
|
|
133
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
134
|
+
target=ErrorTarget.CONVERSATION,
|
|
135
|
+
)
|
|
136
|
+
# Call the prompty flow to get the evaluation result.
|
|
137
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
138
|
+
|
|
139
|
+
score = math.nan
|
|
140
|
+
if prompty_output_dict:
|
|
141
|
+
llm_output = prompty_output_dict.get("llm_output", "")
|
|
142
|
+
input_token_count = prompty_output_dict.get("input_token_count", 0)
|
|
143
|
+
output_token_count = prompty_output_dict.get("output_token_count", 0)
|
|
144
|
+
total_token_count = prompty_output_dict.get("total_token_count", 0)
|
|
145
|
+
finish_reason = prompty_output_dict.get("finish_reason", "")
|
|
146
|
+
model_id = prompty_output_dict.get("model_id", "")
|
|
147
|
+
sample_input = prompty_output_dict.get("sample_input", "")
|
|
148
|
+
sample_output = prompty_output_dict.get("sample_output", "")
|
|
149
|
+
# Parse out score and reason from evaluators known to possess them.
|
|
150
|
+
if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
|
|
151
|
+
score, reason = parse_quality_evaluator_reason_score(llm_output)
|
|
152
|
+
binary_result = self._get_binary_result(score)
|
|
153
|
+
return {
|
|
154
|
+
self._result_key: float(score),
|
|
155
|
+
f"gpt_{self._result_key}": float(score),
|
|
156
|
+
f"{self._result_key}_reason": reason,
|
|
157
|
+
f"{self._result_key}_result": binary_result,
|
|
158
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
159
|
+
f"{self._result_key}_prompt_tokens": input_token_count,
|
|
160
|
+
f"{self._result_key}_completion_tokens": output_token_count,
|
|
161
|
+
f"{self._result_key}_total_tokens": total_token_count,
|
|
162
|
+
f"{self._result_key}_finish_reason": finish_reason,
|
|
163
|
+
f"{self._result_key}_model": model_id,
|
|
164
|
+
f"{self._result_key}_sample_input": sample_input,
|
|
165
|
+
f"{self._result_key}_sample_output": sample_output,
|
|
166
|
+
}
|
|
167
|
+
match = re.search(r"\d", llm_output)
|
|
168
|
+
if match:
|
|
169
|
+
score = float(match.group())
|
|
170
|
+
binary_result = self._get_binary_result(score)
|
|
171
|
+
return {
|
|
172
|
+
self._result_key: float(score),
|
|
173
|
+
f"gpt_{self._result_key}": float(score),
|
|
174
|
+
f"{self._result_key}_result": binary_result,
|
|
175
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
176
|
+
f"{self._result_key}_prompt_tokens": input_token_count,
|
|
177
|
+
f"{self._result_key}_completion_tokens": output_token_count,
|
|
178
|
+
f"{self._result_key}_total_tokens": total_token_count,
|
|
179
|
+
f"{self._result_key}_finish_reason": finish_reason,
|
|
180
|
+
f"{self._result_key}_model": model_id,
|
|
181
|
+
f"{self._result_key}_sample_input": sample_input,
|
|
182
|
+
f"{self._result_key}_sample_output": sample_output,
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
binary_result = self._get_binary_result(score)
|
|
186
|
+
return {
|
|
187
|
+
self._result_key: float(score),
|
|
188
|
+
f"gpt_{self._result_key}": float(score),
|
|
189
|
+
f"{self._result_key}_result": binary_result,
|
|
190
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _get_built_in_tool_definition(tool_name: str):
|
|
195
|
+
"""Get the definition for the built-in tool."""
|
|
196
|
+
try:
|
|
197
|
+
from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
|
|
198
|
+
|
|
199
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
200
|
+
return {
|
|
201
|
+
"type": tool_name,
|
|
202
|
+
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
|
|
203
|
+
"name": tool_name,
|
|
204
|
+
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
|
|
205
|
+
}
|
|
206
|
+
except ImportError:
|
|
207
|
+
pass
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
|
|
211
|
+
"""Extract tool definitions needed for the given built-in tool calls."""
|
|
212
|
+
needed_definitions = []
|
|
213
|
+
for tool_call in tool_calls:
|
|
214
|
+
if isinstance(tool_call, dict):
|
|
215
|
+
tool_type = tool_call.get("type")
|
|
216
|
+
|
|
217
|
+
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
|
|
218
|
+
if tool_type == "tool_call":
|
|
219
|
+
tool_name = tool_call.get("name")
|
|
220
|
+
if tool_name:
|
|
221
|
+
definition = self._get_built_in_tool_definition(tool_name)
|
|
222
|
+
if definition and definition not in needed_definitions:
|
|
223
|
+
needed_definitions.append(definition)
|
|
224
|
+
|
|
225
|
+
return needed_definitions
|
|
226
|
+
|
|
227
|
+
def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
|
|
228
|
+
"""Extract just the tool names from tool calls, removing parameters."""
|
|
229
|
+
tool_names = []
|
|
230
|
+
for tool_call in tool_calls:
|
|
231
|
+
if isinstance(tool_call, dict):
|
|
232
|
+
tool_type = tool_call.get("type")
|
|
233
|
+
if tool_type == "tool_call":
|
|
234
|
+
tool_name = tool_call.get("name")
|
|
235
|
+
if tool_name:
|
|
236
|
+
tool_names.append(tool_name)
|
|
237
|
+
elif tool_call.get("function", {}).get("name"):
|
|
238
|
+
# Handle function call format
|
|
239
|
+
tool_names.append(tool_call["function"]["name"])
|
|
240
|
+
elif tool_call.get("name"):
|
|
241
|
+
# Handle direct name format
|
|
242
|
+
tool_names.append(tool_call["name"])
|
|
243
|
+
return tool_names
|
|
244
|
+
|
|
245
|
+
def _extract_needed_tool_definitions(
|
|
246
|
+
self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
|
|
247
|
+
) -> List[Dict]:
|
|
248
|
+
"""Extract the tool definitions that are needed for the provided tool calls.
|
|
249
|
+
|
|
250
|
+
:param tool_calls: The tool calls that need definitions
|
|
251
|
+
:type tool_calls: List[Dict]
|
|
252
|
+
:param tool_definitions: User-provided tool definitions
|
|
253
|
+
:type tool_definitions: List[Dict]
|
|
254
|
+
:param error_target: The evaluator-specific error target for exceptions
|
|
255
|
+
:type error_target: ErrorTarget
|
|
256
|
+
:return: List of needed tool definitions
|
|
257
|
+
:rtype: List[Dict]
|
|
258
|
+
:raises EvaluationException: If validation fails
|
|
259
|
+
"""
|
|
260
|
+
needed_tool_definitions = []
|
|
261
|
+
|
|
262
|
+
# Add all user-provided tool definitions
|
|
263
|
+
needed_tool_definitions.extend(tool_definitions)
|
|
264
|
+
|
|
265
|
+
# Add the needed built-in tool definitions (if they are called)
|
|
266
|
+
built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
|
|
267
|
+
needed_tool_definitions.extend(built_in_definitions)
|
|
268
|
+
|
|
269
|
+
# OpenAPI tool is a collection of functions, so we need to expand it
|
|
270
|
+
tool_definitions_expanded = list(
|
|
271
|
+
chain.from_iterable(
|
|
272
|
+
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
|
|
273
|
+
for tool in needed_tool_definitions
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Validate that all tool calls have corresponding definitions
|
|
278
|
+
for tool_call in tool_calls:
|
|
279
|
+
if isinstance(tool_call, dict):
|
|
280
|
+
tool_type = tool_call.get("type")
|
|
281
|
+
|
|
282
|
+
if tool_type == "tool_call":
|
|
283
|
+
tool_name = tool_call.get("name")
|
|
284
|
+
if tool_name and self._get_built_in_tool_definition(tool_name):
|
|
285
|
+
# This is a built-in tool from converter, already handled above
|
|
286
|
+
continue
|
|
287
|
+
elif tool_name:
|
|
288
|
+
# This is a regular function tool from converter
|
|
289
|
+
tool_definition_exists = any(
|
|
290
|
+
tool.get("name") == tool_name and tool.get("type", "function") == "function"
|
|
291
|
+
for tool in tool_definitions_expanded
|
|
292
|
+
)
|
|
293
|
+
if not tool_definition_exists:
|
|
294
|
+
raise EvaluationException(
|
|
295
|
+
message=f"Tool definition for {tool_name} not found",
|
|
296
|
+
blame=ErrorBlame.USER_ERROR,
|
|
297
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
298
|
+
target=error_target,
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
raise EvaluationException(
|
|
302
|
+
message=f"Tool call missing name: {tool_call}",
|
|
303
|
+
blame=ErrorBlame.USER_ERROR,
|
|
304
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
305
|
+
target=error_target,
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
# Unsupported tool format - only converter format is supported
|
|
309
|
+
raise EvaluationException(
|
|
310
|
+
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
|
|
311
|
+
blame=ErrorBlame.USER_ERROR,
|
|
312
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
313
|
+
target=error_target,
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
# Tool call is not a dictionary
|
|
317
|
+
raise EvaluationException(
|
|
318
|
+
message=f"Tool call is not a dictionary: {tool_call}",
|
|
319
|
+
blame=ErrorBlame.USER_ERROR,
|
|
320
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
321
|
+
target=error_target,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
return needed_tool_definitions
|
|
325
|
+
|
|
326
|
+
def _not_applicable_result(
|
|
327
|
+
self, error_message: str, threshold: Union[int, float]
|
|
328
|
+
) -> Dict[str, Union[str, float, Dict]]:
|
|
329
|
+
"""Return a result indicating that the evaluation is not applicable.
|
|
330
|
+
|
|
331
|
+
:param error_message: The error message explaining why evaluation is not applicable.
|
|
332
|
+
:type error_message: str
|
|
333
|
+
:param threshold: The threshold value for the evaluator.
|
|
334
|
+
:type threshold: Union[int, float]
|
|
335
|
+
:return: A dictionary containing the result of the evaluation.
|
|
336
|
+
:rtype: Dict[str, Union[str, float, Dict]]
|
|
337
|
+
"""
|
|
338
|
+
# If no tool calls were made or tool call type is not supported, return not applicable result
|
|
339
|
+
return {
|
|
340
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
341
|
+
f"{self._result_key}_result": "pass",
|
|
342
|
+
f"{self._result_key}_threshold": threshold,
|
|
343
|
+
f"{self._result_key}_reason": error_message,
|
|
344
|
+
f"{self._result_key}_details": {},
|
|
345
|
+
}
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict, TypeVar, Union, Optional
|
|
5
|
+
|
|
6
|
+
from typing_extensions import override
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._common.constants import (
|
|
9
|
+
EvaluationMetrics,
|
|
10
|
+
_InternalEvaluationMetrics,
|
|
11
|
+
Tasks,
|
|
12
|
+
_InternalAnnotationTasks,
|
|
13
|
+
)
|
|
14
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
|
|
15
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
16
|
+
from azure.ai.evaluation._exceptions import EvaluationException
|
|
17
|
+
from azure.ai.evaluation._common.utils import validate_conversation
|
|
18
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
19
|
+
from azure.core.credentials import TokenCredential
|
|
20
|
+
|
|
21
|
+
from . import EvaluatorBase
|
|
22
|
+
|
|
23
|
+
T = TypeVar("T")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
27
|
+
"""Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
|
|
28
|
+
This includes content safety evaluators, protected material evaluators, and others. These evaluators
|
|
29
|
+
are all assumed to be of the "query and response or conversation" input variety.
|
|
30
|
+
|
|
31
|
+
:param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
|
|
32
|
+
to specify which evaluation to perform.
|
|
33
|
+
:type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
|
|
34
|
+
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
|
|
35
|
+
aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
|
|
36
|
+
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
37
|
+
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
38
|
+
:type eval_last_turn: bool
|
|
39
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
|
|
40
|
+
Default is ~azure.ai.evaluation._AggregationType.MEAN.
|
|
41
|
+
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
42
|
+
:param threshold: The threshold for the evaluation. Default is 3.
|
|
43
|
+
:type threshold: Optional[int]
|
|
44
|
+
:param _higher_is_better: If True, higher scores are better. Default is True.
|
|
45
|
+
:type _higher_is_better: Optional[bool]
|
|
46
|
+
:param evaluate_query: If True, the query will be included in the evaluation data when evaluating
|
|
47
|
+
query-response pairs. If False, only the response will be evaluated. Default is False.
|
|
48
|
+
Can be passed as a keyword argument.
|
|
49
|
+
:type evaluate_query: bool
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
@override
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
56
|
+
azure_ai_project: Union[dict, str],
|
|
57
|
+
credential: TokenCredential,
|
|
58
|
+
eval_last_turn: bool = False,
|
|
59
|
+
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
60
|
+
threshold: int = 3,
|
|
61
|
+
_higher_is_better: Optional[bool] = False,
|
|
62
|
+
**kwargs,
|
|
63
|
+
):
|
|
64
|
+
super().__init__(
|
|
65
|
+
eval_last_turn=eval_last_turn,
|
|
66
|
+
conversation_aggregation_type=conversation_aggregation_type,
|
|
67
|
+
threshold=threshold,
|
|
68
|
+
_higher_is_better=_higher_is_better,
|
|
69
|
+
)
|
|
70
|
+
self._eval_metric = eval_metric
|
|
71
|
+
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
72
|
+
self._credential = credential
|
|
73
|
+
self._threshold = threshold
|
|
74
|
+
|
|
75
|
+
# Handle evaluate_query parameter from kwargs
|
|
76
|
+
self._evaluate_query = kwargs.get("evaluate_query", False)
|
|
77
|
+
self._higher_is_better = _higher_is_better
|
|
78
|
+
|
|
79
|
+
@override
|
|
80
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
81
|
+
self,
|
|
82
|
+
*args,
|
|
83
|
+
**kwargs,
|
|
84
|
+
):
|
|
85
|
+
"""Evaluate either a query and response or a conversation. Must supply either a query AND response,
|
|
86
|
+
or a conversation, but not both.
|
|
87
|
+
|
|
88
|
+
:keyword query: The query to evaluate.
|
|
89
|
+
:paramtype query: Optional[str]
|
|
90
|
+
:keyword response: The response to evaluate.
|
|
91
|
+
:paramtype response: Optional[str]
|
|
92
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
93
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
94
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
95
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
96
|
+
:rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
|
|
97
|
+
"""
|
|
98
|
+
return super().__call__(*args, **kwargs)
|
|
99
|
+
|
|
100
|
+
@override
|
|
101
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
|
|
102
|
+
"""Perform the evaluation using the Azure AI RAI service.
|
|
103
|
+
The exact evaluation performed is determined by the evaluation metric supplied
|
|
104
|
+
by the child class initializer.
|
|
105
|
+
|
|
106
|
+
:param eval_input: The input to the evaluation function.
|
|
107
|
+
:type eval_input: Dict
|
|
108
|
+
:return: The evaluation result.
|
|
109
|
+
:rtype: Dict
|
|
110
|
+
"""
|
|
111
|
+
if "response" in eval_input:
|
|
112
|
+
return await self._evaluate_query_response(eval_input)
|
|
113
|
+
|
|
114
|
+
conversation = eval_input.get("conversation", None)
|
|
115
|
+
return await self._evaluate_conversation(conversation)
|
|
116
|
+
|
|
117
|
+
async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
|
|
118
|
+
"""
|
|
119
|
+
Evaluates content according to this evaluator's metric.
|
|
120
|
+
:keyword conversation: The conversation contains list of messages to be evaluated.
|
|
121
|
+
Each message should have "role" and "content" keys.
|
|
122
|
+
|
|
123
|
+
:param conversation: The conversation to evaluate.
|
|
124
|
+
:type conversation: ~azure.ai.evaluation.Conversation
|
|
125
|
+
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
126
|
+
:rtype: Dict[str, Union[float, str]]
|
|
127
|
+
"""
|
|
128
|
+
# validate inputs
|
|
129
|
+
validate_conversation(conversation)
|
|
130
|
+
messages = conversation["messages"]
|
|
131
|
+
# Run score computation based on supplied metric.
|
|
132
|
+
result = await evaluate_with_rai_service_multimodal(
|
|
133
|
+
messages=messages,
|
|
134
|
+
metric_name=self._eval_metric,
|
|
135
|
+
project_scope=self._azure_ai_project,
|
|
136
|
+
credential=self._credential,
|
|
137
|
+
)
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
|
|
141
|
+
query = eval_input.get("query", None)
|
|
142
|
+
response = eval_input.get("response", None)
|
|
143
|
+
if response is None:
|
|
144
|
+
raise EvaluationException(
|
|
145
|
+
message="Not implemented",
|
|
146
|
+
internal_message=(
|
|
147
|
+
"Reached query/response evaluation without supplying response."
|
|
148
|
+
+ " This should have failed earlier."
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
input_data = {"response": str(response)}
|
|
152
|
+
|
|
153
|
+
if query is not None and self._evaluate_query:
|
|
154
|
+
input_data["query"] = str(query)
|
|
155
|
+
|
|
156
|
+
if "context" in self._get_all_singleton_inputs():
|
|
157
|
+
context = eval_input.get("context", None)
|
|
158
|
+
if context is None:
|
|
159
|
+
raise EvaluationException(
|
|
160
|
+
message="Not implemented",
|
|
161
|
+
internal_message=(
|
|
162
|
+
"Attempted context-based evaluation without supplying context."
|
|
163
|
+
+ " This should have failed earlier."
|
|
164
|
+
),
|
|
165
|
+
)
|
|
166
|
+
input_data["context"] = context
|
|
167
|
+
|
|
168
|
+
return await evaluate_with_rai_service( # type: ignore
|
|
169
|
+
metric_name=self._eval_metric,
|
|
170
|
+
data=input_data,
|
|
171
|
+
project_scope=self._azure_ai_project,
|
|
172
|
+
credential=self._credential,
|
|
173
|
+
annotation_task=self._get_task(),
|
|
174
|
+
evaluator_name=self.__class__.__name__,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def _get_task(self):
|
|
178
|
+
"""Get the annotation task for the current evaluation metric.
|
|
179
|
+
The annotation task is used by the RAI service script to determine a the message format
|
|
180
|
+
of the API call, and how the output is processed, among other things.
|
|
181
|
+
|
|
182
|
+
:return: The annotation task for the evaluator's self._eval_metric value.
|
|
183
|
+
:rtype: ~azure.ai.evaluation._common.constants.Tasks
|
|
184
|
+
|
|
185
|
+
"""
|
|
186
|
+
if self._eval_metric == EvaluationMetrics.GROUNDEDNESS:
|
|
187
|
+
return Tasks.GROUNDEDNESS
|
|
188
|
+
if self._eval_metric == EvaluationMetrics.XPIA:
|
|
189
|
+
return Tasks.XPIA
|
|
190
|
+
if self._eval_metric == _InternalEvaluationMetrics.ECI:
|
|
191
|
+
return _InternalAnnotationTasks.ECI
|
|
192
|
+
if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
193
|
+
return Tasks.PROTECTED_MATERIAL
|
|
194
|
+
if self._eval_metric == EvaluationMetrics.CODE_VULNERABILITY:
|
|
195
|
+
return Tasks.CODE_VULNERABILITY
|
|
196
|
+
if self._eval_metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
|
|
197
|
+
return Tasks.UNGROUNDED_ATTRIBUTES
|
|
198
|
+
return Tasks.CONTENT_HARM
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from typing import Callable, List
|
|
6
|
+
from azure.ai.evaluation._common.math import list_mean
|
|
7
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
8
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def GetAggregator(aggregation_type: _AggregationType) -> Callable[[List[float]], float]:
|
|
12
|
+
if aggregation_type == _AggregationType.SUM:
|
|
13
|
+
return sum
|
|
14
|
+
if aggregation_type == _AggregationType.MEAN:
|
|
15
|
+
return list_mean
|
|
16
|
+
if aggregation_type == _AggregationType.MAX:
|
|
17
|
+
return max
|
|
18
|
+
if aggregation_type == _AggregationType.MIN:
|
|
19
|
+
return min
|
|
20
|
+
if aggregation_type == _AggregationType.CUSTOM:
|
|
21
|
+
msg = (
|
|
22
|
+
"Cannot 'get' aggregator function associated with custom aggregation enum."
|
|
23
|
+
+ " This enum value should only be outputted as an indicator of an injected"
|
|
24
|
+
+ " aggregation function, not inputted directly"
|
|
25
|
+
)
|
|
26
|
+
raise EvaluationException(
|
|
27
|
+
message=msg,
|
|
28
|
+
blame=ErrorBlame.UNKNOWN,
|
|
29
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
30
|
+
target=ErrorTarget.EVALUATE,
|
|
31
|
+
)
|
|
32
|
+
raise EvaluationException(
|
|
33
|
+
message=f"Unaccounted for aggregation type: {aggregation_type}",
|
|
34
|
+
blame=ErrorBlame.UNKNOWN,
|
|
35
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
36
|
+
target=ErrorTarget.EVALUATE,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def GetAggregatorType(aggregation_function: Callable) -> _AggregationType:
|
|
41
|
+
if aggregation_function == sum: # pylint: disable=comparison-with-callable
|
|
42
|
+
return _AggregationType.SUM
|
|
43
|
+
if aggregation_function == list_mean: # pylint: disable=comparison-with-callable
|
|
44
|
+
return _AggregationType.MEAN
|
|
45
|
+
if aggregation_function == max: # pylint: disable=comparison-with-callable
|
|
46
|
+
return _AggregationType.MAX
|
|
47
|
+
if aggregation_function == min: # pylint: disable=comparison-with-callable
|
|
48
|
+
return _AggregationType.MIN
|
|
49
|
+
return _AggregationType.CUSTOM
|
|
@@ -3,8 +3,6 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
from ._content_safety import ContentSafetyEvaluator
|
|
6
|
-
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
7
|
-
from ._content_safety_chat import ContentSafetyChatEvaluator
|
|
8
6
|
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
9
7
|
from ._self_harm import SelfHarmEvaluator
|
|
10
8
|
from ._sexual import SexualEvaluator
|
|
@@ -16,6 +14,4 @@ __all__ = [
|
|
|
16
14
|
"SelfHarmEvaluator",
|
|
17
15
|
"HateUnfairnessEvaluator",
|
|
18
16
|
"ContentSafetyEvaluator",
|
|
19
|
-
"ContentSafetyChatEvaluator",
|
|
20
|
-
"ContentSafetyEvaluatorBase",
|
|
21
17
|
]
|