azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +100 -5
- azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
- azure/ai/evaluation/_common/constants.py +131 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +831 -142
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +870 -34
- azure/ai/evaluation/_constants.py +167 -6
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
- azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
- azure/ai/evaluation/_evaluate/_utils.py +289 -40
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
- azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
- azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
- azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
- azure/ai/evaluation/_exceptions.py +51 -7
- azure/ai/evaluation/_http_utils.py +210 -137
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +130 -8
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
- azure/ai/evaluation/simulator/_constants.py +12 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
- azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
- azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +302 -208
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
- azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
- {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -3,61 +3,123 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import asyncio
|
|
5
5
|
import importlib.metadata
|
|
6
|
+
import math
|
|
6
7
|
import re
|
|
7
8
|
import time
|
|
9
|
+
import json
|
|
10
|
+
import html
|
|
8
11
|
from ast import literal_eval
|
|
9
|
-
from typing import Dict, List
|
|
12
|
+
from typing import Dict, List, Optional, Union, cast
|
|
10
13
|
from urllib.parse import urlparse
|
|
14
|
+
from string import Template
|
|
15
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
16
|
+
from azure.ai.evaluation._common.onedp.models import QueryResponseInlineMessage
|
|
17
|
+
from azure.core.exceptions import HttpResponseError
|
|
11
18
|
|
|
12
19
|
import jwt
|
|
13
|
-
import numpy as np
|
|
14
20
|
|
|
21
|
+
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
15
22
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
|
-
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
23
|
+
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
17
24
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
25
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
26
|
+
from azure.ai.evaluation._common.utils import is_onedp_project
|
|
18
27
|
from azure.core.credentials import TokenCredential
|
|
19
|
-
from azure.
|
|
28
|
+
from azure.core.exceptions import HttpResponseError
|
|
29
|
+
from azure.core.pipeline.policies import AsyncRetryPolicy, UserAgentPolicy
|
|
20
30
|
|
|
21
31
|
from .constants import (
|
|
22
32
|
CommonConstants,
|
|
23
33
|
EvaluationMetrics,
|
|
24
34
|
RAIService,
|
|
25
35
|
Tasks,
|
|
26
|
-
_InternalAnnotationTasks,
|
|
27
36
|
_InternalEvaluationMetrics,
|
|
28
37
|
)
|
|
29
|
-
from .utils import get_harm_severity_level
|
|
38
|
+
from .utils import get_harm_severity_level, retrieve_content_type
|
|
30
39
|
|
|
31
|
-
try:
|
|
32
|
-
version = importlib.metadata.version("azure-ai-evaluation")
|
|
33
|
-
except importlib.metadata.PackageNotFoundError:
|
|
34
|
-
version = "unknown"
|
|
35
|
-
USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
36
40
|
|
|
41
|
+
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
42
|
+
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
43
|
+
}
|
|
44
|
+
ML_WORKSPACE = "https://management.azure.com/.default"
|
|
45
|
+
COG_SRV_WORKSPACE = "https://ai.azure.com/.default"
|
|
37
46
|
|
|
38
|
-
|
|
47
|
+
INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
51
|
+
"""Given the task and input data, produce a formatted string that will serve as the main
|
|
52
|
+
payload for the RAI service. Requires specific per-task logic.
|
|
53
|
+
|
|
54
|
+
:param data: The data to incorporate into the payload.
|
|
55
|
+
:type data: dict
|
|
56
|
+
:param annotation_task: The annotation task to use. This determines the template to use.
|
|
57
|
+
:type annotation_task: str
|
|
58
|
+
:return: The formatted based on the data and task template.
|
|
59
|
+
:rtype: str
|
|
60
|
+
"""
|
|
61
|
+
# Template class doesn't play nice with json dumping/loading, just handle groundedness'
|
|
62
|
+
# JSON format manually.
|
|
63
|
+
# Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
64
|
+
if annotation_task == Tasks.GROUNDEDNESS:
|
|
65
|
+
as_dict = {
|
|
66
|
+
"question": data.get("query", ""),
|
|
67
|
+
"answer": data.get("response", ""),
|
|
68
|
+
"context": data.get("context", ""),
|
|
69
|
+
}
|
|
70
|
+
return json.dumps(as_dict)
|
|
71
|
+
if annotation_task == Tasks.CODE_VULNERABILITY:
|
|
72
|
+
as_dict = {"context": data.get("query", ""), "completion": data.get("response", "")}
|
|
73
|
+
return json.dumps(as_dict)
|
|
74
|
+
if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
|
|
75
|
+
as_dict = {
|
|
76
|
+
"query": data.get("query", ""),
|
|
77
|
+
"response": data.get("response", ""),
|
|
78
|
+
"context": data.get("context", ""),
|
|
79
|
+
}
|
|
80
|
+
return json.dumps(as_dict)
|
|
81
|
+
as_dict = {
|
|
82
|
+
"query": html.escape(data.get("query", "")),
|
|
83
|
+
"response": html.escape(data.get("response", "")),
|
|
84
|
+
}
|
|
85
|
+
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
|
|
86
|
+
return user_text.replace("'", '\\"')
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_common_headers(token: str, evaluator_name: Optional[str] = None) -> Dict:
|
|
39
90
|
"""Get common headers for the HTTP request
|
|
40
91
|
|
|
41
92
|
:param token: The Azure authentication token.
|
|
42
93
|
:type token: str
|
|
94
|
+
:param evaluator_name: The evaluator name. Default is None.
|
|
95
|
+
:type evaluator_name: str
|
|
43
96
|
:return: The common headers.
|
|
44
97
|
:rtype: Dict
|
|
45
98
|
"""
|
|
99
|
+
user_agent = (
|
|
100
|
+
f"{UserAgentSingleton().value} (type=evaluator; subtype={evaluator_name})"
|
|
101
|
+
if evaluator_name
|
|
102
|
+
else UserAgentSingleton().value
|
|
103
|
+
)
|
|
46
104
|
return {
|
|
47
105
|
"Authorization": f"Bearer {token}",
|
|
48
|
-
"
|
|
49
|
-
"User-Agent": USER_AGENT,
|
|
50
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
51
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
52
|
-
"Connection": "close",
|
|
106
|
+
"User-Agent": user_agent,
|
|
53
107
|
}
|
|
54
108
|
|
|
55
109
|
|
|
56
|
-
|
|
110
|
+
def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
|
|
111
|
+
return get_async_http_client().with_policies(
|
|
112
|
+
retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
async def ensure_service_availability_onedp(
|
|
117
|
+
client: AIProjectClient, token: str, capability: Optional[str] = None
|
|
118
|
+
) -> None:
|
|
57
119
|
"""Check if the Responsible AI service is available in the region and has the required capability, if relevant.
|
|
58
120
|
|
|
59
|
-
:param
|
|
60
|
-
:type
|
|
121
|
+
:param client: The AI project client.
|
|
122
|
+
:type client: AIProjectClient
|
|
61
123
|
:param token: The Azure authentication token.
|
|
62
124
|
:type token: str
|
|
63
125
|
:param capability: The capability to check. Default is None.
|
|
@@ -65,37 +127,65 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
65
127
|
:raises Exception: If the service is not available in the region or the capability is not available.
|
|
66
128
|
"""
|
|
67
129
|
headers = get_common_headers(token)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
async with get_async_http_client() as client:
|
|
71
|
-
response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
72
|
-
svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
if response.status_code != 200:
|
|
76
|
-
msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
|
|
77
|
-
raise EvaluationException(
|
|
78
|
-
message=msg,
|
|
79
|
-
internal_message=msg,
|
|
80
|
-
target=ErrorTarget.UNKNOWN,
|
|
81
|
-
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
82
|
-
blame=ErrorBlame.USER_ERROR,
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
capabilities = response.json()
|
|
130
|
+
capabilities = client.evaluations.check_annotation(headers=headers)
|
|
86
131
|
|
|
87
132
|
if capability and capability not in capabilities:
|
|
88
|
-
msg = f"
|
|
133
|
+
msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
|
|
89
134
|
raise EvaluationException(
|
|
90
135
|
message=msg,
|
|
91
136
|
internal_message=msg,
|
|
92
137
|
target=ErrorTarget.RAI_CLIENT,
|
|
93
138
|
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
94
139
|
blame=ErrorBlame.USER_ERROR,
|
|
140
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
95
141
|
)
|
|
96
142
|
|
|
97
143
|
|
|
98
|
-
def
|
|
144
|
+
async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
|
|
145
|
+
"""Check if the Responsible AI service is available in the region and has the required capability, if relevant.
|
|
146
|
+
|
|
147
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
148
|
+
:type rai_svc_url: str
|
|
149
|
+
:param token: The Azure authentication token.
|
|
150
|
+
:type token: str
|
|
151
|
+
:param capability: The capability to check. Default is None.
|
|
152
|
+
:type capability: str
|
|
153
|
+
:raises Exception: If the service is not available in the region or the capability is not available.
|
|
154
|
+
"""
|
|
155
|
+
headers = get_common_headers(token)
|
|
156
|
+
svc_liveness_url = rai_svc_url + "/checkannotation"
|
|
157
|
+
|
|
158
|
+
async with get_async_http_client() as client:
|
|
159
|
+
response = await client.get(svc_liveness_url, headers=headers)
|
|
160
|
+
|
|
161
|
+
if response.status_code != 200:
|
|
162
|
+
msg = (
|
|
163
|
+
f"RAI service is unavailable in this region, or you lack the necessary permissions "
|
|
164
|
+
f"to access the AI project. Status Code: {response.status_code}"
|
|
165
|
+
)
|
|
166
|
+
raise EvaluationException(
|
|
167
|
+
message=msg,
|
|
168
|
+
internal_message=msg,
|
|
169
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
170
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
171
|
+
blame=ErrorBlame.USER_ERROR,
|
|
172
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
capabilities = response.json()
|
|
176
|
+
if capability and capability not in capabilities:
|
|
177
|
+
msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
|
|
178
|
+
raise EvaluationException(
|
|
179
|
+
message=msg,
|
|
180
|
+
internal_message=msg,
|
|
181
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
182
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
183
|
+
blame=ErrorBlame.USER_ERROR,
|
|
184
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
|
|
99
189
|
"""Generate the payload for the annotation request
|
|
100
190
|
|
|
101
191
|
:param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
|
|
@@ -103,19 +193,20 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
|
103
193
|
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
104
194
|
in the payload.
|
|
105
195
|
:type metric: str
|
|
196
|
+
:param annotation_task: The annotation task to be passed to service
|
|
197
|
+
:type annotation_task: str
|
|
106
198
|
:return: The payload for the annotation request.
|
|
107
199
|
:rtype: Dict
|
|
108
200
|
"""
|
|
109
201
|
include_metric = True
|
|
110
|
-
task =
|
|
202
|
+
task = annotation_task
|
|
111
203
|
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
112
|
-
|
|
204
|
+
include_metric = False
|
|
205
|
+
elif metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
|
|
113
206
|
include_metric = False
|
|
114
207
|
elif metric == _InternalEvaluationMetrics.ECI:
|
|
115
|
-
task = _InternalAnnotationTasks.ECI
|
|
116
208
|
include_metric = False
|
|
117
209
|
elif metric == EvaluationMetrics.XPIA:
|
|
118
|
-
task = Tasks.XPIA
|
|
119
210
|
include_metric = False
|
|
120
211
|
return (
|
|
121
212
|
{
|
|
@@ -131,39 +222,78 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
|
131
222
|
)
|
|
132
223
|
|
|
133
224
|
|
|
134
|
-
async def submit_request(
|
|
225
|
+
async def submit_request(
|
|
226
|
+
data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str, evaluator_name: str
|
|
227
|
+
) -> str:
|
|
135
228
|
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
136
229
|
|
|
137
|
-
:param
|
|
138
|
-
:type
|
|
139
|
-
:param response: The response to evaluate.
|
|
140
|
-
:type response: str
|
|
230
|
+
:param data: The data to evaluate.
|
|
231
|
+
:type data: dict
|
|
141
232
|
:param metric: The evaluation metric to use.
|
|
142
233
|
:type metric: str
|
|
143
234
|
:param rai_svc_url: The Responsible AI service URL.
|
|
144
235
|
:type rai_svc_url: str
|
|
145
236
|
:param token: The Azure authentication token.
|
|
146
237
|
:type token: str
|
|
238
|
+
:param annotation_task: The annotation task to use.
|
|
239
|
+
:type annotation_task: str
|
|
240
|
+
:param evaluator_name: The evaluator name.
|
|
241
|
+
:type evaluator_name: str
|
|
147
242
|
:return: The operation ID.
|
|
148
243
|
:rtype: str
|
|
149
244
|
"""
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
payload = generate_payload(normalized_user_text, metric)
|
|
245
|
+
normalized_user_text = get_formatted_template(data, annotation_task)
|
|
246
|
+
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
153
247
|
|
|
154
248
|
url = rai_svc_url + "/submitannotation"
|
|
155
|
-
headers = get_common_headers(token)
|
|
249
|
+
headers = get_common_headers(token, evaluator_name)
|
|
156
250
|
|
|
157
|
-
async with
|
|
158
|
-
|
|
159
|
-
url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
160
|
-
)
|
|
251
|
+
async with get_async_http_client_with_timeout() as client:
|
|
252
|
+
http_response = await client.post(url, json=payload, headers=headers)
|
|
161
253
|
|
|
162
|
-
if
|
|
163
|
-
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"],
|
|
164
|
-
|
|
254
|
+
if http_response.status_code != 202:
|
|
255
|
+
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
|
|
256
|
+
http_response.raise_for_status()
|
|
257
|
+
result = http_response.json()
|
|
258
|
+
operation_id = result["location"].split("/")[-1]
|
|
259
|
+
return operation_id
|
|
165
260
|
|
|
166
|
-
|
|
261
|
+
|
|
262
|
+
async def submit_request_onedp(
|
|
263
|
+
client: AIProjectClient,
|
|
264
|
+
data: dict,
|
|
265
|
+
metric: str,
|
|
266
|
+
token: str,
|
|
267
|
+
annotation_task: str,
|
|
268
|
+
evaluator_name: str,
|
|
269
|
+
scan_session_id: Optional[str] = None,
|
|
270
|
+
) -> str:
|
|
271
|
+
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
272
|
+
|
|
273
|
+
:param client: The AI project client.
|
|
274
|
+
:type client: AIProjectClient
|
|
275
|
+
:param data: The data to evaluate.
|
|
276
|
+
:type data: dict
|
|
277
|
+
:param metric: The evaluation metric to use.
|
|
278
|
+
:type metric: str
|
|
279
|
+
:param token: The Azure authentication token.
|
|
280
|
+
:type token: str
|
|
281
|
+
:param annotation_task: The annotation task to use.
|
|
282
|
+
:type annotation_task: str
|
|
283
|
+
:param evaluator_name: The evaluator name.
|
|
284
|
+
:type evaluator_name: str
|
|
285
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
286
|
+
:type scan_session_id: Optional[str]
|
|
287
|
+
:return: The operation ID.
|
|
288
|
+
:rtype: str
|
|
289
|
+
"""
|
|
290
|
+
normalized_user_text = get_formatted_template(data, annotation_task)
|
|
291
|
+
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
292
|
+
headers = get_common_headers(token, evaluator_name)
|
|
293
|
+
if scan_session_id:
|
|
294
|
+
headers["x-ms-client-request-id"] = scan_session_id
|
|
295
|
+
response = client.evaluations.submit_annotation(payload, headers=headers)
|
|
296
|
+
result = json.loads(response)
|
|
167
297
|
operation_id = result["location"].split("/")[-1]
|
|
168
298
|
return operation_id
|
|
169
299
|
|
|
@@ -191,9 +321,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
191
321
|
headers = get_common_headers(token)
|
|
192
322
|
|
|
193
323
|
async with get_async_http_client() as client:
|
|
194
|
-
response = await client.get(
|
|
195
|
-
url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
196
|
-
)
|
|
324
|
+
response = await client.get(url, headers=headers, timeout=RAIService.TIMEOUT)
|
|
197
325
|
|
|
198
326
|
if response.status_code == 200:
|
|
199
327
|
return response.json()
|
|
@@ -207,65 +335,195 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
207
335
|
await asyncio.sleep(sleep_time)
|
|
208
336
|
|
|
209
337
|
|
|
338
|
+
async def fetch_result_onedp(client: AIProjectClient, operation_id: str, token: str) -> Dict:
|
|
339
|
+
"""Fetch the annotation result from Responsible AI service
|
|
340
|
+
|
|
341
|
+
:param client: The AI project client.
|
|
342
|
+
:type client: AIProjectClient
|
|
343
|
+
:param operation_id: The operation ID.
|
|
344
|
+
:type operation_id: str
|
|
345
|
+
:param token: The Azure authentication token.
|
|
346
|
+
:type token: str
|
|
347
|
+
:return: The annotation result.
|
|
348
|
+
:rtype: Dict
|
|
349
|
+
"""
|
|
350
|
+
start = time.time()
|
|
351
|
+
request_count = 0
|
|
352
|
+
|
|
353
|
+
while True:
|
|
354
|
+
headers = get_common_headers(token)
|
|
355
|
+
try:
|
|
356
|
+
return client.evaluations.operation_results(operation_id, headers=headers)
|
|
357
|
+
except HttpResponseError:
|
|
358
|
+
request_count += 1
|
|
359
|
+
time_elapsed = time.time() - start
|
|
360
|
+
if time_elapsed > RAIService.TIMEOUT:
|
|
361
|
+
raise TimeoutError(
|
|
362
|
+
f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
sleep_time = RAIService.SLEEP_TIME**request_count
|
|
366
|
+
await asyncio.sleep(sleep_time)
|
|
367
|
+
|
|
368
|
+
|
|
210
369
|
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
211
|
-
batch_response: List[Dict], metric_name: str
|
|
212
|
-
) -> Dict:
|
|
370
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
371
|
+
) -> Dict[str, Union[str, float]]:
|
|
213
372
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
214
373
|
|
|
215
374
|
:param batch_response: The annotation response from Responsible AI service.
|
|
216
375
|
:type batch_response: List[Dict]
|
|
217
376
|
:param metric_name: The evaluation metric to use.
|
|
218
377
|
:type metric_name: str
|
|
378
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
379
|
+
:type metric_display_name: Optional[str]
|
|
219
380
|
:return: The parsed annotation result.
|
|
220
|
-
:rtype:
|
|
381
|
+
:rtype: Dict[str, Union[str, float]]
|
|
221
382
|
"""
|
|
383
|
+
if metric_display_name is None:
|
|
384
|
+
metric_display_name = metric_name
|
|
385
|
+
|
|
222
386
|
# non-numeric metrics
|
|
223
|
-
if metric_name in {
|
|
224
|
-
|
|
387
|
+
if metric_name in {
|
|
388
|
+
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
389
|
+
_InternalEvaluationMetrics.ECI,
|
|
390
|
+
EvaluationMetrics.XPIA,
|
|
391
|
+
EvaluationMetrics.CODE_VULNERABILITY,
|
|
392
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
393
|
+
}:
|
|
394
|
+
result = {}
|
|
395
|
+
if not batch_response or len(batch_response[0]) == 0:
|
|
396
|
+
return {}
|
|
397
|
+
if (
|
|
398
|
+
metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
|
|
399
|
+
and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]
|
|
400
|
+
):
|
|
401
|
+
batch_response[0] = {
|
|
402
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
|
|
403
|
+
}
|
|
404
|
+
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
|
|
405
|
+
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
|
|
406
|
+
for pm_metric_name in pm_metric_names:
|
|
407
|
+
response = batch_response[0][pm_metric_name]
|
|
408
|
+
response = response.replace("false", "False")
|
|
409
|
+
response = response.replace("true", "True")
|
|
410
|
+
parsed_response = literal_eval(response)
|
|
411
|
+
result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
412
|
+
result[pm_metric_name + "_reason"] = (
|
|
413
|
+
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
414
|
+
)
|
|
415
|
+
result[pm_metric_name + "_total_tokens"] = (
|
|
416
|
+
parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
|
|
417
|
+
)
|
|
418
|
+
result[pm_metric_name + "_prompt_tokens"] = (
|
|
419
|
+
parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
|
|
420
|
+
)
|
|
421
|
+
result[pm_metric_name + "_completion_tokens"] = (
|
|
422
|
+
parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
|
|
423
|
+
)
|
|
424
|
+
result[pm_metric_name + "_finish_reason"] = (
|
|
425
|
+
parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
|
|
426
|
+
)
|
|
427
|
+
result[pm_metric_name + "_sample_input"] = (
|
|
428
|
+
parsed_response["sample_input"] if "sample_input" in parsed_response else ""
|
|
429
|
+
)
|
|
430
|
+
result[pm_metric_name + "_sample_output"] = (
|
|
431
|
+
parsed_response["sample_output"] if "sample_output" in parsed_response else ""
|
|
432
|
+
)
|
|
433
|
+
result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
|
|
434
|
+
return result
|
|
435
|
+
if metric_name not in batch_response[0]:
|
|
225
436
|
return {}
|
|
226
437
|
response = batch_response[0][metric_name]
|
|
227
438
|
response = response.replace("false", "False")
|
|
228
439
|
response = response.replace("true", "True")
|
|
229
440
|
parsed_response = literal_eval(response)
|
|
230
|
-
result = {}
|
|
231
441
|
# Use label instead of score since these are assumed to be boolean results.
|
|
232
|
-
# Use
|
|
233
|
-
result[
|
|
234
|
-
result[
|
|
442
|
+
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
443
|
+
result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
444
|
+
result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
235
445
|
|
|
236
446
|
if metric_name == EvaluationMetrics.XPIA:
|
|
237
447
|
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
|
|
238
|
-
# if present else set them to
|
|
239
|
-
result[
|
|
240
|
-
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else
|
|
448
|
+
# if present else set them to math.nan
|
|
449
|
+
result[metric_display_name + "_manipulated_content"] = (
|
|
450
|
+
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
|
|
241
451
|
)
|
|
242
|
-
result[
|
|
243
|
-
parsed_response["intrusion"] if "intrusion" in parsed_response else
|
|
452
|
+
result[metric_display_name + "_intrusion"] = (
|
|
453
|
+
parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
|
|
244
454
|
)
|
|
245
|
-
result[
|
|
246
|
-
parsed_response["information_gathering"] if "information_gathering" in parsed_response else
|
|
455
|
+
result[metric_display_name + "_information_gathering"] = (
|
|
456
|
+
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
|
|
247
457
|
)
|
|
458
|
+
if (
|
|
459
|
+
metric_name == EvaluationMetrics.CODE_VULNERABILITY
|
|
460
|
+
or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
|
|
461
|
+
):
|
|
462
|
+
# Add all attributes under the details.
|
|
463
|
+
details = {}
|
|
464
|
+
for key, value in parsed_response.items():
|
|
465
|
+
if key not in {
|
|
466
|
+
"label",
|
|
467
|
+
"reasoning",
|
|
468
|
+
"version",
|
|
469
|
+
"totalTokenCount",
|
|
470
|
+
"inputTokenCount",
|
|
471
|
+
"outputTokenCount",
|
|
472
|
+
"finish_reason",
|
|
473
|
+
"sample_input",
|
|
474
|
+
"sample_output",
|
|
475
|
+
"model",
|
|
476
|
+
}:
|
|
477
|
+
details[key.replace("-", "_")] = value
|
|
478
|
+
result[metric_display_name + "_details"] = details
|
|
479
|
+
result[metric_display_name + "_total_tokens"] = (
|
|
480
|
+
parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
|
|
481
|
+
)
|
|
482
|
+
result[metric_display_name + "_prompt_tokens"] = (
|
|
483
|
+
parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
|
|
484
|
+
)
|
|
485
|
+
result[metric_display_name + "_completion_tokens"] = (
|
|
486
|
+
parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
|
|
487
|
+
)
|
|
488
|
+
result[metric_display_name + "_finish_reason"] = (
|
|
489
|
+
parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
|
|
490
|
+
)
|
|
491
|
+
result[metric_display_name + "_sample_input"] = (
|
|
492
|
+
parsed_response["sample_input"] if "sample_input" in parsed_response else ""
|
|
493
|
+
)
|
|
494
|
+
result[metric_display_name + "_sample_output"] = (
|
|
495
|
+
parsed_response["sample_output"] if "sample_output" in parsed_response else ""
|
|
496
|
+
)
|
|
497
|
+
result[metric_display_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
|
|
248
498
|
return result
|
|
249
|
-
return _parse_content_harm_response(batch_response, metric_name)
|
|
499
|
+
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
|
|
250
500
|
|
|
251
501
|
|
|
252
|
-
def _parse_content_harm_response(
|
|
502
|
+
def _parse_content_harm_response(
|
|
503
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
504
|
+
) -> Dict[str, Union[str, float]]:
|
|
253
505
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
254
506
|
|
|
255
507
|
:param batch_response: The annotation response from Responsible AI service.
|
|
256
508
|
:type batch_response: List[Dict]
|
|
257
509
|
:param metric_name: The evaluation metric to use.
|
|
258
510
|
:type metric_name: str
|
|
511
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
512
|
+
:type metric_display_name: Optional[str]
|
|
259
513
|
:return: The parsed annotation result.
|
|
260
|
-
:rtype:
|
|
514
|
+
:rtype: Dict[str, Union[str, float]]
|
|
261
515
|
"""
|
|
262
516
|
# Fix the metric name if it's "hate_fairness"
|
|
263
517
|
# Eventually we will remove this fix once the RAI service is updated
|
|
264
|
-
key = metric_name
|
|
518
|
+
key = metric_name if metric_display_name is None else metric_display_name
|
|
265
519
|
if key == EvaluationMetrics.HATE_FAIRNESS:
|
|
266
520
|
key = EvaluationMetrics.HATE_UNFAIRNESS
|
|
267
521
|
|
|
268
|
-
result
|
|
522
|
+
result: Dict[str, Union[str, float]] = {
|
|
523
|
+
(key.value if hasattr(key, "value") else key): math.nan,
|
|
524
|
+
f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
|
|
525
|
+
f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
|
|
526
|
+
}
|
|
269
527
|
|
|
270
528
|
response = batch_response[0]
|
|
271
529
|
if metric_name not in response:
|
|
@@ -276,6 +534,13 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
276
534
|
except Exception: # pylint: disable=broad-exception-caught
|
|
277
535
|
harm_response = response[metric_name]
|
|
278
536
|
|
|
537
|
+
total_tokens = 0
|
|
538
|
+
prompt_tokens = 0
|
|
539
|
+
completion_tokens = 0
|
|
540
|
+
finish_reason = ""
|
|
541
|
+
sample_input = ""
|
|
542
|
+
sample_output = ""
|
|
543
|
+
model = ""
|
|
279
544
|
if harm_response != "" and isinstance(harm_response, dict):
|
|
280
545
|
# check if "output" is one key in harm_response
|
|
281
546
|
if "output" in harm_response:
|
|
@@ -283,11 +548,18 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
283
548
|
|
|
284
549
|
# get content harm metric_value
|
|
285
550
|
if "label" in harm_response:
|
|
286
|
-
|
|
551
|
+
try:
|
|
552
|
+
# Handle "n/a" or other non-numeric values
|
|
553
|
+
if isinstance(harm_response["label"], str) and harm_response["label"].strip().lower() == "n/a":
|
|
554
|
+
metric_value = math.nan
|
|
555
|
+
else:
|
|
556
|
+
metric_value = float(harm_response["label"])
|
|
557
|
+
except (ValueError, TypeError):
|
|
558
|
+
metric_value = math.nan
|
|
287
559
|
elif "valid" in harm_response:
|
|
288
|
-
metric_value = 0 if harm_response["valid"] else
|
|
560
|
+
metric_value = 0 if harm_response["valid"] else math.nan
|
|
289
561
|
else:
|
|
290
|
-
metric_value =
|
|
562
|
+
metric_value = math.nan
|
|
291
563
|
|
|
292
564
|
# get reason
|
|
293
565
|
if "reasoning" in harm_response:
|
|
@@ -296,31 +568,77 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
296
568
|
reason = harm_response["reason"]
|
|
297
569
|
else:
|
|
298
570
|
reason = ""
|
|
571
|
+
|
|
572
|
+
# get token_usage
|
|
573
|
+
if "totalTokenCount" in harm_response:
|
|
574
|
+
total_tokens = harm_response["totalTokenCount"]
|
|
575
|
+
else:
|
|
576
|
+
total_tokens = 0
|
|
577
|
+
if "inputTokenCount" in harm_response:
|
|
578
|
+
prompt_tokens = harm_response["inputTokenCount"]
|
|
579
|
+
else:
|
|
580
|
+
prompt_tokens = 0
|
|
581
|
+
if "outputTokenCount" in harm_response:
|
|
582
|
+
completion_tokens = harm_response["outputTokenCount"]
|
|
583
|
+
else:
|
|
584
|
+
completion_tokens = 0
|
|
585
|
+
|
|
586
|
+
# get finish_reason
|
|
587
|
+
if "finish_reason" in harm_response:
|
|
588
|
+
finish_reason = harm_response["finish_reason"]
|
|
589
|
+
else:
|
|
590
|
+
finish_reason = ""
|
|
591
|
+
|
|
592
|
+
# get sample_input
|
|
593
|
+
if "sample_input" in harm_response:
|
|
594
|
+
sample_input = harm_response["sample_input"]
|
|
595
|
+
else:
|
|
596
|
+
sample_input = ""
|
|
597
|
+
|
|
598
|
+
# get sample_output
|
|
599
|
+
if "sample_output" in harm_response:
|
|
600
|
+
sample_output = harm_response["sample_output"]
|
|
601
|
+
else:
|
|
602
|
+
sample_output = ""
|
|
603
|
+
|
|
604
|
+
# get model
|
|
605
|
+
if "model" in harm_response:
|
|
606
|
+
model = harm_response["model"]
|
|
607
|
+
else:
|
|
608
|
+
model = ""
|
|
299
609
|
elif harm_response != "" and isinstance(harm_response, str):
|
|
300
610
|
metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
|
|
301
611
|
if metric_value_match:
|
|
302
612
|
metric_value = int(metric_value_match[0])
|
|
303
613
|
else:
|
|
304
|
-
metric_value =
|
|
614
|
+
metric_value = math.nan
|
|
305
615
|
reason = harm_response
|
|
306
616
|
elif harm_response != "" and isinstance(harm_response, (int, float)):
|
|
307
617
|
if 0 < harm_response <= 7:
|
|
308
618
|
metric_value = harm_response
|
|
309
619
|
else:
|
|
310
|
-
metric_value =
|
|
620
|
+
metric_value = math.nan
|
|
311
621
|
reason = ""
|
|
312
622
|
else:
|
|
313
|
-
metric_value =
|
|
623
|
+
metric_value = math.nan
|
|
314
624
|
reason = ""
|
|
315
625
|
|
|
316
626
|
harm_score = metric_value
|
|
317
|
-
|
|
318
|
-
|
|
627
|
+
# We've already handled the "n/a" case by converting to math.nan
|
|
628
|
+
if not math.isnan(metric_value):
|
|
629
|
+
# int(math.nan) causes a value error, and math.nan is already handled
|
|
319
630
|
# by get_harm_severity_level
|
|
320
631
|
harm_score = int(metric_value)
|
|
321
632
|
result[key] = get_harm_severity_level(harm_score)
|
|
322
633
|
result[key + "_score"] = harm_score
|
|
323
634
|
result[key + "_reason"] = reason
|
|
635
|
+
result[key + "_total_tokens"] = total_tokens
|
|
636
|
+
result[key + "_prompt_tokens"] = prompt_tokens
|
|
637
|
+
result[key + "_completion_tokens"] = completion_tokens
|
|
638
|
+
result[key + "_finish_reason"] = finish_reason
|
|
639
|
+
result[key + "_sample_input"] = sample_input
|
|
640
|
+
result[key + "_sample_output"] = sample_output
|
|
641
|
+
result[key + "_model"] = model
|
|
324
642
|
|
|
325
643
|
return result
|
|
326
644
|
|
|
@@ -337,31 +655,34 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
337
655
|
"""
|
|
338
656
|
headers = get_common_headers(token)
|
|
339
657
|
|
|
340
|
-
async with
|
|
341
|
-
response = await client.get(
|
|
658
|
+
async with get_async_http_client_with_timeout() as client:
|
|
659
|
+
response = await client.get(
|
|
342
660
|
f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
|
|
343
661
|
f"resourceGroups/{azure_ai_project['resource_group_name']}/"
|
|
344
662
|
f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
|
|
345
663
|
f"api-version=2023-08-01-preview",
|
|
346
664
|
headers=headers,
|
|
347
|
-
timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
|
|
348
665
|
)
|
|
349
666
|
|
|
350
667
|
if response.status_code != 200:
|
|
351
|
-
msg =
|
|
668
|
+
msg = (
|
|
669
|
+
f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
|
|
670
|
+
f"and make sure you have the necessary access permissions. "
|
|
671
|
+
f"Status code: {response.status_code}."
|
|
672
|
+
)
|
|
352
673
|
raise EvaluationException(
|
|
353
674
|
message=msg,
|
|
354
|
-
internal_message=msg,
|
|
355
675
|
target=ErrorTarget.RAI_CLIENT,
|
|
356
|
-
|
|
357
|
-
|
|
676
|
+
blame=ErrorBlame.USER_ERROR,
|
|
677
|
+
category=ErrorCategory.PROJECT_ACCESS_ERROR,
|
|
678
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
358
679
|
)
|
|
359
680
|
|
|
360
681
|
base_url = urlparse(response.json()["properties"]["discoveryUrl"])
|
|
361
682
|
return f"{base_url.scheme}://{base_url.netloc}"
|
|
362
683
|
|
|
363
684
|
|
|
364
|
-
async def get_rai_svc_url(project_scope:
|
|
685
|
+
async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
|
|
365
686
|
"""Get the Responsible AI service URL
|
|
366
687
|
|
|
367
688
|
:param project_scope: The Azure AI project scope details.
|
|
@@ -385,7 +706,9 @@ async def get_rai_svc_url(project_scope: dict, token: str) -> str:
|
|
|
385
706
|
return rai_url
|
|
386
707
|
|
|
387
708
|
|
|
388
|
-
async def fetch_or_reuse_token(
|
|
709
|
+
async def fetch_or_reuse_token(
|
|
710
|
+
credential: TokenCredential, token: Optional[str] = None, workspace: Optional[str] = ML_WORKSPACE
|
|
711
|
+
) -> str:
|
|
389
712
|
"""Get token. Fetch a new token if the current token is near expiry
|
|
390
713
|
|
|
391
714
|
:param credential: The Azure authentication credential.
|
|
@@ -395,58 +718,424 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -
|
|
|
395
718
|
:type token: str
|
|
396
719
|
:return: The Azure authentication token.
|
|
397
720
|
"""
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
# Decode the token to get its expiration time
|
|
721
|
+
if token:
|
|
722
|
+
# Decode the token to get its expiration time
|
|
723
|
+
try:
|
|
402
724
|
decoded_token = jwt.decode(token, options={"verify_signature": False})
|
|
725
|
+
except jwt.PyJWTError:
|
|
726
|
+
pass
|
|
727
|
+
else:
|
|
403
728
|
exp_time = decoded_token["exp"]
|
|
404
729
|
current_time = time.time()
|
|
405
730
|
|
|
406
|
-
#
|
|
731
|
+
# Return current token if not near expiry
|
|
407
732
|
if (exp_time - current_time) >= 300:
|
|
408
|
-
|
|
409
|
-
except Exception: # pylint: disable=broad-exception-caught
|
|
410
|
-
pass
|
|
733
|
+
return token
|
|
411
734
|
|
|
412
|
-
|
|
413
|
-
token = credential.get_token("https://management.azure.com/.default").token
|
|
414
|
-
|
|
415
|
-
return token
|
|
735
|
+
return credential.get_token(workspace).token
|
|
416
736
|
|
|
417
737
|
|
|
418
738
|
async def evaluate_with_rai_service(
|
|
419
|
-
|
|
739
|
+
data: dict,
|
|
740
|
+
metric_name: str,
|
|
741
|
+
project_scope: Union[str, AzureAIProject],
|
|
742
|
+
credential: TokenCredential,
|
|
743
|
+
annotation_task: str = Tasks.CONTENT_HARM,
|
|
744
|
+
metric_display_name=None,
|
|
745
|
+
evaluator_name=None,
|
|
746
|
+
scan_session_id: Optional[str] = None,
|
|
747
|
+
) -> Dict[str, Union[str, float]]:
|
|
748
|
+
"""Evaluate the content safety of the response using Responsible AI service
|
|
749
|
+
|
|
750
|
+
:param data: The data to evaluate.
|
|
751
|
+
:type data: dict
|
|
752
|
+
:param metric_name: The evaluation metric to use.
|
|
753
|
+
:type metric_name: str
|
|
754
|
+
:param project_scope: The Azure AI project, which can either be a string representing the project endpoint
|
|
755
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
756
|
+
:type project_scope: Union[str, AzureAIProject]
|
|
757
|
+
:param credential: The Azure authentication credential.
|
|
758
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
759
|
+
:param annotation_task: The annotation task to use.
|
|
760
|
+
:type annotation_task: str
|
|
761
|
+
:param metric_display_name: The display name of metric to use.
|
|
762
|
+
:type metric_display_name: str
|
|
763
|
+
:param evaluator_name: The evaluator name to use.
|
|
764
|
+
:type evaluator_name: str
|
|
765
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
766
|
+
:type scan_session_id: Optional[str]
|
|
767
|
+
:return: The parsed annotation result.
|
|
768
|
+
:rtype: Dict[str, Union[str, float]]
|
|
769
|
+
"""
|
|
770
|
+
|
|
771
|
+
if is_onedp_project(project_scope):
|
|
772
|
+
client = AIProjectClient(
|
|
773
|
+
endpoint=project_scope,
|
|
774
|
+
credential=credential,
|
|
775
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
776
|
+
)
|
|
777
|
+
token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
|
|
778
|
+
await ensure_service_availability_onedp(client, token, annotation_task)
|
|
779
|
+
operation_id = await submit_request_onedp(
|
|
780
|
+
client, data, metric_name, token, annotation_task, evaluator_name, scan_session_id
|
|
781
|
+
)
|
|
782
|
+
annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
|
|
783
|
+
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
784
|
+
return result
|
|
785
|
+
else:
|
|
786
|
+
# Get RAI service URL from discovery service and check service availability
|
|
787
|
+
token = await fetch_or_reuse_token(credential)
|
|
788
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
789
|
+
await ensure_service_availability(rai_svc_url, token, annotation_task)
|
|
790
|
+
|
|
791
|
+
# Submit annotation request and fetch result
|
|
792
|
+
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task, evaluator_name)
|
|
793
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
794
|
+
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
795
|
+
|
|
796
|
+
return result
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
|
|
800
|
+
"""Generate the payload for the annotation request
|
|
801
|
+
:param content_type: The type of the content representing multimodal or images.
|
|
802
|
+
:type content_type: str
|
|
803
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
804
|
+
:type messages: str
|
|
805
|
+
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
806
|
+
in the payload.
|
|
807
|
+
:type metric: str
|
|
808
|
+
:return: The payload for the annotation request.
|
|
809
|
+
:rtype: Dict
|
|
810
|
+
"""
|
|
811
|
+
include_metric = True
|
|
812
|
+
task = Tasks.CONTENT_HARM
|
|
813
|
+
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
814
|
+
task = Tasks.PROTECTED_MATERIAL
|
|
815
|
+
include_metric = False
|
|
816
|
+
|
|
817
|
+
if include_metric:
|
|
818
|
+
return {
|
|
819
|
+
"ContentType": content_type,
|
|
820
|
+
"Contents": [{"messages": messages}],
|
|
821
|
+
"AnnotationTask": task,
|
|
822
|
+
"MetricList": [metric],
|
|
823
|
+
}
|
|
824
|
+
return {
|
|
825
|
+
"ContentType": content_type,
|
|
826
|
+
"Contents": [{"messages": messages}],
|
|
827
|
+
"AnnotationTask": task,
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
|
|
832
|
+
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
833
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
834
|
+
:type messages: str
|
|
835
|
+
:param metric: The evaluation metric to use.
|
|
836
|
+
:type metric: str
|
|
837
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
838
|
+
:type rai_svc_url: str
|
|
839
|
+
:param token: The Azure authentication token.
|
|
840
|
+
:type token: str
|
|
841
|
+
:return: The operation ID.
|
|
842
|
+
:rtype: str
|
|
843
|
+
"""
|
|
844
|
+
## handle json payload and payload from inference sdk strongly type messages
|
|
845
|
+
if len(messages) > 0 and not isinstance(messages[0], dict):
|
|
846
|
+
try:
|
|
847
|
+
from azure.ai.inference.models import ChatRequestMessage
|
|
848
|
+
except ImportError as ex:
|
|
849
|
+
error_message = (
|
|
850
|
+
"Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
|
|
851
|
+
)
|
|
852
|
+
raise MissingRequiredPackage(message=error_message) from ex
|
|
853
|
+
if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
|
|
854
|
+
messages = [message.as_dict() for message in messages]
|
|
855
|
+
|
|
856
|
+
filtered_messages = [message for message in messages if message["role"] != "system"]
|
|
857
|
+
assistant_messages = [message for message in messages if message["role"] == "assistant"]
|
|
858
|
+
content_type = retrieve_content_type(assistant_messages, metric)
|
|
859
|
+
payload = generate_payload_multimodal(content_type, filtered_messages, metric)
|
|
860
|
+
|
|
861
|
+
## calling rai service for annotation
|
|
862
|
+
url = rai_svc_url + "/submitannotation"
|
|
863
|
+
headers = get_common_headers(token)
|
|
864
|
+
async with get_async_http_client() as client:
|
|
865
|
+
response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
866
|
+
url, json=payload, headers=headers
|
|
867
|
+
)
|
|
868
|
+
if response.status_code != 202:
|
|
869
|
+
raise HttpResponseError(
|
|
870
|
+
message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
|
|
871
|
+
)
|
|
872
|
+
result = response.json()
|
|
873
|
+
operation_id = result["location"].split("/")[-1]
|
|
874
|
+
return operation_id
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
async def submit_multimodal_request_onedp(client: AIProjectClient, messages, metric: str, token: str) -> str:
|
|
878
|
+
|
|
879
|
+
# handle inference sdk strongly type messages
|
|
880
|
+
if len(messages) > 0 and not isinstance(messages[0], dict):
|
|
881
|
+
try:
|
|
882
|
+
from azure.ai.inference.models import ChatRequestMessage
|
|
883
|
+
except ImportError as ex:
|
|
884
|
+
error_message = (
|
|
885
|
+
"Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
|
|
886
|
+
)
|
|
887
|
+
raise MissingRequiredPackage(message=error_message) from ex
|
|
888
|
+
if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
|
|
889
|
+
messages = [message.as_dict() for message in messages]
|
|
890
|
+
|
|
891
|
+
## fetch system and assistant messages from the list of messages
|
|
892
|
+
filtered_messages = [message for message in messages if message["role"] != "system"]
|
|
893
|
+
assistant_messages = [message for message in messages if message["role"] == "assistant"]
|
|
894
|
+
|
|
895
|
+
## prepare for request
|
|
896
|
+
content_type = retrieve_content_type(assistant_messages, metric)
|
|
897
|
+
payload = generate_payload_multimodal(content_type, filtered_messages, metric)
|
|
898
|
+
headers = get_common_headers(token)
|
|
899
|
+
|
|
900
|
+
response = client.evaluations.submit_annotation(payload, headers=headers)
|
|
901
|
+
|
|
902
|
+
result = json.loads(response)
|
|
903
|
+
operation_id = result["location"].split("/")[-1]
|
|
904
|
+
return operation_id
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
def _build_sync_eval_payload(
|
|
908
|
+
data: dict, metric_name: str, annotation_task: str, scan_session_id: Optional[str] = None
|
|
909
|
+
) -> Dict:
|
|
910
|
+
"""Build the sync_evals payload for evaluation using QueryResponseInlineMessage format.
|
|
911
|
+
|
|
912
|
+
:param data: The data to evaluate, containing 'query', 'response', and optionally 'context' and 'tool_calls'.
|
|
913
|
+
:type data: dict
|
|
914
|
+
:param metric_name: The evaluation metric to use.
|
|
915
|
+
:type metric_name: str
|
|
916
|
+
:param annotation_task: The annotation task to use.
|
|
917
|
+
:type annotation_task: str
|
|
918
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
919
|
+
:type scan_session_id: Optional[str]
|
|
920
|
+
:return: The sync_eval payload ready to send to the API.
|
|
921
|
+
:rtype: Dict
|
|
922
|
+
"""
|
|
923
|
+
|
|
924
|
+
# Build properties/metadata (scenario, category, taxonomy, etc.)
|
|
925
|
+
properties = {}
|
|
926
|
+
if data.get("scenario") is not None:
|
|
927
|
+
properties["scenario"] = data["scenario"]
|
|
928
|
+
if data.get("risk_sub_type") is not None:
|
|
929
|
+
properties["category"] = data["risk_sub_type"]
|
|
930
|
+
if data.get("taxonomy") is not None:
|
|
931
|
+
properties["taxonomy"] = str(data["taxonomy"]) # Ensure taxonomy is converted to string
|
|
932
|
+
|
|
933
|
+
# Prepare context if available
|
|
934
|
+
context = None
|
|
935
|
+
if data.get("context") is not None:
|
|
936
|
+
context = " ".join(c["content"] for c in data["context"]["contexts"])
|
|
937
|
+
|
|
938
|
+
# Build QueryResponseInlineMessage object
|
|
939
|
+
item_content = QueryResponseInlineMessage(
|
|
940
|
+
query=data.get("query", ""),
|
|
941
|
+
response=data.get("response", ""),
|
|
942
|
+
context=context,
|
|
943
|
+
tools=data.get("tool_calls"),
|
|
944
|
+
properties=properties if properties else None,
|
|
945
|
+
)
|
|
946
|
+
|
|
947
|
+
# Build the data mapping using mustache syntax {{item.field}}
|
|
948
|
+
data_mapping = {
|
|
949
|
+
"query": "{{item.query}}",
|
|
950
|
+
"response": "{{item.response}}",
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
# Create the sync eval input payload
|
|
954
|
+
# Structure: Uses QueryResponseInlineMessage format with azure_ai_evaluator type
|
|
955
|
+
sync_eval_payload = {
|
|
956
|
+
"name": f"Safety Eval - {metric_name}",
|
|
957
|
+
"data_source": {
|
|
958
|
+
"type": "jsonl",
|
|
959
|
+
"source": {"type": "file_content", "content": {"item": item_content}},
|
|
960
|
+
},
|
|
961
|
+
"testing_criteria": [
|
|
962
|
+
{
|
|
963
|
+
"type": "azure_ai_evaluator",
|
|
964
|
+
"name": metric_name,
|
|
965
|
+
"evaluator_name": metric_name,
|
|
966
|
+
"data_mapping": data_mapping,
|
|
967
|
+
}
|
|
968
|
+
],
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
return sync_eval_payload
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def _parse_sync_eval_result(
|
|
975
|
+
eval_result, metric_name: str, metric_display_name: Optional[str] = None
|
|
976
|
+
) -> Dict[str, Union[str, float]]:
|
|
977
|
+
"""Parse the result from sync_evals response (EvalRunOutputItem) into the standard format.
|
|
978
|
+
|
|
979
|
+
:param eval_result: The result from sync_evals.create() call (EvalRunOutputItem).
|
|
980
|
+
:param metric_name: The evaluation metric name.
|
|
981
|
+
:type metric_name: str
|
|
982
|
+
:param metric_display_name: The display name for the metric.
|
|
983
|
+
:type metric_display_name: Optional[str]
|
|
984
|
+
:return: The parsed result in standard format compatible with parse_response.
|
|
985
|
+
:rtype: Dict[str, Union[str, float]]
|
|
986
|
+
"""
|
|
987
|
+
# Handle EvalRunOutputItem structure
|
|
988
|
+
# Expected structure: {'results': [{'name': 'violence', 'score': 0.0, 'reason': '...', ...}]}
|
|
989
|
+
|
|
990
|
+
display_name = metric_display_name or metric_name
|
|
991
|
+
|
|
992
|
+
# Handle both dict and object formats
|
|
993
|
+
if hasattr(eval_result, "results"):
|
|
994
|
+
results = eval_result.results
|
|
995
|
+
elif isinstance(eval_result, dict) and "results" in eval_result:
|
|
996
|
+
results = eval_result["results"]
|
|
997
|
+
else:
|
|
998
|
+
return {}
|
|
999
|
+
|
|
1000
|
+
if not results or len(results) == 0:
|
|
1001
|
+
return {}
|
|
1002
|
+
|
|
1003
|
+
# Find the result for our specific metric
|
|
1004
|
+
target_result = None
|
|
1005
|
+
for result_item in results:
|
|
1006
|
+
if isinstance(result_item, dict):
|
|
1007
|
+
if result_item.get("name") == metric_name or result_item.get("metric") == metric_name:
|
|
1008
|
+
target_result = result_item
|
|
1009
|
+
break
|
|
1010
|
+
elif hasattr(result_item, "name") and result_item.name == metric_name:
|
|
1011
|
+
target_result = result_item
|
|
1012
|
+
break
|
|
1013
|
+
|
|
1014
|
+
if not target_result:
|
|
1015
|
+
return {}
|
|
1016
|
+
|
|
1017
|
+
# Extract values from the result item
|
|
1018
|
+
if isinstance(target_result, dict):
|
|
1019
|
+
score = target_result.get("score", math.nan)
|
|
1020
|
+
reason = target_result.get("reason", "")
|
|
1021
|
+
# Also check properties.reasoning for additional reason text
|
|
1022
|
+
if not reason and "properties" in target_result:
|
|
1023
|
+
props = target_result["properties"]
|
|
1024
|
+
if isinstance(props, dict):
|
|
1025
|
+
reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
|
|
1026
|
+
else:
|
|
1027
|
+
score = getattr(target_result, "score", math.nan)
|
|
1028
|
+
reason = getattr(target_result, "reason", "")
|
|
1029
|
+
if not reason and hasattr(target_result, "properties"):
|
|
1030
|
+
props = target_result.properties
|
|
1031
|
+
if isinstance(props, dict):
|
|
1032
|
+
reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
|
|
1033
|
+
|
|
1034
|
+
# Convert score to severity level using existing logic
|
|
1035
|
+
harm_score = score if not math.isnan(score) else math.nan
|
|
1036
|
+
severity_level = get_harm_severity_level(harm_score) if not math.isnan(harm_score) else math.nan
|
|
1037
|
+
|
|
1038
|
+
# Return in the standard format expected by the red team processor
|
|
1039
|
+
return {display_name: severity_level, f"{display_name}_score": harm_score, f"{display_name}_reason": reason}
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
async def evaluate_with_rai_service_sync(
|
|
1043
|
+
data: dict,
|
|
1044
|
+
metric_name: str,
|
|
1045
|
+
project_scope: Union[str, AzureAIProject],
|
|
1046
|
+
credential: TokenCredential,
|
|
1047
|
+
annotation_task: str = Tasks.CONTENT_HARM,
|
|
1048
|
+
metric_display_name=None,
|
|
1049
|
+
evaluator_name=None,
|
|
1050
|
+
scan_session_id: Optional[str] = None,
|
|
420
1051
|
):
|
|
421
|
-
"""
|
|
1052
|
+
"""Evaluate the content safety of the response using sync_evals endpoint (OneDP only).
|
|
422
1053
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
1054
|
+
This function uses the sync_evals.create() API instead of the legacy evaluations.submit_annotation()
|
|
1055
|
+
approach. It's specifically designed for OneDP projects and provides better integration with
|
|
1056
|
+
the newer evaluation infrastructure. Returns the raw EvalRunOutputItem for direct use.
|
|
1057
|
+
|
|
1058
|
+
:param data: The data to evaluate.
|
|
1059
|
+
:type data: dict
|
|
1060
|
+
:param metric_name: The evaluation metric to use.
|
|
1061
|
+
:type metric_name: str
|
|
1062
|
+
:param project_scope: The Azure AI project, which can either be a string representing the project endpoint
|
|
1063
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
1064
|
+
:type project_scope: Union[str, AzureAIProject]
|
|
1065
|
+
:param credential: The Azure authentication credential.
|
|
1066
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
1067
|
+
:param annotation_task: The annotation task to use.
|
|
1068
|
+
:type annotation_task: str
|
|
1069
|
+
:param metric_display_name: The display name of metric to use.
|
|
1070
|
+
:type metric_display_name: str
|
|
1071
|
+
:param evaluator_name: The evaluator name to use.
|
|
1072
|
+
:type evaluator_name: str
|
|
1073
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
1074
|
+
:type scan_session_id: Optional[str]
|
|
1075
|
+
:return: The EvalRunOutputItem containing the evaluation results.
|
|
1076
|
+
:rtype: EvalRunOutputItem
|
|
1077
|
+
:raises: EvaluationException if project_scope is not a OneDP project
|
|
436
1078
|
"""
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
1079
|
+
if not is_onedp_project(project_scope):
|
|
1080
|
+
msg = "evaluate_with_rai_service_sync only supports OneDP projects. Use evaluate_with_rai_service for legacy projects."
|
|
1081
|
+
raise EvaluationException(
|
|
1082
|
+
message=msg,
|
|
1083
|
+
internal_message=msg,
|
|
1084
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
1085
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1086
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1087
|
+
)
|
|
1088
|
+
|
|
1089
|
+
client = AIProjectClient(
|
|
1090
|
+
endpoint=project_scope,
|
|
1091
|
+
credential=credential,
|
|
1092
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
1093
|
+
)
|
|
441
1094
|
|
|
442
|
-
#
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
1095
|
+
# Build the sync eval payload
|
|
1096
|
+
sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id)
|
|
1097
|
+
# Call sync_evals.create() with the JSON payload
|
|
1098
|
+
eval_result = client.sync_evals.create(eval=sync_eval_payload)
|
|
446
1099
|
|
|
447
|
-
#
|
|
448
|
-
|
|
449
|
-
annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
|
|
450
|
-
result = parse_response(annotation_response, metric_name)
|
|
1100
|
+
# Return the raw EvalRunOutputItem for downstream processing
|
|
1101
|
+
return eval_result
|
|
451
1102
|
|
|
452
|
-
|
|
1103
|
+
|
|
1104
|
+
async def evaluate_with_rai_service_multimodal(
|
|
1105
|
+
messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential
|
|
1106
|
+
):
|
|
1107
|
+
""" "Evaluate the content safety of the response using Responsible AI service
|
|
1108
|
+
:param messages: The normalized list of messages.
|
|
1109
|
+
:type messages: str
|
|
1110
|
+
:param metric_name: The evaluation metric to use.
|
|
1111
|
+
:type metric_name: str
|
|
1112
|
+
:param project_scope: The Azure AI project, which can either be a string representing the project endpoint
|
|
1113
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
1114
|
+
:type project_scope: Union[str, AzureAIProject]
|
|
1115
|
+
:param credential: The Azure authentication credential.
|
|
1116
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
1117
|
+
:return: The parsed annotation result.
|
|
1118
|
+
:rtype: List[List[Dict]]
|
|
1119
|
+
"""
|
|
1120
|
+
|
|
1121
|
+
if is_onedp_project(project_scope):
|
|
1122
|
+
client = AIProjectClient(
|
|
1123
|
+
endpoint=project_scope,
|
|
1124
|
+
credential=credential,
|
|
1125
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
1126
|
+
)
|
|
1127
|
+
token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
|
|
1128
|
+
await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
|
|
1129
|
+
operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
|
|
1130
|
+
annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
|
|
1131
|
+
result = parse_response(annotation_response, metric_name)
|
|
1132
|
+
return result
|
|
1133
|
+
else:
|
|
1134
|
+
token = await fetch_or_reuse_token(credential)
|
|
1135
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
1136
|
+
await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
|
|
1137
|
+
# Submit annotation request and fetch result
|
|
1138
|
+
operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
|
|
1139
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
1140
|
+
result = parse_response(annotation_response, metric_name)
|
|
1141
|
+
return result
|