azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +83 -14
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
- azure/ai/evaluation/_aoai/label_grader.py +68 -0
- azure/ai/evaluation/_aoai/python_grader.py +86 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_envs.py +207 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +129 -0
- azure/ai/evaluation/_common/__init__.py +9 -1
- azure/ai/evaluation/_common/constants.py +124 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +166 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +66 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +578 -69
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +505 -27
- azure/ai/evaluation/_constants.py +148 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +899 -0
- azure/ai/evaluation/_converters/_models.py +467 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +83 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
- azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
- azure/ai/evaluation/_evaluate/_utils.py +237 -42
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
- azure/ai/evaluation/_exceptions.py +24 -1
- azure/ai/evaluation/_http_utils.py +7 -5
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
- azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/red_team/__init__.py +22 -0
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
- azure/ai/evaluation/red_team/_red_team.py +1717 -0
- azure/ai/evaluation/red_team/_red_team_result.py +661 -0
- azure/ai/evaluation/red_team/_result_processor.py +1708 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
- azure/ai/evaluation/red_team/_utils/constants.py +72 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
- azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
- azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +43 -19
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
- azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
|
@@ -12,16 +12,21 @@ from ast import literal_eval
|
|
|
12
12
|
from typing import Dict, List, Optional, Union, cast
|
|
13
13
|
from urllib.parse import urlparse
|
|
14
14
|
from string import Template
|
|
15
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
16
|
+
from azure.ai.evaluation._common.onedp.models import QueryResponseInlineMessage
|
|
17
|
+
from azure.core.exceptions import HttpResponseError
|
|
15
18
|
|
|
16
19
|
import jwt
|
|
17
20
|
|
|
18
|
-
from
|
|
21
|
+
from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
|
|
19
22
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
20
23
|
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
21
24
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
25
|
+
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
26
|
+
from azure.ai.evaluation._common.utils import is_onedp_project
|
|
22
27
|
from azure.core.credentials import TokenCredential
|
|
23
28
|
from azure.core.exceptions import HttpResponseError
|
|
24
|
-
from azure.core.pipeline.policies import AsyncRetryPolicy
|
|
29
|
+
from azure.core.pipeline.policies import AsyncRetryPolicy, UserAgentPolicy
|
|
25
30
|
|
|
26
31
|
from .constants import (
|
|
27
32
|
CommonConstants,
|
|
@@ -32,15 +37,14 @@ from .constants import (
|
|
|
32
37
|
)
|
|
33
38
|
from .utils import get_harm_severity_level, retrieve_content_type
|
|
34
39
|
|
|
35
|
-
try:
|
|
36
|
-
version = importlib.metadata.version("azure-ai-evaluation")
|
|
37
|
-
except importlib.metadata.PackageNotFoundError:
|
|
38
|
-
version = "unknown"
|
|
39
|
-
USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
40
40
|
|
|
41
41
|
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
42
42
|
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
43
43
|
}
|
|
44
|
+
ML_WORKSPACE = "https://management.azure.com/.default"
|
|
45
|
+
COG_SRV_WORKSPACE = "https://ai.azure.com/.default"
|
|
46
|
+
|
|
47
|
+
INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"
|
|
44
48
|
|
|
45
49
|
|
|
46
50
|
def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
@@ -64,6 +68,16 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
|
64
68
|
"context": data.get("context", ""),
|
|
65
69
|
}
|
|
66
70
|
return json.dumps(as_dict)
|
|
71
|
+
if annotation_task == Tasks.CODE_VULNERABILITY:
|
|
72
|
+
as_dict = {"context": data.get("query", ""), "completion": data.get("response", "")}
|
|
73
|
+
return json.dumps(as_dict)
|
|
74
|
+
if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
|
|
75
|
+
as_dict = {
|
|
76
|
+
"query": data.get("query", ""),
|
|
77
|
+
"response": data.get("response", ""),
|
|
78
|
+
"context": data.get("context", ""),
|
|
79
|
+
}
|
|
80
|
+
return json.dumps(as_dict)
|
|
67
81
|
as_dict = {
|
|
68
82
|
"query": html.escape(data.get("query", "")),
|
|
69
83
|
"response": html.escape(data.get("response", "")),
|
|
@@ -72,21 +86,24 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
|
72
86
|
return user_text.replace("'", '\\"')
|
|
73
87
|
|
|
74
88
|
|
|
75
|
-
def get_common_headers(token: str) -> Dict:
|
|
89
|
+
def get_common_headers(token: str, evaluator_name: Optional[str] = None) -> Dict:
|
|
76
90
|
"""Get common headers for the HTTP request
|
|
77
91
|
|
|
78
92
|
:param token: The Azure authentication token.
|
|
79
93
|
:type token: str
|
|
94
|
+
:param evaluator_name: The evaluator name. Default is None.
|
|
95
|
+
:type evaluator_name: str
|
|
80
96
|
:return: The common headers.
|
|
81
97
|
:rtype: Dict
|
|
82
98
|
"""
|
|
99
|
+
user_agent = (
|
|
100
|
+
f"{UserAgentSingleton().value} (type=evaluator; subtype={evaluator_name})"
|
|
101
|
+
if evaluator_name
|
|
102
|
+
else UserAgentSingleton().value
|
|
103
|
+
)
|
|
83
104
|
return {
|
|
84
105
|
"Authorization": f"Bearer {token}",
|
|
85
|
-
"
|
|
86
|
-
"User-Agent": USER_AGENT,
|
|
87
|
-
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
88
|
-
# https://github.com/encode/httpx/discussions/2959
|
|
89
|
-
"Connection": "close",
|
|
106
|
+
"User-Agent": user_agent,
|
|
90
107
|
}
|
|
91
108
|
|
|
92
109
|
|
|
@@ -96,6 +113,34 @@ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
|
|
|
96
113
|
)
|
|
97
114
|
|
|
98
115
|
|
|
116
|
+
async def ensure_service_availability_onedp(
|
|
117
|
+
client: AIProjectClient, token: str, capability: Optional[str] = None
|
|
118
|
+
) -> None:
|
|
119
|
+
"""Check if the Responsible AI service is available in the region and has the required capability, if relevant.
|
|
120
|
+
|
|
121
|
+
:param client: The AI project client.
|
|
122
|
+
:type client: AIProjectClient
|
|
123
|
+
:param token: The Azure authentication token.
|
|
124
|
+
:type token: str
|
|
125
|
+
:param capability: The capability to check. Default is None.
|
|
126
|
+
:type capability: str
|
|
127
|
+
:raises Exception: If the service is not available in the region or the capability is not available.
|
|
128
|
+
"""
|
|
129
|
+
headers = get_common_headers(token)
|
|
130
|
+
capabilities = client.evaluations.check_annotation(headers=headers)
|
|
131
|
+
|
|
132
|
+
if capability and capability not in capabilities:
|
|
133
|
+
msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
|
|
134
|
+
raise EvaluationException(
|
|
135
|
+
message=msg,
|
|
136
|
+
internal_message=msg,
|
|
137
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
138
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
139
|
+
blame=ErrorBlame.USER_ERROR,
|
|
140
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
99
144
|
async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
|
|
100
145
|
"""Check if the Responsible AI service is available in the region and has the required capability, if relevant.
|
|
101
146
|
|
|
@@ -157,6 +202,8 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
|
|
|
157
202
|
task = annotation_task
|
|
158
203
|
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
159
204
|
include_metric = False
|
|
205
|
+
elif metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
|
|
206
|
+
include_metric = False
|
|
160
207
|
elif metric == _InternalEvaluationMetrics.ECI:
|
|
161
208
|
include_metric = False
|
|
162
209
|
elif metric == EvaluationMetrics.XPIA:
|
|
@@ -175,7 +222,9 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
|
|
|
175
222
|
)
|
|
176
223
|
|
|
177
224
|
|
|
178
|
-
async def submit_request(
|
|
225
|
+
async def submit_request(
|
|
226
|
+
data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str, evaluator_name: str
|
|
227
|
+
) -> str:
|
|
179
228
|
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
180
229
|
|
|
181
230
|
:param data: The data to evaluate.
|
|
@@ -188,6 +237,8 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
|
|
|
188
237
|
:type token: str
|
|
189
238
|
:param annotation_task: The annotation task to use.
|
|
190
239
|
:type annotation_task: str
|
|
240
|
+
:param evaluator_name: The evaluator name.
|
|
241
|
+
:type evaluator_name: str
|
|
191
242
|
:return: The operation ID.
|
|
192
243
|
:rtype: str
|
|
193
244
|
"""
|
|
@@ -195,7 +246,7 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
|
|
|
195
246
|
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
196
247
|
|
|
197
248
|
url = rai_svc_url + "/submitannotation"
|
|
198
|
-
headers = get_common_headers(token)
|
|
249
|
+
headers = get_common_headers(token, evaluator_name)
|
|
199
250
|
|
|
200
251
|
async with get_async_http_client_with_timeout() as client:
|
|
201
252
|
http_response = await client.post(url, json=payload, headers=headers)
|
|
@@ -208,6 +259,45 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
|
|
|
208
259
|
return operation_id
|
|
209
260
|
|
|
210
261
|
|
|
262
|
+
async def submit_request_onedp(
|
|
263
|
+
client: AIProjectClient,
|
|
264
|
+
data: dict,
|
|
265
|
+
metric: str,
|
|
266
|
+
token: str,
|
|
267
|
+
annotation_task: str,
|
|
268
|
+
evaluator_name: str,
|
|
269
|
+
scan_session_id: Optional[str] = None,
|
|
270
|
+
) -> str:
|
|
271
|
+
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
272
|
+
|
|
273
|
+
:param client: The AI project client.
|
|
274
|
+
:type client: AIProjectClient
|
|
275
|
+
:param data: The data to evaluate.
|
|
276
|
+
:type data: dict
|
|
277
|
+
:param metric: The evaluation metric to use.
|
|
278
|
+
:type metric: str
|
|
279
|
+
:param token: The Azure authentication token.
|
|
280
|
+
:type token: str
|
|
281
|
+
:param annotation_task: The annotation task to use.
|
|
282
|
+
:type annotation_task: str
|
|
283
|
+
:param evaluator_name: The evaluator name.
|
|
284
|
+
:type evaluator_name: str
|
|
285
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
286
|
+
:type scan_session_id: Optional[str]
|
|
287
|
+
:return: The operation ID.
|
|
288
|
+
:rtype: str
|
|
289
|
+
"""
|
|
290
|
+
normalized_user_text = get_formatted_template(data, annotation_task)
|
|
291
|
+
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
292
|
+
headers = get_common_headers(token, evaluator_name)
|
|
293
|
+
if scan_session_id:
|
|
294
|
+
headers["x-ms-client-request-id"] = scan_session_id
|
|
295
|
+
response = client.evaluations.submit_annotation(payload, headers=headers)
|
|
296
|
+
result = json.loads(response)
|
|
297
|
+
operation_id = result["location"].split("/")[-1]
|
|
298
|
+
return operation_id
|
|
299
|
+
|
|
300
|
+
|
|
211
301
|
async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential, token: str) -> Dict:
|
|
212
302
|
"""Fetch the annotation result from Responsible AI service
|
|
213
303
|
|
|
@@ -230,8 +320,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
230
320
|
token = await fetch_or_reuse_token(credential, token)
|
|
231
321
|
headers = get_common_headers(token)
|
|
232
322
|
|
|
233
|
-
async with
|
|
234
|
-
response = await client.get(url, headers=headers)
|
|
323
|
+
async with get_async_http_client() as client:
|
|
324
|
+
response = await client.get(url, headers=headers, timeout=RAIService.TIMEOUT)
|
|
235
325
|
|
|
236
326
|
if response.status_code == 200:
|
|
237
327
|
return response.json()
|
|
@@ -245,6 +335,37 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
245
335
|
await asyncio.sleep(sleep_time)
|
|
246
336
|
|
|
247
337
|
|
|
338
|
+
async def fetch_result_onedp(client: AIProjectClient, operation_id: str, token: str) -> Dict:
|
|
339
|
+
"""Fetch the annotation result from Responsible AI service
|
|
340
|
+
|
|
341
|
+
:param client: The AI project client.
|
|
342
|
+
:type client: AIProjectClient
|
|
343
|
+
:param operation_id: The operation ID.
|
|
344
|
+
:type operation_id: str
|
|
345
|
+
:param token: The Azure authentication token.
|
|
346
|
+
:type token: str
|
|
347
|
+
:return: The annotation result.
|
|
348
|
+
:rtype: Dict
|
|
349
|
+
"""
|
|
350
|
+
start = time.time()
|
|
351
|
+
request_count = 0
|
|
352
|
+
|
|
353
|
+
while True:
|
|
354
|
+
headers = get_common_headers(token)
|
|
355
|
+
try:
|
|
356
|
+
return client.evaluations.operation_results(operation_id, headers=headers)
|
|
357
|
+
except HttpResponseError:
|
|
358
|
+
request_count += 1
|
|
359
|
+
time_elapsed = time.time() - start
|
|
360
|
+
if time_elapsed > RAIService.TIMEOUT:
|
|
361
|
+
raise TimeoutError(
|
|
362
|
+
f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
sleep_time = RAIService.SLEEP_TIME**request_count
|
|
366
|
+
await asyncio.sleep(sleep_time)
|
|
367
|
+
|
|
368
|
+
|
|
248
369
|
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
249
370
|
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
250
371
|
) -> Dict[str, Union[str, float]]:
|
|
@@ -267,10 +388,19 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
267
388
|
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
268
389
|
_InternalEvaluationMetrics.ECI,
|
|
269
390
|
EvaluationMetrics.XPIA,
|
|
391
|
+
EvaluationMetrics.CODE_VULNERABILITY,
|
|
392
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
270
393
|
}:
|
|
271
394
|
result = {}
|
|
272
395
|
if not batch_response or len(batch_response[0]) == 0:
|
|
273
396
|
return {}
|
|
397
|
+
if (
|
|
398
|
+
metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
|
|
399
|
+
and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]
|
|
400
|
+
):
|
|
401
|
+
batch_response[0] = {
|
|
402
|
+
EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
|
|
403
|
+
}
|
|
274
404
|
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
|
|
275
405
|
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
|
|
276
406
|
for pm_metric_name in pm_metric_names:
|
|
@@ -282,6 +412,25 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
282
412
|
result[pm_metric_name + "_reason"] = (
|
|
283
413
|
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
284
414
|
)
|
|
415
|
+
result[pm_metric_name + "_total_tokens"] = (
|
|
416
|
+
parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
|
|
417
|
+
)
|
|
418
|
+
result[pm_metric_name + "_prompt_tokens"] = (
|
|
419
|
+
parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
|
|
420
|
+
)
|
|
421
|
+
result[pm_metric_name + "_completion_tokens"] = (
|
|
422
|
+
parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
|
|
423
|
+
)
|
|
424
|
+
result[pm_metric_name + "_finish_reason"] = (
|
|
425
|
+
parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
|
|
426
|
+
)
|
|
427
|
+
result[pm_metric_name + "_sample_input"] = (
|
|
428
|
+
parsed_response["sample_input"] if "sample_input" in parsed_response else ""
|
|
429
|
+
)
|
|
430
|
+
result[pm_metric_name + "_sample_output"] = (
|
|
431
|
+
parsed_response["sample_output"] if "sample_output" in parsed_response else ""
|
|
432
|
+
)
|
|
433
|
+
result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
|
|
285
434
|
return result
|
|
286
435
|
if metric_name not in batch_response[0]:
|
|
287
436
|
return {}
|
|
@@ -306,6 +455,46 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
306
455
|
result[metric_display_name + "_information_gathering"] = (
|
|
307
456
|
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
|
|
308
457
|
)
|
|
458
|
+
if (
|
|
459
|
+
metric_name == EvaluationMetrics.CODE_VULNERABILITY
|
|
460
|
+
or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
|
|
461
|
+
):
|
|
462
|
+
# Add all attributes under the details.
|
|
463
|
+
details = {}
|
|
464
|
+
for key, value in parsed_response.items():
|
|
465
|
+
if key not in {
|
|
466
|
+
"label",
|
|
467
|
+
"reasoning",
|
|
468
|
+
"version",
|
|
469
|
+
"totalTokenCount",
|
|
470
|
+
"inputTokenCount",
|
|
471
|
+
"outputTokenCount",
|
|
472
|
+
"finish_reason",
|
|
473
|
+
"sample_input",
|
|
474
|
+
"sample_output",
|
|
475
|
+
"model",
|
|
476
|
+
}:
|
|
477
|
+
details[key.replace("-", "_")] = value
|
|
478
|
+
result[metric_display_name + "_details"] = details
|
|
479
|
+
result[metric_display_name + "_total_tokens"] = (
|
|
480
|
+
parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
|
|
481
|
+
)
|
|
482
|
+
result[metric_display_name + "_prompt_tokens"] = (
|
|
483
|
+
parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
|
|
484
|
+
)
|
|
485
|
+
result[metric_display_name + "_completion_tokens"] = (
|
|
486
|
+
parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
|
|
487
|
+
)
|
|
488
|
+
result[metric_display_name + "_finish_reason"] = (
|
|
489
|
+
parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
|
|
490
|
+
)
|
|
491
|
+
result[metric_display_name + "_sample_input"] = (
|
|
492
|
+
parsed_response["sample_input"] if "sample_input" in parsed_response else ""
|
|
493
|
+
)
|
|
494
|
+
result[metric_display_name + "_sample_output"] = (
|
|
495
|
+
parsed_response["sample_output"] if "sample_output" in parsed_response else ""
|
|
496
|
+
)
|
|
497
|
+
result[metric_display_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
|
|
309
498
|
return result
|
|
310
499
|
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
|
|
311
500
|
|
|
@@ -345,6 +534,13 @@ def _parse_content_harm_response(
|
|
|
345
534
|
except Exception: # pylint: disable=broad-exception-caught
|
|
346
535
|
harm_response = response[metric_name]
|
|
347
536
|
|
|
537
|
+
total_tokens = 0
|
|
538
|
+
prompt_tokens = 0
|
|
539
|
+
completion_tokens = 0
|
|
540
|
+
finish_reason = ""
|
|
541
|
+
sample_input = ""
|
|
542
|
+
sample_output = ""
|
|
543
|
+
model = ""
|
|
348
544
|
if harm_response != "" and isinstance(harm_response, dict):
|
|
349
545
|
# check if "output" is one key in harm_response
|
|
350
546
|
if "output" in harm_response:
|
|
@@ -352,7 +548,14 @@ def _parse_content_harm_response(
|
|
|
352
548
|
|
|
353
549
|
# get content harm metric_value
|
|
354
550
|
if "label" in harm_response:
|
|
355
|
-
|
|
551
|
+
try:
|
|
552
|
+
# Handle "n/a" or other non-numeric values
|
|
553
|
+
if isinstance(harm_response["label"], str) and harm_response["label"].strip().lower() == "n/a":
|
|
554
|
+
metric_value = math.nan
|
|
555
|
+
else:
|
|
556
|
+
metric_value = float(harm_response["label"])
|
|
557
|
+
except (ValueError, TypeError):
|
|
558
|
+
metric_value = math.nan
|
|
356
559
|
elif "valid" in harm_response:
|
|
357
560
|
metric_value = 0 if harm_response["valid"] else math.nan
|
|
358
561
|
else:
|
|
@@ -365,6 +568,44 @@ def _parse_content_harm_response(
|
|
|
365
568
|
reason = harm_response["reason"]
|
|
366
569
|
else:
|
|
367
570
|
reason = ""
|
|
571
|
+
|
|
572
|
+
# get token_usage
|
|
573
|
+
if "totalTokenCount" in harm_response:
|
|
574
|
+
total_tokens = harm_response["totalTokenCount"]
|
|
575
|
+
else:
|
|
576
|
+
total_tokens = 0
|
|
577
|
+
if "inputTokenCount" in harm_response:
|
|
578
|
+
prompt_tokens = harm_response["inputTokenCount"]
|
|
579
|
+
else:
|
|
580
|
+
prompt_tokens = 0
|
|
581
|
+
if "outputTokenCount" in harm_response:
|
|
582
|
+
completion_tokens = harm_response["outputTokenCount"]
|
|
583
|
+
else:
|
|
584
|
+
completion_tokens = 0
|
|
585
|
+
|
|
586
|
+
# get finish_reason
|
|
587
|
+
if "finish_reason" in harm_response:
|
|
588
|
+
finish_reason = harm_response["finish_reason"]
|
|
589
|
+
else:
|
|
590
|
+
finish_reason = ""
|
|
591
|
+
|
|
592
|
+
# get sample_input
|
|
593
|
+
if "sample_input" in harm_response:
|
|
594
|
+
sample_input = harm_response["sample_input"]
|
|
595
|
+
else:
|
|
596
|
+
sample_input = ""
|
|
597
|
+
|
|
598
|
+
# get sample_output
|
|
599
|
+
if "sample_output" in harm_response:
|
|
600
|
+
sample_output = harm_response["sample_output"]
|
|
601
|
+
else:
|
|
602
|
+
sample_output = ""
|
|
603
|
+
|
|
604
|
+
# get model
|
|
605
|
+
if "model" in harm_response:
|
|
606
|
+
model = harm_response["model"]
|
|
607
|
+
else:
|
|
608
|
+
model = ""
|
|
368
609
|
elif harm_response != "" and isinstance(harm_response, str):
|
|
369
610
|
metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
|
|
370
611
|
if metric_value_match:
|
|
@@ -383,8 +624,7 @@ def _parse_content_harm_response(
|
|
|
383
624
|
reason = ""
|
|
384
625
|
|
|
385
626
|
harm_score = metric_value
|
|
386
|
-
|
|
387
|
-
return result
|
|
627
|
+
# We've already handled the "n/a" case by converting to math.nan
|
|
388
628
|
if not math.isnan(metric_value):
|
|
389
629
|
# int(math.nan) causes a value error, and math.nan is already handled
|
|
390
630
|
# by get_harm_severity_level
|
|
@@ -392,6 +632,13 @@ def _parse_content_harm_response(
|
|
|
392
632
|
result[key] = get_harm_severity_level(harm_score)
|
|
393
633
|
result[key + "_score"] = harm_score
|
|
394
634
|
result[key + "_reason"] = reason
|
|
635
|
+
result[key + "_total_tokens"] = total_tokens
|
|
636
|
+
result[key + "_prompt_tokens"] = prompt_tokens
|
|
637
|
+
result[key + "_completion_tokens"] = completion_tokens
|
|
638
|
+
result[key + "_finish_reason"] = finish_reason
|
|
639
|
+
result[key + "_sample_input"] = sample_input
|
|
640
|
+
result[key + "_sample_output"] = sample_output
|
|
641
|
+
result[key + "_model"] = model
|
|
395
642
|
|
|
396
643
|
return result
|
|
397
644
|
|
|
@@ -459,7 +706,9 @@ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
|
|
|
459
706
|
return rai_url
|
|
460
707
|
|
|
461
708
|
|
|
462
|
-
async def fetch_or_reuse_token(
|
|
709
|
+
async def fetch_or_reuse_token(
|
|
710
|
+
credential: TokenCredential, token: Optional[str] = None, workspace: Optional[str] = ML_WORKSPACE
|
|
711
|
+
) -> str:
|
|
463
712
|
"""Get token. Fetch a new token if the current token is near expiry
|
|
464
713
|
|
|
465
714
|
:param credential: The Azure authentication credential.
|
|
@@ -483,47 +732,68 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
|
|
|
483
732
|
if (exp_time - current_time) >= 300:
|
|
484
733
|
return token
|
|
485
734
|
|
|
486
|
-
return credential.get_token(
|
|
735
|
+
return credential.get_token(workspace).token
|
|
487
736
|
|
|
488
737
|
|
|
489
738
|
async def evaluate_with_rai_service(
|
|
490
739
|
data: dict,
|
|
491
740
|
metric_name: str,
|
|
492
|
-
project_scope: AzureAIProject,
|
|
741
|
+
project_scope: Union[str, AzureAIProject],
|
|
493
742
|
credential: TokenCredential,
|
|
494
743
|
annotation_task: str = Tasks.CONTENT_HARM,
|
|
495
744
|
metric_display_name=None,
|
|
745
|
+
evaluator_name=None,
|
|
746
|
+
scan_session_id: Optional[str] = None,
|
|
496
747
|
) -> Dict[str, Union[str, float]]:
|
|
497
|
-
"""
|
|
748
|
+
"""Evaluate the content safety of the response using Responsible AI service
|
|
498
749
|
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
~azure.core.credentials.TokenCredential
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
750
|
+
:param data: The data to evaluate.
|
|
751
|
+
:type data: dict
|
|
752
|
+
:param metric_name: The evaluation metric to use.
|
|
753
|
+
:type metric_name: str
|
|
754
|
+
:param project_scope: The Azure AI project, which can either be a string representing the project endpoint
|
|
755
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
756
|
+
:type project_scope: Union[str, AzureAIProject]
|
|
757
|
+
:param credential: The Azure authentication credential.
|
|
758
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
759
|
+
:param annotation_task: The annotation task to use.
|
|
760
|
+
:type annotation_task: str
|
|
761
|
+
:param metric_display_name: The display name of metric to use.
|
|
762
|
+
:type metric_display_name: str
|
|
763
|
+
:param evaluator_name: The evaluator name to use.
|
|
764
|
+
:type evaluator_name: str
|
|
765
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
766
|
+
:type scan_session_id: Optional[str]
|
|
767
|
+
:return: The parsed annotation result.
|
|
768
|
+
:rtype: Dict[str, Union[str, float]]
|
|
514
769
|
"""
|
|
515
770
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
771
|
+
if is_onedp_project(project_scope):
|
|
772
|
+
client = AIProjectClient(
|
|
773
|
+
endpoint=project_scope,
|
|
774
|
+
credential=credential,
|
|
775
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
776
|
+
)
|
|
777
|
+
token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
|
|
778
|
+
await ensure_service_availability_onedp(client, token, annotation_task)
|
|
779
|
+
operation_id = await submit_request_onedp(
|
|
780
|
+
client, data, metric_name, token, annotation_task, evaluator_name, scan_session_id
|
|
781
|
+
)
|
|
782
|
+
annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
|
|
783
|
+
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
784
|
+
return result
|
|
785
|
+
else:
|
|
786
|
+
# Get RAI service URL from discovery service and check service availability
|
|
787
|
+
token = await fetch_or_reuse_token(credential)
|
|
788
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
789
|
+
await ensure_service_availability(rai_svc_url, token, annotation_task)
|
|
520
790
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
791
|
+
# Submit annotation request and fetch result
|
|
792
|
+
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task, evaluator_name)
|
|
793
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
794
|
+
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
525
795
|
|
|
526
|
-
|
|
796
|
+
return result
|
|
527
797
|
|
|
528
798
|
|
|
529
799
|
def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
|
|
@@ -604,29 +874,268 @@ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, tok
|
|
|
604
874
|
return operation_id
|
|
605
875
|
|
|
606
876
|
|
|
877
|
+
async def submit_multimodal_request_onedp(client: AIProjectClient, messages, metric: str, token: str) -> str:
|
|
878
|
+
|
|
879
|
+
# handle inference sdk strongly type messages
|
|
880
|
+
if len(messages) > 0 and not isinstance(messages[0], dict):
|
|
881
|
+
try:
|
|
882
|
+
from azure.ai.inference.models import ChatRequestMessage
|
|
883
|
+
except ImportError as ex:
|
|
884
|
+
error_message = (
|
|
885
|
+
"Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
|
|
886
|
+
)
|
|
887
|
+
raise MissingRequiredPackage(message=error_message) from ex
|
|
888
|
+
if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
|
|
889
|
+
messages = [message.as_dict() for message in messages]
|
|
890
|
+
|
|
891
|
+
## fetch system and assistant messages from the list of messages
|
|
892
|
+
filtered_messages = [message for message in messages if message["role"] != "system"]
|
|
893
|
+
assistant_messages = [message for message in messages if message["role"] == "assistant"]
|
|
894
|
+
|
|
895
|
+
## prepare for request
|
|
896
|
+
content_type = retrieve_content_type(assistant_messages, metric)
|
|
897
|
+
payload = generate_payload_multimodal(content_type, filtered_messages, metric)
|
|
898
|
+
headers = get_common_headers(token)
|
|
899
|
+
|
|
900
|
+
response = client.evaluations.submit_annotation(payload, headers=headers)
|
|
901
|
+
|
|
902
|
+
result = json.loads(response)
|
|
903
|
+
operation_id = result["location"].split("/")[-1]
|
|
904
|
+
return operation_id
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
def _build_sync_eval_payload(
|
|
908
|
+
data: dict, metric_name: str, annotation_task: str, scan_session_id: Optional[str] = None
|
|
909
|
+
) -> Dict:
|
|
910
|
+
"""Build the sync_evals payload for evaluation using QueryResponseInlineMessage format.
|
|
911
|
+
|
|
912
|
+
:param data: The data to evaluate, containing 'query', 'response', and optionally 'context' and 'tool_calls'.
|
|
913
|
+
:type data: dict
|
|
914
|
+
:param metric_name: The evaluation metric to use.
|
|
915
|
+
:type metric_name: str
|
|
916
|
+
:param annotation_task: The annotation task to use.
|
|
917
|
+
:type annotation_task: str
|
|
918
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
919
|
+
:type scan_session_id: Optional[str]
|
|
920
|
+
:return: The sync_eval payload ready to send to the API.
|
|
921
|
+
:rtype: Dict
|
|
922
|
+
"""
|
|
923
|
+
|
|
924
|
+
# Build properties/metadata (scenario, category, taxonomy, etc.)
|
|
925
|
+
properties = {}
|
|
926
|
+
if data.get("scenario") is not None:
|
|
927
|
+
properties["scenario"] = data["scenario"]
|
|
928
|
+
if data.get("risk_sub_type") is not None:
|
|
929
|
+
properties["category"] = data["risk_sub_type"]
|
|
930
|
+
if data.get("taxonomy") is not None:
|
|
931
|
+
properties["taxonomy"] = str(data["taxonomy"]) # Ensure taxonomy is converted to string
|
|
932
|
+
|
|
933
|
+
# Prepare context if available
|
|
934
|
+
context = None
|
|
935
|
+
if data.get("context") is not None:
|
|
936
|
+
context = " ".join(c["content"] for c in data["context"]["contexts"])
|
|
937
|
+
|
|
938
|
+
# Build QueryResponseInlineMessage object
|
|
939
|
+
item_content = QueryResponseInlineMessage(
|
|
940
|
+
query=data.get("query", ""),
|
|
941
|
+
response=data.get("response", ""),
|
|
942
|
+
context=context,
|
|
943
|
+
tools=data.get("tool_calls"),
|
|
944
|
+
properties=properties if properties else None,
|
|
945
|
+
)
|
|
946
|
+
|
|
947
|
+
# Build the data mapping using mustache syntax {{item.field}}
|
|
948
|
+
data_mapping = {
|
|
949
|
+
"query": "{{item.query}}",
|
|
950
|
+
"response": "{{item.response}}",
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
# Create the sync eval input payload
|
|
954
|
+
# Structure: Uses QueryResponseInlineMessage format with azure_ai_evaluator type
|
|
955
|
+
sync_eval_payload = {
|
|
956
|
+
"name": f"Safety Eval - {metric_name}",
|
|
957
|
+
"data_source": {
|
|
958
|
+
"type": "jsonl",
|
|
959
|
+
"source": {"type": "file_content", "content": {"item": item_content}},
|
|
960
|
+
},
|
|
961
|
+
"testing_criteria": [
|
|
962
|
+
{
|
|
963
|
+
"type": "azure_ai_evaluator",
|
|
964
|
+
"name": metric_name,
|
|
965
|
+
"evaluator_name": metric_name,
|
|
966
|
+
"data_mapping": data_mapping,
|
|
967
|
+
}
|
|
968
|
+
],
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
return sync_eval_payload
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def _parse_sync_eval_result(
|
|
975
|
+
eval_result, metric_name: str, metric_display_name: Optional[str] = None
|
|
976
|
+
) -> Dict[str, Union[str, float]]:
|
|
977
|
+
"""Parse the result from sync_evals response (EvalRunOutputItem) into the standard format.
|
|
978
|
+
|
|
979
|
+
:param eval_result: The result from sync_evals.create() call (EvalRunOutputItem).
|
|
980
|
+
:param metric_name: The evaluation metric name.
|
|
981
|
+
:type metric_name: str
|
|
982
|
+
:param metric_display_name: The display name for the metric.
|
|
983
|
+
:type metric_display_name: Optional[str]
|
|
984
|
+
:return: The parsed result in standard format compatible with parse_response.
|
|
985
|
+
:rtype: Dict[str, Union[str, float]]
|
|
986
|
+
"""
|
|
987
|
+
# Handle EvalRunOutputItem structure
|
|
988
|
+
# Expected structure: {'results': [{'name': 'violence', 'score': 0.0, 'reason': '...', ...}]}
|
|
989
|
+
|
|
990
|
+
display_name = metric_display_name or metric_name
|
|
991
|
+
|
|
992
|
+
# Handle both dict and object formats
|
|
993
|
+
if hasattr(eval_result, "results"):
|
|
994
|
+
results = eval_result.results
|
|
995
|
+
elif isinstance(eval_result, dict) and "results" in eval_result:
|
|
996
|
+
results = eval_result["results"]
|
|
997
|
+
else:
|
|
998
|
+
return {}
|
|
999
|
+
|
|
1000
|
+
if not results or len(results) == 0:
|
|
1001
|
+
return {}
|
|
1002
|
+
|
|
1003
|
+
# Find the result for our specific metric
|
|
1004
|
+
target_result = None
|
|
1005
|
+
for result_item in results:
|
|
1006
|
+
if isinstance(result_item, dict):
|
|
1007
|
+
if result_item.get("name") == metric_name or result_item.get("metric") == metric_name:
|
|
1008
|
+
target_result = result_item
|
|
1009
|
+
break
|
|
1010
|
+
elif hasattr(result_item, "name") and result_item.name == metric_name:
|
|
1011
|
+
target_result = result_item
|
|
1012
|
+
break
|
|
1013
|
+
|
|
1014
|
+
if not target_result:
|
|
1015
|
+
return {}
|
|
1016
|
+
|
|
1017
|
+
# Extract values from the result item
|
|
1018
|
+
if isinstance(target_result, dict):
|
|
1019
|
+
score = target_result.get("score", math.nan)
|
|
1020
|
+
reason = target_result.get("reason", "")
|
|
1021
|
+
# Also check properties.reasoning for additional reason text
|
|
1022
|
+
if not reason and "properties" in target_result:
|
|
1023
|
+
props = target_result["properties"]
|
|
1024
|
+
if isinstance(props, dict):
|
|
1025
|
+
reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
|
|
1026
|
+
else:
|
|
1027
|
+
score = getattr(target_result, "score", math.nan)
|
|
1028
|
+
reason = getattr(target_result, "reason", "")
|
|
1029
|
+
if not reason and hasattr(target_result, "properties"):
|
|
1030
|
+
props = target_result.properties
|
|
1031
|
+
if isinstance(props, dict):
|
|
1032
|
+
reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
|
|
1033
|
+
|
|
1034
|
+
# Convert score to severity level using existing logic
|
|
1035
|
+
harm_score = score if not math.isnan(score) else math.nan
|
|
1036
|
+
severity_level = get_harm_severity_level(harm_score) if not math.isnan(harm_score) else math.nan
|
|
1037
|
+
|
|
1038
|
+
# Return in the standard format expected by the red team processor
|
|
1039
|
+
return {display_name: severity_level, f"{display_name}_score": harm_score, f"{display_name}_reason": reason}
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
async def evaluate_with_rai_service_sync(
|
|
1043
|
+
data: dict,
|
|
1044
|
+
metric_name: str,
|
|
1045
|
+
project_scope: Union[str, AzureAIProject],
|
|
1046
|
+
credential: TokenCredential,
|
|
1047
|
+
annotation_task: str = Tasks.CONTENT_HARM,
|
|
1048
|
+
metric_display_name=None,
|
|
1049
|
+
evaluator_name=None,
|
|
1050
|
+
scan_session_id: Optional[str] = None,
|
|
1051
|
+
):
|
|
1052
|
+
"""Evaluate the content safety of the response using sync_evals endpoint (OneDP only).
|
|
1053
|
+
|
|
1054
|
+
This function uses the sync_evals.create() API instead of the legacy evaluations.submit_annotation()
|
|
1055
|
+
approach. It's specifically designed for OneDP projects and provides better integration with
|
|
1056
|
+
the newer evaluation infrastructure. Returns the raw EvalRunOutputItem for direct use.
|
|
1057
|
+
|
|
1058
|
+
:param data: The data to evaluate.
|
|
1059
|
+
:type data: dict
|
|
1060
|
+
:param metric_name: The evaluation metric to use.
|
|
1061
|
+
:type metric_name: str
|
|
1062
|
+
:param project_scope: The Azure AI project, which can either be a string representing the project endpoint
|
|
1063
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
1064
|
+
:type project_scope: Union[str, AzureAIProject]
|
|
1065
|
+
:param credential: The Azure authentication credential.
|
|
1066
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
1067
|
+
:param annotation_task: The annotation task to use.
|
|
1068
|
+
:type annotation_task: str
|
|
1069
|
+
:param metric_display_name: The display name of metric to use.
|
|
1070
|
+
:type metric_display_name: str
|
|
1071
|
+
:param evaluator_name: The evaluator name to use.
|
|
1072
|
+
:type evaluator_name: str
|
|
1073
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
1074
|
+
:type scan_session_id: Optional[str]
|
|
1075
|
+
:return: The EvalRunOutputItem containing the evaluation results.
|
|
1076
|
+
:rtype: EvalRunOutputItem
|
|
1077
|
+
:raises: EvaluationException if project_scope is not a OneDP project
|
|
1078
|
+
"""
|
|
1079
|
+
if not is_onedp_project(project_scope):
|
|
1080
|
+
msg = "evaluate_with_rai_service_sync only supports OneDP projects. Use evaluate_with_rai_service for legacy projects."
|
|
1081
|
+
raise EvaluationException(
|
|
1082
|
+
message=msg,
|
|
1083
|
+
internal_message=msg,
|
|
1084
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
1085
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1086
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1087
|
+
)
|
|
1088
|
+
|
|
1089
|
+
client = AIProjectClient(
|
|
1090
|
+
endpoint=project_scope,
|
|
1091
|
+
credential=credential,
|
|
1092
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
# Build the sync eval payload
|
|
1096
|
+
sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id)
|
|
1097
|
+
# Call sync_evals.create() with the JSON payload
|
|
1098
|
+
eval_result = client.sync_evals.create(eval=sync_eval_payload)
|
|
1099
|
+
|
|
1100
|
+
# Return the raw EvalRunOutputItem for downstream processing
|
|
1101
|
+
return eval_result
|
|
1102
|
+
|
|
1103
|
+
|
|
607
1104
|
async def evaluate_with_rai_service_multimodal(
|
|
608
|
-
messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
|
|
1105
|
+
messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential
|
|
609
1106
|
):
|
|
610
1107
|
""" "Evaluate the content safety of the response using Responsible AI service
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
~azure.core.credentials.TokenCredential
|
|
620
|
-
|
|
621
|
-
|
|
1108
|
+
:param messages: The normalized list of messages.
|
|
1109
|
+
:type messages: str
|
|
1110
|
+
:param metric_name: The evaluation metric to use.
|
|
1111
|
+
:type metric_name: str
|
|
1112
|
+
:param project_scope: The Azure AI project, which can either be a string representing the project endpoint
|
|
1113
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
1114
|
+
:type project_scope: Union[str, AzureAIProject]
|
|
1115
|
+
:param credential: The Azure authentication credential.
|
|
1116
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
1117
|
+
:return: The parsed annotation result.
|
|
1118
|
+
:rtype: List[List[Dict]]
|
|
622
1119
|
"""
|
|
623
1120
|
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
1121
|
+
if is_onedp_project(project_scope):
|
|
1122
|
+
client = AIProjectClient(
|
|
1123
|
+
endpoint=project_scope,
|
|
1124
|
+
credential=credential,
|
|
1125
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
1126
|
+
)
|
|
1127
|
+
token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
|
|
1128
|
+
await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
|
|
1129
|
+
operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
|
|
1130
|
+
annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
|
|
1131
|
+
result = parse_response(annotation_response, metric_name)
|
|
1132
|
+
return result
|
|
1133
|
+
else:
|
|
1134
|
+
token = await fetch_or_reuse_token(credential)
|
|
1135
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
1136
|
+
await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
|
|
1137
|
+
# Submit annotation request and fetch result
|
|
1138
|
+
operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
|
|
1139
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
1140
|
+
result = parse_response(annotation_response, metric_name)
|
|
1141
|
+
return result
|