azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,8 @@ from ast import literal_eval
|
|
|
12
12
|
from typing import Dict, List, Optional, Union, cast
|
|
13
13
|
from urllib.parse import urlparse
|
|
14
14
|
from string import Template
|
|
15
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
15
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
16
|
+
from azure.ai.evaluation._common.onedp.models import QueryResponseInlineMessage
|
|
16
17
|
from azure.core.exceptions import HttpResponseError
|
|
17
18
|
|
|
18
19
|
import jwt
|
|
@@ -411,6 +412,25 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
411
412
|
result[pm_metric_name + "_reason"] = (
|
|
412
413
|
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
413
414
|
)
|
|
415
|
+
result[pm_metric_name + "_total_tokens"] = (
|
|
416
|
+
parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
|
|
417
|
+
)
|
|
418
|
+
result[pm_metric_name + "_prompt_tokens"] = (
|
|
419
|
+
parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
|
|
420
|
+
)
|
|
421
|
+
result[pm_metric_name + "_completion_tokens"] = (
|
|
422
|
+
parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
|
|
423
|
+
)
|
|
424
|
+
result[pm_metric_name + "_finish_reason"] = (
|
|
425
|
+
parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
|
|
426
|
+
)
|
|
427
|
+
result[pm_metric_name + "_sample_input"] = (
|
|
428
|
+
parsed_response["sample_input"] if "sample_input" in parsed_response else ""
|
|
429
|
+
)
|
|
430
|
+
result[pm_metric_name + "_sample_output"] = (
|
|
431
|
+
parsed_response["sample_output"] if "sample_output" in parsed_response else ""
|
|
432
|
+
)
|
|
433
|
+
result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
|
|
414
434
|
return result
|
|
415
435
|
if metric_name not in batch_response[0]:
|
|
416
436
|
return {}
|
|
@@ -442,9 +462,39 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
442
462
|
# Add all attributes under the details.
|
|
443
463
|
details = {}
|
|
444
464
|
for key, value in parsed_response.items():
|
|
445
|
-
if key not in {
|
|
465
|
+
if key not in {
|
|
466
|
+
"label",
|
|
467
|
+
"reasoning",
|
|
468
|
+
"version",
|
|
469
|
+
"totalTokenCount",
|
|
470
|
+
"inputTokenCount",
|
|
471
|
+
"outputTokenCount",
|
|
472
|
+
"finish_reason",
|
|
473
|
+
"sample_input",
|
|
474
|
+
"sample_output",
|
|
475
|
+
"model",
|
|
476
|
+
}:
|
|
446
477
|
details[key.replace("-", "_")] = value
|
|
447
478
|
result[metric_display_name + "_details"] = details
|
|
479
|
+
result[metric_display_name + "_total_tokens"] = (
|
|
480
|
+
parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
|
|
481
|
+
)
|
|
482
|
+
result[metric_display_name + "_prompt_tokens"] = (
|
|
483
|
+
parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
|
|
484
|
+
)
|
|
485
|
+
result[metric_display_name + "_completion_tokens"] = (
|
|
486
|
+
parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
|
|
487
|
+
)
|
|
488
|
+
result[metric_display_name + "_finish_reason"] = (
|
|
489
|
+
parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
|
|
490
|
+
)
|
|
491
|
+
result[metric_display_name + "_sample_input"] = (
|
|
492
|
+
parsed_response["sample_input"] if "sample_input" in parsed_response else ""
|
|
493
|
+
)
|
|
494
|
+
result[metric_display_name + "_sample_output"] = (
|
|
495
|
+
parsed_response["sample_output"] if "sample_output" in parsed_response else ""
|
|
496
|
+
)
|
|
497
|
+
result[metric_display_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
|
|
448
498
|
return result
|
|
449
499
|
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
|
|
450
500
|
|
|
@@ -484,6 +534,13 @@ def _parse_content_harm_response(
|
|
|
484
534
|
except Exception: # pylint: disable=broad-exception-caught
|
|
485
535
|
harm_response = response[metric_name]
|
|
486
536
|
|
|
537
|
+
total_tokens = 0
|
|
538
|
+
prompt_tokens = 0
|
|
539
|
+
completion_tokens = 0
|
|
540
|
+
finish_reason = ""
|
|
541
|
+
sample_input = ""
|
|
542
|
+
sample_output = ""
|
|
543
|
+
model = ""
|
|
487
544
|
if harm_response != "" and isinstance(harm_response, dict):
|
|
488
545
|
# check if "output" is one key in harm_response
|
|
489
546
|
if "output" in harm_response:
|
|
@@ -511,6 +568,44 @@ def _parse_content_harm_response(
|
|
|
511
568
|
reason = harm_response["reason"]
|
|
512
569
|
else:
|
|
513
570
|
reason = ""
|
|
571
|
+
|
|
572
|
+
# get token_usage
|
|
573
|
+
if "totalTokenCount" in harm_response:
|
|
574
|
+
total_tokens = harm_response["totalTokenCount"]
|
|
575
|
+
else:
|
|
576
|
+
total_tokens = 0
|
|
577
|
+
if "inputTokenCount" in harm_response:
|
|
578
|
+
prompt_tokens = harm_response["inputTokenCount"]
|
|
579
|
+
else:
|
|
580
|
+
prompt_tokens = 0
|
|
581
|
+
if "outputTokenCount" in harm_response:
|
|
582
|
+
completion_tokens = harm_response["outputTokenCount"]
|
|
583
|
+
else:
|
|
584
|
+
completion_tokens = 0
|
|
585
|
+
|
|
586
|
+
# get finish_reason
|
|
587
|
+
if "finish_reason" in harm_response:
|
|
588
|
+
finish_reason = harm_response["finish_reason"]
|
|
589
|
+
else:
|
|
590
|
+
finish_reason = ""
|
|
591
|
+
|
|
592
|
+
# get sample_input
|
|
593
|
+
if "sample_input" in harm_response:
|
|
594
|
+
sample_input = harm_response["sample_input"]
|
|
595
|
+
else:
|
|
596
|
+
sample_input = ""
|
|
597
|
+
|
|
598
|
+
# get sample_output
|
|
599
|
+
if "sample_output" in harm_response:
|
|
600
|
+
sample_output = harm_response["sample_output"]
|
|
601
|
+
else:
|
|
602
|
+
sample_output = ""
|
|
603
|
+
|
|
604
|
+
# get model
|
|
605
|
+
if "model" in harm_response:
|
|
606
|
+
model = harm_response["model"]
|
|
607
|
+
else:
|
|
608
|
+
model = ""
|
|
514
609
|
elif harm_response != "" and isinstance(harm_response, str):
|
|
515
610
|
metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
|
|
516
611
|
if metric_value_match:
|
|
@@ -537,6 +632,13 @@ def _parse_content_harm_response(
|
|
|
537
632
|
result[key] = get_harm_severity_level(harm_score)
|
|
538
633
|
result[key + "_score"] = harm_score
|
|
539
634
|
result[key + "_reason"] = reason
|
|
635
|
+
result[key + "_total_tokens"] = total_tokens
|
|
636
|
+
result[key + "_prompt_tokens"] = prompt_tokens
|
|
637
|
+
result[key + "_completion_tokens"] = completion_tokens
|
|
638
|
+
result[key + "_finish_reason"] = finish_reason
|
|
639
|
+
result[key + "_sample_input"] = sample_input
|
|
640
|
+
result[key + "_sample_output"] = sample_output
|
|
641
|
+
result[key + "_model"] = model
|
|
540
642
|
|
|
541
643
|
return result
|
|
542
644
|
|
|
@@ -802,6 +904,201 @@ async def submit_multimodal_request_onedp(client: AIProjectClient, messages, met
|
|
|
802
904
|
return operation_id
|
|
803
905
|
|
|
804
906
|
|
|
907
|
+
def _build_sync_eval_payload(
|
|
908
|
+
data: dict, metric_name: str, annotation_task: str, scan_session_id: Optional[str] = None
|
|
909
|
+
) -> Dict:
|
|
910
|
+
"""Build the sync_evals payload for evaluation using QueryResponseInlineMessage format.
|
|
911
|
+
|
|
912
|
+
:param data: The data to evaluate, containing 'query', 'response', and optionally 'context' and 'tool_calls'.
|
|
913
|
+
:type data: dict
|
|
914
|
+
:param metric_name: The evaluation metric to use.
|
|
915
|
+
:type metric_name: str
|
|
916
|
+
:param annotation_task: The annotation task to use.
|
|
917
|
+
:type annotation_task: str
|
|
918
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
919
|
+
:type scan_session_id: Optional[str]
|
|
920
|
+
:return: The sync_eval payload ready to send to the API.
|
|
921
|
+
:rtype: Dict
|
|
922
|
+
"""
|
|
923
|
+
|
|
924
|
+
# Build properties/metadata (category, taxonomy, etc.)
|
|
925
|
+
properties = {}
|
|
926
|
+
if data.get("risk_sub_type") is not None:
|
|
927
|
+
properties["category"] = data["risk_sub_type"]
|
|
928
|
+
if data.get("taxonomy") is not None:
|
|
929
|
+
properties["taxonomy"] = str(data["taxonomy"]) # Ensure taxonomy is converted to string
|
|
930
|
+
|
|
931
|
+
# Prepare context if available
|
|
932
|
+
context = None
|
|
933
|
+
if data.get("context") is not None:
|
|
934
|
+
context = " ".join(c["content"] for c in data["context"]["contexts"])
|
|
935
|
+
|
|
936
|
+
# Build QueryResponseInlineMessage object
|
|
937
|
+
item_content = QueryResponseInlineMessage(
|
|
938
|
+
query=data.get("query", ""),
|
|
939
|
+
response=data.get("response", ""),
|
|
940
|
+
context=context,
|
|
941
|
+
tools=data.get("tool_calls"),
|
|
942
|
+
properties=properties if properties else None,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
# Build the data mapping using mustache syntax {{item.field}}
|
|
946
|
+
data_mapping = {
|
|
947
|
+
"query": "{{item.query}}",
|
|
948
|
+
"response": "{{item.response}}",
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
# Create the sync eval input payload
|
|
952
|
+
# Structure: Uses QueryResponseInlineMessage format with azure_ai_evaluator type
|
|
953
|
+
sync_eval_payload = {
|
|
954
|
+
"name": f"Safety Eval - {metric_name}",
|
|
955
|
+
"data_source": {
|
|
956
|
+
"type": "jsonl",
|
|
957
|
+
"source": {"type": "file_content", "content": {"item": item_content}},
|
|
958
|
+
},
|
|
959
|
+
"testing_criteria": [
|
|
960
|
+
{
|
|
961
|
+
"type": "azure_ai_evaluator",
|
|
962
|
+
"name": metric_name,
|
|
963
|
+
"evaluator_name": metric_name,
|
|
964
|
+
"data_mapping": data_mapping,
|
|
965
|
+
}
|
|
966
|
+
],
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
return sync_eval_payload
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
def _parse_sync_eval_result(
|
|
973
|
+
eval_result, metric_name: str, metric_display_name: Optional[str] = None
|
|
974
|
+
) -> Dict[str, Union[str, float]]:
|
|
975
|
+
"""Parse the result from sync_evals response (EvalRunOutputItem) into the standard format.
|
|
976
|
+
|
|
977
|
+
:param eval_result: The result from sync_evals.create() call (EvalRunOutputItem).
|
|
978
|
+
:param metric_name: The evaluation metric name.
|
|
979
|
+
:type metric_name: str
|
|
980
|
+
:param metric_display_name: The display name for the metric.
|
|
981
|
+
:type metric_display_name: Optional[str]
|
|
982
|
+
:return: The parsed result in standard format compatible with parse_response.
|
|
983
|
+
:rtype: Dict[str, Union[str, float]]
|
|
984
|
+
"""
|
|
985
|
+
# Handle EvalRunOutputItem structure
|
|
986
|
+
# Expected structure: {'results': [{'name': 'violence', 'score': 0.0, 'reason': '...', ...}]}
|
|
987
|
+
|
|
988
|
+
display_name = metric_display_name or metric_name
|
|
989
|
+
|
|
990
|
+
# Handle both dict and object formats
|
|
991
|
+
if hasattr(eval_result, "results"):
|
|
992
|
+
results = eval_result.results
|
|
993
|
+
elif isinstance(eval_result, dict) and "results" in eval_result:
|
|
994
|
+
results = eval_result["results"]
|
|
995
|
+
else:
|
|
996
|
+
return {}
|
|
997
|
+
|
|
998
|
+
if not results or len(results) == 0:
|
|
999
|
+
return {}
|
|
1000
|
+
|
|
1001
|
+
# Find the result for our specific metric
|
|
1002
|
+
target_result = None
|
|
1003
|
+
for result_item in results:
|
|
1004
|
+
if isinstance(result_item, dict):
|
|
1005
|
+
if result_item.get("name") == metric_name or result_item.get("metric") == metric_name:
|
|
1006
|
+
target_result = result_item
|
|
1007
|
+
break
|
|
1008
|
+
elif hasattr(result_item, "name") and result_item.name == metric_name:
|
|
1009
|
+
target_result = result_item
|
|
1010
|
+
break
|
|
1011
|
+
|
|
1012
|
+
if not target_result:
|
|
1013
|
+
return {}
|
|
1014
|
+
|
|
1015
|
+
# Extract values from the result item
|
|
1016
|
+
if isinstance(target_result, dict):
|
|
1017
|
+
score = target_result.get("score", math.nan)
|
|
1018
|
+
reason = target_result.get("reason", "")
|
|
1019
|
+
# Also check properties.reasoning for additional reason text
|
|
1020
|
+
if not reason and "properties" in target_result:
|
|
1021
|
+
props = target_result["properties"]
|
|
1022
|
+
if isinstance(props, dict):
|
|
1023
|
+
reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
|
|
1024
|
+
else:
|
|
1025
|
+
score = getattr(target_result, "score", math.nan)
|
|
1026
|
+
reason = getattr(target_result, "reason", "")
|
|
1027
|
+
if not reason and hasattr(target_result, "properties"):
|
|
1028
|
+
props = target_result.properties
|
|
1029
|
+
if isinstance(props, dict):
|
|
1030
|
+
reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
|
|
1031
|
+
|
|
1032
|
+
# Convert score to severity level using existing logic
|
|
1033
|
+
harm_score = score if not math.isnan(score) else math.nan
|
|
1034
|
+
severity_level = get_harm_severity_level(harm_score) if not math.isnan(harm_score) else math.nan
|
|
1035
|
+
|
|
1036
|
+
# Return in the standard format expected by the red team processor
|
|
1037
|
+
return {display_name: severity_level, f"{display_name}_score": harm_score, f"{display_name}_reason": reason}
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
async def evaluate_with_rai_service_sync(
|
|
1041
|
+
data: dict,
|
|
1042
|
+
metric_name: str,
|
|
1043
|
+
project_scope: Union[str, AzureAIProject],
|
|
1044
|
+
credential: TokenCredential,
|
|
1045
|
+
annotation_task: str = Tasks.CONTENT_HARM,
|
|
1046
|
+
metric_display_name=None,
|
|
1047
|
+
evaluator_name=None,
|
|
1048
|
+
scan_session_id: Optional[str] = None,
|
|
1049
|
+
):
|
|
1050
|
+
"""Evaluate the content safety of the response using sync_evals endpoint (OneDP only).
|
|
1051
|
+
|
|
1052
|
+
This function uses the sync_evals.create() API instead of the legacy evaluations.submit_annotation()
|
|
1053
|
+
approach. It's specifically designed for OneDP projects and provides better integration with
|
|
1054
|
+
the newer evaluation infrastructure. Returns the raw EvalRunOutputItem for direct use.
|
|
1055
|
+
|
|
1056
|
+
:param data: The data to evaluate.
|
|
1057
|
+
:type data: dict
|
|
1058
|
+
:param metric_name: The evaluation metric to use.
|
|
1059
|
+
:type metric_name: str
|
|
1060
|
+
:param project_scope: The Azure AI project, which can either be a string representing the project endpoint
|
|
1061
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
1062
|
+
:type project_scope: Union[str, AzureAIProject]
|
|
1063
|
+
:param credential: The Azure authentication credential.
|
|
1064
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
1065
|
+
:param annotation_task: The annotation task to use.
|
|
1066
|
+
:type annotation_task: str
|
|
1067
|
+
:param metric_display_name: The display name of metric to use.
|
|
1068
|
+
:type metric_display_name: str
|
|
1069
|
+
:param evaluator_name: The evaluator name to use.
|
|
1070
|
+
:type evaluator_name: str
|
|
1071
|
+
:param scan_session_id: The scan session ID to use for the evaluation.
|
|
1072
|
+
:type scan_session_id: Optional[str]
|
|
1073
|
+
:return: The EvalRunOutputItem containing the evaluation results.
|
|
1074
|
+
:rtype: EvalRunOutputItem
|
|
1075
|
+
:raises: EvaluationException if project_scope is not a OneDP project
|
|
1076
|
+
"""
|
|
1077
|
+
if not is_onedp_project(project_scope):
|
|
1078
|
+
msg = "evaluate_with_rai_service_sync only supports OneDP projects. Use evaluate_with_rai_service for legacy projects."
|
|
1079
|
+
raise EvaluationException(
|
|
1080
|
+
message=msg,
|
|
1081
|
+
internal_message=msg,
|
|
1082
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
1083
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1084
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
client = AIProjectClient(
|
|
1088
|
+
endpoint=project_scope,
|
|
1089
|
+
credential=credential,
|
|
1090
|
+
user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
# Build the sync eval payload
|
|
1094
|
+
sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id)
|
|
1095
|
+
# Call sync_evals.create() with the JSON payload
|
|
1096
|
+
eval_result = client.sync_evals.create(eval=sync_eval_payload)
|
|
1097
|
+
|
|
1098
|
+
# Return the raw EvalRunOutputItem for downstream processing
|
|
1099
|
+
return eval_result
|
|
1100
|
+
|
|
1101
|
+
|
|
805
1102
|
async def evaluate_with_rai_service_multimodal(
|
|
806
1103
|
messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential
|
|
807
1104
|
):
|