azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,8 @@ from ast import literal_eval
12
12
  from typing import Dict, List, Optional, Union, cast
13
13
  from urllib.parse import urlparse
14
14
  from string import Template
15
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
15
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
16
+ from azure.ai.evaluation._common.onedp.models import QueryResponseInlineMessage
16
17
  from azure.core.exceptions import HttpResponseError
17
18
 
18
19
  import jwt
@@ -411,6 +412,25 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
411
412
  result[pm_metric_name + "_reason"] = (
412
413
  parsed_response["reasoning"] if "reasoning" in parsed_response else ""
413
414
  )
415
+ result[pm_metric_name + "_total_tokens"] = (
416
+ parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
417
+ )
418
+ result[pm_metric_name + "_prompt_tokens"] = (
419
+ parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
420
+ )
421
+ result[pm_metric_name + "_completion_tokens"] = (
422
+ parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
423
+ )
424
+ result[pm_metric_name + "_finish_reason"] = (
425
+ parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
426
+ )
427
+ result[pm_metric_name + "_sample_input"] = (
428
+ parsed_response["sample_input"] if "sample_input" in parsed_response else ""
429
+ )
430
+ result[pm_metric_name + "_sample_output"] = (
431
+ parsed_response["sample_output"] if "sample_output" in parsed_response else ""
432
+ )
433
+ result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
414
434
  return result
415
435
  if metric_name not in batch_response[0]:
416
436
  return {}
@@ -442,9 +462,39 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
442
462
  # Add all attributes under the details.
443
463
  details = {}
444
464
  for key, value in parsed_response.items():
445
- if key not in {"label", "reasoning", "version"}:
465
+ if key not in {
466
+ "label",
467
+ "reasoning",
468
+ "version",
469
+ "totalTokenCount",
470
+ "inputTokenCount",
471
+ "outputTokenCount",
472
+ "finish_reason",
473
+ "sample_input",
474
+ "sample_output",
475
+ "model",
476
+ }:
446
477
  details[key.replace("-", "_")] = value
447
478
  result[metric_display_name + "_details"] = details
479
+ result[metric_display_name + "_total_tokens"] = (
480
+ parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
481
+ )
482
+ result[metric_display_name + "_prompt_tokens"] = (
483
+ parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
484
+ )
485
+ result[metric_display_name + "_completion_tokens"] = (
486
+ parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
487
+ )
488
+ result[metric_display_name + "_finish_reason"] = (
489
+ parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
490
+ )
491
+ result[metric_display_name + "_sample_input"] = (
492
+ parsed_response["sample_input"] if "sample_input" in parsed_response else ""
493
+ )
494
+ result[metric_display_name + "_sample_output"] = (
495
+ parsed_response["sample_output"] if "sample_output" in parsed_response else ""
496
+ )
497
+ result[metric_display_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
448
498
  return result
449
499
  return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
450
500
 
@@ -484,6 +534,13 @@ def _parse_content_harm_response(
484
534
  except Exception: # pylint: disable=broad-exception-caught
485
535
  harm_response = response[metric_name]
486
536
 
537
+ total_tokens = 0
538
+ prompt_tokens = 0
539
+ completion_tokens = 0
540
+ finish_reason = ""
541
+ sample_input = ""
542
+ sample_output = ""
543
+ model = ""
487
544
  if harm_response != "" and isinstance(harm_response, dict):
488
545
  # check if "output" is one key in harm_response
489
546
  if "output" in harm_response:
@@ -511,6 +568,44 @@ def _parse_content_harm_response(
511
568
  reason = harm_response["reason"]
512
569
  else:
513
570
  reason = ""
571
+
572
+ # get token_usage
573
+ if "totalTokenCount" in harm_response:
574
+ total_tokens = harm_response["totalTokenCount"]
575
+ else:
576
+ total_tokens = 0
577
+ if "inputTokenCount" in harm_response:
578
+ prompt_tokens = harm_response["inputTokenCount"]
579
+ else:
580
+ prompt_tokens = 0
581
+ if "outputTokenCount" in harm_response:
582
+ completion_tokens = harm_response["outputTokenCount"]
583
+ else:
584
+ completion_tokens = 0
585
+
586
+ # get finish_reason
587
+ if "finish_reason" in harm_response:
588
+ finish_reason = harm_response["finish_reason"]
589
+ else:
590
+ finish_reason = ""
591
+
592
+ # get sample_input
593
+ if "sample_input" in harm_response:
594
+ sample_input = harm_response["sample_input"]
595
+ else:
596
+ sample_input = ""
597
+
598
+ # get sample_output
599
+ if "sample_output" in harm_response:
600
+ sample_output = harm_response["sample_output"]
601
+ else:
602
+ sample_output = ""
603
+
604
+ # get model
605
+ if "model" in harm_response:
606
+ model = harm_response["model"]
607
+ else:
608
+ model = ""
514
609
  elif harm_response != "" and isinstance(harm_response, str):
515
610
  metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
516
611
  if metric_value_match:
@@ -537,6 +632,13 @@ def _parse_content_harm_response(
537
632
  result[key] = get_harm_severity_level(harm_score)
538
633
  result[key + "_score"] = harm_score
539
634
  result[key + "_reason"] = reason
635
+ result[key + "_total_tokens"] = total_tokens
636
+ result[key + "_prompt_tokens"] = prompt_tokens
637
+ result[key + "_completion_tokens"] = completion_tokens
638
+ result[key + "_finish_reason"] = finish_reason
639
+ result[key + "_sample_input"] = sample_input
640
+ result[key + "_sample_output"] = sample_output
641
+ result[key + "_model"] = model
540
642
 
541
643
  return result
542
644
 
@@ -802,6 +904,201 @@ async def submit_multimodal_request_onedp(client: AIProjectClient, messages, met
802
904
  return operation_id
803
905
 
804
906
 
907
+ def _build_sync_eval_payload(
908
+ data: dict, metric_name: str, annotation_task: str, scan_session_id: Optional[str] = None
909
+ ) -> Dict:
910
+ """Build the sync_evals payload for evaluation using QueryResponseInlineMessage format.
911
+
912
+ :param data: The data to evaluate, containing 'query', 'response', and optionally 'context' and 'tool_calls'.
913
+ :type data: dict
914
+ :param metric_name: The evaluation metric to use.
915
+ :type metric_name: str
916
+ :param annotation_task: The annotation task to use.
917
+ :type annotation_task: str
918
+ :param scan_session_id: The scan session ID to use for the evaluation.
919
+ :type scan_session_id: Optional[str]
920
+ :return: The sync_eval payload ready to send to the API.
921
+ :rtype: Dict
922
+ """
923
+
924
+ # Build properties/metadata (category, taxonomy, etc.)
925
+ properties = {}
926
+ if data.get("risk_sub_type") is not None:
927
+ properties["category"] = data["risk_sub_type"]
928
+ if data.get("taxonomy") is not None:
929
+ properties["taxonomy"] = str(data["taxonomy"]) # Ensure taxonomy is converted to string
930
+
931
+ # Prepare context if available
932
+ context = None
933
+ if data.get("context") is not None:
934
+ context = " ".join(c["content"] for c in data["context"]["contexts"])
935
+
936
+ # Build QueryResponseInlineMessage object
937
+ item_content = QueryResponseInlineMessage(
938
+ query=data.get("query", ""),
939
+ response=data.get("response", ""),
940
+ context=context,
941
+ tools=data.get("tool_calls"),
942
+ properties=properties if properties else None,
943
+ )
944
+
945
+ # Build the data mapping using mustache syntax {{item.field}}
946
+ data_mapping = {
947
+ "query": "{{item.query}}",
948
+ "response": "{{item.response}}",
949
+ }
950
+
951
+ # Create the sync eval input payload
952
+ # Structure: Uses QueryResponseInlineMessage format with azure_ai_evaluator type
953
+ sync_eval_payload = {
954
+ "name": f"Safety Eval - {metric_name}",
955
+ "data_source": {
956
+ "type": "jsonl",
957
+ "source": {"type": "file_content", "content": {"item": item_content}},
958
+ },
959
+ "testing_criteria": [
960
+ {
961
+ "type": "azure_ai_evaluator",
962
+ "name": metric_name,
963
+ "evaluator_name": metric_name,
964
+ "data_mapping": data_mapping,
965
+ }
966
+ ],
967
+ }
968
+
969
+ return sync_eval_payload
970
+
971
+
972
+ def _parse_sync_eval_result(
973
+ eval_result, metric_name: str, metric_display_name: Optional[str] = None
974
+ ) -> Dict[str, Union[str, float]]:
975
+ """Parse the result from sync_evals response (EvalRunOutputItem) into the standard format.
976
+
977
+ :param eval_result: The result from sync_evals.create() call (EvalRunOutputItem).
978
+ :param metric_name: The evaluation metric name.
979
+ :type metric_name: str
980
+ :param metric_display_name: The display name for the metric.
981
+ :type metric_display_name: Optional[str]
982
+ :return: The parsed result in standard format compatible with parse_response.
983
+ :rtype: Dict[str, Union[str, float]]
984
+ """
985
+ # Handle EvalRunOutputItem structure
986
+ # Expected structure: {'results': [{'name': 'violence', 'score': 0.0, 'reason': '...', ...}]}
987
+
988
+ display_name = metric_display_name or metric_name
989
+
990
+ # Handle both dict and object formats
991
+ if hasattr(eval_result, "results"):
992
+ results = eval_result.results
993
+ elif isinstance(eval_result, dict) and "results" in eval_result:
994
+ results = eval_result["results"]
995
+ else:
996
+ return {}
997
+
998
+ if not results or len(results) == 0:
999
+ return {}
1000
+
1001
+ # Find the result for our specific metric
1002
+ target_result = None
1003
+ for result_item in results:
1004
+ if isinstance(result_item, dict):
1005
+ if result_item.get("name") == metric_name or result_item.get("metric") == metric_name:
1006
+ target_result = result_item
1007
+ break
1008
+ elif hasattr(result_item, "name") and result_item.name == metric_name:
1009
+ target_result = result_item
1010
+ break
1011
+
1012
+ if not target_result:
1013
+ return {}
1014
+
1015
+ # Extract values from the result item
1016
+ if isinstance(target_result, dict):
1017
+ score = target_result.get("score", math.nan)
1018
+ reason = target_result.get("reason", "")
1019
+ # Also check properties.reasoning for additional reason text
1020
+ if not reason and "properties" in target_result:
1021
+ props = target_result["properties"]
1022
+ if isinstance(props, dict):
1023
+ reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
1024
+ else:
1025
+ score = getattr(target_result, "score", math.nan)
1026
+ reason = getattr(target_result, "reason", "")
1027
+ if not reason and hasattr(target_result, "properties"):
1028
+ props = target_result.properties
1029
+ if isinstance(props, dict):
1030
+ reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
1031
+
1032
+ # Convert score to severity level using existing logic
1033
+ harm_score = score if not math.isnan(score) else math.nan
1034
+ severity_level = get_harm_severity_level(harm_score) if not math.isnan(harm_score) else math.nan
1035
+
1036
+ # Return in the standard format expected by the red team processor
1037
+ return {display_name: severity_level, f"{display_name}_score": harm_score, f"{display_name}_reason": reason}
1038
+
1039
+
1040
+ async def evaluate_with_rai_service_sync(
1041
+ data: dict,
1042
+ metric_name: str,
1043
+ project_scope: Union[str, AzureAIProject],
1044
+ credential: TokenCredential,
1045
+ annotation_task: str = Tasks.CONTENT_HARM,
1046
+ metric_display_name=None,
1047
+ evaluator_name=None,
1048
+ scan_session_id: Optional[str] = None,
1049
+ ):
1050
+ """Evaluate the content safety of the response using sync_evals endpoint (OneDP only).
1051
+
1052
+ This function uses the sync_evals.create() API instead of the legacy evaluations.submit_annotation()
1053
+ approach. It's specifically designed for OneDP projects and provides better integration with
1054
+ the newer evaluation infrastructure. Returns the raw EvalRunOutputItem for direct use.
1055
+
1056
+ :param data: The data to evaluate.
1057
+ :type data: dict
1058
+ :param metric_name: The evaluation metric to use.
1059
+ :type metric_name: str
1060
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
1061
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
1062
+ :type project_scope: Union[str, AzureAIProject]
1063
+ :param credential: The Azure authentication credential.
1064
+ :type credential: ~azure.core.credentials.TokenCredential
1065
+ :param annotation_task: The annotation task to use.
1066
+ :type annotation_task: str
1067
+ :param metric_display_name: The display name of metric to use.
1068
+ :type metric_display_name: str
1069
+ :param evaluator_name: The evaluator name to use.
1070
+ :type evaluator_name: str
1071
+ :param scan_session_id: The scan session ID to use for the evaluation.
1072
+ :type scan_session_id: Optional[str]
1073
+ :return: The EvalRunOutputItem containing the evaluation results.
1074
+ :rtype: EvalRunOutputItem
1075
+ :raises: EvaluationException if project_scope is not a OneDP project
1076
+ """
1077
+ if not is_onedp_project(project_scope):
1078
+ msg = "evaluate_with_rai_service_sync only supports OneDP projects. Use evaluate_with_rai_service for legacy projects."
1079
+ raise EvaluationException(
1080
+ message=msg,
1081
+ internal_message=msg,
1082
+ target=ErrorTarget.RAI_CLIENT,
1083
+ category=ErrorCategory.INVALID_VALUE,
1084
+ blame=ErrorBlame.USER_ERROR,
1085
+ )
1086
+
1087
+ client = AIProjectClient(
1088
+ endpoint=project_scope,
1089
+ credential=credential,
1090
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
1091
+ )
1092
+
1093
+ # Build the sync eval payload
1094
+ sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id)
1095
+ # Call sync_evals.create() with the JSON payload
1096
+ eval_result = client.sync_evals.create(eval=sync_eval_payload)
1097
+
1098
+ # Return the raw EvalRunOutputItem for downstream processing
1099
+ return eval_result
1100
+
1101
+
805
1102
  async def evaluate_with_rai_service_multimodal(
806
1103
  messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential
807
1104
  ):
@@ -21,34 +21,122 @@ from azure.ai.evaluation._model_configurations import (
21
21
  )
22
22
 
23
23
  from . import constants
24
+ from .constants import EvaluatorScoringPattern, EVALUATOR_SCORING_PATTERNS, SCORING_PATTERN_CONFIG
24
25
 
25
26
  _nltk_data_download_lock = threading.Lock()
26
27
 
27
28
  T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
28
29
 
29
30
 
30
- def get_harm_severity_level(harm_score: Union[float, int]) -> Union[str, float]:
31
+ def get_harm_severity_level(harm_score: Union[float, int], evaluator: Optional[str] = None) -> Union[str, float]:
31
32
  """Generate harm severity level based on harm score.
32
33
 
33
34
  :param harm_score: The harm score to be evaluated.
34
- :type harm_score: int
35
+ :type harm_score: Union[float, int]
36
+ :param evaluator: Optional evaluator name to determine scoring pattern. If not provided, uses default 0-7 scale.
37
+ :type evaluator: Optional[str]
35
38
  :return: The harm severity level. If harm score is None or numpy.nan, returns numpy.nan.
36
- :rtype: str
39
+ :rtype: Union[str, float]
37
40
  """
38
- HARM_SEVERITY_LEVEL_MAPPING = {
39
- constants.HarmSeverityLevel.VeryLow: [0, 1],
40
- constants.HarmSeverityLevel.Low: [2, 3],
41
- constants.HarmSeverityLevel.Medium: [4, 5],
42
- constants.HarmSeverityLevel.High: [6, 7],
43
- }
44
- if math.isnan(harm_score) or harm_score is None:
41
+ if harm_score is None or (isinstance(harm_score, float) and math.isnan(harm_score)):
42
+ return math.nan
43
+
44
+ # Get the scoring pattern for this evaluator
45
+ pattern = get_evaluator_scoring_pattern(evaluator)
46
+ config = SCORING_PATTERN_CONFIG.get(pattern)
47
+
48
+ if not config:
49
+ # Fallback to default 0-7 mapping
50
+ HARM_SEVERITY_LEVEL_MAPPING = {
51
+ constants.HarmSeverityLevel.VeryLow: [0, 1],
52
+ constants.HarmSeverityLevel.Low: [2, 3],
53
+ constants.HarmSeverityLevel.Medium: [4, 5],
54
+ constants.HarmSeverityLevel.High: [6, 7],
55
+ }
56
+ for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
57
+ if harm_score_range[0] <= harm_score <= harm_score_range[1]:
58
+ return harm_level.value
45
59
  return math.nan
46
- for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
47
- if harm_score_range[0] <= harm_score <= harm_score_range[1]:
60
+
61
+ # Use pattern-specific severity mapping
62
+ severity_mapping = config.get("severity_mapping", {})
63
+ for harm_level, score_range in severity_mapping.items():
64
+ if score_range[0] <= harm_score <= score_range[-1]:
48
65
  return harm_level.value
66
+
49
67
  return math.nan
50
68
 
51
69
 
70
+ def get_evaluator_scoring_pattern(evaluator: Optional[str]) -> EvaluatorScoringPattern:
71
+ """Get the scoring pattern for a given evaluator.
72
+
73
+ :param evaluator: The name of the evaluator.
74
+ :type evaluator: Optional[str]
75
+ :return: The scoring pattern for the evaluator, defaults to SCALE_0_7 if not found.
76
+ :rtype: EvaluatorScoringPattern
77
+ """
78
+ if evaluator is None:
79
+ return EvaluatorScoringPattern.SCALE_0_7
80
+
81
+ # Normalize evaluator name (remove underscores, lowercase)
82
+ normalized_name = evaluator.lower().replace("_", "")
83
+
84
+ # Try exact match first
85
+ if evaluator in EVALUATOR_SCORING_PATTERNS:
86
+ return EVALUATOR_SCORING_PATTERNS[evaluator]
87
+
88
+ # Try normalized match
89
+ for key, pattern in EVALUATOR_SCORING_PATTERNS.items():
90
+ if key.replace("_", "").lower() == normalized_name:
91
+ return pattern
92
+
93
+ # Default to 0-7 scale for unknown evaluators
94
+ return EvaluatorScoringPattern.SCALE_0_7
95
+
96
+
97
+ def get_default_threshold_for_evaluator(evaluator: str) -> int:
98
+ """Get the default attack success threshold for a given evaluator.
99
+
100
+ :param evaluator: The name of the evaluator.
101
+ :type evaluator: str
102
+ :return: The default threshold value.
103
+ :rtype: int
104
+ """
105
+ pattern = get_evaluator_scoring_pattern(evaluator)
106
+ config = SCORING_PATTERN_CONFIG.get(pattern, {})
107
+ return config.get("default_threshold", 3)
108
+
109
+
110
+ def convert_binary_to_numeric(value: Union[str, bool, int]) -> int:
111
+ """Convert binary evaluator outputs to numeric scores.
112
+
113
+ :param value: The binary value ("safe"/"unsafe", "true"/"false", True/False, 0/1).
114
+ :type value: Union[str, bool, int]
115
+ :return: 0 for safe/true, 1 for unsafe/false.
116
+ :rtype: int
117
+ """
118
+ if isinstance(value, bool):
119
+ return 0 if value else 1
120
+
121
+ if isinstance(value, int):
122
+ return value
123
+
124
+ if isinstance(value, str):
125
+ value_lower = value.lower().strip()
126
+ # For "safe"/"unsafe" pattern
127
+ if value_lower == "safe":
128
+ return 0
129
+ if value_lower == "unsafe":
130
+ return 1
131
+ # For "true"/"false" pattern
132
+ if value_lower == "true":
133
+ return 0
134
+ if value_lower == "false":
135
+ return 1
136
+
137
+ raise ValueError(f"Unable to convert value '{value}' to numeric score")
138
+
139
+
52
140
  def ensure_nltk_data_downloaded():
53
141
  """Download NLTK data packages if not already downloaded."""
54
142
  nltk_data = [
@@ -492,36 +580,69 @@ def _extract_text_from_content(content):
492
580
  return text
493
581
 
494
582
 
495
- def _get_conversation_history(query, include_system_messages=False):
496
- all_user_queries = []
497
- cur_user_query = []
498
- all_agent_responses = []
499
- cur_agent_response = []
583
+ def filter_to_used_tools(tool_definitions, msgs_lists, logger=None):
584
+ """Filters the tool definitions to only include those that were actually used in the messages lists."""
585
+ try:
586
+ used_tool_names = set()
587
+ any_tools_used = False
588
+ for msgs in msgs_lists:
589
+ for msg in msgs:
590
+ if msg.get("role") == "assistant" and "content" in msg:
591
+ for content in msg.get("content", []):
592
+ if content.get("type") == "tool_call":
593
+ any_tools_used = True
594
+ if "tool_call" in content and "function" in content["tool_call"]:
595
+ used_tool_names.add(content["tool_call"]["function"])
596
+ elif "name" in content:
597
+ used_tool_names.add(content["name"])
598
+
599
+ filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names]
600
+ if any_tools_used and not filtered_tools:
601
+ if logger:
602
+ logger.warning("No tool definitions matched the tools used in the messages. Returning original list.")
603
+ filtered_tools = tool_definitions
604
+
605
+ return filtered_tools
606
+ except Exception as e:
607
+ if logger:
608
+ logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}")
609
+ return tool_definitions
610
+
611
+
612
+ def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False):
613
+ all_user_queries, all_agent_responses = [], []
614
+ cur_user_query, cur_agent_response = [], []
500
615
  system_message = None
616
+
501
617
  for msg in query:
502
- if not "role" in msg:
618
+ role = msg.get("role")
619
+ if not role:
503
620
  continue
504
- if include_system_messages and msg["role"] == "system" and "content" in msg:
621
+ if include_system_messages and role == "system":
505
622
  system_message = msg.get("content", "")
506
- if msg["role"] == "user" and "content" in msg:
507
- if cur_agent_response != []:
508
- all_agent_responses.append(cur_agent_response)
623
+
624
+ elif role == "user" and "content" in msg:
625
+ if cur_agent_response:
626
+ formatted_agent_response = _get_agent_response(
627
+ cur_agent_response, include_tool_messages=include_tool_messages
628
+ )
629
+ all_agent_responses.append([formatted_agent_response])
509
630
  cur_agent_response = []
510
631
  text_in_msg = _extract_text_from_content(msg["content"])
511
632
  if text_in_msg:
512
633
  cur_user_query.append(text_in_msg)
513
634
 
514
- if msg["role"] == "assistant" and "content" in msg:
515
- if cur_user_query != []:
635
+ elif role in ("assistant", "tool"):
636
+ if cur_user_query:
516
637
  all_user_queries.append(cur_user_query)
517
638
  cur_user_query = []
518
- text_in_msg = _extract_text_from_content(msg["content"])
519
- if text_in_msg:
520
- cur_agent_response.append(text_in_msg)
521
- if cur_user_query != []:
639
+ cur_agent_response.append(msg)
640
+
641
+ if cur_user_query:
522
642
  all_user_queries.append(cur_user_query)
523
- if cur_agent_response != []:
524
- all_agent_responses.append(cur_agent_response)
643
+ if cur_agent_response:
644
+ formatted_agent_response = _get_agent_response(cur_agent_response, include_tool_messages=include_tool_messages)
645
+ all_agent_responses.append([formatted_agent_response])
525
646
 
526
647
  if len(all_user_queries) != len(all_agent_responses) + 1:
527
648
  raise EvaluationException(
@@ -531,8 +652,9 @@ def _get_conversation_history(query, include_system_messages=False):
531
652
  category=ErrorCategory.INVALID_VALUE,
532
653
  blame=ErrorBlame.USER_ERROR,
533
654
  )
655
+
534
656
  result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
535
- if include_system_messages:
657
+ if include_system_messages and system_message:
536
658
  result["system_message"] = system_message
537
659
  return result
538
660
 
@@ -540,7 +662,7 @@ def _get_conversation_history(query, include_system_messages=False):
540
662
  def _pretty_format_conversation_history(conversation_history):
541
663
  """Formats the conversation history for better readability."""
542
664
  formatted_history = ""
543
- if "system_message" in conversation_history and conversation_history["system_message"] is not None:
665
+ if conversation_history.get("system_message"):
544
666
  formatted_history += "SYSTEM_PROMPT:\n"
545
667
  formatted_history += " " + conversation_history["system_message"] + "\n\n"
546
668
  for i, (user_query, agent_response) in enumerate(
@@ -548,22 +670,34 @@ def _pretty_format_conversation_history(conversation_history):
548
670
  ):
549
671
  formatted_history += f"User turn {i+1}:\n"
550
672
  for msg in user_query:
551
- formatted_history += " " + "\n ".join(msg)
552
- formatted_history += "\n\n"
673
+ if isinstance(msg, list):
674
+ for submsg in msg:
675
+ formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
676
+ else:
677
+ formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
678
+ formatted_history += "\n"
553
679
  if agent_response:
554
680
  formatted_history += f"Agent turn {i+1}:\n"
555
681
  for msg in agent_response:
556
- formatted_history += " " + "\n ".join(msg)
557
- formatted_history += "\n\n"
682
+ if isinstance(msg, list):
683
+ for submsg in msg:
684
+ formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
685
+ else:
686
+ formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
687
+ formatted_history += "\n"
558
688
  return formatted_history
559
689
 
560
690
 
561
- def reformat_conversation_history(query, logger=None, include_system_messages=False):
691
+ def reformat_conversation_history(query, logger=None, include_system_messages=False, include_tool_messages=False):
562
692
  """Reformats the conversation history to a more compact representation."""
563
693
  try:
564
- conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
694
+ conversation_history = _get_conversation_history(
695
+ query,
696
+ include_system_messages=include_system_messages,
697
+ include_tool_messages=include_tool_messages,
698
+ )
565
699
  return _pretty_format_conversation_history(conversation_history)
566
- except:
700
+ except Exception as e:
567
701
  # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
568
702
  # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
569
703
  # From our tests the negative impact on IntentResolution is: