azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. azure/ai/evaluation/__init__.py +13 -2
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
  6. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  8. azure/ai/evaluation/_azure/_envs.py +9 -10
  9. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  10. azure/ai/evaluation/_common/constants.py +11 -2
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  13. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  14. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  15. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  16. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  17. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  18. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  20. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  21. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  22. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  23. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  24. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  25. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  26. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  27. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  28. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  29. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  30. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  31. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  32. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  33. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
  34. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  35. azure/ai/evaluation/_common/rai_service.py +86 -50
  36. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  37. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  38. azure/ai/evaluation/_common/utils.py +124 -3
  39. azure/ai/evaluation/_constants.py +2 -1
  40. azure/ai/evaluation/_converters/__init__.py +1 -1
  41. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  42. azure/ai/evaluation/_converters/_models.py +46 -0
  43. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  44. azure/ai/evaluation/_eval_mapping.py +2 -2
  45. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
  46. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  47. azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
  48. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
  49. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  50. azure/ai/evaluation/_evaluate/_utils.py +24 -15
  51. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
  52. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
  53. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
  54. azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
  55. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  56. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
  57. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
  58. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
  59. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
  60. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
  61. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
  62. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  63. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
  64. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
  65. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
  66. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
  67. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
  68. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
  69. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
  70. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  71. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
  72. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
  73. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
  74. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
  75. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
  76. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
  77. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
  78. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
  79. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
  80. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
  81. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
  82. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
  83. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
  84. azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
  85. azure/ai/evaluation/_exceptions.py +10 -0
  86. azure/ai/evaluation/_http_utils.py +3 -3
  87. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
  88. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  91. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  92. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  93. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  94. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  95. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
  96. azure/ai/evaluation/_user_agent.py +32 -1
  97. azure/ai/evaluation/_version.py +1 -1
  98. azure/ai/evaluation/red_team/__init__.py +3 -1
  99. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  100. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  101. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  102. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  103. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  104. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  105. azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
  106. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  107. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  108. azure/ai/evaluation/red_team/_red_team.py +1622 -765
  109. azure/ai/evaluation/red_team/_red_team_result.py +43 -38
  110. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  111. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
  112. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
  113. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
  114. azure/ai/evaluation/red_team/_utils/constants.py +6 -12
  115. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  116. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  117. azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
  118. azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
  119. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  120. azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
  121. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  122. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  123. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
  124. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
  125. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  126. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
  127. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  128. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  129. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  130. azure/ai/evaluation/simulator/_simulator.py +9 -8
  131. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
  132. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
  133. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  134. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
  135. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
  136. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,21 @@
1
- # coding=utf-8
2
- # --------------------------------------------------------------------------
3
- # Copyright (c) Microsoft Corporation. All rights reserved.
4
- # Licensed under the MIT License. See License.txt in the project root for license information.
5
- # --------------------------------------------------------------------------
6
- """Customize generated code here.
7
-
8
- Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
9
- """
10
- from typing import List
11
-
12
- __all__: List[str] = [] # Add all objects you want publicly available to users at this package level
13
-
14
-
15
- def patch_sdk():
16
- """Do not remove from this file.
17
-
18
- `patch_sdk` is a last resort escape hatch that allows you to do customizations
19
- you can't accomplish using the techniques described in
20
- https://aka.ms/azsdk/python/dpcodegen/python/customize
21
- """
1
+ # coding=utf-8
2
+ # --------------------------------------------------------------------------
3
+ # Copyright (c) Microsoft Corporation. All rights reserved.
4
+ # Licensed under the MIT License. See License.txt in the project root for license information.
5
+ # --------------------------------------------------------------------------
6
+ """Customize generated code here.
7
+
8
+ Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
9
+ """
10
+ from typing import List
11
+
12
+ __all__: List[str] = [] # Add all objects you want publicly available to users at this package level
13
+
14
+
15
+ def patch_sdk():
16
+ """Do not remove from this file.
17
+
18
+ `patch_sdk` is a last resort escape hatch that allows you to do customizations
19
+ you can't accomplish using the techniques described in
20
+ https://aka.ms/azsdk/python/dpcodegen/python/customize
21
+ """
@@ -21,10 +21,11 @@ from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
21
21
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
22
22
  from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
23
23
  from azure.ai.evaluation._model_configurations import AzureAIProject
24
+ from azure.ai.evaluation._user_agent import UserAgentSingleton
24
25
  from azure.ai.evaluation._common.utils import is_onedp_project
25
26
  from azure.core.credentials import TokenCredential
26
27
  from azure.core.exceptions import HttpResponseError
27
- from azure.core.pipeline.policies import AsyncRetryPolicy
28
+ from azure.core.pipeline.policies import AsyncRetryPolicy, UserAgentPolicy
28
29
 
29
30
  from .constants import (
30
31
  CommonConstants,
@@ -35,20 +36,16 @@ from .constants import (
35
36
  )
36
37
  from .utils import get_harm_severity_level, retrieve_content_type
37
38
 
38
- try:
39
- version = importlib.metadata.version("azure-ai-evaluation")
40
- except importlib.metadata.PackageNotFoundError:
41
- version = "unknown"
42
- USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
43
39
 
44
40
  USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
45
41
  "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
46
42
  }
47
- ML_WORKSPACE = "https://management.azure.com/.default"
43
+ ML_WORKSPACE = "https://management.azure.com/.default"
48
44
  COG_SRV_WORKSPACE = "https://ai.azure.com/.default"
49
45
 
50
46
  INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"
51
47
 
48
+
52
49
  def get_formatted_template(data: dict, annotation_task: str) -> str:
53
50
  """Given the task and input data, produce a formatted string that will serve as the main
54
51
  payload for the RAI service. Requires specific per-task logic.
@@ -71,16 +68,13 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
71
68
  }
72
69
  return json.dumps(as_dict)
73
70
  if annotation_task == Tasks.CODE_VULNERABILITY:
74
- as_dict = {
75
- "context": data.get("query", ""),
76
- "completion": data.get("response", "")
77
- }
71
+ as_dict = {"context": data.get("query", ""), "completion": data.get("response", "")}
78
72
  return json.dumps(as_dict)
79
73
  if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
80
74
  as_dict = {
81
75
  "query": data.get("query", ""),
82
76
  "response": data.get("response", ""),
83
- "context": data.get("context", "")
77
+ "context": data.get("context", ""),
84
78
  }
85
79
  return json.dumps(as_dict)
86
80
  as_dict = {
@@ -101,7 +95,11 @@ def get_common_headers(token: str, evaluator_name: Optional[str] = None) -> Dict
101
95
  :return: The common headers.
102
96
  :rtype: Dict
103
97
  """
104
- user_agent = f"{USER_AGENT} (type=evaluator; subtype={evaluator_name})" if evaluator_name else USER_AGENT
98
+ user_agent = (
99
+ f"{UserAgentSingleton().value} (type=evaluator; subtype={evaluator_name})"
100
+ if evaluator_name
101
+ else UserAgentSingleton().value
102
+ )
105
103
  return {
106
104
  "Authorization": f"Bearer {token}",
107
105
  "User-Agent": user_agent,
@@ -113,7 +111,10 @@ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
113
111
  retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
114
112
  )
115
113
 
116
- async def ensure_service_availability_onedp(client: AIProjectClient, token: str, capability: Optional[str] = None) -> None:
114
+
115
+ async def ensure_service_availability_onedp(
116
+ client: AIProjectClient, token: str, capability: Optional[str] = None
117
+ ) -> None:
117
118
  """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
118
119
 
119
120
  :param client: The AI project client.
@@ -126,7 +127,7 @@ async def ensure_service_availability_onedp(client: AIProjectClient, token: str,
126
127
  """
127
128
  headers = get_common_headers(token)
128
129
  capabilities = client.evaluations.check_annotation(headers=headers)
129
-
130
+
130
131
  if capability and capability not in capabilities:
131
132
  msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
132
133
  raise EvaluationException(
@@ -137,7 +138,8 @@ async def ensure_service_availability_onedp(client: AIProjectClient, token: str,
137
138
  blame=ErrorBlame.USER_ERROR,
138
139
  tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
139
140
  )
140
-
141
+
142
+
141
143
  async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
142
144
  """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
143
145
 
@@ -257,12 +259,13 @@ async def submit_request(
257
259
 
258
260
 
259
261
  async def submit_request_onedp(
260
- client: AIProjectClient,
261
- data: dict,
262
- metric: str,
263
- token: str,
264
- annotation_task: str,
265
- evaluator_name: str
262
+ client: AIProjectClient,
263
+ data: dict,
264
+ metric: str,
265
+ token: str,
266
+ annotation_task: str,
267
+ evaluator_name: str,
268
+ scan_session_id: Optional[str] = None,
266
269
  ) -> str:
267
270
  """Submit request to Responsible AI service for evaluation and return operation ID
268
271
 
@@ -278,12 +281,16 @@ async def submit_request_onedp(
278
281
  :type annotation_task: str
279
282
  :param evaluator_name: The evaluator name.
280
283
  :type evaluator_name: str
284
+ :param scan_session_id: The scan session ID to use for the evaluation.
285
+ :type scan_session_id: Optional[str]
281
286
  :return: The operation ID.
282
287
  :rtype: str
283
288
  """
284
289
  normalized_user_text = get_formatted_template(data, annotation_task)
285
290
  payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
286
291
  headers = get_common_headers(token, evaluator_name)
292
+ if scan_session_id:
293
+ headers["client_request_id"] = scan_session_id
287
294
  response = client.evaluations.submit_annotation(payload, headers=headers)
288
295
  result = json.loads(response)
289
296
  operation_id = result["location"].split("/")[-1]
@@ -326,6 +333,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
326
333
  sleep_time = RAIService.SLEEP_TIME**request_count
327
334
  await asyncio.sleep(sleep_time)
328
335
 
336
+
329
337
  async def fetch_result_onedp(client: AIProjectClient, operation_id: str, token: str) -> Dict:
330
338
  """Fetch the annotation result from Responsible AI service
331
339
 
@@ -349,11 +357,14 @@ async def fetch_result_onedp(client: AIProjectClient, operation_id: str, token:
349
357
  request_count += 1
350
358
  time_elapsed = time.time() - start
351
359
  if time_elapsed > RAIService.TIMEOUT:
352
- raise TimeoutError(f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds")
360
+ raise TimeoutError(
361
+ f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds"
362
+ )
353
363
 
354
364
  sleep_time = RAIService.SLEEP_TIME**request_count
355
365
  await asyncio.sleep(sleep_time)
356
366
 
367
+
357
368
  def parse_response( # pylint: disable=too-many-branches,too-many-statements
358
369
  batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
359
370
  ) -> Dict[str, Union[str, float]]:
@@ -382,10 +393,13 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
382
393
  result = {}
383
394
  if not batch_response or len(batch_response[0]) == 0:
384
395
  return {}
385
- if metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]:
386
- batch_response[0] = {
387
- EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
388
- }
396
+ if (
397
+ metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
398
+ and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]
399
+ ):
400
+ batch_response[0] = {
401
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
402
+ }
389
403
  if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
390
404
  pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
391
405
  for pm_metric_name in pm_metric_names:
@@ -421,7 +435,10 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
421
435
  result[metric_display_name + "_information_gathering"] = (
422
436
  parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
423
437
  )
424
- if metric_name == EvaluationMetrics.CODE_VULNERABILITY or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
438
+ if (
439
+ metric_name == EvaluationMetrics.CODE_VULNERABILITY
440
+ or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
441
+ ):
425
442
  # Add all attributes under the details.
426
443
  details = {}
427
444
  for key, value in parsed_response.items():
@@ -587,7 +604,9 @@ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
587
604
  return rai_url
588
605
 
589
606
 
590
- async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None, workspace: Optional[str] = ML_WORKSPACE) -> str:
607
+ async def fetch_or_reuse_token(
608
+ credential: TokenCredential, token: Optional[str] = None, workspace: Optional[str] = ML_WORKSPACE
609
+ ) -> str:
591
610
  """Get token. Fetch a new token if the current token is near expiry
592
611
 
593
612
  :param credential: The Azure authentication credential.
@@ -622,6 +641,7 @@ async def evaluate_with_rai_service(
622
641
  annotation_task: str = Tasks.CONTENT_HARM,
623
642
  metric_display_name=None,
624
643
  evaluator_name=None,
644
+ scan_session_id: Optional[str] = None,
625
645
  ) -> Dict[str, Union[str, float]]:
626
646
  """Evaluate the content safety of the response using Responsible AI service
627
647
 
@@ -629,8 +649,8 @@ async def evaluate_with_rai_service(
629
649
  :type data: dict
630
650
  :param metric_name: The evaluation metric to use.
631
651
  :type metric_name: str
632
- :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
633
- or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
652
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
653
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
634
654
  :type project_scope: Union[str, AzureAIProject]
635
655
  :param credential: The Azure authentication credential.
636
656
  :type credential: ~azure.core.credentials.TokenCredential
@@ -640,15 +660,23 @@ async def evaluate_with_rai_service(
640
660
  :type metric_display_name: str
641
661
  :param evaluator_name: The evaluator name to use.
642
662
  :type evaluator_name: str
663
+ :param scan_session_id: The scan session ID to use for the evaluation.
664
+ :type scan_session_id: Optional[str]
643
665
  :return: The parsed annotation result.
644
666
  :rtype: Dict[str, Union[str, float]]
645
667
  """
646
668
 
647
669
  if is_onedp_project(project_scope):
648
- client = AIProjectClient(endpoint=project_scope, credential=credential)
670
+ client = AIProjectClient(
671
+ endpoint=project_scope,
672
+ credential=credential,
673
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
674
+ )
649
675
  token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
650
676
  await ensure_service_availability_onedp(client, token, annotation_task)
651
- operation_id = await submit_request_onedp(client, data, metric_name, token, annotation_task, evaluator_name)
677
+ operation_id = await submit_request_onedp(
678
+ client, data, metric_name, token, annotation_task, evaluator_name, scan_session_id
679
+ )
652
680
  annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
653
681
  result = parse_response(annotation_response, metric_name, metric_display_name)
654
682
  return result
@@ -665,6 +693,7 @@ async def evaluate_with_rai_service(
665
693
 
666
694
  return result
667
695
 
696
+
668
697
  def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
669
698
  """Generate the payload for the annotation request
670
699
  :param content_type: The type of the content representing multimodal or images.
@@ -696,6 +725,7 @@ def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dic
696
725
  "AnnotationTask": task,
697
726
  }
698
727
 
728
+
699
729
  async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
700
730
  """Submit request to Responsible AI service for evaluation and return operation ID
701
731
  :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
@@ -741,8 +771,9 @@ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, tok
741
771
  operation_id = result["location"].split("/")[-1]
742
772
  return operation_id
743
773
 
774
+
744
775
  async def submit_multimodal_request_onedp(client: AIProjectClient, messages, metric: str, token: str) -> str:
745
-
776
+
746
777
  # handle inference sdk strongly type messages
747
778
  if len(messages) > 0 and not isinstance(messages[0], dict):
748
779
  try:
@@ -758,37 +789,42 @@ async def submit_multimodal_request_onedp(client: AIProjectClient, messages, met
758
789
  ## fetch system and assistant messages from the list of messages
759
790
  filtered_messages = [message for message in messages if message["role"] != "system"]
760
791
  assistant_messages = [message for message in messages if message["role"] == "assistant"]
761
-
792
+
762
793
  ## prepare for request
763
794
  content_type = retrieve_content_type(assistant_messages, metric)
764
795
  payload = generate_payload_multimodal(content_type, filtered_messages, metric)
765
796
  headers = get_common_headers(token)
766
-
797
+
767
798
  response = client.evaluations.submit_annotation(payload, headers=headers)
768
-
799
+
769
800
  result = json.loads(response)
770
801
  operation_id = result["location"].split("/")[-1]
771
802
  return operation_id
772
803
 
804
+
773
805
  async def evaluate_with_rai_service_multimodal(
774
806
  messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential
775
807
  ):
776
808
  """ "Evaluate the content safety of the response using Responsible AI service
777
- :param messages: The normalized list of messages.
778
- :type messages: str
779
- :param metric_name: The evaluation metric to use.
780
- :type metric_name: str
781
- :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
782
- or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
783
- :type project_scope: Union[str, AzureAIProject]
784
- :param credential: The Azure authentication credential.
785
- :type credential: ~azure.core.credentials.TokenCredential
786
- :return: The parsed annotation result.
787
- :rtype: List[List[Dict]]
809
+ :param messages: The normalized list of messages.
810
+ :type messages: str
811
+ :param metric_name: The evaluation metric to use.
812
+ :type metric_name: str
813
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
814
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
815
+ :type project_scope: Union[str, AzureAIProject]
816
+ :param credential: The Azure authentication credential.
817
+ :type credential: ~azure.core.credentials.TokenCredential
818
+ :return: The parsed annotation result.
819
+ :rtype: List[List[Dict]]
788
820
  """
789
821
 
790
822
  if is_onedp_project(project_scope):
791
- client = AIProjectClient(endpoint=project_scope, credential=credential)
823
+ client = AIProjectClient(
824
+ endpoint=project_scope,
825
+ credential=credential,
826
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
827
+ )
792
828
  token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
793
829
  await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
794
830
  operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
@@ -803,4 +839,4 @@ async def evaluate_with_rai_service_multimodal(
803
839
  operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
804
840
  annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
805
841
  result = parse_response(annotation_response, metric_name)
806
- return result
842
+ return result
@@ -24,7 +24,7 @@ except ImportError:
24
24
  _patch_all = []
25
25
  from ._patch import patch_sdk as _patch_sdk
26
26
 
27
- # Export GeneratedRAIClient as alias of MachineLearningServicesClient for backward compatibility
27
+ # Export GeneratedRAIClient as alias of MachineLearningServicesClient for backward compatibility
28
28
 
29
29
  __all__ = [
30
30
  "MachineLearningServicesClient",
@@ -112,7 +112,12 @@ def build_rai_svc_get_jail_break_dataset_with_type_request( # pylint: disable=n
112
112
 
113
113
 
114
114
  def build_rai_svc_get_attack_objectives_request( # pylint: disable=name-too-long
115
- *, risk_types: Optional[List[str]] = None, lang: Optional[str] = None, strategy: Optional[str] = None, **kwargs: Any
115
+ *,
116
+ risk_types: Optional[List[str]] = None,
117
+ risk_categories: Optional[List[str]] = None,
118
+ lang: Optional[str] = None,
119
+ strategy: Optional[str] = None,
120
+ **kwargs: Any
116
121
  ) -> HttpRequest:
117
122
  _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
118
123
  _params = case_insensitive_dict(kwargs.pop("params", {}) or {})
@@ -127,6 +132,10 @@ def build_rai_svc_get_attack_objectives_request( # pylint: disable=name-too-lon
127
132
  _params["api-version"] = _SERIALIZER.query("api_version", api_version, "str")
128
133
  if risk_types is not None:
129
134
  _params["riskTypes"] = [_SERIALIZER.query("risk_types", q, "str") if q is not None else "" for q in risk_types]
135
+ if risk_categories is not None:
136
+ _params["riskCategory"] = [
137
+ _SERIALIZER.query("risk_categories", q, "str") if q is not None else "" for q in risk_categories
138
+ ]
130
139
  if lang is not None:
131
140
  _params["lang"] = _SERIALIZER.query("lang", lang, "str")
132
141
  if strategy is not None:
@@ -573,6 +582,7 @@ class RAISvcOperations:
573
582
  def get_attack_objectives(
574
583
  self,
575
584
  *,
585
+ risk_category: str,
576
586
  risk_types: Optional[List[str]] = None,
577
587
  lang: Optional[str] = None,
578
588
  strategy: Optional[str] = None,
@@ -580,6 +590,8 @@ class RAISvcOperations:
580
590
  ) -> List[_models.AttackObjective]:
581
591
  """Get the attack objectives.
582
592
 
593
+ :keyword risk_category: Risk category for the attack objectives. Required.
594
+ :paramtype risk_category: str
583
595
  :keyword risk_types: Risk types for the attack objectives dataset. Default value is None.
584
596
  :paramtype risk_types: list[str]
585
597
  :keyword lang: The language for the attack objectives dataset, defaults to 'en'. Default value
@@ -605,6 +617,7 @@ class RAISvcOperations:
605
617
  cls: ClsType[List[_models.AttackObjective]] = kwargs.pop("cls", None)
606
618
 
607
619
  _request = build_rai_svc_get_attack_objectives_request(
620
+ risk_categories=[risk_category],
608
621
  risk_types=risk_types,
609
622
  lang=lang,
610
623
  strategy=strategy,
@@ -13,7 +13,7 @@ from azure.storage.blob import ContainerClient
13
13
  from typing_extensions import NotRequired, Required, TypeGuard
14
14
  from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
15
15
  from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
16
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
+ from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
17
17
  from azure.ai.evaluation._model_configurations import (
18
18
  AzureAIProject,
19
19
  AzureOpenAIModelConfiguration,
@@ -126,6 +126,7 @@ def construct_prompty_model_config(
126
126
 
127
127
  return prompty_model_config
128
128
 
129
+
129
130
  def is_onedp_project(azure_ai_project: AzureAIProject) -> bool:
130
131
  """Check if the Azure AI project is an OneDP project.
131
132
 
@@ -138,6 +139,7 @@ def is_onedp_project(azure_ai_project: AzureAIProject) -> bool:
138
139
  return True
139
140
  return False
140
141
 
142
+
141
143
  def validate_azure_ai_project(o: object) -> AzureAIProject:
142
144
  fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
143
145
 
@@ -291,7 +293,8 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
291
293
 
292
294
  return cast(T_TypedDict, o)
293
295
 
294
- def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool:
296
+
297
+ def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool:
295
298
  """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
296
299
 
297
300
  :param score: The score to check.
@@ -310,6 +313,7 @@ def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5)
310
313
 
311
314
  return min_score <= numeric_score <= max_score
312
315
 
316
+
313
317
  def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
314
318
  """Parse the output of prompt-based quality evaluators that return a score and reason.
315
319
 
@@ -481,6 +485,123 @@ def validate_conversation(conversation):
481
485
  ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
482
486
  )
483
487
 
488
+
489
+ def _extract_text_from_content(content):
490
+ text = []
491
+ for msg in content:
492
+ if "text" in msg:
493
+ text.append(msg["text"])
494
+ return text
495
+
496
+
497
+ def _get_conversation_history(query):
498
+ all_user_queries = []
499
+ cur_user_query = []
500
+ all_agent_responses = []
501
+ cur_agent_response = []
502
+ for msg in query:
503
+ if not "role" in msg:
504
+ continue
505
+ if msg["role"] == "user" and "content" in msg:
506
+ if cur_agent_response != []:
507
+ all_agent_responses.append(cur_agent_response)
508
+ cur_agent_response = []
509
+ text_in_msg = _extract_text_from_content(msg["content"])
510
+ if text_in_msg:
511
+ cur_user_query.append(text_in_msg)
512
+
513
+ if msg["role"] == "assistant" and "content" in msg:
514
+ if cur_user_query != []:
515
+ all_user_queries.append(cur_user_query)
516
+ cur_user_query = []
517
+ text_in_msg = _extract_text_from_content(msg["content"])
518
+ if text_in_msg:
519
+ cur_agent_response.append(text_in_msg)
520
+ if cur_user_query != []:
521
+ all_user_queries.append(cur_user_query)
522
+ if cur_agent_response != []:
523
+ all_agent_responses.append(cur_agent_response)
524
+
525
+ if len(all_user_queries) != len(all_agent_responses) + 1:
526
+ raise EvaluationException(
527
+ message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
528
+ internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
529
+ target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
530
+ category=ErrorCategory.INVALID_VALUE,
531
+ blame=ErrorBlame.USER_ERROR,
532
+ )
533
+
534
+ return {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
535
+
536
+
537
+ def _pretty_format_conversation_history(conversation_history):
538
+ """Formats the conversation history for better readability."""
539
+ formatted_history = ""
540
+ for i, (user_query, agent_response) in enumerate(
541
+ zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
542
+ ):
543
+ formatted_history += f"User turn {i+1}:\n"
544
+ for msg in user_query:
545
+ formatted_history += " " + "\n ".join(msg)
546
+ formatted_history += "\n\n"
547
+ if agent_response:
548
+ formatted_history += f"Agent turn {i+1}:\n"
549
+ for msg in agent_response:
550
+ formatted_history += " " + "\n ".join(msg)
551
+ formatted_history += "\n\n"
552
+ return formatted_history
553
+
554
+
555
+ def reformat_conversation_history(query, logger=None):
556
+ """Reformats the conversation history to a more compact representation."""
557
+ try:
558
+ conversation_history = _get_conversation_history(query)
559
+ return _pretty_format_conversation_history(conversation_history)
560
+ except:
561
+ # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
562
+ # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
563
+ # From our tests the negative impact on IntentResolution is:
564
+ # Higher intra model variance (0.142 vs 0.046)
565
+ # Higher inter model variance (0.345 vs 0.607)
566
+ # Lower percentage of mode in Likert scale (73.4% vs 75.4%)
567
+ # Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
568
+ if logger:
569
+ logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
570
+ return query
571
+
572
+
573
+ def _get_agent_response(agent_response_msgs):
574
+ """Extracts the text from the agent response content."""
575
+ agent_response_text = []
576
+ for msg in agent_response_msgs:
577
+ if "role" in msg and msg["role"] == "assistant" and "content" in msg:
578
+ text = _extract_text_from_content(msg["content"])
579
+ if text:
580
+ agent_response_text.extend(text)
581
+ return agent_response_text
582
+
583
+
584
+ def reformat_agent_response(response, logger=None):
585
+ try:
586
+ if response is None or response == []:
587
+ return ""
588
+ agent_response = _get_agent_response(response)
589
+ if agent_response == []:
590
+ # If no message could be extracted, likely the format changed, fallback to the original response in that case
591
+ if logger:
592
+ logger.warning(
593
+ f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
594
+ )
595
+ return response
596
+ return "\n".join(agent_response)
597
+ except:
598
+ # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
599
+ # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
600
+ if logger:
601
+ logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
602
+ return response
603
+
604
+
484
605
  def upload(path: str, container_client: ContainerClient, logger=None):
485
606
  """Upload files or directories to Azure Blob Storage using a container client.
486
607
 
@@ -509,7 +630,7 @@ def upload(path: str, container_client: ContainerClient, logger=None):
509
630
  local_paths = []
510
631
 
511
632
  if os.path.isdir(path):
512
- for (root, _, filenames) in os.walk(path):
633
+ for root, _, filenames in os.walk(path):
513
634
  upload_path = ""
514
635
  if root != path:
515
636
  rel_path = os.path.relpath(root, path)
@@ -81,6 +81,7 @@ class _AggregationType(enum.Enum):
81
81
  SUM = "sum"
82
82
  CUSTOM = "custom"
83
83
 
84
+
84
85
  class TokenScope(str, enum.Enum):
85
86
  """Defines the scope of the token used to access Azure resources."""
86
87
 
@@ -114,4 +115,4 @@ BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
114
115
 
115
116
  AOAI_COLUMN_NAME = "aoai"
116
117
  DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
117
- DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
118
+ DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
@@ -1,3 +1,3 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
3
+ # ---------------------------------------------------------