azure-ai-evaluation 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (69) hide show
  1. azure/ai/evaluation/__init__.py +1 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +1 -1
  3. azure/ai/evaluation/_aoai/label_grader.py +2 -2
  4. azure/ai/evaluation/_aoai/string_check_grader.py +2 -2
  5. azure/ai/evaluation/_aoai/text_similarity_grader.py +2 -2
  6. azure/ai/evaluation/_common/__init__.py +3 -1
  7. azure/ai/evaluation/_common/evaluation_onedp_client.py +50 -5
  8. azure/ai/evaluation/_common/onedp/operations/_operations.py +4 -2
  9. azure/ai/evaluation/_common/rai_service.py +7 -6
  10. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  11. azure/ai/evaluation/_converters/_models.py +76 -6
  12. azure/ai/evaluation/_eval_mapping.py +2 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +15 -17
  14. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +24 -5
  15. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  16. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  17. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  18. azure/ai/evaluation/_evaluators/_common/_base_eval.py +4 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  20. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  22. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  24. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +31 -29
  25. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  26. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +10 -0
  27. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  28. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +10 -0
  29. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +10 -0
  30. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  31. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  32. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  33. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +10 -0
  34. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +13 -0
  35. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  36. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +14 -4
  37. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  38. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +10 -0
  39. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -0
  40. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +80 -10
  41. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  42. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  43. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +26 -7
  44. azure/ai/evaluation/_version.py +1 -1
  45. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  46. azure/ai/evaluation/red_team/_agent/_agent_functions.py +264 -0
  47. azure/ai/evaluation/red_team/_agent/_agent_tools.py +503 -0
  48. azure/ai/evaluation/red_team/_agent/_agent_utils.py +69 -0
  49. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +237 -0
  50. azure/ai/evaluation/red_team/_attack_strategy.py +2 -0
  51. azure/ai/evaluation/red_team/_red_team.py +572 -207
  52. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
  53. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +570 -0
  54. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
  55. azure/ai/evaluation/red_team/_utils/constants.py +5 -1
  56. azure/ai/evaluation/red_team/_utils/metric_mapping.py +2 -2
  57. azure/ai/evaluation/red_team/_utils/strategy_utils.py +2 -0
  58. azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -2
  59. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  60. azure/ai/evaluation/simulator/_direct_attack_simulator.py +3 -3
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +3 -3
  62. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +3 -0
  63. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +15 -7
  64. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +6 -5
  65. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/METADATA +35 -3
  66. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/RECORD +69 -61
  67. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/NOTICE.txt +0 -0
  68. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/WHEEL +0 -0
  69. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,121 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import logging
6
+ import uuid
7
+ import os
8
+ import json
9
+ import traceback
10
+ import asyncio
11
+ import re
12
+ from typing import Dict, Optional, Any, Tuple, List
13
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
14
+ from azure.ai.evaluation.simulator._model_tools._generated_rai_client import GeneratedRAIClient
15
+ from pyrit.models import PromptRequestResponse, construct_response_from_request
16
+ from pyrit.prompt_target import PromptChatTarget
17
+ from .constants import USER_AGENT
18
+ from .metric_mapping import get_metric_from_risk_category
19
+ from .._attack_objective_generator import RiskCategory
20
+
21
+
22
+ class RAIServiceEvalChatTarget(PromptChatTarget):
23
+ """A class to handle chat-based interactions with the RAI service for evaluation purposes."""
24
+
25
+ def __init__(
26
+ self,
27
+ credential: Any,
28
+ azure_ai_project,
29
+ risk_category: RiskCategory,
30
+ logger: Optional[logging.Logger] = None,
31
+ evaluator_name: Optional[str] = None,
32
+ ) -> None:
33
+ """Initialize the RAIServiceEvalChatTarget.
34
+
35
+ :param endpoint: The endpoint URL for the RAI service.
36
+ :type endpoint: str
37
+ :param credential: The credential object for authentication.
38
+ :type credential: Any
39
+ """
40
+ super().__init__()
41
+ self.logger = logger
42
+ self.risk_category = risk_category
43
+ self.evaluator_name = evaluator_name
44
+ self.credential = credential
45
+ self.azure_ai_project = azure_ai_project
46
+
47
+ async def send_prompt_async(self, *, prompt_request: PromptRequestResponse, objective: str = "") -> PromptRequestResponse:
48
+ self.logger.info("Starting send_prompt_async operation")
49
+ self._validate_request(prompt_request=prompt_request)
50
+
51
+ thing_to_eval = prompt_request.request_pieces[0].to_dict()['original_value']
52
+
53
+ thing_to_eval_qr = {
54
+ "query":"",
55
+ "response": thing_to_eval
56
+ }
57
+
58
+ metric_name = get_metric_from_risk_category(self.risk_category)
59
+
60
+ evaluation_result = await evaluate_with_rai_service(thing_to_eval_qr,
61
+ metric_name=metric_name,
62
+ project_scope=self.azure_ai_project,
63
+ credential=self.credential)
64
+ self.logger.info(f"Evaluation result: {evaluation_result}")
65
+
66
+ # Extract the relevant data from evaluation_result
67
+ score = evaluation_result.get(f"{metric_name}_score", 0)
68
+ reason = evaluation_result.get(f"{metric_name}_reason", "")
69
+ threshold = evaluation_result.get(f"{metric_name}_threshold", 3)
70
+ result = evaluation_result.get(metric_name, "")
71
+
72
+ # Calculate pass/fail using the same logic as in _evaluate_conversation
73
+ # Convert to boolean then to string for PyRIT's true/false score
74
+ # Ensure we're working with numeric values for comparison
75
+ boolean_result = float(score) > float(threshold)
76
+
77
+ # Format the response in a way that PyRIT's scorer can parse
78
+ # Use "true"/"false" string for score_value which is required for true_false score type
79
+ scorer_compatible_response = {
80
+ "score_value": boolean_result, # PyRIT requires lowercase "true" or "false" for true_false scores
81
+ "description": f"Threshold: {threshold}, Result: {result}",
82
+ "rationale": reason,
83
+ "metadata": {
84
+ "raw_score": score,
85
+ "threshold": threshold,
86
+ "result": result,
87
+ "risk_category": self.risk_category
88
+ }
89
+ }
90
+
91
+ # Convert to JSON string
92
+ response_json = json.dumps(scorer_compatible_response)
93
+
94
+ # Construct the response
95
+ response = construct_response_from_request(
96
+ request=prompt_request.request_pieces[0],
97
+ response_text_pieces=[response_json],
98
+ )
99
+ self.logger.info(f"Constructed response: {response}")
100
+ return response
101
+
102
+
103
+ def is_json_response_supported(self) -> bool:
104
+ """Check if JSON response is supported.
105
+
106
+ :return: True if JSON response is supported, False otherwise
107
+ """
108
+ # This target supports JSON responses
109
+ return True
110
+
111
+ def _validate_request(self, *, prompt_request: PromptRequestResponse) -> None:
112
+ """Validate the request.
113
+
114
+ :param prompt_request: The prompt request
115
+ """
116
+ if len(prompt_request.request_pieces) != 1:
117
+ raise ValueError("This target only supports a single prompt request piece.")
118
+
119
+ if prompt_request.request_pieces[0].converted_value_data_type != "text":
120
+ raise ValueError("This target only supports text prompt input.")
121
+