azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. azure/ai/evaluation/__init__.py +82 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/_experimental.py +172 -0
  4. azure/ai/evaluation/_common/constants.py +72 -0
  5. azure/ai/evaluation/_common/math.py +89 -0
  6. azure/ai/evaluation/_common/rai_service.py +632 -0
  7. azure/ai/evaluation/_common/utils.py +445 -0
  8. azure/ai/evaluation/_constants.py +72 -0
  9. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
  12. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
  13. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
  14. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  15. azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
  16. azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
  17. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
  18. azure/ai/evaluation/_evaluate/_utils.py +298 -0
  19. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  20. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  21. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
  22. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  23. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
  24. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  25. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  26. azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
  27. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
  28. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  47. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  48. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  49. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
  50. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  51. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  52. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  53. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  54. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  55. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  56. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  57. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  58. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  59. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
  60. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
  62. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
  64. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  65. azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  66. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
  67. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  68. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  69. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  70. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  72. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  73. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
  74. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  75. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  76. azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
  77. azure/ai/evaluation/_exceptions.py +128 -0
  78. azure/ai/evaluation/_http_utils.py +466 -0
  79. azure/ai/evaluation/_model_configurations.py +123 -0
  80. azure/ai/evaluation/_user_agent.py +6 -0
  81. azure/ai/evaluation/_vendor/__init__.py +3 -0
  82. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  83. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  84. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  85. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  86. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  87. azure/ai/evaluation/_version.py +5 -0
  88. azure/ai/evaluation/py.typed +0 -0
  89. azure/ai/evaluation/simulator/__init__.py +16 -0
  90. azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
  91. azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
  92. azure/ai/evaluation/simulator/_constants.py +27 -0
  93. azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
  94. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  95. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  96. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  97. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  98. azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
  99. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  100. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  101. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  102. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
  103. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  104. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
  105. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
  106. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
  107. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
  108. azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
  109. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  110. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  111. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  112. azure/ai/evaluation/simulator/_simulator.py +716 -0
  113. azure/ai/evaluation/simulator/_tracing.py +89 -0
  114. azure/ai/evaluation/simulator/_utils.py +132 -0
  115. azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
  116. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
  117. azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
  118. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
  119. azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
  120. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  121. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  122. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._service_groundedness import GroundednessProEvaluator
6
+
7
+ __all__ = [
8
+ "GroundednessProEvaluator",
9
+ ]
@@ -0,0 +1,148 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing import List, Union, Dict
5
+ from typing_extensions import overload, override
6
+
7
+ from azure.ai.evaluation._common._experimental import experimental
8
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
9
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
+ from azure.ai.evaluation._model_configurations import Conversation
11
+
12
+
13
+ @experimental
14
+ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
15
+ """
16
+ Evaluates service-based groundedness score for a given response, context, and query or a multi-turn conversation,
17
+ including reasoning.
18
+
19
+ The groundedness measure calls Azure AI Evaluation service to assess how well the AI-generated answer is grounded
20
+ in the source context. Even if the responses from LLM are factually correct, they'll be considered ungrounded if
21
+ they can't be verified against the provided sources (such as your input source or your database).
22
+
23
+ Service-based groundedness scores are boolean values, where True indicates that the response is grounded.
24
+
25
+ :param credential: The credential for connecting to Azure AI project. Required
26
+ :type credential: ~azure.core.credentials.TokenCredential
27
+ :param azure_ai_project: The scope of the Azure AI project.
28
+ It contains subscription id, resource group, and project name.
29
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
30
+ :param kwargs: Additional arguments to pass to the evaluator.
31
+ :type kwargs: Any
32
+
33
+ .. admonition:: Example:
34
+
35
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
36
+ :start-after: [START groundedness_pro_evaluator]
37
+ :end-before: [END groundedness_pro_evaluator]
38
+ :language: python
39
+ :dedent: 8
40
+ :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
41
+
42
+ .. note::
43
+
44
+ If this evaluator is supplied to the `evaluate` function, the aggregated metric
45
+ for the groundedness pro label will be "groundedness_pro_passing_rate".
46
+ """
47
+
48
+ id = "azureml://registries/azureml/models/Groundedness-Pro-Evaluator/versions/1"
49
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
50
+
51
+ @override
52
+ def __init__(
53
+ self,
54
+ credential,
55
+ azure_ai_project,
56
+ **kwargs,
57
+ ):
58
+ self._passing_score = 5 # TODO update once the binarization PR is merged
59
+ self._output_prefix = "groundedness_pro"
60
+ super().__init__(
61
+ eval_metric=EvaluationMetrics.GROUNDEDNESS,
62
+ azure_ai_project=azure_ai_project,
63
+ credential=credential,
64
+ **kwargs,
65
+ )
66
+
67
+ @overload
68
+ def __call__(
69
+ self,
70
+ *,
71
+ response: str,
72
+ context: str,
73
+ query: str,
74
+ ) -> Dict[str, Union[str, bool]]:
75
+ """Evaluate groundedness for a given query/response/context
76
+
77
+ :keyword response: The response to be evaluated.
78
+ :paramtype response: str
79
+ :keyword context: The context to be evaluated.
80
+ :paramtype context: str
81
+ :keyword query: The query to be evaluated.
82
+ :paramtype query: Optional[str]
83
+ :return: The relevance score.
84
+ :rtype: Dict[str, Union[str, bool]]
85
+ """
86
+
87
+ @overload
88
+ def __call__(
89
+ self,
90
+ *,
91
+ conversation: Conversation,
92
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
93
+ """Evaluate groundedness for a conversation for a multi-turn evaluation. If the conversation has
94
+ more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results
95
+ available in the output under the "evaluation_per_turn" key.
96
+
97
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
98
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
99
+ to be dictionaries with keys "content", "role", and possibly "context".
100
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
101
+ :return: The relevance score.
102
+ :rtype: Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]
103
+ """
104
+
105
+ @override
106
+ def __call__( # pylint: disable=docstring-missing-param
107
+ self,
108
+ *args,
109
+ **kwargs,
110
+ ):
111
+ """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
112
+ or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
113
+ the evaluator will aggregate the results of each turn, with the per-turn results available
114
+ in the output under the "evaluation_per_turn" key.
115
+
116
+ :keyword query: The query to be evaluated.
117
+ :paramtype query: Optional[str]
118
+ :keyword response: The response to be evaluated.
119
+ :paramtype response: Optional[str]
120
+ :keyword context: The context to be evaluated.
121
+ :paramtype context: Optional[str]
122
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
123
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
124
+ to be dictionaries with keys "content", "role", and possibly "context".
125
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
126
+ :return: The relevance score.
127
+ :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
128
+ """
129
+ return super().__call__(*args, **kwargs)
130
+
131
+ @override
132
+ async def _do_eval(self, eval_input: Dict):
133
+ """This evaluator has some unique post-processing that requires data that
134
+ the rai_service script is not currently built to handle. So we post-post-process
135
+ the result here to message it into the right form.
136
+
137
+ :param eval_input: The input to the evaluation function.
138
+ :type eval_input: Dict
139
+ :return: The evaluation result.
140
+ :rtype: Dict
141
+ """
142
+ result = await super()._do_eval(eval_input)
143
+ real_result = {}
144
+ real_result[self._output_prefix + "_label"] = (
145
+ result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self._passing_score
146
+ )
147
+ real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
148
+ return real_result
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._similarity import SimilarityEvaluator
6
+
7
+ __all__ = [
8
+ "SimilarityEvaluator",
9
+ ]
@@ -0,0 +1,140 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import math
6
+ import os
7
+ import re
8
+
9
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
10
+ from promptflow.core import AsyncPrompty
11
+
12
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
+
14
+ from ..._common.utils import construct_prompty_model_config, validate_model_config
15
+
16
+ try:
17
+ from ..._user_agent import USER_AGENT
18
+ except ImportError:
19
+ USER_AGENT = "None"
20
+
21
+
22
+ class _AsyncSimilarityEvaluator:
23
+ # Constants must be defined within eval's directory to be save/loadable
24
+ _PROMPTY_FILE = "similarity.prompty"
25
+ _LLM_CALL_TIMEOUT = 600
26
+ _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
+
28
+ def __init__(self, model_config: dict):
29
+ prompty_model_config = construct_prompty_model_config(
30
+ validate_model_config(model_config),
31
+ self._DEFAULT_OPEN_API_VERSION,
32
+ USER_AGENT,
33
+ )
34
+
35
+ current_dir = os.path.dirname(__file__)
36
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
37
+ self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
38
+
39
+ async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
40
+ """
41
+ Evaluate similarity.
42
+
43
+ :keyword query: The query to be evaluated.
44
+ :paramtype query: str
45
+ :keyword response: The response to be evaluated.
46
+ :paramtype response: str
47
+ :keyword ground_truth: The ground truth to be evaluated.
48
+ :paramtype ground_truth: str
49
+ :return: The similarity score.
50
+ :rtype: Dict[str, float]
51
+ """
52
+ # Validate input parameters
53
+ query = str(query or "")
54
+ response = str(response or "")
55
+ ground_truth = str(ground_truth or "")
56
+
57
+ if not (query.strip() and response.strip() and ground_truth.strip()):
58
+ msg = "'query', 'response' and 'ground_truth' must be non-empty strings."
59
+ raise EvaluationException(
60
+ message=msg,
61
+ internal_message=msg,
62
+ error_category=ErrorCategory.MISSING_FIELD,
63
+ error_blame=ErrorBlame.USER_ERROR,
64
+ error_target=ErrorTarget.SIMILARITY_EVALUATOR,
65
+ )
66
+
67
+ # Run the evaluation flow
68
+ llm_output = await self._flow(
69
+ query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
70
+ )
71
+
72
+ score = math.nan
73
+ if llm_output:
74
+ match = re.search(r"\d", llm_output)
75
+ if match:
76
+ score = float(match.group())
77
+
78
+ return {"similarity": float(score), "gpt_similarity": float(score)}
79
+
80
+
81
+ class SimilarityEvaluator:
82
+ """
83
+ Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
84
+
85
+ The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
86
+ AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
87
+ the ground truth and the model's prediction, which are high-dimensional vector representations capturing
88
+ the semantic meaning and context of the sentences.
89
+
90
+ Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
91
+ tasks where you have access to ground truth responses. Similarity enables you to assess the generated
92
+ text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
93
+
94
+ Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
95
+
96
+ :param model_config: Configuration for the Azure OpenAI model.
97
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
98
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
99
+
100
+ .. admonition:: Example:
101
+
102
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
103
+ :start-after: [START rouge_score_evaluator]
104
+ :end-before: [END rouge_score_evaluator]
105
+ :language: python
106
+ :dedent: 8
107
+ :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
108
+
109
+ .. note::
110
+
111
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
112
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
113
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
114
+ """
115
+
116
+ id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
117
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
118
+
119
+ def __init__(self, model_config):
120
+ self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
121
+
122
+ def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
123
+ """
124
+ Evaluate similarity.
125
+
126
+ :keyword query: The query to be evaluated.
127
+ :paramtype query: str
128
+ :keyword response: The response to be evaluated.
129
+ :paramtype response: str
130
+ :keyword ground_truth: The ground truth to be evaluated.
131
+ :paramtype ground_truth: str
132
+ :return: The similarity score.
133
+ :rtype: Dict[str, float]
134
+ """
135
+ return async_run_allowing_running_loop(
136
+ self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
137
+ )
138
+
139
+ def _to_async(self):
140
+ return self._async_evaluator
@@ -0,0 +1,66 @@
1
+ ---
2
+ name: Similarity
3
+ description: Evaluates similarity score for QA scenario
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 1
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: text
14
+
15
+ inputs:
16
+ query:
17
+ type: string
18
+ response:
19
+ type: string
20
+ ground_truth:
21
+ type: string
22
+
23
+ ---
24
+ system:
25
+ You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
26
+ user:
27
+ Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale:
28
+ One star: the predicted answer is not at all similar to the correct answer
29
+ Two stars: the predicted answer is mostly not similar to the correct answer
30
+ Three stars: the predicted answer is somewhat similar to the correct answer
31
+ Four stars: the predicted answer is mostly similar to the correct answer
32
+ Five stars: the predicted answer is completely similar to the correct answer
33
+
34
+ This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
35
+
36
+ The examples below show the Equivalence score for a question, a correct answer, and a predicted answer.
37
+
38
+ question: What is the role of ribosomes?
39
+ correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins.
40
+ predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules.
41
+ stars: 1
42
+
43
+ question: Why did the Titanic sink?
44
+ correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life.
45
+ predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts.
46
+ stars: 2
47
+
48
+ question: What causes seasons on Earth?
49
+ correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns.
50
+ predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions.
51
+ stars: 3
52
+
53
+ question: How does photosynthesis work?
54
+ correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions.
55
+ predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions.
56
+ stars: 4
57
+
58
+ question: What are the health benefits of regular exercise?
59
+ correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood.
60
+ predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood.
61
+ stars: 5
62
+
63
+ question: {{query}}
64
+ correct answer:{{ground_truth}}
65
+ predicted answer: {{response}}
66
+ stars:
@@ -0,0 +1,5 @@
1
+ from .xpia import IndirectAttackEvaluator
2
+
3
+ __all__ = [
4
+ "IndirectAttackEvaluator",
5
+ ]
@@ -0,0 +1,125 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import logging
5
+ from typing import Dict, List, Union
6
+
7
+ from typing_extensions import overload, override
8
+
9
+ from azure.ai.evaluation._common._experimental import experimental
10
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
11
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
12
+ from azure.ai.evaluation._model_configurations import Conversation
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @experimental
18
+ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
19
+ """
20
+ Evaluates the indirect attack score for a given query and response or a multi-turn conversation, with reasoning.
21
+
22
+ Indirect attacks, also known as cross-domain prompt injected attacks (XPIA), are when jailbreak attacks are
23
+ injected into the context of a document or source that may result in an altered, unexpected behavior.
24
+
25
+ Indirect attacks evaluations are broken down into three subcategories:
26
+ - Manipulated Content: This category involves commands that aim to alter or fabricate information, often
27
+ to mislead or deceive. It includes actions like spreading false information, altering language or
28
+ formatting, and hiding or emphasizing specific details. The goal is often to manipulate perceptions
29
+ or behaviors by controlling the flow and presentation of information.
30
+ - Intrusion: This category encompasses commands that attempt to breach systems, gain unauthorized access,
31
+ or elevate privileges illicitly. It includes creating backdoors, exploiting vulnerabilities, and
32
+ traditional jailbreaks to bypass security measures. The intent is often to gain control or access
33
+ sensitive data without detection.
34
+ - Information Gathering: This category pertains to accessing, deleting, or modifying data without
35
+ authorization, often for malicious purposes. It includes exfiltrating sensitive data, tampering with
36
+ system records, and removing or altering existing information. The focus is on acquiring or
37
+ manipulating data to exploit or compromise systems and individuals.
38
+
39
+ Indirect attack scores are boolean values, where True indicates that the response contains an indirect attack.
40
+
41
+ :param credential: The credential for connecting to Azure AI project. Required
42
+ :type credential: ~azure.core.credentials.TokenCredential
43
+ :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project
44
+ name.
45
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
46
+
47
+ .. admonition:: Example:
48
+
49
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
50
+ :start-after: [START indirect_attack_evaluator]
51
+ :end-before: [END indirect_attack_evaluator]
52
+ :language: python
53
+ :dedent: 8
54
+ :caption: Initialize and call an IndirectAttackEvaluator.
55
+ """
56
+
57
+ id = "azureml://registries/azureml/models/Indirect-Attack-Evaluator/versions/3"
58
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
59
+
60
+ @override
61
+ def __init__(
62
+ self,
63
+ credential,
64
+ azure_ai_project,
65
+ ):
66
+ super().__init__(
67
+ eval_metric=EvaluationMetrics.XPIA,
68
+ azure_ai_project=azure_ai_project,
69
+ credential=credential,
70
+ )
71
+
72
+ @overload
73
+ def __call__(
74
+ self,
75
+ *,
76
+ query: str,
77
+ response: str,
78
+ ) -> Dict[str, Union[str, bool]]:
79
+ """Evaluate whether cross domain injected attacks are present in given query/response
80
+
81
+ :keyword query: The query to be evaluated.
82
+ :paramtype query: str
83
+ :keyword response: The response to be evaluated.
84
+ :paramtype response: str
85
+ :return: The cross domain injection attack score
86
+ :rtype: Dict[str, Union[str, bool]]
87
+ """
88
+
89
+ @overload
90
+ def __call__(
91
+ self,
92
+ *,
93
+ conversation: Conversation,
94
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
95
+ """Evaluate cross domain injected attacks are present in a conversation
96
+
97
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
98
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
99
+ to be dictionaries with keys "content", "role", and possibly "context".
100
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
101
+ :return: The cross domain injection attack score
102
+ :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
103
+ """
104
+
105
+ @override
106
+ def __call__( # pylint: disable=docstring-missing-param
107
+ self,
108
+ *args,
109
+ **kwargs,
110
+ ):
111
+ """
112
+ Evaluate whether cross domain injected attacks are present in your AI system's response.
113
+
114
+ :keyword query: The query to be evaluated.
115
+ :paramtype query: Optional[str]
116
+ :keyword response: The response to be evaluated.
117
+ :paramtype response: Optional[str]
118
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
119
+ key "messages". Conversation turns are expected
120
+ to be dictionaries with keys "content" and "role".
121
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
122
+ :return: The cross domain injection attack score
123
+ :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
124
+ """
125
+ return super().__call__(*args, **kwargs)
@@ -0,0 +1,128 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ """This includes enums and classes for exceptions for use in azure-ai-evaluation."""
5
+
6
+ from enum import Enum
7
+ from typing import Optional
8
+
9
+ from azure.core.exceptions import AzureError
10
+
11
+
12
+ class ErrorCategory(Enum):
13
+ """Error category to be specified when using EvaluationException class.
14
+
15
+ When using EvaluationException, specify the type that best describes the nature of the error being captured.
16
+
17
+ * INVALID_VALUE -> One or more inputs are invalid (e.g. incorrect type or format)
18
+ * UNKNOWN_FIELD -> A least one unrecognized parameter is specified
19
+ * MISSING_FIELD -> At least one required parameter is missing
20
+ * FILE_OR_FOLDER_NOT_FOUND -> One or more files or folder paths do not exist
21
+ * RESOURCE_NOT_FOUND -> Resource could not be found
22
+ * FAILED_EXECUTION -> Execution failed
23
+ * SERVICE_UNAVAILABLE -> Service is unavailable
24
+ * MISSING_PACKAGE -> Required package is missing
25
+ * FAILED_REMOTE_TRACKING -> Remote tracking failed
26
+ * PROJECT_ACCESS_ERROR -> Access to project failed
27
+ * UNKNOWN -> Undefined placeholder. Avoid using.
28
+ """
29
+
30
+ INVALID_VALUE = "INVALID VALUE"
31
+ UNKNOWN_FIELD = "UNKNOWN FIELD"
32
+ MISSING_FIELD = "MISSING FIELD"
33
+ FILE_OR_FOLDER_NOT_FOUND = "FILE OR FOLDER NOT FOUND"
34
+ RESOURCE_NOT_FOUND = "RESOURCE NOT FOUND"
35
+ FAILED_EXECUTION = "FAILED_EXECUTION"
36
+ SERVICE_UNAVAILABLE = "SERVICE UNAVAILABLE"
37
+ MISSING_PACKAGE = "MISSING PACKAGE"
38
+ FAILED_REMOTE_TRACKING = "FAILED REMOTE TRACKING"
39
+ PROJECT_ACCESS_ERROR = "PROJECT ACCESS ERROR"
40
+ UNKNOWN = "UNKNOWN"
41
+
42
+
43
+ class ErrorBlame(Enum):
44
+ """Source of blame to be specified when using EvaluationException class.
45
+
46
+ When using EvaluationException, specify whether the error is due to user actions or the system.
47
+ """
48
+
49
+ USER_ERROR = "UserError"
50
+ SYSTEM_ERROR = "SystemError"
51
+ UNKNOWN = "Unknown"
52
+
53
+
54
+ class ErrorTarget(Enum):
55
+ """Error target to be specified when using EvaluationException class.
56
+
57
+ When using EvaluationException, specify the code area that was being targeted when the
58
+ exception was triggered.
59
+ """
60
+
61
+ EVAL_RUN = "EvalRun"
62
+ CODE_CLIENT = "CodeClient"
63
+ RAI_CLIENT = "RAIClient"
64
+ COHERENCE_EVALUATOR = "CoherenceEvaluator"
65
+ CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
66
+ CONTENT_SAFETY_MULTIMODAL_EVALUATOR = "ContentSafetyMultimodalEvaluator"
67
+ ECI_EVALUATOR = "ECIEvaluator"
68
+ F1_EVALUATOR = "F1Evaluator"
69
+ GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
70
+ PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
71
+ RELEVANCE_EVALUATOR = "RelevanceEvaluator"
72
+ SIMILARITY_EVALUATOR = "SimilarityEvaluator"
73
+ FLUENCY_EVALUATOR = "FluencyEvaluator"
74
+ RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
75
+ INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
76
+ INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator"
77
+ ADVERSARIAL_SIMULATOR = "AdversarialSimulator"
78
+ DIRECT_ATTACK_SIMULATOR = "DirectAttackSimulator"
79
+ EVALUATE = "Evaluate"
80
+ CALLBACK_CONVERSATION_BOT = "CallbackConversationBot"
81
+ MODELS = "Models"
82
+ UNKNOWN = "Unknown"
83
+ CONVERSATION = "Conversation"
84
+
85
+
86
+ class EvaluationException(AzureError):
87
+ """The base class for all exceptions raised in azure-ai-evaluation. If there is a need to define a custom
88
+ exception type, that custom exception type should extend from this class.
89
+
90
+ :param message: A message describing the error. This is the error message the user will see.
91
+ :type message: str
92
+ :param internal_message: The error message without any personal data. This will be pushed to telemetry logs.
93
+ :type internal_message: str
94
+ :param target: The name of the element that caused the exception to be thrown.
95
+ :type target: ~azure.ai.evaluation._exceptions.ErrorTarget
96
+ :param category: The error category, defaults to Unknown.
97
+ :type category: ~azure.ai.evaluation._exceptions.ErrorCategory
98
+ :param blame: The source of blame for the error, defaults to Unknown.
99
+ :type balance: ~azure.ai.evaluation._exceptions.ErrorBlame
100
+ :param tsg_link: A link to the TSG page for troubleshooting the error.
101
+ :type tsg_link: str
102
+ """
103
+
104
+ def __init__(
105
+ self,
106
+ message: str,
107
+ *args,
108
+ internal_message: Optional[str] = None,
109
+ target: ErrorTarget = ErrorTarget.UNKNOWN,
110
+ category: ErrorCategory = ErrorCategory.UNKNOWN,
111
+ blame: ErrorBlame = ErrorBlame.UNKNOWN,
112
+ tsg_link: Optional[str] = None,
113
+ **kwargs,
114
+ ) -> None:
115
+ self.category = category
116
+ self.target = target
117
+ self.blame = blame
118
+ self.internal_message = internal_message
119
+ self.tsg_link = tsg_link
120
+ super().__init__(message, *args, **kwargs)
121
+
122
+ def __str__(self):
123
+ error_blame = "InternalError" if self.blame != ErrorBlame.USER_ERROR else "UserError"
124
+ msg = f"({error_blame}) {super().__str__()}"
125
+ if self.tsg_link:
126
+ msg += f"\nVisit {self.tsg_link} to troubleshoot this issue."
127
+
128
+ return msg