azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (105) hide show
  1. azure/ai/evaluation/__init__.py +5 -31
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +120 -300
  4. azure/ai/evaluation/_common/utils.py +23 -381
  5. azure/ai/evaluation/_constants.py +6 -19
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +7 -23
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +17 -33
  9. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/proxy_client.py +4 -32
  10. azure/ai/evaluation/_evaluate/_eval_run.py +24 -81
  11. azure/ai/evaluation/_evaluate/_evaluate.py +239 -393
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +17 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +28 -82
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +18 -17
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +357 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +157 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +88 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +67 -105
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +34 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +301 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +54 -44
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +19 -34
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +89 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +16 -14
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +87 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -20
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +30 -23
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +96 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -26
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +38 -53
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +105 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +132 -203
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +1 -2
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +92 -111
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +12 -13
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +67 -33
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +2 -1
  60. azure/ai/evaluation/{_common → simulator/_helpers}/_experimental.py +9 -24
  61. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +5 -26
  62. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +94 -107
  63. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  64. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +11 -28
  65. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +4 -8
  66. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  67. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  68. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  69. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  70. azure/ai/evaluation/simulator/_simulator.py +207 -277
  71. azure/ai/evaluation/simulator/_tracing.py +4 -4
  72. azure/ai/evaluation/simulator/_utils.py +13 -31
  73. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +449 -0
  74. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +99 -0
  75. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +1 -1
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  78. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  79. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  80. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  81. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  82. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  84. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  85. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  86. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  87. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  88. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  89. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  91. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  92. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  93. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  94. azure/ai/evaluation/_vendor/__init__.py +0 -3
  95. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  96. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  97. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  98. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  100. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  101. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  102. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  103. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  104. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  105. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,54 @@
1
+ ---
2
+ name: Groundedness
3
+ description: Evaluates groundedness score for QA scenario
4
+ model:
5
+ api: chat
6
+ configuration:
7
+ type: azure_openai
8
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
9
+ api_key: ${env:AZURE_OPENAI_API_KEY}
10
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
+ parameters:
12
+ temperature: 0.0
13
+ max_tokens: 1
14
+ top_p: 1.0
15
+ presence_penalty: 0
16
+ frequency_penalty: 0
17
+ response_format:
18
+ type: text
19
+
20
+ inputs:
21
+ response:
22
+ type: string
23
+ context:
24
+ type: string
25
+
26
+ ---
27
+ system:
28
+ You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
29
+ user:
30
+ You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
31
+ 1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
32
+ 2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
33
+ 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
34
+ Independent Examples:
35
+ ## Example Task #1 Input:
36
+ {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
37
+ ## Example Task #1 Output:
38
+ 1
39
+ ## Example Task #2 Input:
40
+ {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."}
41
+ ## Example Task #2 Output:
42
+ 5
43
+ ## Example Task #3 Input:
44
+ {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."}
45
+ ## Example Task #3 Output:
46
+ 5
47
+ ## Example Task #4 Input:
48
+ {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
49
+ ## Example Task #4 Output:
50
+ 1
51
+ ## Actual Task Input:
52
+ {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{response}}}
53
+ Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question.
54
+ Actual Task Output:
@@ -1,10 +1,11 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ import nltk
4
5
  from nltk.translate.meteor_score import meteor_score
5
6
  from promptflow._utils.async_utils import async_run_allowing_running_loop
6
7
 
7
- from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
8
+ from azure.ai.evaluation._common.utils import nltk_tokenize
8
9
 
9
10
 
10
11
  class _AsyncMeteorScoreEvaluator:
@@ -13,7 +14,10 @@ class _AsyncMeteorScoreEvaluator:
13
14
  self._beta = beta
14
15
  self._gamma = gamma
15
16
 
16
- ensure_nltk_data_downloaded()
17
+ try:
18
+ nltk.find("corpora/wordnet.zip")
19
+ except LookupError:
20
+ nltk.download("wordnet")
17
21
 
18
22
  async def __call__(self, *, ground_truth: str, response: str, **kwargs):
19
23
  reference_tokens = nltk_tokenize(ground_truth)
@@ -34,7 +38,7 @@ class _AsyncMeteorScoreEvaluator:
34
38
 
35
39
  class MeteorScoreEvaluator:
36
40
  """
37
- Calculates the METEOR score for a given response and ground truth.
41
+ Evaluator that computes the METEOR Score between two strings.
38
42
 
39
43
  The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by
40
44
  comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of
@@ -42,12 +46,6 @@ class MeteorScoreEvaluator:
42
46
  word stems to more accurately capture meaning and language variations. In addition to machine translation and
43
47
  text summarization, paraphrase detection is an optimal use case for the METEOR score.
44
48
 
45
- Use the METEOR score when you want a more linguistically informed evaluation metric that captures not only
46
- n-gram overlap but also accounts for synonyms, stemming, and word order. This is particularly useful for evaluating
47
- tasks like machine translation, text summarization, and text generation.
48
-
49
- The METEOR score ranges from 0 to 1, with 1 indicating a perfect match.
50
-
51
49
  :param alpha: The METEOR score alpha parameter. Default is 0.9.
52
50
  :type alpha: float
53
51
  :param beta: The METEOR score beta parameter. Default is 3.0.
@@ -55,18 +53,27 @@ class MeteorScoreEvaluator:
55
53
  :param gamma: The METEOR score gamma parameter. Default is 0.5.
56
54
  :type gamma: float
57
55
 
58
- .. admonition:: Example:
56
+ **Usage**
59
57
 
60
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
61
- :start-after: [START meteor_score_evaluator]
62
- :end-before: [END meteor_score_evaluator]
63
- :language: python
64
- :dedent: 8
65
- :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
66
- """
58
+ .. code-block:: python
59
+
60
+ eval_fn = MeteorScoreEvaluator(
61
+ alpha=0.9,
62
+ beta=3.0,
63
+ gamma=0.5
64
+ )
65
+ result = eval_fn(
66
+ response="Tokyo is the capital of Japan.",
67
+ ground_truth="The capital of Japan is Tokyo.")
67
68
 
68
- id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
69
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
69
+ **Output format**
70
+
71
+ .. code-block:: python
72
+
73
+ {
74
+ "meteor_score": 0.62
75
+ }
76
+ """
70
77
 
71
78
  def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
72
79
  self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
@@ -80,7 +87,7 @@ class MeteorScoreEvaluator:
80
87
  :keyword ground_truth: The ground truth to be compared against.
81
88
  :paramtype ground_truth: str
82
89
  :return: The METEOR score.
83
- :rtype: Dict[str, float]
90
+ :rtype: dict
84
91
  """
85
92
  return async_run_allowing_running_loop(
86
93
  self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
@@ -1,113 +1,104 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
4
5
 
5
- from typing import Dict, List, Optional, Union
6
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
7
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
8
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
6
9
 
7
- from typing_extensions import overload, override
8
10
 
9
- from azure.ai.evaluation._common._experimental import experimental
10
- from azure.ai.evaluation._common.constants import EvaluationMetrics
11
- from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
12
- from azure.ai.evaluation._model_configurations import Conversation
11
+ class _AsyncProtectedMaterialEvaluator:
12
+ def __init__(self, azure_ai_project: dict, credential=None):
13
+ self._azure_ai_project = azure_ai_project
14
+ self._credential = credential
13
15
 
16
+ async def __call__(self, *, query: str, response: str, **kwargs):
17
+ """
18
+ Evaluates content according to this evaluator's metric.
14
19
 
15
- @experimental
16
- class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
17
- """
18
- Evaluates the protected material score for a given query and response or a multi-turn conversation, with reasoning.
20
+ :keyword query: The query to be evaluated.
21
+ :paramtype query: str
22
+ :keyword response: The response to be evaluated.
23
+ :paramtype response: str
24
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
25
+ :rtype: Any
26
+ """
27
+ # Validate inputs
28
+ # Raises value error if failed, so execution alone signifies success.
29
+ if not (query and query.strip() and query != "None") or not (
30
+ response and response.strip() and response != "None"
31
+ ):
32
+ msg = "Both 'query' and 'response' must be non-empty strings."
33
+ raise EvaluationException(
34
+ message=msg,
35
+ internal_message=msg,
36
+ error_category=ErrorCategory.MISSING_FIELD,
37
+ error_blame=ErrorBlame.USER_ERROR,
38
+ error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
39
+ )
40
+
41
+ # Run score computation based on supplied metric.
42
+ result = await evaluate_with_rai_service(
43
+ metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
44
+ query=query,
45
+ response=response,
46
+ project_scope=self._azure_ai_project,
47
+ credential=self._credential,
48
+ )
49
+ return result
19
50
 
20
- Protected material is any text that is under copyright, including song lyrics, recipes, and articles. Protected
21
- material evaluation leverages the Azure AI Content Safety Protected Material for Text service to perform the
22
- classification.
23
51
 
24
- The protected material score is a boolean value, where True indicates that protected material was detected.
52
+ class ProtectedMaterialEvaluator:
53
+ """
54
+ Initialize a protected material evaluator to detect whether protected material
55
+ is present in your AI system's response. Outputs True or False with AI-generated reasoning.
25
56
 
26
- :param credential: The credential required for connecting to the Azure AI project.
27
- :type credential: ~azure.core.credentials.TokenCredential
28
- :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
29
- resource group, and project name.
57
+ :param azure_ai_project: The scope of the Azure AI project.
58
+ It contains subscription id, resource group, and project name.
30
59
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
60
+ :param credential: The credential for connecting to Azure AI project.
61
+ :type credential: ~azure.core.credentials.TokenCredential
62
+ :return: Whether or not protected material was found in the response, with AI-generated reasoning.
63
+ :rtype: Dict[str, str]
31
64
 
32
- .. admonition:: Example:
65
+ **Usage**
33
66
 
34
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
35
- :start-after: [START protected_material_evaluator]
36
- :end-before: [END protected_material_evaluator]
37
- :language: python
38
- :dedent: 8
39
- :caption: Initialize and call a ProtectedMaterialEvaluator.
40
- """
67
+ .. code-block:: python
41
68
 
42
- id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
43
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
44
-
45
- @override
46
- def __init__(
47
- self,
48
- credential,
49
- azure_ai_project,
50
- ):
51
- super().__init__(
52
- eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
53
- azure_ai_project=azure_ai_project,
54
- credential=credential,
55
- )
69
+ azure_ai_project = {
70
+ "subscription_id": "<subscription_id>",
71
+ "resource_group_name": "<resource_group_name>",
72
+ "project_name": "<project_name>",
73
+ }
74
+ eval_fn = ProtectedMaterialEvaluator(azure_ai_project)
75
+ result = eval_fn(query="What is the capital of France?", response="Paris.")
56
76
 
57
- @overload
58
- def __call__(
59
- self,
60
- *,
61
- query: str,
62
- response: str,
63
- ) -> Dict[str, Union[str, bool]]:
64
- """Evaluate a given query/response pair for protected material
77
+ **Output format**
65
78
 
66
- :keyword query: The query to be evaluated.
67
- :paramtype query: str
68
- :keyword response: The response to be evaluated.
69
- :paramtype response: str
70
- :return: The protected material score.
71
- :rtype: Dict[str, Union[str, bool]]
72
- """
79
+ .. code-block:: python
73
80
 
74
- @overload
75
- def __call__(
76
- self,
77
- *,
78
- conversation: Conversation,
79
- ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
80
- """Evaluate a conversation for protected material
81
-
82
- :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
83
- key "messages", and potentially a global context under the key "context". Conversation turns are expected
84
- to be dictionaries with keys "content", "role", and possibly "context".
85
- :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
86
- :return: The protected material score.
87
- :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
88
- """
81
+ {
82
+ "protected_material_label": "False",
83
+ "protected_material_reason": "This query does not contain any protected material."
84
+ }
85
+ """
86
+
87
+ def __init__(self, azure_ai_project: dict, credential=None):
88
+ self._async_evaluator = _AsyncProtectedMaterialEvaluator(azure_ai_project, credential)
89
89
 
90
- @override
91
- def __call__(
92
- self,
93
- *,
94
- query: Optional[str] = None,
95
- response: Optional[str] = None,
96
- conversation=None,
97
- **kwargs,
98
- ):
90
+ def __call__(self, *, query: str, response: str, **kwargs):
99
91
  """
100
- Evaluate if protected material is present in your AI system's response.
92
+ Evaluates protected material content.
101
93
 
102
94
  :keyword query: The query to be evaluated.
103
- :paramtype query: Optional[str]
95
+ :paramtype query: str
104
96
  :keyword response: The response to be evaluated.
105
- :paramtype response: Optional[str]
106
- :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
107
- key "messages". Conversation turns are expected
108
- to be dictionaries with keys "content" and "role".
109
- :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
110
- :return: The fluency score.
111
- :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
97
+ :paramtype response: str
98
+ :return: A dictionary containing a boolean label and reasoning.
99
+ :rtype: dict
112
100
  """
113
- return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
101
+ return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
102
+
103
+ def _to_async(self):
104
+ return self._async_evaluator
@@ -0,0 +1,5 @@
1
+ from ._protected_materials import ProtectedMaterialsEvaluator
2
+
3
+ __all__ = [
4
+ "ProtectedMaterialsEvaluator",
5
+ ]
@@ -0,0 +1,104 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
5
+
6
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
7
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
8
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
9
+
10
+
11
+ class _AsyncProtectedMaterialsEvaluator:
12
+ def __init__(self, azure_ai_project: dict, credential=None):
13
+ self._azure_ai_project = azure_ai_project
14
+ self._credential = credential
15
+
16
+ async def __call__(self, *, query: str, response: str, **kwargs):
17
+ """
18
+ Evaluates content according to this evaluator's metric.
19
+
20
+ :keyword query: The query to be evaluated.
21
+ :paramtype query: str
22
+ :keyword response: The response to be evaluated.
23
+ :paramtype response: str
24
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
25
+ :rtype: Any
26
+ """
27
+ # Validate inputs
28
+ # Raises value error if failed, so execution alone signifies success.
29
+ if not (query and query.strip() and query != "None") or not (
30
+ response and response.strip() and response != "None"
31
+ ):
32
+ msg = "Both 'query' and 'response' must be non-empty strings."
33
+ raise EvaluationException(
34
+ message=msg,
35
+ internal_message=msg,
36
+ error_category=ErrorCategory.MISSING_FIELD,
37
+ error_blame=ErrorBlame.USER_ERROR,
38
+ error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
39
+ )
40
+
41
+ # Run score computation based on supplied metric.
42
+ result = await evaluate_with_rai_service(
43
+ metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
44
+ query=query,
45
+ response=response,
46
+ project_scope=self._azure_ai_project,
47
+ credential=self._credential,
48
+ )
49
+ return result
50
+
51
+
52
+ class ProtectedMaterialsEvaluator:
53
+ """
54
+ Initialize a protected materials evaluator to detect whether protected material
55
+ is present in your AI system's response. Outputs True or False with AI-generated reasoning.
56
+
57
+ :param azure_ai_project: The scope of the Azure AI project.
58
+ It contains subscription id, resource group, and project name.
59
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
60
+ :param credential: The credential for connecting to Azure AI project.
61
+ :type credential: ~azure.core.credentials.TokenCredential
62
+ :return: Whether or not protected material was found in the response, with AI-generated reasoning.
63
+ :rtype: Dict[str, str]
64
+
65
+ **Usage**
66
+
67
+ .. code-block:: python
68
+
69
+ azure_ai_project = {
70
+ "subscription_id": "<subscription_id>",
71
+ "resource_group_name": "<resource_group_name>",
72
+ "project_name": "<project_name>",
73
+ }
74
+ eval_fn = ProtectedMaterialsEvaluator(azure_ai_project)
75
+ result = eval_fn(query="What is the capital of France?", response="Paris.")
76
+
77
+ **Output format**
78
+
79
+ .. code-block:: python
80
+
81
+ {
82
+ "label": "False",
83
+ "reasoning": "This query does not contain any protected material."
84
+ }
85
+ """
86
+
87
+ def __init__(self, azure_ai_project: dict, credential=None):
88
+ self._async_evaluator = _AsyncProtectedMaterialsEvaluator(azure_ai_project, credential)
89
+
90
+ def __call__(self, *, query: str, response: str, **kwargs):
91
+ """
92
+ Evaluates protected materials content.
93
+
94
+ :keyword query: The query to be evaluated.
95
+ :paramtype query: str
96
+ :keyword response: The response to be evaluated.
97
+ :paramtype response: str
98
+ :return: A dictionary containing a boolean label and reasoning.
99
+ :rtype: dict
100
+ """
101
+ return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
102
+
103
+ def _to_async(self):
104
+ return self._async_evaluator
@@ -3,7 +3,6 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  from concurrent.futures import as_completed
6
- from typing import Callable, Dict, List, Union
7
6
 
8
7
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
9
8
 
@@ -22,33 +21,39 @@ class QAEvaluator:
22
21
  :param model_config: Configuration for the Azure OpenAI model.
23
22
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
24
23
  ~azure.ai.evaluation.OpenAIModelConfiguration]
25
- :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
26
- :param kwargs: Additional arguments to pass to the evaluator.
27
- :type kwargs: Any
24
+ :return: A function that evaluates and generates metrics for "question-answering" scenario.
25
+ :rtype: Callable
28
26
 
29
- .. admonition:: Example:
27
+ **Usage**
30
28
 
31
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
32
- :start-after: [START qa_evaluator]
33
- :end-before: [END qa_evaluator]
34
- :language: python
35
- :dedent: 8
36
- :caption: Initialize and call a QAEvaluator.
29
+ .. code-block:: python
37
30
 
38
- .. note::
31
+ eval_fn = QAEvaluator(model_config)
32
+ result = qa_eval(
33
+ query="Tokyo is the capital of which country?",
34
+ response="Japan",
35
+ context="Tokyo is the capital of Japan.",
36
+ ground_truth="Japan"
37
+ )
39
38
 
40
- To align with our support of a diverse set of models, keys without the `gpt_` prefix has been added.
41
- To maintain backwards compatibility, the old keys with the `gpt_` prefix are still be present in the output;
42
- however, it is recommended to use the new keys moving forward as the old keys will be deprecated in the future.
43
- """
39
+ **Output format**
40
+
41
+ .. code-block:: python
44
42
 
45
- id = "qa"
46
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
43
+ {
44
+ "gpt_groundedness": 3.5,
45
+ "gpt_relevance": 4.0,
46
+ "gpt_coherence": 1.5,
47
+ "gpt_fluency": 4.0,
48
+ "gpt_similarity": 3.0,
49
+ "f1_score": 0.42
50
+ }
51
+ """
47
52
 
48
- def __init__(self, model_config, **kwargs):
49
- self._parallel = kwargs.pop("_parallel", False)
53
+ def __init__(self, model_config: dict, parallel: bool = True):
54
+ self._parallel = parallel
50
55
 
51
- self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
56
+ self._evaluators = [
52
57
  GroundednessEvaluator(model_config),
53
58
  RelevanceEvaluator(model_config),
54
59
  CoherenceEvaluator(model_config),
@@ -69,10 +74,12 @@ class QAEvaluator:
69
74
  :paramtype context: str
70
75
  :keyword ground_truth: The ground truth to be evaluated.
71
76
  :paramtype ground_truth: str
77
+ :keyword parallel: Whether to evaluate in parallel. Defaults to True.
78
+ :paramtype parallel: bool
72
79
  :return: The scores for QA scenario.
73
- :rtype: Dict[str, Union[str, float]]
80
+ :rtype: dict
74
81
  """
75
- results: Dict[str, Union[str, float]] = {}
82
+ results = {}
76
83
  if self._parallel:
77
84
  with ThreadPoolExecutor() as executor:
78
85
  futures = {