azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show
  1. azure/ai/evaluation/__init__.py +4 -26
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +122 -302
  4. azure/ai/evaluation/_common/utils.py +35 -393
  5. azure/ai/evaluation/_constants.py +6 -28
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  10. azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
  11. azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
  13. azure/ai/evaluation/_evaluate/_utils.py +47 -108
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +134 -205
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +2 -3
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
  60. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  63. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
  64. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
  65. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  66. azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  69. azure/ai/evaluation/simulator/_tracing.py +28 -25
  70. azure/ai/evaluation/simulator/_utils.py +13 -34
  71. azure/ai/evaluation/simulator/simulator.py +579 -0
  72. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  73. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  74. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_common/_experimental.py +0 -172
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
  78. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  79. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  80. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  81. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  82. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  84. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  85. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  86. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  87. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  88. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  89. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  91. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  92. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  93. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  94. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  95. azure/ai/evaluation/_vendor/__init__.py +0 -3
  96. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  97. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  98. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  100. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  101. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  102. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  103. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  104. azure/ai/evaluation/simulator/_simulator.py +0 -716
  105. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  106. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  107. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  108. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,54 @@
1
+ ---
2
+ name: Groundedness
3
+ description: Evaluates groundedness score for QA scenario
4
+ model:
5
+ api: chat
6
+ configuration:
7
+ type: azure_openai
8
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
9
+ api_key: ${env:AZURE_OPENAI_API_KEY}
10
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
+ parameters:
12
+ temperature: 0.0
13
+ max_tokens: 1
14
+ top_p: 1.0
15
+ presence_penalty: 0
16
+ frequency_penalty: 0
17
+ response_format:
18
+ type: text
19
+
20
+ inputs:
21
+ response:
22
+ type: string
23
+ context:
24
+ type: string
25
+
26
+ ---
27
+ system:
28
+ You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
29
+ user:
30
+ You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
31
+ 1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
32
+ 2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
33
+ 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
34
+ Independent Examples:
35
+ ## Example Task #1 Input:
36
+ {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
37
+ ## Example Task #1 Output:
38
+ 1
39
+ ## Example Task #2 Input:
40
+ {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."}
41
+ ## Example Task #2 Output:
42
+ 5
43
+ ## Example Task #3 Input:
44
+ {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."}
45
+ ## Example Task #3 Output:
46
+ 5
47
+ ## Example Task #4 Input:
48
+ {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
49
+ ## Example Task #4 Output:
50
+ 1
51
+ ## Actual Task Input:
52
+ {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{response}}}
53
+ Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question.
54
+ Actual Task Output:
@@ -1,10 +1,10 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ import nltk
4
5
  from nltk.translate.meteor_score import meteor_score
5
6
  from promptflow._utils.async_utils import async_run_allowing_running_loop
6
-
7
- from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
7
+ from azure.ai.evaluation._common.utils import nltk_tokenize
8
8
 
9
9
 
10
10
  class _AsyncMeteorScoreEvaluator:
@@ -13,7 +13,10 @@ class _AsyncMeteorScoreEvaluator:
13
13
  self._beta = beta
14
14
  self._gamma = gamma
15
15
 
16
- ensure_nltk_data_downloaded()
16
+ try:
17
+ nltk.find("corpora/wordnet.zip")
18
+ except LookupError:
19
+ nltk.download("wordnet")
17
20
 
18
21
  async def __call__(self, *, ground_truth: str, response: str, **kwargs):
19
22
  reference_tokens = nltk_tokenize(ground_truth)
@@ -34,7 +37,7 @@ class _AsyncMeteorScoreEvaluator:
34
37
 
35
38
  class MeteorScoreEvaluator:
36
39
  """
37
- Calculates the METEOR score for a given response and ground truth.
40
+ Evaluator that computes the METEOR Score between two strings.
38
41
 
39
42
  The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by
40
43
  comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of
@@ -42,12 +45,6 @@ class MeteorScoreEvaluator:
42
45
  word stems to more accurately capture meaning and language variations. In addition to machine translation and
43
46
  text summarization, paraphrase detection is an optimal use case for the METEOR score.
44
47
 
45
- Use the METEOR score when you want a more linguistically informed evaluation metric that captures not only
46
- n-gram overlap but also accounts for synonyms, stemming, and word order. This is particularly useful for evaluating
47
- tasks like machine translation, text summarization, and text generation.
48
-
49
- The METEOR score ranges from 0 to 1, with 1 indicating a perfect match.
50
-
51
48
  :param alpha: The METEOR score alpha parameter. Default is 0.9.
52
49
  :type alpha: float
53
50
  :param beta: The METEOR score beta parameter. Default is 3.0.
@@ -55,18 +52,27 @@ class MeteorScoreEvaluator:
55
52
  :param gamma: The METEOR score gamma parameter. Default is 0.5.
56
53
  :type gamma: float
57
54
 
58
- .. admonition:: Example:
55
+ **Usage**
59
56
 
60
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
61
- :start-after: [START meteor_score_evaluator]
62
- :end-before: [END meteor_score_evaluator]
63
- :language: python
64
- :dedent: 8
65
- :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
66
- """
57
+ .. code-block:: python
58
+
59
+ eval_fn = MeteorScoreEvaluator(
60
+ alpha=0.9,
61
+ beta=3.0,
62
+ gamma=0.5
63
+ )
64
+ result = eval_fn(
65
+ response="Tokyo is the capital of Japan.",
66
+ ground_truth="The capital of Japan is Tokyo.")
67
+
68
+ **Output format**
69
+
70
+ .. code-block:: python
67
71
 
68
- id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
69
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
72
+ {
73
+ "meteor_score": 0.62
74
+ }
75
+ """
70
76
 
71
77
  def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
72
78
  self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
@@ -80,7 +86,7 @@ class MeteorScoreEvaluator:
80
86
  :keyword ground_truth: The ground truth to be compared against.
81
87
  :paramtype ground_truth: str
82
88
  :return: The METEOR score.
83
- :rtype: Dict[str, float]
89
+ :rtype: dict
84
90
  """
85
91
  return async_run_allowing_running_loop(
86
92
  self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
@@ -1,113 +1,104 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
5
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
6
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
7
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
8
+ from azure.ai.evaluation._model_configurations import AzureAIProject
4
9
 
5
- from typing import Dict, List, Optional, Union
6
10
 
7
- from typing_extensions import overload, override
11
+ class _AsyncProtectedMaterialEvaluator:
12
+ def __init__(self, azure_ai_project: dict, credential=None):
13
+ self._azure_ai_project = azure_ai_project
14
+ self._credential = credential
8
15
 
9
- from azure.ai.evaluation._common._experimental import experimental
10
- from azure.ai.evaluation._common.constants import EvaluationMetrics
11
- from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
12
- from azure.ai.evaluation._model_configurations import Conversation
16
+ async def __call__(self, *, query: str, response: str, **kwargs):
17
+ """
18
+ Evaluates content according to this evaluator's metric.
19
+
20
+ :keyword query: The query to be evaluated.
21
+ :paramtype query: str
22
+ :keyword response: The response to be evaluated.
23
+ :paramtype response: str
24
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
25
+ :rtype: Any
26
+ """
27
+ # Validate inputs
28
+ # Raises value error if failed, so execution alone signifies success.
29
+ if not (query and query.strip() and query != "None") or not (
30
+ response and response.strip() and response != "None"
31
+ ):
32
+ msg = "Both 'query' and 'response' must be non-empty strings."
33
+ raise EvaluationException(
34
+ message=msg,
35
+ internal_message=msg,
36
+ error_category=ErrorCategory.MISSING_FIELD,
37
+ error_blame=ErrorBlame.USER_ERROR,
38
+ error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
39
+ )
40
+
41
+ # Run score computation based on supplied metric.
42
+ result = await evaluate_with_rai_service(
43
+ metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
44
+ query=query,
45
+ response=response,
46
+ project_scope=self._azure_ai_project,
47
+ credential=self._credential,
48
+ )
49
+ return result
13
50
 
14
51
 
15
- @experimental
16
- class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
52
+ class ProtectedMaterialEvaluator:
17
53
  """
18
- Evaluates the protected material score for a given query and response or a multi-turn conversation, with reasoning.
54
+ Initialize a protected material evaluator to detect whether protected material
55
+ is present in your AI system's response. Outputs True or False with AI-generated reasoning.
19
56
 
20
- Protected material is any text that is under copyright, including song lyrics, recipes, and articles. Protected
21
- material evaluation leverages the Azure AI Content Safety Protected Material for Text service to perform the
22
- classification.
57
+ :param azure_ai_project: The scope of the Azure AI project.
58
+ It contains subscription id, resource group, and project name.
59
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
60
+ :param credential: The credential for connecting to Azure AI project.
61
+ :type credential: ~azure.core.credentials.TokenCredential
62
+ :return: Whether or not protected material was found in the response, with AI-generated reasoning.
63
+ :rtype: Dict[str, str]
23
64
 
24
- The protected material score is a boolean value, where True indicates that protected material was detected.
65
+ **Usage**
25
66
 
26
- :param credential: The credential required for connecting to the Azure AI project.
27
- :type credential: ~azure.core.credentials.TokenCredential
28
- :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
29
- resource group, and project name.
30
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
67
+ .. code-block:: python
68
+
69
+ azure_ai_project = {
70
+ "subscription_id": "<subscription_id>",
71
+ "resource_group_name": "<resource_group_name>",
72
+ "project_name": "<project_name>",
73
+ }
74
+ eval_fn = ProtectedMaterialEvaluator(azure_ai_project)
75
+ result = eval_fn(query="What is the capital of France?", response="Paris.")
76
+
77
+ **Output format**
31
78
 
32
- .. admonition:: Example:
79
+ .. code-block:: python
33
80
 
34
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
35
- :start-after: [START protected_material_evaluator]
36
- :end-before: [END protected_material_evaluator]
37
- :language: python
38
- :dedent: 8
39
- :caption: Initialize and call a ProtectedMaterialEvaluator.
81
+ {
82
+ "protected_material_label": "False",
83
+ "protected_material_reason": "This query does not contain any protected material."
84
+ }
40
85
  """
41
86
 
42
- id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
43
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
44
-
45
- @override
46
- def __init__(
47
- self,
48
- credential,
49
- azure_ai_project,
50
- ):
51
- super().__init__(
52
- eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
53
- azure_ai_project=azure_ai_project,
54
- credential=credential,
55
- )
87
+ def __init__(self, azure_ai_project: dict, credential=None):
88
+ self._async_evaluator = _AsyncProtectedMaterialEvaluator(azure_ai_project, credential)
56
89
 
57
- @overload
58
- def __call__(
59
- self,
60
- *,
61
- query: str,
62
- response: str,
63
- ) -> Dict[str, Union[str, bool]]:
64
- """Evaluate a given query/response pair for protected material
90
+ def __call__(self, *, query: str, response: str, **kwargs):
91
+ """
92
+ Evaluates protected material content.
65
93
 
66
94
  :keyword query: The query to be evaluated.
67
95
  :paramtype query: str
68
96
  :keyword response: The response to be evaluated.
69
97
  :paramtype response: str
70
- :return: The protected material score.
71
- :rtype: Dict[str, Union[str, bool]]
72
- """
73
-
74
- @overload
75
- def __call__(
76
- self,
77
- *,
78
- conversation: Conversation,
79
- ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
80
- """Evaluate a conversation for protected material
81
-
82
- :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
83
- key "messages", and potentially a global context under the key "context". Conversation turns are expected
84
- to be dictionaries with keys "content", "role", and possibly "context".
85
- :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
86
- :return: The protected material score.
87
- :rtype: Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]
88
- """
89
-
90
- @override
91
- def __call__(
92
- self,
93
- *,
94
- query: Optional[str] = None,
95
- response: Optional[str] = None,
96
- conversation=None,
97
- **kwargs,
98
- ):
98
+ :return: A dictionary containing a boolean label and reasoning.
99
+ :rtype: dict
99
100
  """
100
- Evaluate if protected material is present in your AI system's response.
101
+ return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
101
102
 
102
- :keyword query: The query to be evaluated.
103
- :paramtype query: Optional[str]
104
- :keyword response: The response to be evaluated.
105
- :paramtype response: Optional[str]
106
- :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
107
- key "messages". Conversation turns are expected
108
- to be dictionaries with keys "content" and "role".
109
- :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
110
- :return: The fluency score.
111
- :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
112
- """
113
- return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
103
+ def _to_async(self):
104
+ return self._async_evaluator
@@ -0,0 +1,5 @@
1
+ from ._protected_materials import ProtectedMaterialsEvaluator
2
+
3
+ __all__ = [
4
+ "ProtectedMaterialsEvaluator",
5
+ ]
@@ -0,0 +1,104 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
5
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
6
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
7
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
8
+ from azure.ai.evaluation._model_configurations import AzureAIProject
9
+
10
+
11
+ class _AsyncProtectedMaterialsEvaluator:
12
+ def __init__(self, azure_ai_project: dict, credential=None):
13
+ self._azure_ai_project = azure_ai_project
14
+ self._credential = credential
15
+
16
+ async def __call__(self, *, query: str, response: str, **kwargs):
17
+ """
18
+ Evaluates content according to this evaluator's metric.
19
+
20
+ :keyword query: The query to be evaluated.
21
+ :paramtype query: str
22
+ :keyword response: The response to be evaluated.
23
+ :paramtype response: str
24
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
25
+ :rtype: Any
26
+ """
27
+ # Validate inputs
28
+ # Raises value error if failed, so execution alone signifies success.
29
+ if not (query and query.strip() and query != "None") or not (
30
+ response and response.strip() and response != "None"
31
+ ):
32
+ msg = "Both 'query' and 'response' must be non-empty strings."
33
+ raise EvaluationException(
34
+ message=msg,
35
+ internal_message=msg,
36
+ error_category=ErrorCategory.MISSING_FIELD,
37
+ error_blame=ErrorBlame.USER_ERROR,
38
+ error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
39
+ )
40
+
41
+ # Run score computation based on supplied metric.
42
+ result = await evaluate_with_rai_service(
43
+ metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
44
+ query=query,
45
+ response=response,
46
+ project_scope=self._azure_ai_project,
47
+ credential=self._credential,
48
+ )
49
+ return result
50
+
51
+
52
+ class ProtectedMaterialsEvaluator:
53
+ """
54
+ Initialize a protected materials evaluator to detect whether protected material
55
+ is present in your AI system's response. Outputs True or False with AI-generated reasoning.
56
+
57
+ :param azure_ai_project: The scope of the Azure AI project.
58
+ It contains subscription id, resource group, and project name.
59
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
60
+ :param credential: The credential for connecting to Azure AI project.
61
+ :type credential: ~azure.core.credentials.TokenCredential
62
+ :return: Whether or not protected material was found in the response, with AI-generated reasoning.
63
+ :rtype: Dict[str, str]
64
+
65
+ **Usage**
66
+
67
+ .. code-block:: python
68
+
69
+ azure_ai_project = {
70
+ "subscription_id": "<subscription_id>",
71
+ "resource_group_name": "<resource_group_name>",
72
+ "project_name": "<project_name>",
73
+ }
74
+ eval_fn = ProtectedMaterialsEvaluator(azure_ai_project)
75
+ result = eval_fn(query="What is the capital of France?", response="Paris.")
76
+
77
+ **Output format**
78
+
79
+ .. code-block:: python
80
+
81
+ {
82
+ "label": "False",
83
+ "reasoning": "This query does not contain any protected material."
84
+ }
85
+ """
86
+
87
+ def __init__(self, azure_ai_project: dict, credential=None):
88
+ self._async_evaluator = _AsyncProtectedMaterialsEvaluator(azure_ai_project, credential)
89
+
90
+ def __call__(self, *, query: str, response: str, **kwargs):
91
+ """
92
+ Evaluates protected materials content.
93
+
94
+ :keyword query: The query to be evaluated.
95
+ :paramtype query: str
96
+ :keyword response: The response to be evaluated.
97
+ :paramtype response: str
98
+ :return: A dictionary containing a boolean label and reasoning.
99
+ :rtype: dict
100
+ """
101
+ return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
102
+
103
+ def _to_async(self):
104
+ return self._async_evaluator
@@ -3,7 +3,7 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  from concurrent.futures import as_completed
6
- from typing import Callable, Dict, List, Union
6
+ from typing import Union
7
7
 
8
8
  from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
9
9
 
@@ -11,6 +11,7 @@ from .._coherence import CoherenceEvaluator
11
11
  from .._f1_score import F1ScoreEvaluator
12
12
  from .._fluency import FluencyEvaluator
13
13
  from .._groundedness import GroundednessEvaluator
14
+ from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
14
15
  from .._relevance import RelevanceEvaluator
15
16
  from .._similarity import SimilarityEvaluator
16
17
 
@@ -22,33 +23,41 @@ class QAEvaluator:
22
23
  :param model_config: Configuration for the Azure OpenAI model.
23
24
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
24
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
25
- :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
26
- :param kwargs: Additional arguments to pass to the evaluator.
27
- :type kwargs: Any
26
+ :return: A function that evaluates and generates metrics for "question-answering" scenario.
27
+ :rtype: Callable
28
28
 
29
- .. admonition:: Example:
29
+ **Usage**
30
30
 
31
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
32
- :start-after: [START qa_evaluator]
33
- :end-before: [END qa_evaluator]
34
- :language: python
35
- :dedent: 8
36
- :caption: Initialize and call a QAEvaluator.
31
+ .. code-block:: python
37
32
 
38
- .. note::
33
+ eval_fn = QAEvaluator(model_config)
34
+ result = qa_eval(
35
+ query="Tokyo is the capital of which country?",
36
+ response="Japan",
37
+ context="Tokyo is the capital of Japan.",
38
+ ground_truth="Japan"
39
+ )
39
40
 
40
- To align with our support of a diverse set of models, keys without the `gpt_` prefix has been added.
41
- To maintain backwards compatibility, the old keys with the `gpt_` prefix are still be present in the output;
42
- however, it is recommended to use the new keys moving forward as the old keys will be deprecated in the future.
43
- """
41
+ **Output format**
42
+
43
+ .. code-block:: python
44
44
 
45
- id = "qa"
46
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
45
+ {
46
+ "gpt_groundedness": 3.5,
47
+ "gpt_relevance": 4.0,
48
+ "gpt_coherence": 1.5,
49
+ "gpt_fluency": 4.0,
50
+ "gpt_similarity": 3.0,
51
+ "f1_score": 0.42
52
+ }
53
+ """
47
54
 
48
- def __init__(self, model_config, **kwargs):
49
- self._parallel = kwargs.pop("_parallel", False)
55
+ def __init__(
56
+ self, model_config: dict, parallel: bool = True
57
+ ):
58
+ self._parallel = parallel
50
59
 
51
- self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
60
+ self._evaluators = [
52
61
  GroundednessEvaluator(model_config),
53
62
  RelevanceEvaluator(model_config),
54
63
  CoherenceEvaluator(model_config),
@@ -69,15 +78,22 @@ class QAEvaluator:
69
78
  :paramtype context: str
70
79
  :keyword ground_truth: The ground truth to be evaluated.
71
80
  :paramtype ground_truth: str
81
+ :keyword parallel: Whether to evaluate in parallel. Defaults to True.
82
+ :paramtype parallel: bool
72
83
  :return: The scores for QA scenario.
73
- :rtype: Dict[str, Union[str, float]]
84
+ :rtype: dict
74
85
  """
75
- results: Dict[str, Union[str, float]] = {}
86
+ results = {}
76
87
  if self._parallel:
77
88
  with ThreadPoolExecutor() as executor:
78
89
  futures = {
79
90
  executor.submit(
80
- evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
91
+ evaluator,
92
+ query=query,
93
+ response=response,
94
+ context=context,
95
+ ground_truth=ground_truth,
96
+ **kwargs
81
97
  ): evaluator
82
98
  for evaluator in self._evaluators
83
99
  }
@@ -87,7 +103,9 @@ class QAEvaluator:
87
103
  results.update(future.result())
88
104
  else:
89
105
  for evaluator in self._evaluators:
90
- result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
106
+ result = evaluator(
107
+ query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
108
+ )
91
109
  results.update(result)
92
110
 
93
111
  return results