azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (100) hide show
  1. azure/ai/evaluation/__init__.py +60 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/constants.py +65 -0
  4. azure/ai/evaluation/_common/rai_service.py +452 -0
  5. azure/ai/evaluation/_common/utils.py +87 -0
  6. azure/ai/evaluation/_constants.py +50 -0
  7. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  8. azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
  9. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
  10. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
  11. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
  15. azure/ai/evaluation/_evaluate/_utils.py +237 -0
  16. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  17. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  18. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
  19. azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
  20. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  21. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
  22. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  23. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  24. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  25. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
  26. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
  27. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
  28. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  47. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  48. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
  49. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  50. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
  51. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  52. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  53. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
  55. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  56. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
  57. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
  58. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  59. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  60. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
  62. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
  63. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  64. azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
  65. azure/ai/evaluation/_exceptions.py +107 -0
  66. azure/ai/evaluation/_http_utils.py +395 -0
  67. azure/ai/evaluation/_model_configurations.py +27 -0
  68. azure/ai/evaluation/_user_agent.py +6 -0
  69. azure/ai/evaluation/_version.py +5 -0
  70. azure/ai/evaluation/py.typed +0 -0
  71. azure/ai/evaluation/simulator/__init__.py +15 -0
  72. azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
  73. azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
  74. azure/ai/evaluation/simulator/_constants.py +17 -0
  75. azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
  76. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  77. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  78. azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
  79. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  80. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  81. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
  82. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
  83. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  84. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
  85. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
  86. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
  87. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
  88. azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
  89. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
  90. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
  91. azure/ai/evaluation/simulator/_tracing.py +92 -0
  92. azure/ai/evaluation/simulator/_utils.py +111 -0
  93. azure/ai/evaluation/simulator/simulator.py +579 -0
  94. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  95. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  96. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  97. azure_ai_evaluation-1.0.0b1.dist-info/top_level.txt +1 -0
  98. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  99. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  100. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,61 @@
1
+ ---
2
+ name: Fluency
3
+ description: Evaluates fluency score for QA scenario
4
+ model:
5
+ api: chat
6
+ configuration:
7
+ type: azure_openai
8
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
9
+ api_key: ${env:AZURE_OPENAI_API_KEY}
10
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
+ parameters:
12
+ temperature: 0.0
13
+ max_tokens: 1
14
+ top_p: 1.0
15
+ presence_penalty: 0
16
+ frequency_penalty: 0
17
+ response_format:
18
+ type: text
19
+
20
+ inputs:
21
+ query:
22
+ type: string
23
+ response:
24
+ type: string
25
+
26
+ ---
27
+ system:
28
+ You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
29
+ user:
30
+ Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale:
31
+ One star: the answer completely lacks fluency
32
+ Two stars: the answer mostly lacks fluency
33
+ Three stars: the answer is partially fluent
34
+ Four stars: the answer is mostly fluent
35
+ Five stars: the answer has perfect fluency
36
+
37
+ This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
38
+
39
+ question: What did you have for breakfast today?
40
+ answer: Breakfast today, me eating cereal and orange juice very good.
41
+ stars: 1
42
+
43
+ question: How do you feel when you travel alone?
44
+ answer: Alone travel, nervous, but excited also. I feel adventure and like its time.
45
+ stars: 2
46
+
47
+ question: When was the last time you went on a family vacation?
48
+ answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun.
49
+ stars: 3
50
+
51
+ question: What is your favorite thing about your job?
52
+ answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly learning from their experiences and stories.
53
+ stars: 4
54
+
55
+ question: Can you describe your morning routine?
56
+ answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am.
57
+ stars: 5
58
+
59
+ question: {{query}}
60
+ answer: {{response}}
61
+ stars:
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._gleu import GleuScoreEvaluator
6
+
7
+ __all__ = [
8
+ "GleuScoreEvaluator",
9
+ ]
@@ -0,0 +1,71 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from nltk.translate.gleu_score import sentence_gleu
5
+
6
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
7
+ from azure.ai.evaluation._common.utils import nltk_tokenize
8
+
9
+
10
+ class _AsyncGleuScoreEvaluator:
11
+ def __init__(self):
12
+ pass
13
+
14
+ async def __call__(self, *, ground_truth: str, response: str, **kwargs):
15
+ reference_tokens = nltk_tokenize(ground_truth)
16
+ hypothesis_tokens = nltk_tokenize(response)
17
+
18
+ score = sentence_gleu([reference_tokens], hypothesis_tokens)
19
+
20
+ return {
21
+ "gleu_score": score,
22
+ }
23
+
24
+
25
+ class GleuScoreEvaluator:
26
+ """
27
+ Evaluator that computes the BLEU Score between two strings.
28
+
29
+ The GLEU (Google-BLEU) score evaluator measures the similarity between generated and reference texts by
30
+ evaluating n-gram overlap, considering both precision and recall. This balanced evaluation, designed for
31
+ sentence-level assessment, makes it ideal for detailed analysis of translation quality. GLEU is well-suited for
32
+ use cases such as machine translation, text summarization, and text generation.
33
+
34
+ **Usage**
35
+
36
+ .. code-block:: python
37
+
38
+ eval_fn = GleuScoreEvaluator()
39
+ result = eval_fn(
40
+ response="Tokyo is the capital of Japan.",
41
+ ground_truth="The capital of Japan is Tokyo.")
42
+
43
+ **Output format**
44
+
45
+ .. code-block:: python
46
+
47
+ {
48
+ "gleu_score": 0.41
49
+ }
50
+ """
51
+
52
+ def __init__(self):
53
+ self._async_evaluator = _AsyncGleuScoreEvaluator()
54
+
55
+ def __call__(self, *, ground_truth: str, response: str, **kwargs):
56
+ """
57
+ Evaluate the GLEU score between the response and the ground truth.
58
+
59
+ :keyword response: The response to be evaluated.
60
+ :paramtype response: str
61
+ :keyword ground_truth: The ground truth to be compared against.
62
+ :paramtype ground_truth: str
63
+ :return: The GLEU score.
64
+ :rtype: dict
65
+ """
66
+ return async_run_allowing_running_loop(
67
+ self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
68
+ )
69
+
70
+ def _to_async(self):
71
+ return self._async_evaluator
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._groundedness import GroundednessEvaluator
6
+
7
+ __all__ = [
8
+ "GroundednessEvaluator",
9
+ ]
@@ -0,0 +1,123 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import os
6
+ import re
7
+ from typing import Union
8
+
9
+ import numpy as np
10
+
11
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
12
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
13
+ from promptflow.core import AsyncPrompty
14
+
15
+ from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
16
+ from ..._common.utils import (
17
+ check_and_add_api_version_for_aoai_model_config,
18
+ check_and_add_user_agent_for_aoai_model_config,
19
+ )
20
+
21
+ try:
22
+ from ..._user_agent import USER_AGENT
23
+ except ImportError:
24
+ USER_AGENT = None
25
+
26
+
27
+ class _AsyncGroundednessEvaluator:
28
+ # Constants must be defined within eval's directory to be save/loadable
29
+ PROMPTY_FILE = "groundedness.prompty"
30
+ LLM_CALL_TIMEOUT = 600
31
+ DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
32
+
33
+ def __init__(self, model_config: dict):
34
+ check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
35
+
36
+ prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
37
+
38
+ # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
39
+ # https://github.com/encode/httpx/discussions/2959
40
+ prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
41
+
42
+ check_and_add_user_agent_for_aoai_model_config(
43
+ model_config,
44
+ prompty_model_config,
45
+ USER_AGENT,
46
+ )
47
+
48
+ current_dir = os.path.dirname(__file__)
49
+ prompty_path = os.path.join(current_dir, "groundedness.prompty")
50
+ self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
51
+
52
+ async def __call__(self, *, response: str, context: str, **kwargs):
53
+ # Validate input parameters
54
+ response = str(response or "")
55
+ context = str(context or "")
56
+
57
+ if not response.strip() or not context.strip():
58
+ msg = "Both 'response' and 'context' must be non-empty strings."
59
+ raise EvaluationException(
60
+ message=msg,
61
+ internal_message=msg,
62
+ error_category=ErrorCategory.MISSING_FIELD,
63
+ error_blame=ErrorBlame.USER_ERROR,
64
+ error_target=ErrorTarget.F1_EVALUATOR,
65
+ )
66
+
67
+ # Run the evaluation flow
68
+ llm_output = await self._flow(response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
69
+
70
+ score = np.nan
71
+ if llm_output:
72
+ match = re.search(r"\d", llm_output)
73
+ if match:
74
+ score = float(match.group())
75
+
76
+ return {"gpt_groundedness": float(score)}
77
+
78
+
79
+ class GroundednessEvaluator:
80
+ """
81
+ Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
82
+
83
+ :param model_config: Configuration for the Azure OpenAI model.
84
+ :type model_config: Union[~azure.ai.evalation.AzureOpenAIModelConfiguration,
85
+ ~azure.ai.evalation.OpenAIModelConfiguration]
86
+
87
+ **Usage**
88
+
89
+ .. code-block:: python
90
+
91
+ eval_fn = GroundednessEvaluator(model_config)
92
+ result = eval_fn(
93
+ response="The capital of Japan is Tokyo.",
94
+ context="Tokyo is Japan's capital, known for its blend of traditional culture \
95
+ and technological advancements.")
96
+
97
+ **Output format**
98
+
99
+ .. code-block:: python
100
+
101
+ {
102
+ "gpt_groundedness": 5
103
+ }
104
+ """
105
+
106
+ def __init__(self, model_config: dict):
107
+ self._async_evaluator = _AsyncGroundednessEvaluator(model_config)
108
+
109
+ def __call__(self, *, response: str, context: str, **kwargs):
110
+ """
111
+ Evaluate groundedness of the response in the context.
112
+
113
+ :keyword response: The response to be evaluated.
114
+ :paramtype response: str
115
+ :keyword context: The context in which the response is evaluated.
116
+ :paramtype context: str
117
+ :return: The groundedness score.
118
+ :rtype: dict
119
+ """
120
+ return async_run_allowing_running_loop(self._async_evaluator, response=response, context=context, **kwargs)
121
+
122
+ def _to_async(self):
123
+ return self._async_evaluator
@@ -0,0 +1,54 @@
1
+ ---
2
+ name: Groundedness
3
+ description: Evaluates groundedness score for QA scenario
4
+ model:
5
+ api: chat
6
+ configuration:
7
+ type: azure_openai
8
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
9
+ api_key: ${env:AZURE_OPENAI_API_KEY}
10
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
+ parameters:
12
+ temperature: 0.0
13
+ max_tokens: 1
14
+ top_p: 1.0
15
+ presence_penalty: 0
16
+ frequency_penalty: 0
17
+ response_format:
18
+ type: text
19
+
20
+ inputs:
21
+ response:
22
+ type: string
23
+ context:
24
+ type: string
25
+
26
+ ---
27
+ system:
28
+ You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
29
+ user:
30
+ You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
31
+ 1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
32
+ 2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
33
+ 3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
34
+ Independent Examples:
35
+ ## Example Task #1 Input:
36
+ {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
37
+ ## Example Task #1 Output:
38
+ 1
39
+ ## Example Task #2 Input:
40
+ {"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."}
41
+ ## Example Task #2 Output:
42
+ 5
43
+ ## Example Task #3 Input:
44
+ {"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."}
45
+ ## Example Task #3 Output:
46
+ 5
47
+ ## Example Task #4 Input:
48
+ {"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."}
49
+ ## Example Task #4 Output:
50
+ 1
51
+ ## Actual Task Input:
52
+ {"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{response}}}
53
+ Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question.
54
+ Actual Task Output:
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._meteor import MeteorScoreEvaluator
6
+
7
+ __all__ = [
8
+ "MeteorScoreEvaluator",
9
+ ]
@@ -0,0 +1,96 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import nltk
5
+ from nltk.translate.meteor_score import meteor_score
6
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
7
+ from azure.ai.evaluation._common.utils import nltk_tokenize
8
+
9
+
10
+ class _AsyncMeteorScoreEvaluator:
11
+ def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
12
+ self._alpha = alpha
13
+ self._beta = beta
14
+ self._gamma = gamma
15
+
16
+ try:
17
+ nltk.find("corpora/wordnet.zip")
18
+ except LookupError:
19
+ nltk.download("wordnet")
20
+
21
+ async def __call__(self, *, ground_truth: str, response: str, **kwargs):
22
+ reference_tokens = nltk_tokenize(ground_truth)
23
+ hypothesis_tokens = nltk_tokenize(response)
24
+
25
+ score = meteor_score(
26
+ [reference_tokens],
27
+ hypothesis_tokens,
28
+ alpha=self._alpha,
29
+ beta=self._beta,
30
+ gamma=self._gamma,
31
+ )
32
+
33
+ return {
34
+ "meteor_score": score,
35
+ }
36
+
37
+
38
+ class MeteorScoreEvaluator:
39
+ """
40
+ Evaluator that computes the METEOR Score between two strings.
41
+
42
+ The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by
43
+ comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of
44
+ other metrics like BLEU by considering synonyms, stemming, and paraphrasing. METEOR score considers synonyms and
45
+ word stems to more accurately capture meaning and language variations. In addition to machine translation and
46
+ text summarization, paraphrase detection is an optimal use case for the METEOR score.
47
+
48
+ :param alpha: The METEOR score alpha parameter. Default is 0.9.
49
+ :type alpha: float
50
+ :param beta: The METEOR score beta parameter. Default is 3.0.
51
+ :type beta: float
52
+ :param gamma: The METEOR score gamma parameter. Default is 0.5.
53
+ :type gamma: float
54
+
55
+ **Usage**
56
+
57
+ .. code-block:: python
58
+
59
+ eval_fn = MeteorScoreEvaluator(
60
+ alpha=0.9,
61
+ beta=3.0,
62
+ gamma=0.5
63
+ )
64
+ result = eval_fn(
65
+ response="Tokyo is the capital of Japan.",
66
+ ground_truth="The capital of Japan is Tokyo.")
67
+
68
+ **Output format**
69
+
70
+ .. code-block:: python
71
+
72
+ {
73
+ "meteor_score": 0.62
74
+ }
75
+ """
76
+
77
+ def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
78
+ self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
79
+
80
+ def __call__(self, *, ground_truth: str, response: str, **kwargs):
81
+ """
82
+ Evaluate the METEOR score between the response and the ground truth.
83
+
84
+ :keyword response: The response to be evaluated.
85
+ :paramtype response: str
86
+ :keyword ground_truth: The ground truth to be compared against.
87
+ :paramtype ground_truth: str
88
+ :return: The METEOR score.
89
+ :rtype: dict
90
+ """
91
+ return async_run_allowing_running_loop(
92
+ self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
93
+ )
94
+
95
+ def _to_async(self):
96
+ return self._async_evaluator
@@ -0,0 +1,5 @@
1
+ from ._protected_material import ProtectedMaterialEvaluator
2
+
3
+ __all__ = [
4
+ "ProtectedMaterialEvaluator",
5
+ ]
@@ -0,0 +1,104 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
5
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
6
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
7
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
8
+ from azure.ai.evaluation._model_configurations import AzureAIProject
9
+
10
+
11
+ class _AsyncProtectedMaterialEvaluator:
12
+ def __init__(self, azure_ai_project: dict, credential=None):
13
+ self._azure_ai_project = azure_ai_project
14
+ self._credential = credential
15
+
16
+ async def __call__(self, *, query: str, response: str, **kwargs):
17
+ """
18
+ Evaluates content according to this evaluator's metric.
19
+
20
+ :keyword query: The query to be evaluated.
21
+ :paramtype query: str
22
+ :keyword response: The response to be evaluated.
23
+ :paramtype response: str
24
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
25
+ :rtype: Any
26
+ """
27
+ # Validate inputs
28
+ # Raises value error if failed, so execution alone signifies success.
29
+ if not (query and query.strip() and query != "None") or not (
30
+ response and response.strip() and response != "None"
31
+ ):
32
+ msg = "Both 'query' and 'response' must be non-empty strings."
33
+ raise EvaluationException(
34
+ message=msg,
35
+ internal_message=msg,
36
+ error_category=ErrorCategory.MISSING_FIELD,
37
+ error_blame=ErrorBlame.USER_ERROR,
38
+ error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
39
+ )
40
+
41
+ # Run score computation based on supplied metric.
42
+ result = await evaluate_with_rai_service(
43
+ metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
44
+ query=query,
45
+ response=response,
46
+ project_scope=self._azure_ai_project,
47
+ credential=self._credential,
48
+ )
49
+ return result
50
+
51
+
52
+ class ProtectedMaterialEvaluator:
53
+ """
54
+ Initialize a protected material evaluator to detect whether protected material
55
+ is present in your AI system's response. Outputs True or False with AI-generated reasoning.
56
+
57
+ :param azure_ai_project: The scope of the Azure AI project.
58
+ It contains subscription id, resource group, and project name.
59
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
60
+ :param credential: The credential for connecting to Azure AI project.
61
+ :type credential: ~azure.core.credentials.TokenCredential
62
+ :return: Whether or not protected material was found in the response, with AI-generated reasoning.
63
+ :rtype: Dict[str, str]
64
+
65
+ **Usage**
66
+
67
+ .. code-block:: python
68
+
69
+ azure_ai_project = {
70
+ "subscription_id": "<subscription_id>",
71
+ "resource_group_name": "<resource_group_name>",
72
+ "project_name": "<project_name>",
73
+ }
74
+ eval_fn = ProtectedMaterialEvaluator(azure_ai_project)
75
+ result = eval_fn(query="What is the capital of France?", response="Paris.")
76
+
77
+ **Output format**
78
+
79
+ .. code-block:: python
80
+
81
+ {
82
+ "protected_material_label": "False",
83
+ "protected_material_reason": "This query does not contain any protected material."
84
+ }
85
+ """
86
+
87
+ def __init__(self, azure_ai_project: dict, credential=None):
88
+ self._async_evaluator = _AsyncProtectedMaterialEvaluator(azure_ai_project, credential)
89
+
90
+ def __call__(self, *, query: str, response: str, **kwargs):
91
+ """
92
+ Evaluates protected material content.
93
+
94
+ :keyword query: The query to be evaluated.
95
+ :paramtype query: str
96
+ :keyword response: The response to be evaluated.
97
+ :paramtype response: str
98
+ :return: A dictionary containing a boolean label and reasoning.
99
+ :rtype: dict
100
+ """
101
+ return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
102
+
103
+ def _to_async(self):
104
+ return self._async_evaluator
@@ -0,0 +1,5 @@
1
+ from ._protected_materials import ProtectedMaterialsEvaluator
2
+
3
+ __all__ = [
4
+ "ProtectedMaterialsEvaluator",
5
+ ]
@@ -0,0 +1,104 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
5
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
6
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
7
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
8
+ from azure.ai.evaluation._model_configurations import AzureAIProject
9
+
10
+
11
+ class _AsyncProtectedMaterialsEvaluator:
12
+ def __init__(self, azure_ai_project: dict, credential=None):
13
+ self._azure_ai_project = azure_ai_project
14
+ self._credential = credential
15
+
16
+ async def __call__(self, *, query: str, response: str, **kwargs):
17
+ """
18
+ Evaluates content according to this evaluator's metric.
19
+
20
+ :keyword query: The query to be evaluated.
21
+ :paramtype query: str
22
+ :keyword response: The response to be evaluated.
23
+ :paramtype response: str
24
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
25
+ :rtype: Any
26
+ """
27
+ # Validate inputs
28
+ # Raises value error if failed, so execution alone signifies success.
29
+ if not (query and query.strip() and query != "None") or not (
30
+ response and response.strip() and response != "None"
31
+ ):
32
+ msg = "Both 'query' and 'response' must be non-empty strings."
33
+ raise EvaluationException(
34
+ message=msg,
35
+ internal_message=msg,
36
+ error_category=ErrorCategory.MISSING_FIELD,
37
+ error_blame=ErrorBlame.USER_ERROR,
38
+ error_target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR,
39
+ )
40
+
41
+ # Run score computation based on supplied metric.
42
+ result = await evaluate_with_rai_service(
43
+ metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
44
+ query=query,
45
+ response=response,
46
+ project_scope=self._azure_ai_project,
47
+ credential=self._credential,
48
+ )
49
+ return result
50
+
51
+
52
+ class ProtectedMaterialsEvaluator:
53
+ """
54
+ Initialize a protected materials evaluator to detect whether protected material
55
+ is present in your AI system's response. Outputs True or False with AI-generated reasoning.
56
+
57
+ :param azure_ai_project: The scope of the Azure AI project.
58
+ It contains subscription id, resource group, and project name.
59
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
60
+ :param credential: The credential for connecting to Azure AI project.
61
+ :type credential: ~azure.core.credentials.TokenCredential
62
+ :return: Whether or not protected material was found in the response, with AI-generated reasoning.
63
+ :rtype: Dict[str, str]
64
+
65
+ **Usage**
66
+
67
+ .. code-block:: python
68
+
69
+ azure_ai_project = {
70
+ "subscription_id": "<subscription_id>",
71
+ "resource_group_name": "<resource_group_name>",
72
+ "project_name": "<project_name>",
73
+ }
74
+ eval_fn = ProtectedMaterialsEvaluator(azure_ai_project)
75
+ result = eval_fn(query="What is the capital of France?", response="Paris.")
76
+
77
+ **Output format**
78
+
79
+ .. code-block:: python
80
+
81
+ {
82
+ "label": "False",
83
+ "reasoning": "This query does not contain any protected material."
84
+ }
85
+ """
86
+
87
+ def __init__(self, azure_ai_project: dict, credential=None):
88
+ self._async_evaluator = _AsyncProtectedMaterialsEvaluator(azure_ai_project, credential)
89
+
90
+ def __call__(self, *, query: str, response: str, **kwargs):
91
+ """
92
+ Evaluates protected materials content.
93
+
94
+ :keyword query: The query to be evaluated.
95
+ :paramtype query: str
96
+ :keyword response: The response to be evaluated.
97
+ :paramtype response: str
98
+ :return: A dictionary containing a boolean label and reasoning.
99
+ :rtype: dict
100
+ """
101
+ return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
102
+
103
+ def _to_async(self):
104
+ return self._async_evaluator
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._qa import QAEvaluator
6
+
7
+ __all__ = [
8
+ "QAEvaluator",
9
+ ]