azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show
  1. azure/ai/evaluation/__init__.py +4 -26
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +122 -302
  4. azure/ai/evaluation/_common/utils.py +35 -393
  5. azure/ai/evaluation/_constants.py +6 -28
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  10. azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
  11. azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
  13. azure/ai/evaluation/_evaluate/_utils.py +47 -108
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +134 -205
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +2 -3
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
  60. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  63. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
  64. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
  65. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  66. azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  69. azure/ai/evaluation/simulator/_tracing.py +28 -25
  70. azure/ai/evaluation/simulator/_utils.py +13 -34
  71. azure/ai/evaluation/simulator/simulator.py +579 -0
  72. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  73. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  74. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_common/_experimental.py +0 -172
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
  78. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  79. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  80. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  81. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  82. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  84. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  85. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  86. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  87. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  88. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  89. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  91. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  92. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  93. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  94. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  95. azure/ai/evaluation/_vendor/__init__.py +0 -3
  96. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  97. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  98. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  100. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  101. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  102. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  103. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  104. azure/ai/evaluation/simulator/_simulator.py +0 -716
  105. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  106. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  107. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  108. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
@@ -3,112 +3,129 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import os
6
- from typing import Dict, Union, List
6
+ import re
7
+ from typing import Union
7
8
 
8
- from typing_extensions import overload, override
9
+ import numpy as np
9
10
 
10
- from azure.ai.evaluation._model_configurations import Conversation
11
- from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
12
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
13
+ from promptflow.core import AsyncPrompty
12
14
 
15
+ from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
16
+ from ..._common.utils import (
17
+ check_and_add_api_version_for_aoai_model_config,
18
+ check_and_add_user_agent_for_aoai_model_config,
19
+ )
13
20
 
14
- class RelevanceEvaluator(PromptyEvaluatorBase):
15
- """
16
- Evaluates relevance score for a given query and response or a multi-turn conversation, including reasoning.
21
+ try:
22
+ from ..._user_agent import USER_AGENT
23
+ except ImportError:
24
+ USER_AGENT = None
25
+
26
+
27
+ class _AsyncRelevanceEvaluator:
28
+ # Constants must be defined within eval's directory to be save/loadable
29
+ PROMPTY_FILE = "relevance.prompty"
30
+ LLM_CALL_TIMEOUT = 600
31
+ DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
32
+
33
+ def __init__(self, model_config: dict):
34
+ check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
17
35
 
18
- The relevance measure assesses the ability of answers to capture the key points of the context.
19
- High relevance scores signify the AI system's understanding of the input and its capability to produce coherent
20
- and contextually appropriate outputs. Conversely, low relevance scores indicate that generated responses might
21
- be off-topic, lacking in context, or insufficient in addressing the user's intended queries. Use the relevance
22
- metric when evaluating the AI system's performance in understanding the input and generating contextually
23
- appropriate responses.
36
+ prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
24
37
 
25
- Relevance scores range from 1 to 5, with 1 being the worst and 5 being the best.
38
+ # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
39
+ # https://github.com/encode/httpx/discussions/2959
40
+ prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
41
+
42
+ check_and_add_user_agent_for_aoai_model_config(
43
+ model_config,
44
+ prompty_model_config,
45
+ USER_AGENT,
46
+ )
47
+
48
+ current_dir = os.path.dirname(__file__)
49
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
50
+ self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
51
+
52
+ async def __call__(self, *, query: str, response: str, context: str, **kwargs):
53
+ # Validate input parameters
54
+ query = str(query or "")
55
+ response = str(response or "")
56
+ context = str(context or "")
57
+
58
+ if not (query.strip() and response.strip() and context.strip()):
59
+ msg = "'query', 'response' and 'context' must be non-empty strings."
60
+ raise EvaluationException(
61
+ message=msg,
62
+ internal_message=msg,
63
+ error_category=ErrorCategory.MISSING_FIELD,
64
+ error_blame=ErrorBlame.USER_ERROR,
65
+ error_target=ErrorTarget.RELEVANCE_EVALUATOR,
66
+ )
67
+
68
+ # Run the evaluation flow
69
+ llm_output = await self._flow(
70
+ query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
71
+ )
72
+
73
+ score = np.nan
74
+ if llm_output:
75
+ match = re.search(r"\d", llm_output)
76
+ if match:
77
+ score = float(match.group())
78
+
79
+ return {"gpt_relevance": float(score)}
80
+
81
+
82
+ class RelevanceEvaluator:
83
+ """
84
+ Initialize a relevance evaluator configured for a specific Azure OpenAI model.
26
85
 
27
86
  :param model_config: Configuration for the Azure OpenAI model.
28
87
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
29
88
  ~azure.ai.evaluation.OpenAIModelConfiguration]
30
89
 
31
- .. admonition:: Example:
90
+ **Usage**
32
91
 
33
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
34
- :start-after: [START relevance_evaluator]
35
- :end-before: [END relevance_evaluator]
36
- :language: python
37
- :dedent: 8
38
- :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
92
+ .. code-block:: python
39
93
 
40
- .. note::
94
+ eval_fn = RelevanceEvaluator(model_config)
95
+ result = eval_fn(
96
+ query="What is the capital of Japan?",
97
+ response="The capital of Japan is Tokyo.",
98
+ context="Tokyo is Japan's capital, known for its blend of traditional culture \
99
+ and technological advancements.")
41
100
 
42
- To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
43
- To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
44
- however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
45
- """
101
+ **Output format**
46
102
 
47
- # Constants must be defined within eval's directory to be save/loadable
48
- _PROMPTY_FILE = "relevance.prompty"
49
- _RESULT_KEY = "relevance"
103
+ .. code-block:: python
50
104
 
51
- id = "azureml://registries/azureml/models/Relevance-Evaluator/versions/4"
52
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
105
+ {
106
+ "gpt_relevance": 3.0
107
+ }
108
+ """
53
109
 
54
- @override
55
- def __init__(self, model_config):
56
- current_dir = os.path.dirname(__file__)
57
- prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
58
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
59
-
60
- @overload
61
- def __call__(
62
- self,
63
- *,
64
- query: str,
65
- response: str,
66
- ) -> Dict[str, Union[str, float]]:
67
- """Evaluate groundedness for given input of query, response, context
110
+ def __init__(self, model_config: dict):
111
+ self._async_evaluator = _AsyncRelevanceEvaluator(model_config)
112
+
113
+ def __call__(self, *, query: str, response: str, context: str, **kwargs):
114
+ """
115
+ Evaluate relevance.
68
116
 
69
117
  :keyword query: The query to be evaluated.
70
118
  :paramtype query: str
71
119
  :keyword response: The response to be evaluated.
72
120
  :paramtype response: str
121
+ :keyword context: The context to be evaluated.
122
+ :paramtype context: str
73
123
  :return: The relevance score.
74
- :rtype: Dict[str, float]
124
+ :rtype: dict
75
125
  """
126
+ return async_run_allowing_running_loop(
127
+ self._async_evaluator, query=query, response=response, context=context, **kwargs
128
+ )
76
129
 
77
- @overload
78
- def __call__(
79
- self,
80
- *,
81
- conversation: Conversation,
82
- ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
83
- """Evaluate relevance for a conversation
84
-
85
- :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
86
- key "messages", and potentially a global context under the key "context". Conversation turns are expected
87
- to be dictionaries with keys "content", "role", and possibly "context".
88
- :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
89
- :return: The relevance score.
90
- :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
91
- """
92
-
93
- @override
94
- def __call__( # pylint: disable=docstring-missing-param
95
- self,
96
- *args,
97
- **kwargs,
98
- ):
99
- """Evaluate relevance. Accepts either a query and response for a single evaluation,
100
- or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
101
- the evaluator will aggregate the results of each turn.
102
-
103
- :keyword query: The query to be evaluated. Mutually exclusive with the `conversation` parameter.
104
- :paramtype query: Optional[str]
105
- :keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter.
106
- :paramtype response: Optional[str]
107
- :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
108
- key "messages", and potentially a global context under the key "context". Conversation turns are expected
109
- to be dictionaries with keys "content", "role", and possibly "context".
110
- :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
111
- :return: The relevance score.
112
- :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
113
- """
114
- return super().__call__(*args, **kwargs)
130
+ def _to_async(self):
131
+ return self._async_evaluator
@@ -3,9 +3,14 @@ name: Relevance
3
3
  description: Evaluates relevance score for QA scenario
4
4
  model:
5
5
  api: chat
6
+ configuration:
7
+ type: azure_openai
8
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
9
+ api_key: ${env:AZURE_OPENAI_API_KEY}
10
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
6
11
  parameters:
7
12
  temperature: 0.0
8
- max_tokens: 800
13
+ max_tokens: 1
9
14
  top_p: 1.0
10
15
  presence_penalty: 0
11
16
  frequency_penalty: 0
@@ -17,84 +22,48 @@ inputs:
17
22
  type: string
18
23
  response:
19
24
  type: string
25
+ context:
26
+ type: string
20
27
 
21
28
  ---
22
29
  system:
23
- # Instruction
24
- ## Goal
25
- ### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
26
- - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
27
- - **Data**: Your input data include QUERY and RESPONSE.
28
- - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
29
-
30
+ You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
30
31
  user:
31
- # Definition
32
- **Relevance** refers to how effectively a response addresses a question. It assesses the accuracy, completeness, and direct relevance of the response based solely on the given information.
33
-
34
- # Ratings
35
- ## [Relevance: 1] (Irrelevant Response)
36
- **Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed.
37
-
38
- **Examples:**
39
- **Query:** What is the team preparing for?
40
- **Response:** I went grocery shopping yesterday evening.
41
-
42
- **Query:** When will the company's new product line launch?
43
- **Response:** International travel can be very rewarding and educational.
44
-
45
- ## [Relevance: 2] (Incorrect Response)
46
- **Definition:** The response attempts to address the question but includes incorrect information. It provides a response that is factually wrong based on the provided information.
47
-
48
- **Examples:**
49
- **Query:** When was the merger between the two firms finalized?
50
- **Response:** The merger was finalized on April 10th.
51
-
52
- **Query:** Where and when will the solar eclipse be visible?
53
- **Response:** The solar eclipse will be visible in Asia on December 14th.
54
-
55
- ## [Relevance: 3] (Incomplete Response)
56
- **Definition:** The response addresses the question but omits key details necessary for a full understanding. It provides a partial response that lacks essential information.
57
-
58
- **Examples:**
59
- **Query:** What type of food does the new restaurant offer?
60
- **Response:** The restaurant offers Italian food like pasta.
61
-
62
- **Query:** What topics will the conference cover?
63
- **Response:** The conference will cover renewable energy and climate change.
64
-
65
- ## [Relevance: 4] (Complete Response)
66
- **Definition:** The response fully addresses the question with accurate and complete information. It includes all essential details required for a comprehensive understanding, without adding any extraneous information.
67
-
68
- **Examples:**
69
- **Query:** What type of food does the new restaurant offer?
70
- **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto.
71
-
72
- **Query:** What topics will the conference cover?
73
- **Response:** The conference will cover renewable energy, climate change, and sustainability practices.
74
-
75
- ## [Relevance: 5] (Comprehensive Response with Insights)
76
- **Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding.
77
-
78
- **Examples:**
79
- **Query:** What type of food does the new restaurant offer?
80
- **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience.
81
-
82
- **Query:** What topics will the conference cover?
83
- **Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues.
84
-
85
-
86
-
87
- # Data
88
- QUERY: {{query}}
89
- RESPONSE: {{response}}
90
-
91
-
92
- # Tasks
93
- ## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
94
- - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
95
- - **Explanation**: a very short explanation of why you think the input Data should get that Score.
96
- - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
97
-
98
-
99
- ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
100
- # Output
32
+ Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale:
33
+ One star: the answer completely lacks relevance
34
+ Two stars: the answer mostly lacks relevance
35
+ Three stars: the answer is partially relevant
36
+ Four stars: the answer is mostly relevant
37
+ Five stars: the answer has perfect relevance
38
+
39
+ This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
40
+
41
+ context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize.
42
+ question: What field did Marie Curie excel in?
43
+ answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques.
44
+ stars: 1
45
+
46
+ context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history.
47
+ question: Where were The Beatles formed?
48
+ answer: The band The Beatles began their journey in London, England, and they changed the history of music.
49
+ stars: 2
50
+
51
+ context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
52
+ question: What are the main goals of Perseverance Mars rover mission?
53
+ answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars.
54
+ stars: 3
55
+
56
+ context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health.
57
+ question: What are the main components of the Mediterranean diet?
58
+ answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes.
59
+ stars: 4
60
+
61
+ context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty.
62
+ question: What are the main attractions of the Queen's Royal Castle?
63
+ answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty.
64
+ stars: 5
65
+
66
+ context: {{context}}
67
+ question: {{query}}
68
+ answer: {{response}}
69
+ stars:
@@ -3,12 +3,12 @@
3
3
  # ---------------------------------------------------------
4
4
  from enum import Enum
5
5
 
6
- from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+ from rouge_score import rouge_scorer
7
7
 
8
- from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
8
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
9
9
 
10
10
 
11
- class RougeType(Enum):
11
+ class RougeType(str, Enum):
12
12
  """
13
13
  Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
14
14
  """
@@ -37,8 +37,8 @@ class _AsyncRougeScoreEvaluator:
37
37
  self._rouge_type = rouge_type
38
38
 
39
39
  async def __call__(self, *, ground_truth: str, response: str, **kwargs):
40
- scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
41
- metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
40
+ scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
41
+ metrics = scorer.score(ground_truth, response)[self._rouge_type]
42
42
  return {
43
43
  "rouge_precision": metrics.precision,
44
44
  "rouge_recall": metrics.recall,
@@ -48,33 +48,33 @@ class _AsyncRougeScoreEvaluator:
48
48
 
49
49
  class RougeScoreEvaluator:
50
50
  """
51
- Calculates the ROUGE score for a given response and ground truth.
51
+ Evaluator for computes the ROUGE scores between two strings.
52
52
 
53
- The ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) evaluates the similarity between the
54
- generated text and reference text based on n-gram overlap, including ROUGE-N (unigram, bigram, etc.), and
55
- ROUGE-L (longest common subsequence). It calculates precision, recall, and F1 scores to capture how well
56
- the generated text matches the reference text. Rouge type options are "rouge1" (Unigram overlap), "rouge2"
57
- (Bigram overlap), "rouge3" (Trigram overlap), "rouge4" (4-gram overlap), "rouge5" (5-gram overlap), "rougeL"
58
- (L-graph overlap)
53
+ ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate automatic
54
+ summarization and machine translation. It measures the overlap between generated text and reference summaries.
55
+ ROUGE focuses on recall-oriented measures to assess how well the generated text covers the reference text. Text
56
+ summarization and document comparison are among optimal use cases for ROUGE, particularly in scenarios where text
57
+ coherence and relevance are critical.
59
58
 
60
- Use the ROUGE score when you need a robust evaluation metric for text summarization, machine translation, and
61
- other natural language processing tasks, especially when focusing on recall and the ability to capture relevant
62
- information from the reference text.
59
+ **Usage**
63
60
 
64
- ROUGE scores range from 0 to 1, with higher scores indicating better quality.
61
+ .. code-block:: python
65
62
 
66
- .. admonition:: Example:
63
+ eval_fn = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
64
+ result = eval_fn(
65
+ response="Tokyo is the capital of Japan.",
66
+ ground_truth="The capital of Japan is Tokyo.")
67
67
 
68
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
69
- :start-after: [START rouge_score_evaluator]
70
- :end-before: [END rouge_score_evaluator]
71
- :language: python
72
- :dedent: 8
73
- :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
74
- """
68
+ **Output format**
75
69
 
76
- id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
77
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
+ .. code-block:: python
71
+
72
+ {
73
+ "rouge_precision": 1.0,
74
+ "rouge_recall": 1.0,
75
+ "rouge_f1_score": 1.0
76
+ }
77
+ """
78
78
 
79
79
  def __init__(self, rouge_type: RougeType):
80
80
  self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
@@ -88,7 +88,7 @@ class RougeScoreEvaluator:
88
88
  :keyword ground_truth: The ground truth to be compared against.
89
89
  :paramtype ground_truth: str
90
90
  :return: The ROUGE score.
91
- :rtype: Dict[str, float]
91
+ :rtype: dict
92
92
  """
93
93
  return async_run_allowing_running_loop(
94
94
  self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
@@ -2,53 +2,54 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- import math
6
5
  import os
7
6
  import re
7
+ from typing import Union
8
+
9
+ import numpy as np
8
10
 
9
11
  from promptflow._utils.async_utils import async_run_allowing_running_loop
12
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
10
13
  from promptflow.core import AsyncPrompty
11
14
 
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
-
14
- from ..._common.utils import construct_prompty_model_config, validate_model_config
15
+ from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
16
+ from ..._common.utils import (
17
+ check_and_add_api_version_for_aoai_model_config,
18
+ check_and_add_user_agent_for_aoai_model_config,
19
+ )
15
20
 
16
21
  try:
17
22
  from ..._user_agent import USER_AGENT
18
23
  except ImportError:
19
- USER_AGENT = "None"
24
+ USER_AGENT = None
20
25
 
21
26
 
22
27
  class _AsyncSimilarityEvaluator:
23
28
  # Constants must be defined within eval's directory to be save/loadable
24
- _PROMPTY_FILE = "similarity.prompty"
25
- _LLM_CALL_TIMEOUT = 600
26
- _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
29
+ PROMPTY_FILE = "similarity.prompty"
30
+ LLM_CALL_TIMEOUT = 600
31
+ DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
32
 
28
33
  def __init__(self, model_config: dict):
29
- prompty_model_config = construct_prompty_model_config(
30
- validate_model_config(model_config),
31
- self._DEFAULT_OPEN_API_VERSION,
34
+ check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
35
+
36
+ prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
37
+
38
+ # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
39
+ # https://github.com/encode/httpx/discussions/2959
40
+ prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
41
+
42
+ check_and_add_user_agent_for_aoai_model_config(
43
+ model_config,
44
+ prompty_model_config,
32
45
  USER_AGENT,
33
46
  )
34
47
 
35
48
  current_dir = os.path.dirname(__file__)
36
- prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
49
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
37
50
  self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
38
51
 
39
52
  async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
40
- """
41
- Evaluate similarity.
42
-
43
- :keyword query: The query to be evaluated.
44
- :paramtype query: str
45
- :keyword response: The response to be evaluated.
46
- :paramtype response: str
47
- :keyword ground_truth: The ground truth to be evaluated.
48
- :paramtype ground_truth: str
49
- :return: The similarity score.
50
- :rtype: Dict[str, float]
51
- """
52
53
  # Validate input parameters
53
54
  query = str(query or "")
54
55
  response = str(response or "")
@@ -66,57 +67,46 @@ class _AsyncSimilarityEvaluator:
66
67
 
67
68
  # Run the evaluation flow
68
69
  llm_output = await self._flow(
69
- query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
70
+ query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
70
71
  )
71
72
 
72
- score = math.nan
73
+ score = np.nan
73
74
  if llm_output:
74
75
  match = re.search(r"\d", llm_output)
75
76
  if match:
76
77
  score = float(match.group())
77
78
 
78
- return {"similarity": float(score), "gpt_similarity": float(score)}
79
+ return {"gpt_similarity": float(score)}
79
80
 
80
81
 
81
82
  class SimilarityEvaluator:
82
83
  """
83
- Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
84
-
85
- The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
86
- AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
87
- the ground truth and the model's prediction, which are high-dimensional vector representations capturing
88
- the semantic meaning and context of the sentences.
89
-
90
- Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
91
- tasks where you have access to ground truth responses. Similarity enables you to assess the generated
92
- text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
93
-
94
- Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
84
+ Initialize a similarity evaluator configured for a specific Azure OpenAI model.
95
85
 
96
86
  :param model_config: Configuration for the Azure OpenAI model.
97
87
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
98
88
  ~azure.ai.evaluation.OpenAIModelConfiguration]
99
89
 
100
- .. admonition:: Example:
90
+ **Usage**
101
91
 
102
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
103
- :start-after: [START rouge_score_evaluator]
104
- :end-before: [END rouge_score_evaluator]
105
- :language: python
106
- :dedent: 8
107
- :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
92
+ .. code-block:: python
108
93
 
109
- .. note::
94
+ eval_fn = SimilarityEvaluator(model_config)
95
+ result = eval_fn(
96
+ query="What is the capital of Japan?",
97
+ response="The capital of Japan is Tokyo.",
98
+ ground_truth="Tokyo is Japan's capital.")
110
99
 
111
- To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
112
- To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
113
- however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
114
- """
100
+ **Output format**
115
101
 
116
- id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
117
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
102
+ .. code-block:: python
118
103
 
119
- def __init__(self, model_config):
104
+ {
105
+ "gpt_similarity": 3.0
106
+ }
107
+ """
108
+
109
+ def __init__(self, model_config: dict):
120
110
  self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
121
111
 
122
112
  def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
@@ -130,7 +120,7 @@ class SimilarityEvaluator:
130
120
  :keyword ground_truth: The ground truth to be evaluated.
131
121
  :paramtype ground_truth: str
132
122
  :return: The similarity score.
133
- :rtype: Dict[str, float]
123
+ :rtype: dict
134
124
  """
135
125
  return async_run_allowing_running_loop(
136
126
  self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
@@ -3,6 +3,11 @@ name: Similarity
3
3
  description: Evaluates similarity score for QA scenario
4
4
  model:
5
5
  api: chat
6
+ configuration:
7
+ type: azure_openai
8
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
9
+ api_key: ${env:AZURE_OPENAI_API_KEY}
10
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
6
11
  parameters:
7
12
  temperature: 0.0
8
13
  max_tokens: 1