azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (79) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/_common/constants.py +5 -0
  3. azure/ai/evaluation/_common/math.py +11 -0
  4. azure/ai/evaluation/_common/rai_service.py +172 -35
  5. azure/ai/evaluation/_common/utils.py +162 -23
  6. azure/ai/evaluation/_constants.py +6 -6
  7. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  8. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +4 -4
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +6 -3
  10. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  11. azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
  12. azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
  13. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
  14. azure/ai/evaluation/_evaluate/_utils.py +40 -7
  15. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
  17. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  18. azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
  19. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
  20. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
  29. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +49 -15
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  34. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  35. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  36. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  43. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +44 -11
  44. azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
  45. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
  46. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +125 -82
  48. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  49. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  50. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  51. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  52. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
  53. azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
  54. azure/ai/evaluation/_exceptions.py +17 -0
  55. azure/ai/evaluation/_model_configurations.py +18 -1
  56. azure/ai/evaluation/_version.py +1 -1
  57. azure/ai/evaluation/simulator/__init__.py +2 -1
  58. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  59. azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
  60. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  61. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  62. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  63. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  64. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  65. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
  66. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  69. azure/ai/evaluation/simulator/_simulator.py +115 -61
  70. azure/ai/evaluation/simulator/_utils.py +6 -6
  71. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +166 -9
  72. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/NOTICE.txt +20 -0
  73. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  74. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  76. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  77. /azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +0 -0
  78. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  79. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,9 @@ import os
7
7
  import re
8
8
  import tempfile
9
9
  from pathlib import Path
10
- from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict, Union
10
+ from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
11
+ import uuid
12
+ import base64
11
13
 
12
14
  import pandas as pd
13
15
  from promptflow.client import PFClient
@@ -37,12 +39,6 @@ class AzureMLWorkspace(NamedTuple):
37
39
  workspace_name: str
38
40
 
39
41
 
40
- class EvaluateResult(TypedDict):
41
- metrics: Dict[str, float]
42
- studio_url: Optional[str]
43
- rows: List[Dict]
44
-
45
-
46
42
  def is_none(value) -> bool:
47
43
  return value is None or str(value).lower() == "none"
48
44
 
@@ -87,6 +83,34 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
87
83
  return azure_pf_client, ws_triad
88
84
 
89
85
 
86
+ def _store_multimodal_content(messages, tmpdir: str):
87
+ # verify if images folder exists
88
+ images_folder_path = os.path.join(tmpdir, "images")
89
+ os.makedirs(images_folder_path, exist_ok=True)
90
+
91
+ # traverse all messages and replace base64 image data with new file name.
92
+ for message in messages:
93
+ if isinstance(message.get("content", []), list):
94
+ for content in message.get("content", []):
95
+ if content.get("type") == "image_url":
96
+ image_url = content.get("image_url")
97
+ if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
98
+ # Extract the base64 string
99
+ base64image = image_url["url"].replace("data:image/jpg;base64,", "")
100
+
101
+ # Generate a unique filename
102
+ image_file_name = f"{str(uuid.uuid4())}.jpg"
103
+ image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
104
+
105
+ # Decode the base64 string to binary image data
106
+ image_data_binary = base64.b64decode(base64image)
107
+
108
+ # Write the binary image data to the file
109
+ image_file_path = os.path.join(images_folder_path, image_file_name)
110
+ with open(image_file_path, "wb") as f:
111
+ f.write(image_data_binary)
112
+
113
+
90
114
  def _log_metrics_and_instance_results(
91
115
  metrics: Dict[str, Any],
92
116
  instance_results: pd.DataFrame,
@@ -116,6 +140,15 @@ def _log_metrics_and_instance_results(
116
140
  artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
117
141
 
118
142
  with tempfile.TemporaryDirectory() as tmpdir:
143
+ # storing multi_modal images if exists
144
+ col_name = "inputs.conversation"
145
+ if col_name in instance_results.columns:
146
+ for item in instance_results[col_name].items():
147
+ value = item[1]
148
+ if "messages" in value:
149
+ _store_multimodal_content(value["messages"], tmpdir)
150
+
151
+ # storing artifact result
119
152
  tmp_path = os.path.join(tmpdir, artifact_name)
120
153
 
121
154
  with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
@@ -63,7 +63,7 @@ class BleuScoreEvaluator:
63
63
  :keyword ground_truth: The ground truth to be compared against.
64
64
  :paramtype ground_truth: str
65
65
  :return: The BLEU score.
66
- :rtype: dict
66
+ :rtype: Dict[str, float]
67
67
  """
68
68
  return async_run_allowing_running_loop(
69
69
  self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
@@ -31,18 +31,23 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
31
31
  .. code-block:: python
32
32
 
33
33
  {
34
- "gpt_coherence": 1.0
34
+ "coherence": 1.0,
35
+ "gpt_coherence": 1.0,
35
36
  }
37
+
38
+ Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
39
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
40
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
36
41
  """
37
42
 
38
- PROMPTY_FILE = "coherence.prompty"
39
- RESULT_KEY = "gpt_coherence"
43
+ _PROMPTY_FILE = "coherence.prompty"
44
+ _RESULT_KEY = "coherence"
40
45
 
41
46
  @override
42
- def __init__(self, model_config: dict):
47
+ def __init__(self, model_config):
43
48
  current_dir = os.path.dirname(__file__)
44
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
45
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
49
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
50
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
46
51
 
47
52
  @override
48
53
  def __call__(
@@ -50,7 +55,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
50
55
  *,
51
56
  query: Optional[str] = None,
52
57
  response: Optional[str] = None,
53
- conversation: Optional[dict] = None,
58
+ conversation=None,
54
59
  **kwargs,
55
60
  ):
56
61
  """Evaluate coherence. Accepts either a query and response for a single evaluation,
@@ -64,8 +69,8 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
64
69
  :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
65
70
  key "messages". Conversation turns are expected
66
71
  to be dictionaries with keys "content" and "role".
67
- :paramtype conversation: Optional[Dict]
72
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
68
73
  :return: The relevance score.
69
- :rtype: Dict[str, float]
74
+ :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
70
75
  """
71
76
  return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -5,7 +5,7 @@ model:
5
5
  api: chat
6
6
  parameters:
7
7
  temperature: 0.0
8
- max_tokens: 1
8
+ max_tokens: 800
9
9
  top_p: 1.0
10
10
  presence_penalty: 0
11
11
  frequency_penalty: 0
@@ -20,38 +20,80 @@ inputs:
20
20
 
21
21
  ---
22
22
  system:
23
- You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
23
+ # Instruction
24
+ ## Goal
25
+ ### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
26
+ - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
27
+ - **Data**: Your input data include a QUERY and a RESPONSE.
28
+ - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
24
29
 
25
30
  user:
26
- Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale:
27
- One star: the answer completely lacks coherence
28
- Two stars: the answer mostly lacks coherence
29
- Three stars: the answer is partially coherent
30
- Four stars: the answer is mostly coherent
31
- Five stars: the answer has perfect coherency
32
-
33
- This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
34
-
35
- question: What is your favorite indoor activity and why do you enjoy it?
36
- answer: I like pizza. The sun is shining.
37
- stars: 1
38
-
39
- question: Can you describe your favorite movie without giving away any spoilers?
40
- answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain.
41
- stars: 2
42
-
43
- question: What are some benefits of regular exercise?
44
- answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green.
45
- stars: 3
46
-
47
- question: How do you cope with stress in your daily life?
48
- answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities.
49
- stars: 4
50
-
51
- question: What can you tell me about climate change and its effects on the environment?
52
- answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike.
53
- stars: 5
54
-
55
- question: {{query}}
56
- answer: {{response}}
57
- stars:
31
+ # Definition
32
+ **Coherence** refers to the logical and orderly presentation of ideas in a response, allowing the reader to easily follow and understand the writer's train of thought. A coherent answer directly addresses the question with clear connections between sentences and paragraphs, using appropriate transitions and a logical sequence of ideas.
33
+
34
+ # Ratings
35
+ ## [Coherence: 1] (Incoherent Response)
36
+ **Definition:** The response lacks coherence entirely. It consists of disjointed words or phrases that do not form complete or meaningful sentences. There is no logical connection to the question, making the response incomprehensible.
37
+
38
+ **Examples:**
39
+ **Query:** What are the benefits of renewable energy?
40
+ **Response:** Wind sun green jump apple silence over.
41
+
42
+ **Query:** Explain the process of photosynthesis.
43
+ **Response:** Plants light water flying blue music.
44
+
45
+ ## [Coherence: 2] (Poorly Coherent Response)
46
+ **Definition:** The response shows minimal coherence with fragmented sentences and limited connection to the question. It contains some relevant keywords but lacks logical structure and clear relationships between ideas, making the overall message difficult to understand.
47
+
48
+ **Examples:**
49
+ **Query:** How does vaccination work?
50
+ **Response:** Vaccines protect disease. Immune system fight. Health better.
51
+
52
+ **Query:** Describe how a bill becomes a law.
53
+ **Response:** Idea proposed. Congress discuss vote. President signs.
54
+
55
+ ## [Coherence: 3] (Partially Coherent Response)
56
+ **Definition:** The response partially addresses the question with some relevant information but exhibits issues in the logical flow and organization of ideas. Connections between sentences may be unclear or abrupt, requiring the reader to infer the links. The response may lack smooth transitions and may present ideas out of order.
57
+
58
+ **Examples:**
59
+ **Query:** What causes earthquakes?
60
+ **Response:** Earthquakes happen when tectonic plates move suddenly. Energy builds up then releases. Ground shakes and can cause damage.
61
+
62
+ **Query:** Explain the importance of the water cycle.
63
+ **Response:** The water cycle moves water around Earth. Evaporation, then precipitation occurs. It supports life by distributing water.
64
+
65
+ ## [Coherence: 4] (Coherent Response)
66
+ **Definition:** The response is coherent and effectively addresses the question. Ideas are logically organized with clear connections between sentences and paragraphs. Appropriate transitions are used to guide the reader through the response, which flows smoothly and is easy to follow.
67
+
68
+ **Examples:**
69
+ **Query:** What is the water cycle and how does it work?
70
+ **Response:** The water cycle is the continuous movement of water on Earth through processes like evaporation, condensation, and precipitation. Water evaporates from bodies of water, forms clouds through condensation, and returns to the surface as precipitation. This cycle is essential for distributing water resources globally.
71
+
72
+ **Query:** Describe the role of mitochondria in cellular function.
73
+ **Response:** Mitochondria are organelles that produce energy for the cell. They convert nutrients into ATP through cellular respiration. This energy powers various cellular activities, making mitochondria vital for cell survival.
74
+
75
+ ## [Coherence: 5] (Highly Coherent Response)
76
+ **Definition:** The response is exceptionally coherent, demonstrating sophisticated organization and flow. Ideas are presented in a logical and seamless manner, with excellent use of transitional phrases and cohesive devices. The connections between concepts are clear and enhance the reader's understanding. The response thoroughly addresses the question with clarity and precision.
77
+
78
+ **Examples:**
79
+ **Query:** Analyze the economic impacts of climate change on coastal cities.
80
+ **Response:** Climate change significantly affects the economies of coastal cities through rising sea levels, increased flooding, and more intense storms. These environmental changes can damage infrastructure, disrupt businesses, and lead to costly repairs. For instance, frequent flooding can hinder transportation and commerce, while the threat of severe weather may deter investment and tourism. Consequently, cities may face increased expenses for disaster preparedness and mitigation efforts, straining municipal budgets and impacting economic growth.
81
+
82
+ **Query:** Discuss the significance of the Monroe Doctrine in shaping U.S. foreign policy.
83
+ **Response:** The Monroe Doctrine was a pivotal policy declared in 1823 that asserted U.S. opposition to European colonization in the Americas. By stating that any intervention by external powers in the Western Hemisphere would be viewed as a hostile act, it established the U.S. as a protector of the region. This doctrine shaped U.S. foreign policy by promoting isolation from European conflicts while justifying American influence and expansion in the hemisphere. Its long-term significance lies in its enduring influence on international relations and its role in defining the U.S. position in global affairs.
84
+
85
+
86
+ # Data
87
+ QUERY: {{query}}
88
+ RESPONSE: {{response}}
89
+
90
+
91
+ # Tasks
92
+ ## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
93
+ - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
94
+ - **Explanation**: a very short explanation of why you think the input Data should get that Score.
95
+ - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
96
+
97
+
98
+ ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
99
+ # Output
@@ -11,6 +11,7 @@ from typing_extensions import ParamSpec, TypeAlias
11
11
 
12
12
  from azure.ai.evaluation._common.math import list_mean
13
13
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
+ from azure.ai.evaluation._common.utils import remove_optional_singletons
14
15
 
15
16
  P = ParamSpec("P")
16
17
  T = TypeVar("T")
@@ -32,9 +33,9 @@ AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
32
33
 
33
34
  foo: AggregateResult[float] = {
34
35
  "evaluation_per_turn": {
35
- "gpt_coherence": [1.0, 2.0, 3.0]
36
+ "coherence": [1.0, 2.0, 3.0]
36
37
  },
37
- "gpt_coherence": 2.0
38
+ "coherence": 2.0
38
39
  }
39
40
  """
40
41
 
@@ -44,7 +45,7 @@ DoEvalResult: TypeAlias = Dict[str, T]
44
45
  .. code-block:: python
45
46
 
46
47
  foo: DoEvalResult[float] = {
47
- "gpt_coherence": 2.0
48
+ "coherence": 2.0
48
49
  }
49
50
  """
50
51
 
@@ -96,7 +97,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
96
97
  :keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
97
98
  :type kwargs: Dict
98
99
  :return: The evaluation result
99
- :rtype: Dict
100
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
100
101
  """
101
102
  return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
102
103
 
@@ -110,7 +111,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
110
111
  :param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
111
112
  :type eval_input: Any
112
113
  :return: A single evaluation result
113
- :rtype: Dict
114
+ :rtype: DoEvalResult[T_EvalValue]
114
115
  """
115
116
 
116
117
  # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -172,16 +173,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
172
173
  response_context = response.get("context", None)
173
174
  if global_context:
174
175
  context["global_context"] = global_context
175
- if query_context and not include_query:
176
+ if query_context and include_query:
176
177
  context["query_context"] = query_context
177
- if response_context and not include_response:
178
+ if response_context and include_response:
178
179
  context["response_context"] = response_context
179
180
 
180
181
  eval_input: DerivedEvalInput = {}
181
182
  if include_query:
182
- eval_input["query"] = query
183
+ eval_input["query"] = query.get("content", "")
183
184
  if include_response:
184
- eval_input["response"] = response
185
+ eval_input["response"] = response.get("content", "")
185
186
  if include_context:
186
187
  eval_input["context"] = str(context)
187
188
  eval_inputs.append(eval_input)
@@ -219,9 +220,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
219
220
  singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
220
221
  # Check that both conversation and other inputs aren't set
221
222
  if conversation is not None and any(singletons.values()):
223
+ msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
222
224
  raise EvaluationException(
223
- message="Invalid input",
224
- internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
225
+ message=msg,
225
226
  blame=ErrorBlame.USER_ERROR,
226
227
  category=ErrorCategory.INVALID_VALUE,
227
228
  target=ErrorTarget.CONVERSATION,
@@ -230,12 +231,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
230
231
  if conversation is not None:
231
232
  return self._derive_conversation_converter()(conversation)
232
233
  # Handle Singletons
233
- if all(value is not None for value in singletons.values()):
234
- return [singletons] # TODO loosen requirements to allow for optional singletons?
234
+ required_singletons = remove_optional_singletons(self, singletons)
235
+ if all(value is not None for value in required_singletons.values()):
236
+ return [singletons]
235
237
  # Missing input
238
+ msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
236
239
  raise EvaluationException(
237
- message="Missing input",
238
- internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
240
+ message=msg,
239
241
  blame=ErrorBlame.USER_ERROR,
240
242
  category=ErrorCategory.INVALID_VALUE,
241
243
  target=ErrorTarget.CONVERSATION,
@@ -254,7 +256,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
254
256
  values (including non-numerics) located in under the "evaluation_per_turn" key,
255
257
  which each sub-key being a metric and each sub-value being a the list of that metric's
256
258
  per-turn values.
257
- :rtype: Dict
259
+ :rtype: AggregateResult[T_EvalValue]
258
260
  """
259
261
 
260
262
  aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
@@ -274,7 +276,6 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
274
276
  aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
275
277
  # Slap the per-turn results back in.
276
278
  aggregated["evaluation_per_turn"] = evaluation_per_turn
277
-
278
279
  return aggregated
279
280
 
280
281
  async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
@@ -283,7 +284,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
283
284
  :keyword kwargs: The inputs to evaluate.
284
285
  :type kwargs: Dict
285
286
  :return: The evaluation result.
286
- :rtype: Dict
287
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
287
288
  """
288
289
  # Convert inputs into list of evaluable inputs.
289
290
  eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
@@ -315,7 +316,7 @@ class AsyncEvaluatorBase:
315
316
 
316
317
  # Don't look at my shame. Nothing to see here....
317
318
  # Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
318
- # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature#
319
+ # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
319
320
  # are just not passed into this function instead of ending up in kwargs.
320
321
  # Since we want this to be relatively call-agnostic, we just account for every input that any children
321
322
  # are known to throw at this, mash them into kwargs, and then pass them into the real call.
@@ -4,12 +4,13 @@
4
4
 
5
5
  import math
6
6
  import re
7
- from typing import Dict
7
+ from typing import Dict, Union
8
8
 
9
9
  from promptflow.core import AsyncPrompty
10
10
  from typing_extensions import override
11
11
 
12
- from ..._common.utils import construct_prompty_model_config, validate_model_config
12
+ from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
13
+ from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
13
14
  from . import EvaluatorBase
14
15
 
15
16
  try:
@@ -36,8 +37,8 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
36
37
  :type ignore_queries: bool
37
38
  """
38
39
 
39
- LLM_CALL_TIMEOUT = 600
40
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
40
+ _LLM_CALL_TIMEOUT = 600
41
+ _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
41
42
 
42
43
  def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
43
44
  self._result_key = result_key
@@ -46,7 +47,7 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
46
47
 
47
48
  prompty_model_config = construct_prompty_model_config(
48
49
  validate_model_config(model_config),
49
- self.DEFAULT_OPEN_API_VERSION,
50
+ self._DEFAULT_OPEN_API_VERSION,
50
51
  USER_AGENT,
51
52
  )
52
53
 
@@ -56,7 +57,7 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
56
57
  # defining a default here.
57
58
 
58
59
  @override
59
- async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
60
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
60
61
  """Do a relevance evaluation.
61
62
 
62
63
  :param eval_input: The input to the evaluator. Expected to contain
@@ -66,11 +67,20 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
66
67
  :return: The evaluation result.
67
68
  :rtype: Dict
68
69
  """
69
- llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
70
+ llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
70
71
 
71
72
  score = math.nan
72
73
  if llm_output:
74
+ # Parse out score and reason from evaluators known to possess them.
75
+ if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
76
+ score, reason = parse_quality_evaluator_reason_score(llm_output)
77
+ return {
78
+ self._result_key: float(score),
79
+ f"gpt_{self._result_key}": float(score),
80
+ f"{self._result_key}_reason": reason,
81
+ }
73
82
  match = re.search(r"\d", llm_output)
74
83
  if match:
75
84
  score = float(match.group())
76
- return {self._result_key: float(score)}
85
+ return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
86
+ return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
@@ -5,7 +5,12 @@ from typing import Dict, Optional, Union
5
5
 
6
6
  from typing_extensions import override
7
7
 
8
- from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
8
+ from azure.ai.evaluation._common.constants import (
9
+ EvaluationMetrics,
10
+ _InternalEvaluationMetrics,
11
+ Tasks,
12
+ _InternalAnnotationTasks,
13
+ )
9
14
  from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
10
15
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
11
16
  from azure.ai.evaluation._exceptions import EvaluationException
@@ -13,8 +18,10 @@ from azure.core.credentials import TokenCredential
13
18
 
14
19
  from . import EvaluatorBase
15
20
 
21
+ T = Union[str, float]
16
22
 
17
- class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
23
+
24
+ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
18
25
  """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
19
26
  This includes content safety evaluators, protected material evaluators, and others. These evaluators
20
27
  are all assumed to be of the "query and response or conversation" input variety.
@@ -48,7 +55,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
48
55
  *,
49
56
  query: Optional[str] = None,
50
57
  response: Optional[str] = None,
51
- conversation: Optional[dict] = None,
58
+ conversation=None,
52
59
  **kwargs,
53
60
  ):
54
61
  """Evaluate either a query and response or a conversation. Must supply either a query AND response,
@@ -61,14 +68,13 @@ class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
61
68
  :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
62
69
  key "messages", and potentially a global context under the key "context". Conversation turns are expected
63
70
  to be dictionaries with keys "content", "role", and possibly "context".
64
- :paramtype conversation: Optional[Dict]
65
- :return: The evaluation result.
66
- :rtype: Dict[str, Union[str, float]]
71
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
72
+ :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
67
73
  """
68
74
  return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
69
75
 
70
76
  @override
71
- async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
77
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
72
78
  """Perform the evaluation using the Azure AI RAI service.
73
79
  The exact evaluation performed is determined by the evaluation metric supplied
74
80
  by the child class initializer.
@@ -88,10 +94,43 @@ class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
88
94
  + " This should have failed earlier."
89
95
  ),
90
96
  )
97
+ input_data = {"query": query, "response": response}
98
+
99
+ if "context" in self._singleton_inputs:
100
+ context = eval_input.get("context", None)
101
+ if context is None:
102
+ raise EvaluationException(
103
+ message="Not implemented",
104
+ internal_message=(
105
+ "Attempted context-based evaluation without supplying context."
106
+ + " This should have failed earlier."
107
+ ),
108
+ )
109
+ input_data["context"] = context
110
+
91
111
  return await evaluate_with_rai_service(
92
112
  metric_name=self._eval_metric,
93
- query=query,
94
- response=response,
113
+ data=input_data,
95
114
  project_scope=self._azure_ai_project,
96
115
  credential=self._credential,
116
+ annotation_task=self._get_task(),
97
117
  )
118
+
119
+ def _get_task(self):
120
+ """Get the annotation task for the current evaluation metric.
121
+ The annotation task is used by the RAI service script to determine a the message format
122
+ of the API call, and how the output is processed, among other things.
123
+
124
+ :return: The annotation task for the evaluator's self._eval_metric value.
125
+ :rtype: ~azure.ai.evaluation._common.constants.Tasks
126
+
127
+ """
128
+ if self._eval_metric == EvaluationMetrics.GROUNDEDNESS:
129
+ return Tasks.GROUNDEDNESS
130
+ if self._eval_metric == EvaluationMetrics.XPIA:
131
+ return Tasks.XPIA
132
+ if self._eval_metric == _InternalEvaluationMetrics.ECI:
133
+ return _InternalAnnotationTasks.ECI
134
+ if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
135
+ return Tasks.PROTECTED_MATERIAL
136
+ return Tasks.CONTENT_HARM