azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (72) hide show
  1. azure/ai/evaluation/_azure/__init__.py +3 -0
  2. azure/ai/evaluation/_azure/_clients.py +188 -0
  3. azure/ai/evaluation/_azure/_models.py +227 -0
  4. azure/ai/evaluation/_azure/_token_manager.py +118 -0
  5. azure/ai/evaluation/_common/_experimental.py +4 -0
  6. azure/ai/evaluation/_common/math.py +62 -2
  7. azure/ai/evaluation/_common/rai_service.py +110 -50
  8. azure/ai/evaluation/_common/utils.py +50 -16
  9. azure/ai/evaluation/_constants.py +2 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +12 -1
  13. azure/ai/evaluation/_evaluate/_eval_run.py +38 -43
  14. azure/ai/evaluation/_evaluate/_evaluate.py +62 -131
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
  16. azure/ai/evaluation/_evaluate/_utils.py +72 -38
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +88 -6
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +16 -3
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +39 -10
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +58 -52
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
  33. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
  34. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
  35. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
  36. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
  37. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
  38. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
  39. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
  40. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
  43. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
  45. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
  46. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
  48. azure/ai/evaluation/_exceptions.py +2 -0
  49. azure/ai/evaluation/_http_utils.py +6 -4
  50. azure/ai/evaluation/_model_configurations.py +65 -14
  51. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  52. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  53. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  54. azure/ai/evaluation/_version.py +1 -1
  55. azure/ai/evaluation/simulator/_adversarial_scenario.py +17 -1
  56. azure/ai/evaluation/simulator/_adversarial_simulator.py +57 -47
  57. azure/ai/evaluation/simulator/_constants.py +11 -1
  58. azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
  59. azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
  60. azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +12 -1
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
  63. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +48 -4
  64. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
  65. azure/ai/evaluation/simulator/_simulator.py +54 -45
  66. azure/ai/evaluation/simulator/_utils.py +25 -7
  67. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +240 -327
  68. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +71 -68
  69. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  70. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
  71. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
  72. {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from typing_extensions import override
4
+ from typing_extensions import overload, override
5
5
 
6
6
  from azure.ai.evaluation._common._experimental import experimental
7
7
  from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics
8
8
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
9
+ from azure.ai.evaluation._model_configurations import Conversation
9
10
 
10
11
 
11
12
  @experimental
@@ -49,16 +50,40 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
49
50
  }
50
51
  """
51
52
 
53
+ id = "eci"
54
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
+
52
56
  @override
53
57
  def __init__(
54
58
  self,
55
59
  credential,
56
60
  azure_ai_project,
57
- eval_last_turn: bool = False,
58
61
  ):
59
62
  super().__init__(
60
63
  eval_metric=_InternalEvaluationMetrics.ECI,
61
64
  azure_ai_project=azure_ai_project,
62
65
  credential=credential,
63
- eval_last_turn=eval_last_turn,
64
66
  )
67
+
68
+ @overload
69
+ def __call__(
70
+ self,
71
+ *,
72
+ query: str,
73
+ response: str,
74
+ ): ...
75
+
76
+ @overload
77
+ def __call__(
78
+ self,
79
+ *,
80
+ conversation: Conversation,
81
+ ): ...
82
+
83
+ @override
84
+ def __call__( # pylint: disable=docstring-missing-param
85
+ self,
86
+ *args,
87
+ **kwargs,
88
+ ):
89
+ return super().__call__(*args, **kwargs)
@@ -106,27 +106,34 @@ class _AsyncF1ScoreEvaluator:
106
106
 
107
107
  class F1ScoreEvaluator:
108
108
  """
109
- Initialize a f1 score evaluator for calculating F1 score.
109
+ Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
110
110
 
111
- **Usage**
111
+ F1 Scores range from 0 to 1, with 1 being the best possible score.
112
112
 
113
- .. code-block:: python
113
+ The F1-score computes the ratio of the number of shared words between the model generation and
114
+ the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
115
+ truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
116
+ precision is the ratio of the number of shared words to the total number of words in the generation, and recall
117
+ is the ratio of the number of shared words to the total number of words in the ground truth.
114
118
 
115
- eval_fn = F1ScoreEvaluator()
116
- result = eval_fn(
117
- response="The capital of Japan is Tokyo.",
118
- ground_truth="Tokyo is Japan's capital, known for its blend of traditional culture \
119
- and technological advancements.")
119
+ Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
120
+ model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
121
+ information in the response.
120
122
 
121
- **Output format**
122
123
 
123
- .. code-block:: python
124
+ .. admonition:: Example:
124
125
 
125
- {
126
- "f1_score": 0.42
127
- }
126
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
127
+ :start-after: [START f1_score_evaluator]
128
+ :end-before: [END f1_score_evaluator]
129
+ :language: python
130
+ :dedent: 8
131
+ :caption: Initialize and call an F1ScoreEvaluator.
128
132
  """
129
133
 
134
+ id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
135
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
136
+
130
137
  def __init__(self):
131
138
  self._async_evaluator = _AsyncF1ScoreEvaluator()
132
139
 
@@ -3,57 +3,89 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import os
6
- from typing import Optional
6
+ from typing import Dict, List, Union
7
7
 
8
- from typing_extensions import override
8
+ from typing_extensions import overload, override
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._model_configurations import Conversation
11
12
 
12
13
 
13
- class FluencyEvaluator(PromptyEvaluatorBase):
14
+ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
14
15
  """
15
- Initialize a fluency evaluator configured for a specific Azure OpenAI model.
16
+ Evaluates the fluency of a given response or a multi-turn conversation, including reasoning.
17
+
18
+ The fluency measure assesses the extent to which the generated text conforms to grammatical rules, syntactic
19
+ structures, and appropriate vocabulary usage, resulting in linguistically correct responses.
20
+
21
+ Fluency scores range from 1 to 5, with 1 being the least fluent and 5 being the most fluent.
16
22
 
17
23
  :param model_config: Configuration for the Azure OpenAI model.
18
24
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
19
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
20
26
 
21
- **Usage**
22
-
23
- .. code-block:: python
27
+ .. admonition:: Example:
24
28
 
25
- eval_fn = FluencyEvaluator(model_config)
26
- result = eval_fn(response="The capital of Japan is Tokyo.")
29
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
30
+ :start-after: [START fluency_evaluator]
31
+ :end-before: [END fluency_evaluator]
32
+ :language: python
33
+ :dedent: 8
34
+ :caption: Initialize and call a FluencyEvaluator.
27
35
 
28
- **Output format**
36
+ .. note::
29
37
 
30
- .. code-block:: python
31
-
32
- {
33
- "fluency": 4.0,
34
- "gpt_fluency": 4.0,
35
- }
36
-
37
- Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
38
- To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
39
- however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
38
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
39
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
40
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
40
41
  """
41
42
 
42
43
  _PROMPTY_FILE = "fluency.prompty"
43
44
  _RESULT_KEY = "fluency"
44
45
 
46
+ id = "azureml://registries/azureml/models/Fluency-Evaluator/versions/4"
47
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
48
+
45
49
  @override
46
50
  def __init__(self, model_config):
47
51
  current_dir = os.path.dirname(__file__)
48
52
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
49
53
  super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
50
54
 
51
- @override
55
+ @overload
52
56
  def __call__(
53
57
  self,
54
58
  *,
55
- response: Optional[str] = None,
56
- conversation=None,
59
+ response: str,
60
+ ) -> Dict[str, Union[str, float]]:
61
+ """Evaluate fluency in given response
62
+
63
+ :keyword response: The response to be evaluated.
64
+ :paramtype response: str
65
+ :return: The fluency score
66
+ :rtype: Dict[str, float]
67
+ """
68
+
69
+ @overload
70
+ def __call__(
71
+ self,
72
+ *,
73
+ conversation: Conversation,
74
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
75
+ """Evaluate fluency for a conversation
76
+
77
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
78
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
79
+ to be dictionaries with keys "content", "role", and possibly "context".
80
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
81
+ :return: The fluency score
82
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
83
+ """
84
+
85
+ @override
86
+ def __call__( # pylint: disable=docstring-missing-param
87
+ self,
88
+ *args,
57
89
  **kwargs,
58
90
  ):
59
91
  """
@@ -62,12 +94,11 @@ class FluencyEvaluator(PromptyEvaluatorBase):
62
94
  the evaluator will aggregate the results of each turn.
63
95
 
64
96
  :keyword response: The response to be evaluated. Mutually exclusive with the "conversation" parameter.
65
- :paramtype response: str
97
+ :paramtype response: Optional[str]
66
98
  :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
67
99
  key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role".
68
100
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
69
101
  :return: The fluency score.
70
102
  :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
71
103
  """
72
-
73
- return super().__call__(response=response, conversation=conversation, **kwargs)
104
+ return super().__call__(*args, **kwargs)
@@ -24,31 +24,29 @@ class _AsyncGleuScoreEvaluator:
24
24
 
25
25
  class GleuScoreEvaluator:
26
26
  """
27
- Evaluator that computes the BLEU Score between two strings.
27
+ Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
28
28
 
29
29
  The GLEU (Google-BLEU) score evaluator measures the similarity between generated and reference texts by
30
30
  evaluating n-gram overlap, considering both precision and recall. This balanced evaluation, designed for
31
31
  sentence-level assessment, makes it ideal for detailed analysis of translation quality. GLEU is well-suited for
32
32
  use cases such as machine translation, text summarization, and text generation.
33
33
 
34
- **Usage**
34
+ GLEU scores range from 0 to 1, where a value of 1 represents perfect overlap between the response and
35
+ the ground truth and a value of 0 indicates no overlap.
35
36
 
36
- .. code-block:: python
37
+ .. admonition:: Example:
37
38
 
38
- eval_fn = GleuScoreEvaluator()
39
- result = eval_fn(
40
- response="Tokyo is the capital of Japan.",
41
- ground_truth="The capital of Japan is Tokyo.")
42
-
43
- **Output format**
44
-
45
- .. code-block:: python
46
-
47
- {
48
- "gleu_score": 0.41
49
- }
39
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
40
+ :start-after: [START gleu_score_evaluator]
41
+ :end-before: [END gleu_score_evaluator]
42
+ :language: python
43
+ :dedent: 8
44
+ :caption: Initialize and call a GleuScoreEvaluator.
50
45
  """
51
46
 
47
+ id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
48
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
49
+
52
50
  def __init__(self):
53
51
  self._async_evaluator = _AsyncGleuScoreEvaluator()
54
52
 
@@ -2,12 +2,13 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import os
5
- from typing import Optional
5
+ from typing import Dict, List, Optional, Union
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import overload, override
8
8
  from promptflow.core import AsyncPrompty
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._model_configurations import Conversation
11
12
  from ..._common.utils import construct_prompty_model_config, validate_model_config
12
13
 
13
14
  try:
@@ -16,36 +17,37 @@ except ImportError:
16
17
  USER_AGENT = "None"
17
18
 
18
19
 
19
- class GroundednessEvaluator(PromptyEvaluatorBase):
20
+ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
20
21
  """
21
- Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
22
+ Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
23
+ including reasoning.
24
+
25
+ The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
26
+ context, making sure that these claims are substantiated by the context. Even if the responses from LLM are
27
+ factually correct, they'll be considered ungrounded if they can't be verified against the provided sources
28
+ (such as your input source or your database). Use the groundedness metric when you need to verify that
29
+ AI-generated responses align with and are validated by the provided context.
30
+
31
+ Groundedness scores range from 1 to 5, with 1 being the least grounded and 5 being the most grounded.
22
32
 
23
33
  :param model_config: Configuration for the Azure OpenAI model.
24
34
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
25
35
  ~azure.ai.evaluation.OpenAIModelConfiguration]
26
36
 
27
- **Usage**
28
-
29
- .. code-block:: python
37
+ .. admonition:: Example:
30
38
 
31
- eval_fn = GroundednessEvaluator(model_config)
32
- result = eval_fn(
33
- response="The capital of Japan is Tokyo.",
34
- context="Tokyo is Japan's capital, known for its blend of traditional culture \
35
- and technological advancements.")
39
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
40
+ :start-after: [START groundedness_evaluator]
41
+ :end-before: [END groundedness_evaluator]
42
+ :language: python
43
+ :dedent: 8
44
+ :caption: Initialize and call a GroundednessEvaluator.
36
45
 
37
- **Output format**
46
+ .. note::
38
47
 
39
- .. code-block:: python
40
-
41
- {
42
- "groundedness": 5,
43
- "gpt_groundedness": 5,
44
- }
45
-
46
- Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
47
- To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
48
- however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
48
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
49
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
50
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
49
51
  """
50
52
 
51
53
  _PROMPTY_FILE_NO_QUERY = "groundedness_without_query.prompty"
@@ -53,6 +55,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
53
55
  _RESULT_KEY = "groundedness"
54
56
  _OPTIONAL_PARAMS = ["query"]
55
57
 
58
+ id = "azureml://registries/azureml/models/Groundedness-Evaluator/versions/4"
59
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
60
+
56
61
  @override
57
62
  def __init__(self, model_config):
58
63
  current_dir = os.path.dirname(__file__)
@@ -62,14 +67,47 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
62
67
  self._model_config = model_config
63
68
  # Needs to be set because it's used in call method to re-validate prompt if `query` is provided
64
69
 
65
- @override
70
+ @overload
66
71
  def __call__(
67
72
  self,
68
73
  *,
74
+ response: str,
75
+ context: str,
69
76
  query: Optional[str] = None,
70
- response: Optional[str] = None,
71
- context: Optional[str] = None,
72
- conversation=None,
77
+ ) -> Dict[str, Union[str, float]]:
78
+ """Evaluate groundedness for given input of response, context
79
+
80
+ :keyword response: The response to be evaluated.
81
+ :paramtype response: str
82
+ :keyword context: The context to be evaluated.
83
+ :paramtype context: str
84
+ :keyword query: The query to be evaluated. Optional parameter for use with the `response`
85
+ and `context` parameters. If provided, a different prompt template will be used for evaluation.
86
+ :paramtype query: Optional[str]
87
+ :return: The groundedness score.
88
+ :rtype: Dict[str, float]
89
+ """
90
+
91
+ @overload
92
+ def __call__(
93
+ self,
94
+ *,
95
+ conversation: Conversation,
96
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
97
+ """Evaluate groundedness for a conversation
98
+
99
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
100
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
101
+ to be dictionaries with keys "content", "role", and possibly "context".
102
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
103
+ :return: The groundedness score.
104
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
105
+ """
106
+
107
+ @override
108
+ def __call__( # pylint: disable=docstring-missing-param
109
+ self,
110
+ *args,
73
111
  **kwargs,
74
112
  ):
75
113
  """Evaluate groundedness. Accepts either a query, response, and context for a single evaluation,
@@ -89,10 +127,10 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
89
127
  to be dictionaries with keys "content", "role", and possibly "context".
90
128
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
91
129
  :return: The relevance score.
92
- :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
130
+ :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
93
131
  """
94
132
 
95
- if query:
133
+ if kwargs.get("query", None):
96
134
  current_dir = os.path.dirname(__file__)
97
135
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
98
136
  self._prompty_file = prompty_path
@@ -103,4 +141,4 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
103
141
  )
104
142
  self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
105
143
 
106
- return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
144
+ return super().__call__(*args, **kwargs)
@@ -34,7 +34,7 @@ class _AsyncMeteorScoreEvaluator:
34
34
 
35
35
  class MeteorScoreEvaluator:
36
36
  """
37
- Evaluator that computes the METEOR Score between two strings.
37
+ Calculates the METEOR score for a given response and ground truth.
38
38
 
39
39
  The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by
40
40
  comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of
@@ -42,6 +42,12 @@ class MeteorScoreEvaluator:
42
42
  word stems to more accurately capture meaning and language variations. In addition to machine translation and
43
43
  text summarization, paraphrase detection is an optimal use case for the METEOR score.
44
44
 
45
+ Use the METEOR score when you want a more linguistically informed evaluation metric that captures not only
46
+ n-gram overlap but also accounts for synonyms, stemming, and word order. This is particularly useful for evaluating
47
+ tasks like machine translation, text summarization, and text generation.
48
+
49
+ The METEOR score ranges from 0 to 1, with 1 indicating a perfect match.
50
+
45
51
  :param alpha: The METEOR score alpha parameter. Default is 0.9.
46
52
  :type alpha: float
47
53
  :param beta: The METEOR score beta parameter. Default is 3.0.
@@ -49,28 +55,19 @@ class MeteorScoreEvaluator:
49
55
  :param gamma: The METEOR score gamma parameter. Default is 0.5.
50
56
  :type gamma: float
51
57
 
52
- **Usage**
53
-
54
- .. code-block:: python
55
-
56
- eval_fn = MeteorScoreEvaluator(
57
- alpha=0.9,
58
- beta=3.0,
59
- gamma=0.5
60
- )
61
- result = eval_fn(
62
- response="Tokyo is the capital of Japan.",
63
- ground_truth="The capital of Japan is Tokyo.")
58
+ .. admonition:: Example:
64
59
 
65
- **Output format**
66
-
67
- .. code-block:: python
68
-
69
- {
70
- "meteor_score": 0.62
71
- }
60
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
61
+ :start-after: [START meteor_score_evaluator]
62
+ :end-before: [END meteor_score_evaluator]
63
+ :language: python
64
+ :dedent: 8
65
+ :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
72
66
  """
73
67
 
68
+ id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
69
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
+
74
71
  def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
75
72
  self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
76
73
 
@@ -28,12 +28,10 @@ class ContentSafetyMultimodalEvaluator:
28
28
  :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
29
29
  resource group, and project name.
30
30
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
31
- :param parallel: Specifies whether to use parallel execution for evaluators.
32
- If True, evaluators execute in parallel; otherwise, they execute sequentially. Defaults to True.
33
- :type parallel: bool
31
+ :param kwargs: Additional arguments to pass to the evaluator.
32
+ :type kwargs: Any
34
33
 
35
34
  :return: A function that evaluates multimodal chat messages and generates content safety metrics.
36
- :rtype: Callable
37
35
 
38
36
  **Usage Example**
39
37
 
@@ -45,7 +43,7 @@ class ContentSafetyMultimodalEvaluator:
45
43
  "project_name": "<project_name>",
46
44
  }
47
45
  eval_fn = ContentSafetyMultimodalEvaluator(azure_ai_project)
48
- result = eval_fn(
46
+ result = eval_fn(conversation=
49
47
  {
50
48
  "messages": [
51
49
  {
@@ -92,8 +90,11 @@ class ContentSafetyMultimodalEvaluator:
92
90
 
93
91
  """
94
92
 
95
- def __init__(self, credential, azure_ai_project, parallel: bool = False):
96
- self._parallel = parallel
93
+ id = "content_safety_multimodal"
94
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
95
+
96
+ def __init__(self, credential, azure_ai_project, **kwargs):
97
+ self._parallel = kwargs.pop("_parallel", False)
97
98
  self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
98
99
  ViolenceMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project),
99
100
  SexualMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project),
@@ -104,8 +105,9 @@ class ContentSafetyMultimodalEvaluator:
104
105
  def __call__(self, *, conversation, **kwargs):
105
106
  """
106
107
  Evaluates content-safety metrics for list of messages.
108
+
107
109
  :keyword conversation: The conversation contains list of messages to be evaluated.
108
- Each message should have "role" and "content" keys.
110
+ Each message should have "role" and "content" keys. It supports single turn only.
109
111
  :paramtype conversation: ~azure.ai.evaluation.Conversation
110
112
  :return: The evaluation score based on the Content Safety Metrics.
111
113
  :rtype: Dict[str, Union[float, str]]
@@ -7,10 +7,8 @@ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_mu
7
7
  from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
8
8
  from azure.ai.evaluation._common.utils import validate_conversation
9
9
  from azure.core.credentials import TokenCredential
10
- from azure.ai.evaluation._common._experimental import experimental
11
10
 
12
11
 
13
- @experimental
14
12
  class ContentSafetyMultimodalEvaluatorBase(ABC):
15
13
  """
16
14
  Initialize a evaluator for a specified Evaluation Metric. Base class that is not
@@ -28,7 +28,7 @@ class HateUnfairnessMultimodalEvaluator:
28
28
  "project_name": "<project_name>",
29
29
  }
30
30
  eval_fn = HateUnfairnessMultimodalEvaluator(azure_ai_project)
31
- result = eval_fn(
31
+ result = eval_fn(conversation=
32
32
  {
33
33
  "messages": [
34
34
  {
@@ -66,14 +66,18 @@ class HateUnfairnessMultimodalEvaluator:
66
66
 
67
67
  """
68
68
 
69
+ id = "hate_unfairness_multimodal"
70
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71
+
69
72
  def __init__(self, credential, azure_ai_project):
70
73
  self._async_evaluator = _AsyncHateUnfairnessMultimodalEvaluator(credential, azure_ai_project)
71
74
 
72
75
  def __call__(self, *, conversation, **kwargs):
73
76
  """
74
77
  Evaluates hate unfairness content.
78
+
75
79
  :keyword conversation: The conversation contains list of messages to be evaluated.
76
- Each message should have "role" and "content" keys.
80
+ Each message should have "role" and "content" keys. It supports single turn only.
77
81
  :paramtype conversation: ~azure.ai.evaluation.Conversation
78
82
  :return: The hate unfairness score.
79
83
  :rtype: Dict[str, Union[float, str]]
@@ -2,10 +2,10 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  from promptflow._utils.async_utils import async_run_allowing_running_loop
5
- from azure.ai.evaluation._common._experimental import experimental
6
5
  from azure.ai.evaluation._common.constants import EvaluationMetrics
7
6
  from azure.ai.evaluation._common.utils import validate_conversation
8
7
  from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_multimodal
8
+ from azure.ai.evaluation._common._experimental import experimental
9
9
 
10
10
 
11
11
  @experimental
@@ -22,7 +22,6 @@ class ProtectedMaterialMultimodalEvaluator:
22
22
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
23
23
 
24
24
  :return: A dictionary containing the evaluation result label and reasoning.
25
- :rtype: Dict[str, str]
26
25
 
27
26
  **Usage Example**
28
27
 
@@ -34,7 +33,7 @@ class ProtectedMaterialMultimodalEvaluator:
34
33
  "project_name": "<project_name>",
35
34
  }
36
35
  eval_fn = ProtectedMaterialMultimodalEvaluator(azure_ai_project)
37
- result = eval_fn(
36
+ result = eval_fn(conversation=
38
37
  {
39
38
  "messages": [
40
39
  {
@@ -71,6 +70,9 @@ class ProtectedMaterialMultimodalEvaluator:
71
70
 
72
71
  """
73
72
 
73
+ id = "protected_material_multimodal"
74
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
75
+
74
76
  def __init__(
75
77
  self,
76
78
  credential,
@@ -82,8 +84,9 @@ class ProtectedMaterialMultimodalEvaluator:
82
84
  """
83
85
  Evaluates protected materials content.
84
86
 
85
- :keyword messages: The messages to be evaluated. Each message should have "role" and "content" keys.
86
- :paramtype messages: ~azure.ai.evaluation.Conversation
87
+ :keyword conversation: The conversation contains list of messages to be evaluated.
88
+ Each message should have "role" and "content" keys. It supports single turn only.
89
+ :paramtype conversation: ~azure.ai.evaluation.Conversation
87
90
  :return: A dictionary containing a boolean label and reasoning.
88
91
  :rtype: Dict[str, str]
89
92
  """
@@ -101,8 +104,9 @@ class _AsyncProtectedMaterialMultimodalEvaluator:
101
104
  async def __call__(self, *, conversation, **kwargs):
102
105
  """
103
106
  Evaluates content according to this evaluator's metric.
107
+
104
108
  :keyword conversation: The conversation contains list of messages to be evaluated.
105
- Each message should have "role" and "content" keys.
109
+ Each message should have "role" and "content" keys. It supports single turn only.
106
110
  :paramtype conversation: ~azure.ai.evaluation.Conversation
107
111
  :return: The evaluation score computation based on the Content Safety metric (self.metric).
108
112
  :rtype: Any