azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. azure/ai/evaluation/__init__.py +1 -15
  2. azure/ai/evaluation/_azure/_clients.py +24 -8
  3. azure/ai/evaluation/_azure/_models.py +2 -2
  4. azure/ai/evaluation/_common/utils.py +8 -8
  5. azure/ai/evaluation/_constants.py +21 -0
  6. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
  7. azure/ai/evaluation/_evaluate/_eval_run.py +3 -1
  8. azure/ai/evaluation/_evaluate/_evaluate.py +74 -14
  9. azure/ai/evaluation/_evaluate/_utils.py +27 -0
  10. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
  11. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  12. azure/ai/evaluation/_evaluators/_common/_base_eval.py +69 -4
  13. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
  14. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +7 -1
  15. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  16. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +5 -42
  17. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
  18. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
  20. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
  21. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
  22. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
  23. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
  24. azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
  25. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
  26. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +41 -81
  27. azure/ai/evaluation/_exceptions.py +0 -1
  28. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  29. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +640 -0
  30. azure/ai/evaluation/_version.py +2 -1
  31. azure/ai/evaluation/simulator/_adversarial_simulator.py +10 -3
  32. azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
  33. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
  34. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
  35. azure/ai/evaluation/simulator/_simulator.py +21 -13
  36. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/METADATA +77 -7
  37. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/RECORD +40 -44
  38. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  39. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  40. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  41. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  42. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  43. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  44. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  45. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  46. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/NOTICE.txt +0 -0
  47. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/WHEEL +0 -0
  48. {azure_ai_evaluation-1.1.0.dist-info → azure_ai_evaluation-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,16 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict
4
5
  from nltk.translate.gleu_score import sentence_gleu
5
- from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+ from typing_extensions import overload, override
6
7
 
7
8
  from azure.ai.evaluation._common.utils import nltk_tokenize
8
9
 
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
11
 
10
- class _AsyncGleuScoreEvaluator:
11
- def __init__(self):
12
- pass
13
-
14
- async def __call__(self, *, ground_truth: str, response: str, **kwargs):
15
- reference_tokens = nltk_tokenize(ground_truth)
16
- hypothesis_tokens = nltk_tokenize(response)
17
-
18
- score = sentence_gleu([reference_tokens], hypothesis_tokens)
19
-
20
- return {
21
- "gleu_score": score,
22
- }
23
12
 
24
-
25
- class GleuScoreEvaluator:
13
+ class GleuScoreEvaluator(EvaluatorBase):
26
14
  """
27
15
  Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
28
16
 
@@ -47,10 +35,32 @@ class GleuScoreEvaluator:
47
35
  id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
48
36
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
49
37
 
38
+ @override
50
39
  def __init__(self):
51
- self._async_evaluator = _AsyncGleuScoreEvaluator()
40
+ super().__init__()
41
+
42
+ @override
43
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
44
+ """Produce a glue score evaluation result.
45
+
46
+ :param eval_input: The input to the evaluation function.
47
+ :type eval_input: Dict
48
+ :return: The evaluation result.
49
+ :rtype: Dict
50
+ """
51
+ ground_truth = eval_input["ground_truth"]
52
+ response = eval_input["response"]
53
+ reference_tokens = nltk_tokenize(ground_truth)
54
+ hypothesis_tokens = nltk_tokenize(response)
52
55
 
53
- def __call__(self, *, ground_truth: str, response: str, **kwargs):
56
+ score = sentence_gleu([reference_tokens], hypothesis_tokens)
57
+
58
+ return {
59
+ "gleu_score": score,
60
+ }
61
+
62
+ @overload # type: ignore
63
+ def __call__(self, *, ground_truth: str, response: str):
54
64
  """
55
65
  Evaluate the GLEU score between the response and the ground truth.
56
66
 
@@ -61,9 +71,21 @@ class GleuScoreEvaluator:
61
71
  :return: The GLEU score.
62
72
  :rtype: Dict[str, float]
63
73
  """
64
- return async_run_allowing_running_loop(
65
- self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
66
- )
67
74
 
68
- def _to_async(self):
69
- return self._async_evaluator
75
+ @override
76
+ def __call__( # pylint: disable=docstring-missing-param
77
+ self,
78
+ *args,
79
+ **kwargs,
80
+ ):
81
+ """
82
+ Evaluate the GLEU score between the response and the ground truth.
83
+
84
+ :keyword response: The response to be evaluated.
85
+ :paramtype response: str
86
+ :keyword ground_truth: The ground truth to be compared against.
87
+ :paramtype ground_truth: str
88
+ :return: The GLEU score.
89
+ :rtype: Dict[str, float]
90
+ """
91
+ return super().__call__(*args, **kwargs)
@@ -1,38 +1,16 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict
5
+
4
6
  from nltk.translate.meteor_score import meteor_score
5
- from promptflow._utils.async_utils import async_run_allowing_running_loop
7
+ from typing_extensions import overload, override
6
8
 
7
9
  from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
8
11
 
9
12
 
10
- class _AsyncMeteorScoreEvaluator:
11
- def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
12
- self._alpha = alpha
13
- self._beta = beta
14
- self._gamma = gamma
15
-
16
- ensure_nltk_data_downloaded()
17
-
18
- async def __call__(self, *, ground_truth: str, response: str, **kwargs):
19
- reference_tokens = nltk_tokenize(ground_truth)
20
- hypothesis_tokens = nltk_tokenize(response)
21
-
22
- score = meteor_score(
23
- [reference_tokens],
24
- hypothesis_tokens,
25
- alpha=self._alpha,
26
- beta=self._beta,
27
- gamma=self._gamma,
28
- )
29
-
30
- return {
31
- "meteor_score": score,
32
- }
33
-
34
-
35
- class MeteorScoreEvaluator:
13
+ class MeteorScoreEvaluator(EvaluatorBase):
36
14
  """
37
15
  Calculates the METEOR score for a given response and ground truth.
38
16
 
@@ -68,10 +46,41 @@ class MeteorScoreEvaluator:
68
46
  id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
69
47
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
48
 
49
+ @override
71
50
  def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
72
- self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
51
+ self._alpha = alpha
52
+ self._beta = beta
53
+ self._gamma = gamma
54
+ ensure_nltk_data_downloaded()
55
+ super().__init__()
73
56
 
74
- def __call__(self, *, ground_truth: str, response: str, **kwargs):
57
+ @override
58
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
59
+ """Produce a meteor score evaluation result.
60
+
61
+ :param eval_input: The input to the evaluation function.
62
+ :type eval_input: Dict
63
+ :return: The evaluation result.
64
+ :rtype: Dict
65
+ """
66
+ ground_truth = eval_input["ground_truth"]
67
+ response = eval_input["response"]
68
+ reference_tokens = nltk_tokenize(ground_truth)
69
+ hypothesis_tokens = nltk_tokenize(response)
70
+ score = meteor_score(
71
+ [reference_tokens],
72
+ hypothesis_tokens,
73
+ alpha=self._alpha,
74
+ beta=self._beta,
75
+ gamma=self._gamma,
76
+ )
77
+
78
+ return {
79
+ "meteor_score": score,
80
+ }
81
+
82
+ @overload # type: ignore
83
+ def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
75
84
  """
76
85
  Evaluate the METEOR score between the response and the ground truth.
77
86
 
@@ -82,9 +91,21 @@ class MeteorScoreEvaluator:
82
91
  :return: The METEOR score.
83
92
  :rtype: Dict[str, float]
84
93
  """
85
- return async_run_allowing_running_loop(
86
- self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
87
- )
88
94
 
89
- def _to_async(self):
90
- return self._async_evaluator
95
+ @override
96
+ def __call__( # pylint: disable=docstring-missing-param
97
+ self,
98
+ *args,
99
+ **kwargs,
100
+ ):
101
+ """
102
+ Evaluate the METEOR score between the response and the ground truth.
103
+
104
+ :keyword response: The response to be evaluated.
105
+ :paramtype response: str
106
+ :keyword ground_truth: The ground truth to be compared against.
107
+ :paramtype ground_truth: str
108
+ :return: The METEOR score.
109
+ :rtype: Dict[str, float]
110
+ """
111
+ return super().__call__(*args, **kwargs)
@@ -2,10 +2,11 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from concurrent.futures import as_completed
6
- from typing import Callable, Dict, List, Union
5
+ from typing import Union
7
6
 
8
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
7
+ from typing_extensions import overload, override
8
+
9
+ from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
9
10
 
10
11
  from .._coherence import CoherenceEvaluator
11
12
  from .._f1_score import F1ScoreEvaluator
@@ -15,7 +16,7 @@ from .._relevance import RelevanceEvaluator
15
16
  from .._similarity import SimilarityEvaluator
16
17
 
17
18
 
18
- class QAEvaluator:
19
+ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
19
20
  """
20
21
  Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
21
22
 
@@ -46,9 +47,7 @@ class QAEvaluator:
46
47
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
47
48
 
48
49
  def __init__(self, model_config, **kwargs):
49
- self._parallel = kwargs.pop("_parallel", False)
50
-
51
- self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
50
+ evaluators = [
52
51
  GroundednessEvaluator(model_config),
53
52
  RelevanceEvaluator(model_config),
54
53
  CoherenceEvaluator(model_config),
@@ -56,8 +55,31 @@ class QAEvaluator:
56
55
  SimilarityEvaluator(model_config),
57
56
  F1ScoreEvaluator(),
58
57
  ]
58
+ super().__init__(evaluators=evaluators, **kwargs)
59
+
60
+ @overload # type: ignore
61
+ def __call__(self, *, query: str, response: str, context: str, ground_truth: str):
62
+ """
63
+ Evaluates question-answering scenario.
64
+
65
+ :keyword query: The query to be evaluated.
66
+ :paramtype query: str
67
+ :keyword response: The response to be evaluated.
68
+ :paramtype response: str
69
+ :keyword context: The context to be evaluated.
70
+ :paramtype context: str
71
+ :keyword ground_truth: The ground truth to be evaluated.
72
+ :paramtype ground_truth: str
73
+ :return: The scores for QA scenario.
74
+ :rtype: Dict[str, Union[str, float]]
75
+ """
59
76
 
60
- def __call__(self, *, query: str, response: str, context: str, ground_truth: str, **kwargs):
77
+ @override
78
+ def __call__( # pylint: disable=docstring-missing-param
79
+ self,
80
+ *args,
81
+ **kwargs,
82
+ ):
61
83
  """
62
84
  Evaluates question-answering scenario.
63
85
 
@@ -72,22 +94,5 @@ class QAEvaluator:
72
94
  :return: The scores for QA scenario.
73
95
  :rtype: Dict[str, Union[str, float]]
74
96
  """
75
- results: Dict[str, Union[str, float]] = {}
76
- if self._parallel:
77
- with ThreadPoolExecutor() as executor:
78
- futures = {
79
- executor.submit(
80
- evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
81
- ): evaluator
82
- for evaluator in self._evaluators
83
- }
84
-
85
- # Collect results as they complete
86
- for future in as_completed(futures):
87
- results.update(future.result())
88
- else:
89
- for evaluator in self._evaluators:
90
- result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
91
- results.update(result)
92
-
93
- return results
97
+
98
+ return super().__call__(*args, **kwargs)
@@ -3,9 +3,11 @@
3
3
  # ---------------------------------------------------------
4
4
  from enum import Enum
5
5
 
6
- from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+ from typing import Dict
7
+ from typing_extensions import overload, override
7
8
 
8
9
  from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
11
 
10
12
 
11
13
  class RougeType(Enum):
@@ -32,21 +34,7 @@ class RougeType(Enum):
32
34
  """Overlap of L-grams (L consecutive words) between generated and reference text."""
33
35
 
34
36
 
35
- class _AsyncRougeScoreEvaluator:
36
- def __init__(self, rouge_type: RougeType):
37
- self._rouge_type = rouge_type
38
-
39
- async def __call__(self, *, ground_truth: str, response: str, **kwargs):
40
- scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
41
- metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
42
- return {
43
- "rouge_precision": metrics.precision,
44
- "rouge_recall": metrics.recall,
45
- "rouge_f1_score": metrics.fmeasure,
46
- }
47
-
48
-
49
- class RougeScoreEvaluator:
37
+ class RougeScoreEvaluator(EvaluatorBase):
50
38
  """
51
39
  Calculates the ROUGE score for a given response and ground truth.
52
40
 
@@ -76,10 +64,32 @@ class RougeScoreEvaluator:
76
64
  id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
77
65
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
78
66
 
67
+ @override
79
68
  def __init__(self, rouge_type: RougeType):
80
- self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
69
+ self._rouge_type = rouge_type
70
+ super().__init__()
71
+
72
+ @override
73
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
74
+ """Produce a rouge score evaluation result.
81
75
 
82
- def __call__(self, *, ground_truth: str, response: str, **kwargs):
76
+ :param eval_input: The input to the evaluation function.
77
+ :type eval_input: Dict
78
+ :return: The evaluation result.
79
+ :rtype: Dict
80
+ """
81
+ ground_truth = eval_input["ground_truth"]
82
+ response = eval_input["response"]
83
+ scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
84
+ metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
85
+ return {
86
+ "rouge_precision": metrics.precision,
87
+ "rouge_recall": metrics.recall,
88
+ "rouge_f1_score": metrics.fmeasure,
89
+ }
90
+
91
+ @overload # type: ignore
92
+ def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
83
93
  """
84
94
  Evaluate the ROUGE score between the response and the ground truth.
85
95
 
@@ -90,9 +100,20 @@ class RougeScoreEvaluator:
90
100
  :return: The ROUGE score.
91
101
  :rtype: Dict[str, float]
92
102
  """
93
- return async_run_allowing_running_loop(
94
- self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
95
- )
96
103
 
97
- def _to_async(self):
98
- return self._async_evaluator
104
+ @override
105
+ def __call__( # pylint: disable=docstring-missing-param
106
+ self,
107
+ *args,
108
+ **kwargs,
109
+ ):
110
+ """
111
+ Evaluate route score.
112
+ :keyword response: The response to be evaluated.
113
+ :paramtype response: str
114
+ :keyword ground_truth: The ground truth to be compared against.
115
+ :paramtype ground_truth: str
116
+ :return: The ROUGE score.
117
+ :rtype: Dict[str, float]
118
+ """
119
+ return super().__call__(*args, **kwargs)
@@ -2,83 +2,15 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- import math
6
5
  import os
7
- import re
6
+ from typing import Dict
8
7
 
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from promptflow.core import AsyncPrompty
8
+ from typing_extensions import overload, override
11
9
 
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
10
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13
11
 
14
- from ..._common.utils import construct_prompty_model_config, validate_model_config
15
12
 
16
- try:
17
- from ..._user_agent import USER_AGENT
18
- except ImportError:
19
- USER_AGENT = "None"
20
-
21
-
22
- class _AsyncSimilarityEvaluator:
23
- # Constants must be defined within eval's directory to be save/loadable
24
- _PROMPTY_FILE = "similarity.prompty"
25
- _LLM_CALL_TIMEOUT = 600
26
- _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
-
28
- def __init__(self, model_config: dict):
29
- prompty_model_config = construct_prompty_model_config(
30
- validate_model_config(model_config),
31
- self._DEFAULT_OPEN_API_VERSION,
32
- USER_AGENT,
33
- )
34
-
35
- current_dir = os.path.dirname(__file__)
36
- prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
37
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
38
-
39
- async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
40
- """
41
- Evaluate similarity.
42
-
43
- :keyword query: The query to be evaluated.
44
- :paramtype query: str
45
- :keyword response: The response to be evaluated.
46
- :paramtype response: str
47
- :keyword ground_truth: The ground truth to be evaluated.
48
- :paramtype ground_truth: str
49
- :return: The similarity score.
50
- :rtype: Dict[str, float]
51
- """
52
- # Validate input parameters
53
- query = str(query or "")
54
- response = str(response or "")
55
- ground_truth = str(ground_truth or "")
56
-
57
- if not (query.strip() and response.strip() and ground_truth.strip()):
58
- msg = "'query', 'response' and 'ground_truth' must be non-empty strings."
59
- raise EvaluationException(
60
- message=msg,
61
- internal_message=msg,
62
- error_category=ErrorCategory.MISSING_FIELD,
63
- error_blame=ErrorBlame.USER_ERROR,
64
- error_target=ErrorTarget.SIMILARITY_EVALUATOR,
65
- )
66
-
67
- # Run the evaluation flow
68
- llm_output = await self._flow(
69
- query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
70
- )
71
-
72
- score = math.nan
73
- if llm_output:
74
- match = re.search(r"\d", llm_output)
75
- if match:
76
- score = float(match.group())
77
-
78
- return {"similarity": float(score), "gpt_similarity": float(score)}
79
-
80
-
81
- class SimilarityEvaluator:
13
+ class SimilarityEvaluator(PromptyEvaluatorBase):
82
14
  """
83
15
  Evaluates similarity score for a given query, response, and ground truth.
84
16
 
@@ -113,13 +45,27 @@ class SimilarityEvaluator:
113
45
  however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
114
46
  """
115
47
 
116
- id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
48
+ # Constants must be defined within eval's directory to be save/loadable
49
+
50
+ _PROMPTY_FILE = "similarity.prompty"
51
+ _RESULT_KEY = "similarity"
52
+
53
+ id = "similarity"
117
54
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
118
55
 
56
+ @override
119
57
  def __init__(self, model_config):
120
- self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
121
-
122
- def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
58
+ current_dir = os.path.dirname(__file__)
59
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
60
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
61
+
62
+ # Ignoring a mypy error about having only 1 overload function.
63
+ # We want to use the overload style for all evals, even single-inputs. This is both to make
64
+ # refactoring to multi-input styles easier, stylistic consistency consistency across evals,
65
+ # and due to the fact that non-overloaded syntax now causes various parsing issues that
66
+ # we don't want to deal with.
67
+ @overload # type: ignore
68
+ def __call__(self, *, query: str, response: str, ground_truth: str) -> Dict[str, float]:
123
69
  """
124
70
  Evaluate similarity.
125
71
 
@@ -132,9 +78,23 @@ class SimilarityEvaluator:
132
78
  :return: The similarity score.
133
79
  :rtype: Dict[str, float]
134
80
  """
135
- return async_run_allowing_running_loop(
136
- self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
137
- )
138
81
 
139
- def _to_async(self):
140
- return self._async_evaluator
82
+ @override
83
+ def __call__( # pylint: disable=docstring-missing-param
84
+ self,
85
+ *args,
86
+ **kwargs,
87
+ ):
88
+ """
89
+ Evaluate similarity.
90
+
91
+ :keyword query: The query to be evaluated.
92
+ :paramtype query: str
93
+ :keyword response: The response to be evaluated.
94
+ :paramtype response: str
95
+ :keyword ground_truth: The ground truth to be evaluated.
96
+ :paramtype ground_truth: str
97
+ :return: The similarity score.
98
+ :rtype: Dict[str, float]
99
+ """
100
+ return super().__call__(*args, **kwargs)
@@ -63,7 +63,6 @@ class ErrorTarget(Enum):
63
63
  RAI_CLIENT = "RAIClient"
64
64
  COHERENCE_EVALUATOR = "CoherenceEvaluator"
65
65
  CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
66
- CONTENT_SAFETY_MULTIMODAL_EVALUATOR = "ContentSafetyMultimodalEvaluator"
67
66
  ECI_EVALUATOR = "ECIEvaluator"
68
67
  F1_EVALUATOR = "F1Evaluator"
69
68
  GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
@@ -0,0 +1,3 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------