azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (78) hide show
  1. azure/ai/evaluation/__init__.py +9 -5
  2. azure/ai/evaluation/_common/constants.py +4 -2
  3. azure/ai/evaluation/_common/math.py +18 -0
  4. azure/ai/evaluation/_common/rai_service.py +54 -62
  5. azure/ai/evaluation/_common/utils.py +201 -16
  6. azure/ai/evaluation/_constants.py +12 -0
  7. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
  8. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
  10. azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
  11. azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +44 -25
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
  15. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  16. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
  20. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  37. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  38. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
  39. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  40. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
  42. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  43. azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
  44. azure/ai/evaluation/_exceptions.py +9 -7
  45. azure/ai/evaluation/_http_utils.py +203 -132
  46. azure/ai/evaluation/_model_configurations.py +37 -9
  47. azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
  48. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  49. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  50. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  51. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  52. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  53. azure/ai/evaluation/_version.py +1 -1
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
  55. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  56. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  57. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  58. azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
  59. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
  60. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
  61. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  62. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  63. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  64. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  65. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
  66. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
  67. azure/ai/evaluation/simulator/_simulator.py +127 -117
  68. azure/ai/evaluation/simulator/_tracing.py +4 -4
  69. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
  70. azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
  71. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
  72. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  73. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  74. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  75. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  76. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  77. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
  78. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
@@ -1,37 +1,21 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from promptflow._utils.async_utils import async_run_allowing_running_loop
4
+ from typing_extensions import override
5
5
 
6
6
  from azure.ai.evaluation._common.constants import EvaluationMetrics
7
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
7
8
 
8
- try:
9
- from ._content_safety_base import ContentSafetyEvaluatorBase
10
- except ImportError:
11
- from _content_safety_base import ContentSafetyEvaluatorBase
12
9
 
13
-
14
- class _AsyncSexualEvaluator(ContentSafetyEvaluatorBase):
15
- def __init__(self, azure_ai_project: dict, credential=None):
16
- super().__init__(
17
- metric=EvaluationMetrics.SEXUAL,
18
- azure_ai_project=azure_ai_project,
19
- credential=credential,
20
- )
21
-
22
- async def __call__(self, *, query: str, response: str, **kwargs):
23
- return await super().__call__(query=query, response=response, **kwargs)
24
-
25
-
26
- class SexualEvaluator:
10
+ class SexualEvaluator(RaiServiceEvaluatorBase):
27
11
  """
28
12
  Initialize a sexual evaluator for sexual score.
29
13
 
14
+ :param credential: The credential for connecting to Azure AI project. Required
15
+ :type credential: ~azure.core.credentials.TokenCredential
30
16
  :param azure_ai_project: The scope of the Azure AI project.
31
17
  It contains subscription id, resource group, and project name.
32
18
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
33
- :param credential: The credential for connecting to Azure AI project.
34
- :type credential: ~azure.core.credentials.TokenCredential
35
19
 
36
20
  **Usage**
37
21
 
@@ -56,21 +40,16 @@ class SexualEvaluator:
56
40
  }
57
41
  """
58
42
 
59
- def __init__(self, azure_ai_project: dict, credential=None):
60
- self._async_evaluator = _AsyncSexualEvaluator(azure_ai_project, credential)
61
-
62
- def __call__(self, *, query: str, response: str, **kwargs):
63
- """
64
- Evaluates sexual content.
65
-
66
- :keyword query: The query to be evaluated.
67
- :paramtype query: str
68
- :keyword response: The response to be evaluated.
69
- :paramtype response: str
70
- :return: The sexual score.
71
- :rtype: dict
72
- """
73
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
74
-
75
- def _to_async(self):
76
- return self._async_evaluator
43
+ @override
44
+ def __init__(
45
+ self,
46
+ credential,
47
+ azure_ai_project: dict,
48
+ eval_last_turn: bool = False,
49
+ ):
50
+ super().__init__(
51
+ eval_metric=EvaluationMetrics.SEXUAL,
52
+ azure_ai_project=azure_ai_project,
53
+ credential=credential,
54
+ eval_last_turn=eval_last_turn,
55
+ )
@@ -1,37 +1,21 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from promptflow._utils.async_utils import async_run_allowing_running_loop
4
+ from typing_extensions import override
5
5
 
6
6
  from azure.ai.evaluation._common.constants import EvaluationMetrics
7
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
7
8
 
8
- try:
9
- from ._content_safety_base import ContentSafetyEvaluatorBase
10
- except ImportError:
11
- from _content_safety_base import ContentSafetyEvaluatorBase
12
9
 
13
-
14
- class _AsyncViolenceEvaluator(ContentSafetyEvaluatorBase):
15
- def __init__(self, azure_ai_project: dict, credential=None):
16
- super().__init__(
17
- metric=EvaluationMetrics.VIOLENCE,
18
- azure_ai_project=azure_ai_project,
19
- credential=credential,
20
- )
21
-
22
- async def __call__(self, *, query: str, response: str, **kwargs):
23
- return await super().__call__(query=query, response=response, **kwargs)
24
-
25
-
26
- class ViolenceEvaluator:
10
+ class ViolenceEvaluator(RaiServiceEvaluatorBase):
27
11
  """
28
12
  Initialize a violence evaluator for violence score.
29
13
 
14
+ :param credential: The credential for connecting to Azure AI project. Required
15
+ :type credential: ~azure.core.credentials.TokenCredential
30
16
  :param azure_ai_project: The scope of the Azure AI project.
31
17
  It contains subscription id, resource group, and project name.
32
18
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
33
- :param credential: The credential for connecting to Azure AI project.
34
- :type credential: ~azure.core.credentials.TokenCredential
35
19
 
36
20
  **Usage**
37
21
 
@@ -56,21 +40,16 @@ class ViolenceEvaluator:
56
40
  }
57
41
  """
58
42
 
59
- def __init__(self, azure_ai_project: dict, credential=None):
60
- self._async_evaluator = _AsyncViolenceEvaluator(azure_ai_project, credential)
61
-
62
- def __call__(self, *, query: str, response: str, **kwargs):
63
- """
64
- Evaluates violence content.
65
-
66
- :keyword query: The query to be evaluated.
67
- :paramtype query: str
68
- :keyword response: The response to be evaluated.
69
- :paramtype response: str
70
- :return: The violence score.
71
- :rtype: dict
72
- """
73
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
74
-
75
- def _to_async(self):
76
- return self._async_evaluator
43
+ @override
44
+ def __init__(
45
+ self,
46
+ credential,
47
+ azure_ai_project: dict,
48
+ eval_last_turn: bool = False,
49
+ ):
50
+ super().__init__(
51
+ eval_metric=EvaluationMetrics.VIOLENCE,
52
+ azure_ai_project=azure_ai_project,
53
+ credential=credential,
54
+ eval_last_turn=eval_last_turn,
55
+ )
@@ -1,45 +1,13 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from promptflow._utils.async_utils import async_run_allowing_running_loop
4
+ from typing_extensions import override
5
5
 
6
6
  from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics
7
- from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
8
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
7
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
9
8
 
10
9
 
11
- class _AsyncECIEvaluator:
12
- def __init__(self, azure_ai_project: dict, credential=None):
13
- self._azure_ai_project = azure_ai_project
14
- self._credential = credential
15
-
16
- async def __call__(self, *, query: str, response: str, **kwargs):
17
- # Validate inputs
18
- # Raises value error if failed, so execution alone signifies success.
19
- if not (query and query.strip() and query != "None") or not (
20
- response and response.strip() and response != "None"
21
- ):
22
- msg = "Both 'query' and 'response' must be non-empty strings."
23
- raise EvaluationException(
24
- message=msg,
25
- internal_message=msg,
26
- error_category=ErrorCategory.MISSING_FIELD,
27
- error_blame=ErrorBlame.USER_ERROR,
28
- error_target=ErrorTarget.ECI_EVALUATOR,
29
- )
30
-
31
- # Run score computation based on supplied metric.
32
- result = await evaluate_with_rai_service(
33
- metric_name=_InternalEvaluationMetrics.ECI,
34
- query=query,
35
- response=response,
36
- project_scope=self._azure_ai_project,
37
- credential=self._credential,
38
- )
39
- return result
40
-
41
-
42
- class ECIEvaluator:
10
+ class ECIEvaluator(RaiServiceEvaluatorBase):
43
11
  """
44
12
  Initialize an ECI evaluator to evaluate ECI based on the following guidelines:
45
13
  Detects whether ECI is present without a disclaimer in the AI system’s response.
@@ -49,11 +17,11 @@ class ECIEvaluator:
49
17
  "AI-generated content may be incorrect. If you are seeking ECI-related information, please go to Bing Search."
50
18
  Outputs True or False with AI-generated reasoning.
51
19
 
20
+ :param credential: The credential for connecting to Azure AI project. Required
21
+ :type credential: ~azure.core.credentials.TokenCredential
52
22
  :param azure_ai_project: The scope of the Azure AI project.
53
23
  It contains subscription id, resource group, and project name.
54
24
  :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
55
- :param credential: The credential for connecting to Azure AI project.
56
- :type credential: ~azure.core.credentials.TokenCredential
57
25
  :return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
58
26
  :rtype: Dict[str, str]
59
27
 
@@ -79,21 +47,16 @@ class ECIEvaluator:
79
47
  }
80
48
  """
81
49
 
82
- def __init__(self, azure_ai_project: dict, credential=None) -> None:
83
- self._async_evaluator = _AsyncECIEvaluator(azure_ai_project, credential)
84
-
85
- def __call__(self, *, query: str, response: str, **kwargs):
86
- """
87
- Evaluates ECI content.
88
-
89
- :keyword query: The query to be evaluated.
90
- :paramtype query: str
91
- :keyword response: The response to be evaluated.
92
- :paramtype response: str
93
- :return: The ECI result.
94
- :rtype: dict
95
- """
96
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
97
-
98
- def _to_async(self):
99
- return self._async_evaluator
50
+ @override
51
+ def __init__(
52
+ self,
53
+ credential,
54
+ azure_ai_project: dict,
55
+ eval_last_turn: bool = False,
56
+ ):
57
+ super().__init__(
58
+ eval_metric=_InternalEvaluationMetrics.ECI,
59
+ azure_ai_project=azure_ai_project,
60
+ credential=credential,
61
+ eval_last_turn=eval_last_turn,
62
+ )
@@ -15,6 +15,16 @@ class _AsyncF1ScoreEvaluator:
15
15
  pass
16
16
 
17
17
  async def __call__(self, *, response: str, ground_truth: str, **kwargs):
18
+ """
19
+ Evaluate F1 score.
20
+
21
+ :keyword response: The response to be evaluated.
22
+ :paramtype response: str
23
+ :keyword ground_truth: The ground truth to be evaluated.
24
+ :paramtype ground_truth: str
25
+ :return: The F1 score.
26
+ :rtype: Dict[str, float]
27
+ """
18
28
  # Validate inputs
19
29
  if not (response and response.strip() and response != "None") or not (
20
30
  ground_truth and ground_truth.strip() and ground_truth != "None"
@@ -34,7 +44,7 @@ class _AsyncF1ScoreEvaluator:
34
44
  return {"f1_score": f1_result}
35
45
 
36
46
  @classmethod
37
- def _compute_f1_score(cls, response: str, ground_truth: str) -> str:
47
+ def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
38
48
  import re
39
49
  import string
40
50
 
@@ -76,11 +86,9 @@ class _AsyncF1ScoreEvaluator:
76
86
 
77
87
  return white_space_fix(remove_articles(remove_punctuation(lower(text))))
78
88
 
79
- prediction_tokens = normalize_text(response)
80
- reference_tokens = normalize_text(ground_truth)
81
89
  tokenizer = QASplitTokenizer()
82
- prediction_tokens = tokenizer(prediction_tokens)
83
- reference_tokens = tokenizer(reference_tokens)
90
+ prediction_tokens = tokenizer(normalize_text(response))
91
+ reference_tokens = tokenizer(normalize_text(ground_truth))
84
92
 
85
93
  common_tokens = Counter(prediction_tokens) & Counter(reference_tokens)
86
94
  num_common_tokens = sum(common_tokens.values())
@@ -131,7 +139,7 @@ class F1ScoreEvaluator:
131
139
  :keyword ground_truth: The ground truth to be evaluated.
132
140
  :paramtype ground_truth: str
133
141
  :return: The F1 score.
134
- :rtype: dict
142
+ :rtype: Dict[str, float]
135
143
  """
136
144
 
137
145
  return async_run_allowing_running_loop(
@@ -3,75 +3,14 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import os
6
- import re
6
+ from typing import Optional
7
7
 
8
- import numpy as np
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from promptflow.core import AsyncPrompty
8
+ from typing_extensions import override
11
9
 
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
10
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13
11
 
14
- from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
15
12
 
16
- try:
17
- from ..._user_agent import USER_AGENT
18
- except ImportError:
19
- USER_AGENT = None
20
-
21
-
22
- class _AsyncFluencyEvaluator:
23
- # Constants must be defined within eval's directory to be save/loadable
24
- PROMPTY_FILE = "fluency.prompty"
25
- LLM_CALL_TIMEOUT = 600
26
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
-
28
- def __init__(self, model_config: dict):
29
- ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
30
-
31
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
32
-
33
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
34
- # https://github.com/encode/httpx/discussions/2959
35
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
36
-
37
- ensure_user_agent_in_aoai_model_config(
38
- model_config,
39
- prompty_model_config,
40
- USER_AGENT,
41
- )
42
-
43
- current_dir = os.path.dirname(__file__)
44
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
45
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
46
-
47
- async def __call__(self, *, query: str, response: str, **kwargs):
48
- # Validate input parameters
49
- query = str(query or "")
50
- response = str(response or "")
51
-
52
- if not (query.strip() and response.strip()):
53
- msg = "Both 'query' and 'response' must be non-empty strings."
54
- raise EvaluationException(
55
- message=msg,
56
- internal_message=msg,
57
- error_category=ErrorCategory.MISSING_FIELD,
58
- error_blame=ErrorBlame.USER_ERROR,
59
- error_target=ErrorTarget.F1_EVALUATOR,
60
- )
61
-
62
- # Run the evaluation flow
63
- llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
64
-
65
- score = np.nan
66
- if llm_output:
67
- match = re.search(r"\d", llm_output)
68
- if match:
69
- score = float(match.group())
70
-
71
- return {"gpt_fluency": float(score)}
72
-
73
-
74
- class FluencyEvaluator:
13
+ class FluencyEvaluator(PromptyEvaluatorBase):
75
14
  """
76
15
  Initialize a fluency evaluator configured for a specific Azure OpenAI model.
77
16
 
@@ -97,21 +36,38 @@ class FluencyEvaluator:
97
36
  }
98
37
  """
99
38
 
100
- def __init__(self, model_config: dict):
101
- self._async_evaluator = _AsyncFluencyEvaluator(model_config)
39
+ PROMPTY_FILE = "fluency.prompty"
40
+ RESULT_KEY = "gpt_fluency"
102
41
 
103
- def __call__(self, *, query: str, response: str, **kwargs):
42
+ @override
43
+ def __init__(self, model_config: dict):
44
+ current_dir = os.path.dirname(__file__)
45
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
46
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
47
+
48
+ @override
49
+ def __call__(
50
+ self,
51
+ *,
52
+ query: Optional[str] = None,
53
+ response: Optional[str] = None,
54
+ conversation: Optional[dict] = None,
55
+ **kwargs,
56
+ ):
104
57
  """
105
- Evaluate fluency.
58
+ Evaluate fluency. Accepts either a query and response for a single evaluation,
59
+ or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
60
+ the evaluator will aggregate the results of each turn.
106
61
 
107
62
  :keyword query: The query to be evaluated.
108
63
  :paramtype query: str
109
64
  :keyword response: The response to be evaluated.
110
65
  :paramtype response: str
66
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
67
+ key "messages". Conversation turns are expected
68
+ to be dictionaries with keys "content" and "role".
69
+ :paramtype conversation: Optional[Dict]
111
70
  :return: The fluency score.
112
- :rtype: dict
71
+ :rtype: Dict[str, float]
113
72
  """
114
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
115
-
116
- def _to_async(self):
117
- return self._async_evaluator
73
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -3,11 +3,6 @@ name: Fluency
3
3
  description: Evaluates fluency score for QA scenario
4
4
  model:
5
5
  api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
6
  parameters:
12
7
  temperature: 0.0
13
8
  max_tokens: 1
@@ -1,77 +1,15 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
5
4
  import os
6
- import re
7
-
8
- import numpy as np
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from promptflow.core import AsyncPrompty
11
-
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
-
14
- from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
15
-
16
- try:
17
- from ..._user_agent import USER_AGENT
18
- except ImportError:
19
- USER_AGENT = None
20
-
21
-
22
- class _AsyncGroundednessEvaluator:
23
- # Constants must be defined within eval's directory to be save/loadable
24
- PROMPTY_FILE = "groundedness.prompty"
25
- LLM_CALL_TIMEOUT = 600
26
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
-
28
- def __init__(self, model_config: dict):
29
- ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
30
-
31
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
32
-
33
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
34
- # https://github.com/encode/httpx/discussions/2959
35
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
5
+ from typing import Optional
36
6
 
37
- ensure_user_agent_in_aoai_model_config(
38
- model_config,
39
- prompty_model_config,
40
- USER_AGENT,
41
- )
7
+ from typing_extensions import override
42
8
 
43
- current_dir = os.path.dirname(__file__)
44
- prompty_path = os.path.join(current_dir, "groundedness.prompty")
45
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
46
-
47
- async def __call__(self, *, response: str, context: str, **kwargs):
48
- # Validate input parameters
49
- response = str(response or "")
50
- context = str(context or "")
51
-
52
- if not response.strip() or not context.strip():
53
- msg = "Both 'response' and 'context' must be non-empty strings."
54
- raise EvaluationException(
55
- message=msg,
56
- internal_message=msg,
57
- error_category=ErrorCategory.MISSING_FIELD,
58
- error_blame=ErrorBlame.USER_ERROR,
59
- error_target=ErrorTarget.F1_EVALUATOR,
60
- )
61
-
62
- # Run the evaluation flow
63
- llm_output = await self._flow(response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
64
-
65
- score = np.nan
66
- if llm_output:
67
- match = re.search(r"\d", llm_output)
68
- if match:
69
- score = float(match.group())
70
-
71
- return {"gpt_groundedness": float(score)}
9
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
72
10
 
73
11
 
74
- class GroundednessEvaluator:
12
+ class GroundednessEvaluator(PromptyEvaluatorBase):
75
13
  """
76
14
  Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
77
15
 
@@ -98,21 +36,37 @@ class GroundednessEvaluator:
98
36
  }
99
37
  """
100
38
 
101
- def __init__(self, model_config: dict):
102
- self._async_evaluator = _AsyncGroundednessEvaluator(model_config)
39
+ PROMPTY_FILE = "groundedness.prompty"
40
+ RESULT_KEY = "gpt_groundedness"
103
41
 
104
- def __call__(self, *, response: str, context: str, **kwargs):
105
- """
106
- Evaluate groundedness of the response in the context.
42
+ @override
43
+ def __init__(self, model_config: dict):
44
+ current_dir = os.path.dirname(__file__)
45
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
46
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
47
+
48
+ @override
49
+ def __call__(
50
+ self,
51
+ *,
52
+ response: Optional[str] = None,
53
+ context: Optional[str] = None,
54
+ conversation: Optional[dict] = None,
55
+ **kwargs,
56
+ ):
57
+ """Evaluate groundedless. Accepts either a response and context a single evaluation,
58
+ or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
59
+ the evaluator will aggregate the results of each turn.
107
60
 
108
61
  :keyword response: The response to be evaluated.
109
- :paramtype response: str
110
- :keyword context: The context in which the response is evaluated.
111
- :paramtype context: str
112
- :return: The groundedness score.
113
- :rtype: dict
62
+ :paramtype response: Optional[str]
63
+ :keyword context: The context to be evaluated.
64
+ :paramtype context: Optional[str]
65
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
66
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
67
+ to be dictionaries with keys "content", "role", and possibly "context".
68
+ :paramtype conversation: Optional[Dict]
69
+ :return: The relevance score.
70
+ :rtype: Dict[str, float]
114
71
  """
115
- return async_run_allowing_running_loop(self._async_evaluator, response=response, context=context, **kwargs)
116
-
117
- def _to_async(self):
118
- return self._async_evaluator
72
+ return super().__call__(response=response, context=context, conversation=conversation, **kwargs)
@@ -3,11 +3,6 @@ name: Groundedness
3
3
  description: Evaluates groundedness score for QA scenario
4
4
  model:
5
5
  api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
6
  parameters:
12
7
  temperature: 0.0
13
8
  max_tokens: 1