azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (78) hide show
  1. azure/ai/evaluation/__init__.py +9 -5
  2. azure/ai/evaluation/_common/constants.py +4 -2
  3. azure/ai/evaluation/_common/math.py +18 -0
  4. azure/ai/evaluation/_common/rai_service.py +54 -62
  5. azure/ai/evaluation/_common/utils.py +201 -16
  6. azure/ai/evaluation/_constants.py +12 -0
  7. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
  8. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
  10. azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
  11. azure/ai/evaluation/_evaluate/_evaluate.py +161 -89
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +44 -25
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +33 -79
  15. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  16. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +331 -0
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +76 -0
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +97 -0
  20. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +18 -41
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +18 -39
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +18 -39
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +18 -39
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +18 -55
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +30 -74
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +34 -80
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +18 -65
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +35 -83
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  37. azure/ai/evaluation/_evaluators/{_chat → _retrieval}/__init__.py +2 -2
  38. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/_retrieval.py +25 -28
  39. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/retrieval.prompty +0 -5
  40. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +23 -17
  42. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  43. azure/ai/evaluation/_evaluators/_xpia/xpia.py +15 -90
  44. azure/ai/evaluation/_exceptions.py +9 -7
  45. azure/ai/evaluation/_http_utils.py +203 -132
  46. azure/ai/evaluation/_model_configurations.py +37 -9
  47. azure/ai/evaluation/{_evaluators/_chat/retrieval → _vendor}/__init__.py +0 -6
  48. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  49. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  50. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  51. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  52. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  53. azure/ai/evaluation/_version.py +1 -1
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
  55. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  56. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  57. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  58. azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
  59. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
  60. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
  61. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  62. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  63. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  64. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  65. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -6
  66. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
  67. azure/ai/evaluation/simulator/_simulator.py +127 -117
  68. azure/ai/evaluation/simulator/_tracing.py +4 -4
  69. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +129 -43
  70. azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
  71. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +106 -0
  72. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  73. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  74. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  75. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  76. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  77. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
  78. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
@@ -6,15 +6,22 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import tempfile
9
- from collections import namedtuple
10
9
  from pathlib import Path
11
- from typing import Dict
10
+ from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict, Union
12
11
 
13
12
  import pandas as pd
14
-
15
- from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
13
+ from promptflow.client import PFClient
14
+ from promptflow.entities import Run
15
+
16
+ from azure.ai.evaluation._constants import (
17
+ DEFAULT_EVALUATION_RESULTS_FILE_NAME,
18
+ DefaultOpenEncoding,
19
+ EvaluationRunProperties,
20
+ Prefixes,
21
+ )
16
22
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
17
23
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
24
+ from azure.ai.evaluation._model_configurations import AzureAIProject
18
25
 
19
26
  LOGGER = logging.getLogger(__name__)
20
27
 
@@ -23,14 +30,26 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
23
30
  "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
24
31
  )
25
32
 
26
- AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
33
+
34
+ class AzureMLWorkspace(NamedTuple):
35
+ subscription_id: str
36
+ resource_group_name: str
37
+ workspace_name: str
27
38
 
28
39
 
29
- def is_none(value):
40
+ class EvaluateResult(TypedDict):
41
+ metrics: Dict[str, float]
42
+ studio_url: Optional[str]
43
+ rows: List[Dict]
44
+
45
+
46
+ def is_none(value) -> bool:
30
47
  return value is None or str(value).lower() == "none"
31
48
 
32
49
 
33
- def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
50
+ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
51
+ trace_provider: str,
52
+ ) -> AzureMLWorkspace:
34
53
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
35
54
  if not match or len(match.groups()) != 5:
36
55
  raise EvaluationException(
@@ -47,7 +66,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
47
66
  subscription_id = match.group(1)
48
67
  resource_group_name = match.group(3)
49
68
  workspace_name = match.group(5)
50
- return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
69
+ return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
51
70
 
52
71
 
53
72
  def load_jsonl(path):
@@ -55,7 +74,7 @@ def load_jsonl(path):
55
74
  return [json.loads(line) for line in f.readlines()]
56
75
 
57
76
 
58
- def _azure_pf_client_and_triad(trace_destination):
77
+ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
59
78
  from promptflow.azure._cli._utils import _get_azure_pf_client
60
79
 
61
80
  ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
@@ -69,14 +88,14 @@ def _azure_pf_client_and_triad(trace_destination):
69
88
 
70
89
 
71
90
  def _log_metrics_and_instance_results(
72
- metrics,
73
- instance_results,
74
- trace_destination,
75
- run,
76
- evaluation_name,
77
- ) -> str:
91
+ metrics: Dict[str, Any],
92
+ instance_results: pd.DataFrame,
93
+ trace_destination: Optional[str],
94
+ run: Run,
95
+ evaluation_name: Optional[str],
96
+ ) -> Optional[str]:
78
97
  if trace_destination is None:
79
- LOGGER.error("Unable to log traces as trace destination was not defined.")
98
+ LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
80
99
  return None
81
100
 
82
101
  azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
@@ -94,7 +113,6 @@ def _log_metrics_and_instance_results(
94
113
  ml_client=azure_pf_client.ml_client,
95
114
  promptflow_run=run,
96
115
  ) as ev_run:
97
-
98
116
  artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
99
117
 
100
118
  with tempfile.TemporaryDirectory() as tmpdir:
@@ -112,7 +130,8 @@ def _log_metrics_and_instance_results(
112
130
  if run is None:
113
131
  ev_run.write_properties_to_run_history(
114
132
  properties={
115
- "_azureml.evaluation_run": "azure-ai-generative-parent",
133
+ EvaluationRunProperties.RUN_TYPE: "eval_run",
134
+ EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent",
116
135
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
117
136
  "isEvaluatorRun": "true",
118
137
  }
@@ -138,7 +157,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
138
157
  return studio_url
139
158
 
140
159
 
141
- def _trace_destination_from_project_scope(project_scope: dict) -> str:
160
+ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
142
161
  subscription_id = project_scope["subscription_id"]
143
162
  resource_group_name = project_scope["resource_group_name"]
144
163
  workspace_name = project_scope["project_name"]
@@ -151,9 +170,9 @@ def _trace_destination_from_project_scope(project_scope: dict) -> str:
151
170
  return trace_destination
152
171
 
153
172
 
154
- def _write_output(path, data_dict):
173
+ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
155
174
  p = Path(path)
156
- if os.path.isdir(path):
175
+ if p.is_dir():
157
176
  p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
158
177
 
159
178
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
@@ -161,7 +180,7 @@ def _write_output(path, data_dict):
161
180
 
162
181
 
163
182
  def _apply_column_mapping(
164
- source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
183
+ source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
165
184
  ) -> pd.DataFrame:
166
185
  """
167
186
  Apply column mapping to source_df based on mapping_config.
@@ -211,7 +230,7 @@ def _apply_column_mapping(
211
230
  return result_df
212
231
 
213
232
 
214
- def _has_aggregator(evaluator):
233
+ def _has_aggregator(evaluator: object) -> bool:
215
234
  return hasattr(evaluator, "__aggregate__")
216
235
 
217
236
 
@@ -234,11 +253,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
234
253
  return default_value
235
254
 
236
255
 
237
- def set_event_loop_policy():
256
+ def set_event_loop_policy() -> None:
238
257
  import asyncio
239
258
  import platform
240
259
 
241
260
  if platform.system().lower() == "windows":
242
261
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
243
262
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
244
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
263
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
@@ -1,77 +1,15 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
5
4
  import os
6
- import re
7
-
8
- import numpy as np
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from promptflow.core import AsyncPrompty
11
-
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
-
14
- from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
15
-
16
- try:
17
- from ..._user_agent import USER_AGENT
18
- except ImportError:
19
- USER_AGENT = None
20
-
21
-
22
- class _AsyncCoherenceEvaluator:
23
- # Constants must be defined within eval's directory to be save/loadable
24
- PROMPTY_FILE = "coherence.prompty"
25
- LLM_CALL_TIMEOUT = 600
26
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
-
28
- def __init__(self, model_config: dict):
29
- ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
30
-
31
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
32
-
33
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
34
- # https://github.com/encode/httpx/discussions/2959
35
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
36
-
37
- ensure_user_agent_in_aoai_model_config(
38
- model_config,
39
- prompty_model_config,
40
- USER_AGENT,
41
- )
5
+ from typing import Optional
42
6
 
43
- current_dir = os.path.dirname(__file__)
44
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
45
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
46
-
47
- async def __call__(self, *, query: str, response: str, **kwargs):
48
- # Validate input parameters
49
- query = str(query or "")
50
- response = str(response or "")
51
-
52
- if not (query.strip() and response.strip()):
53
- msg = "Both 'query' and 'response' must be non-empty strings."
54
- raise EvaluationException(
55
- message=msg,
56
- internal_message=msg,
57
- error_category=ErrorCategory.INVALID_VALUE,
58
- error_blame=ErrorBlame.USER_ERROR,
59
- error_target=ErrorTarget.COHERENCE_EVALUATOR,
60
- )
7
+ from typing_extensions import override
61
8
 
62
- # Run the evaluation flow
63
- llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
9
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
64
10
 
65
- score = np.nan
66
- if llm_output:
67
- match = re.search(r"\d", llm_output)
68
- if match:
69
- score = float(match.group())
70
11
 
71
- return {"gpt_coherence": float(score)}
72
-
73
-
74
- class CoherenceEvaluator:
12
+ class CoherenceEvaluator(PromptyEvaluatorBase):
75
13
  """
76
14
  Initialize a coherence evaluator configured for a specific Azure OpenAI model.
77
15
 
@@ -97,21 +35,37 @@ class CoherenceEvaluator:
97
35
  }
98
36
  """
99
37
 
100
- def __init__(self, model_config: dict):
101
- self._async_evaluator = _AsyncCoherenceEvaluator(model_config)
38
+ PROMPTY_FILE = "coherence.prompty"
39
+ RESULT_KEY = "gpt_coherence"
102
40
 
103
- def __call__(self, *, query: str, response: str, **kwargs):
104
- """
105
- Evaluate coherence.
41
+ @override
42
+ def __init__(self, model_config: dict):
43
+ current_dir = os.path.dirname(__file__)
44
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
45
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
46
+
47
+ @override
48
+ def __call__(
49
+ self,
50
+ *,
51
+ query: Optional[str] = None,
52
+ response: Optional[str] = None,
53
+ conversation: Optional[dict] = None,
54
+ **kwargs,
55
+ ):
56
+ """Evaluate coherence. Accepts either a query and response for a single evaluation,
57
+ or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
58
+ turns, the evaluator will aggregate the results of each turn.
106
59
 
107
- :keyword query: The query to be evaluated.
108
- :paramtype query: str
109
60
  :keyword response: The response to be evaluated.
110
- :paramtype response: str
111
- :return: The coherence score.
61
+ :paramtype response: Optional[str]
62
+ :keyword context: The context to be evaluated.
63
+ :paramtype context: Optional[str]
64
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
65
+ key "messages". Conversation turns are expected
66
+ to be dictionaries with keys "content" and "role".
67
+ :paramtype conversation: Optional[Dict]
68
+ :return: The relevance score.
112
69
  :rtype: Dict[str, float]
113
70
  """
114
- return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
115
-
116
- def _to_async(self):
117
- return self._async_evaluator
71
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -3,11 +3,6 @@ name: Coherence
3
3
  description: Evaluates coherence score for QA scenario
4
4
  model:
5
5
  api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
6
  parameters:
12
7
  temperature: 0.0
13
8
  max_tokens: 1
@@ -0,0 +1,13 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._base_eval import EvaluatorBase
6
+ from ._base_prompty_eval import PromptyEvaluatorBase
7
+ from ._base_rai_svc_eval import RaiServiceEvaluatorBase
8
+
9
+ __all__ = [
10
+ "EvaluatorBase",
11
+ "PromptyEvaluatorBase",
12
+ "RaiServiceEvaluatorBase",
13
+ ]
@@ -0,0 +1,331 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import inspect
6
+ from abc import ABC, abstractmethod
7
+ from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
8
+
9
+ from promptflow._utils.async_utils import async_run_allowing_running_loop
10
+ from typing_extensions import ParamSpec, TypeAlias
11
+
12
+ from azure.ai.evaluation._common.math import list_mean
13
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
+
15
+ P = ParamSpec("P")
16
+ T = TypeVar("T")
17
+ T_EvalValue = TypeVar("T_EvalValue")
18
+
19
+
20
+ class DerivedEvalInput(TypedDict, total=False):
21
+ """The eval input generated by EvaluatorBase._derive_conversation_starter."""
22
+
23
+ query: Dict[str, Any]
24
+ response: Dict[str, Any]
25
+ context: str
26
+
27
+
28
+ AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
29
+ """TypeAlias that models the return value of EvaluatorBase._aggregate_results
30
+
31
+ .. code-block:: python
32
+
33
+ foo: AggregateResult[float] = {
34
+ "evaluation_per_turn": {
35
+ "gpt_coherence": [1.0, 2.0, 3.0]
36
+ },
37
+ "gpt_coherence": 2.0
38
+ }
39
+ """
40
+
41
+ DoEvalResult: TypeAlias = Dict[str, T]
42
+ """TypeAlias that models the return value of EvaluatorBase._do_eval
43
+
44
+ .. code-block:: python
45
+
46
+ foo: DoEvalResult[float] = {
47
+ "gpt_coherence": 2.0
48
+ }
49
+ """
50
+
51
+
52
+ # TODO exception target pass down?
53
+ class EvaluatorBase(ABC, Generic[T_EvalValue]):
54
+ """Base class for all evaluators that are capable of accepting either a group of single values,
55
+ or conversation as input. All such evaluators need to implement two functions of their own:
56
+ - _convert_conversation_to_eval_input
57
+ - _do_eval
58
+
59
+ Additionally, __call__ should be overridden to reshape the function header as needed to produce more informative
60
+ documentation, although ideally the actual child implementation of __call__ should just amount to
61
+ 'super().__init__()'.
62
+
63
+
64
+ :param not_singleton_inputs: A list of strings that represent the names of
65
+ inputs to the child evaluator's __call__ function that are NOT singleton inputs. By default, this
66
+ is ["conversation", "kwargs"].
67
+ :type not_singleton_inputs: List[str]
68
+ :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
69
+ :type eval_last_turn: bool
70
+ """
71
+
72
+ # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
73
+
74
+ # Make sure to call super().__init__() in the child class's __init__ method.
75
+ # pylint: disable=dangerous-default-value
76
+ def __init__(
77
+ self,
78
+ *,
79
+ not_singleton_inputs: List[str] = ["conversation", "kwargs"],
80
+ eval_last_turn: bool = False,
81
+ ):
82
+ self._not_singleton_inputs = not_singleton_inputs
83
+ self._eval_last_turn = eval_last_turn
84
+ self._singleton_inputs = self._derive_singleton_inputs()
85
+ self._async_evaluator = AsyncEvaluatorBase(self._real_call)
86
+
87
+ # This needs to be overridden just to change the function header into something more informative,
88
+ # and to be able to add a more specific docstring. The actual function contents should just be
89
+ # super().__call__(<inputs>)
90
+ def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
91
+ """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
92
+ one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
93
+ The actual behavior of this function shouldn't change beyond adding more inputs to the
94
+ async_run_allowing_running_loop call.
95
+
96
+ :keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
97
+ :type kwargs: Dict
98
+ :return: The evaluation result
99
+ :rtype: Dict
100
+ """
101
+ return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
102
+
103
+ @abstractmethod
104
+ async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
105
+ """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
106
+ In the default case, all required inputs are assumed to be within eval_input, as user-friendly
107
+ typing is handled above this function in favor of polymorphic simplicity. This function must be
108
+ asynchronous.
109
+
110
+ :param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
111
+ :type eval_input: Any
112
+ :return: A single evaluation result
113
+ :rtype: Dict
114
+ """
115
+
116
+ # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
117
+
118
+ def _derive_singleton_inputs(self) -> List[str]:
119
+ """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
120
+ when the evaluator is being used in a non-conversation context.
121
+ By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
122
+ Thankfully this works the way you'd hope, with the call_signature being based on the child
123
+ function's signature, not the parent's.
124
+
125
+ :return: A list of strings representing the names of singleton inputs.
126
+ :rtype: List[str]
127
+ """
128
+
129
+ call_signature = inspect.signature(self.__call__)
130
+ singletons = []
131
+ for param in call_signature.parameters:
132
+ if param not in self._not_singleton_inputs:
133
+ singletons.append(param)
134
+ return singletons
135
+
136
+ def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
137
+ """Produce the function that will be used to convert conversations to a list of evaluable inputs.
138
+ This uses the inputs derived from the _derive_singleton_inputs function to determine which
139
+ aspects of a conversation ought to be extracted.
140
+
141
+ :return: The function that will be used to convert conversations to evaluable inputs.
142
+ :rtype: Callable
143
+ """
144
+ include_context = "context" in self._singleton_inputs
145
+ include_query = "query" in self._singleton_inputs
146
+ include_response = "response" in self._singleton_inputs
147
+
148
+ def converter(conversation: Dict) -> List[DerivedEvalInput]:
149
+ messages = cast(List[Dict[str, Any]], conversation["messages"])
150
+ global_context = conversation.get("context", None)
151
+ # Extract queries, responses from conversation
152
+ queries: List[Dict[str, Any]] = []
153
+ responses: List[Dict[str, Any]] = []
154
+
155
+ # Convert conversation slice into queries and responses.
156
+ # Assume that 'user' role is asking queries and 'assistant' role is responding.
157
+ if self._eval_last_turn and len(messages) > 1:
158
+ messages = messages[-2:]
159
+
160
+ for each_turn in messages:
161
+ role = each_turn["role"]
162
+ if role == "user":
163
+ queries.append(each_turn)
164
+ elif role == "assistant":
165
+ responses.append(each_turn)
166
+ # TODO complain if len(queries) != len(responses)?
167
+ eval_inputs = []
168
+ for query, response in zip(queries, responses):
169
+ context = {}
170
+ if include_context:
171
+ query_context = query.get("context", None)
172
+ response_context = response.get("context", None)
173
+ if global_context:
174
+ context["global_context"] = global_context
175
+ if query_context and not include_query:
176
+ context["query_context"] = query_context
177
+ if response_context and not include_response:
178
+ context["response_context"] = response_context
179
+
180
+ eval_input: DerivedEvalInput = {}
181
+ if include_query:
182
+ eval_input["query"] = query
183
+ if include_response:
184
+ eval_input["response"] = response
185
+ if include_context:
186
+ eval_input["context"] = str(context)
187
+ eval_inputs.append(eval_input)
188
+ return eval_inputs
189
+
190
+ return converter
191
+
192
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
193
+ """Convert an arbitrary input into a list of inputs for evaluators.
194
+ It is assumed that evaluators generally make use of their inputs in one of two ways.
195
+ Either they receive a collection of keyname inputs that are all single values
196
+ (like a query and response), or they receive conversation that iss a list of dictionary
197
+ values.
198
+
199
+ The self._singleton_inputs list assigned during initialization is used to find and extract
200
+ singleton keywords, and self._allow_converssation_input is used to determine if a conversation
201
+ is a valid input.
202
+
203
+ If both conversations and singletons are allowed, the function will raise an exception if both
204
+ are inputted.
205
+
206
+ This function must be overridden by child classes IF they need to both a conversation and
207
+ other inputs to be passed in.
208
+
209
+ :keyword kwargs: The inputs to convert.
210
+ :type kwargs: Dict
211
+ :return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
212
+ :rtype: List
213
+ """
214
+
215
+ # Collect inputs
216
+ conversation = kwargs.get("conversation", None)
217
+ singletons = {}
218
+ if len(self._singleton_inputs) > 0:
219
+ singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
220
+ # Check that both conversation and other inputs aren't set
221
+ if conversation is not None and any(singletons.values()):
222
+ raise EvaluationException(
223
+ message="Invalid input",
224
+ internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
225
+ blame=ErrorBlame.USER_ERROR,
226
+ category=ErrorCategory.INVALID_VALUE,
227
+ target=ErrorTarget.CONVERSATION,
228
+ )
229
+ # Handle Conversation
230
+ if conversation is not None:
231
+ return self._derive_conversation_converter()(conversation)
232
+ # Handle Singletons
233
+ if all(value is not None for value in singletons.values()):
234
+ return [singletons] # TODO loosen requirements to allow for optional singletons?
235
+ # Missing input
236
+ raise EvaluationException(
237
+ message="Missing input",
238
+ internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
239
+ blame=ErrorBlame.USER_ERROR,
240
+ category=ErrorCategory.INVALID_VALUE,
241
+ target=ErrorTarget.CONVERSATION,
242
+ )
243
+
244
+ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
245
+ """Aggregate the evaluation results of each conversation turn into a single result.
246
+
247
+ Exact implementation might need to vary slightly depending on the results produced.
248
+ Default behavior is to average the all number-based outputs.
249
+
250
+ :param per_turn_results: List of evaluation results for each turn in the conversation.
251
+ :type per_turn_results: List[Dict]
252
+ :return: A dictionary containing aggregated results, with numeric metrics having their
253
+ means as top-level values in the dictionary, and all original
254
+ values (including non-numerics) located in under the "evaluation_per_turn" key,
255
+ which each sub-key being a metric and each sub-value being a the list of that metric's
256
+ per-turn values.
257
+ :rtype: Dict
258
+ """
259
+
260
+ aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
261
+ evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
262
+
263
+ # Go over each turn, and rotate the results into a
264
+ # metric: List[values] format for the evals_per_turn dictionary.
265
+ for turn in per_turn_results:
266
+ for metric, value in turn.items():
267
+ if metric not in evaluation_per_turn:
268
+ evaluation_per_turn[metric] = []
269
+ evaluation_per_turn[metric].append(value)
270
+
271
+ # Find and average all numeric values
272
+ for metric, values in evaluation_per_turn.items():
273
+ if all(isinstance(value, (int, float)) for value in values):
274
+ aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
275
+ # Slap the per-turn results back in.
276
+ aggregated["evaluation_per_turn"] = evaluation_per_turn
277
+
278
+ return aggregated
279
+
280
+ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
281
+ """The asynchronous call where real end-to-end evaluation logic is performed.
282
+
283
+ :keyword kwargs: The inputs to evaluate.
284
+ :type kwargs: Dict
285
+ :return: The evaluation result.
286
+ :rtype: Dict
287
+ """
288
+ # Convert inputs into list of evaluable inputs.
289
+ eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
290
+ per_turn_results = []
291
+ # Evaluate all inputs.
292
+ for eval_input in eval_input_list:
293
+ per_turn_results.append(await self._do_eval(eval_input))
294
+ # Return results as-is if only one result was produced.
295
+
296
+ if len(per_turn_results) == 1:
297
+ return per_turn_results[0]
298
+ if len(per_turn_results) == 0:
299
+ return {} # TODO raise something?
300
+ # Otherwise, aggregate results.
301
+ return self._aggregate_results(per_turn_results=per_turn_results)
302
+
303
+ @final
304
+ def _to_async(self) -> "AsyncEvaluatorBase":
305
+ return self._async_evaluator
306
+
307
+
308
+ class AsyncEvaluatorBase:
309
+ """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
310
+ to ensure that no one ever needs to extend or otherwise modify this class directly.
311
+ """
312
+
313
+ def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION
314
+ self._real_call = real_call
315
+
316
+ # Don't look at my shame. Nothing to see here....
317
+ # Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
318
+ # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature#
319
+ # are just not passed into this function instead of ending up in kwargs.
320
+ # Since we want this to be relatively call-agnostic, we just account for every input that any children
321
+ # are known to throw at this, mash them into kwargs, and then pass them into the real call.
322
+ async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
323
+ if conversation is not None:
324
+ kwargs["conversation"] = conversation
325
+ if query is not None:
326
+ kwargs["query"] = query
327
+ if response is not None:
328
+ kwargs["response"] = response
329
+ if context is not None:
330
+ kwargs["context"] = context
331
+ return await self._real_call(**kwargs)