azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show
  1. azure/ai/evaluation/__init__.py +1 -1
  2. azure/ai/evaluation/_common/constants.py +4 -2
  3. azure/ai/evaluation/_common/math.py +18 -0
  4. azure/ai/evaluation/_common/rai_service.py +54 -62
  5. azure/ai/evaluation/_common/utils.py +182 -12
  6. azure/ai/evaluation/_constants.py +10 -2
  7. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
  8. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
  10. azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
  11. azure/ai/evaluation/_evaluate/_evaluate.py +116 -62
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +44 -25
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  15. azure/ai/evaluation/_evaluators/_common/_base_eval.py +59 -30
  16. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +10 -13
  17. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -20
  18. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
  19. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
  20. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -4
  22. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +4 -4
  23. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -4
  24. azure/ai/evaluation/_evaluators/_eci/_eci.py +4 -4
  25. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  26. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  27. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +3 -2
  28. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +4 -4
  29. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
  30. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +3 -2
  31. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +11 -8
  32. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  33. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +21 -7
  34. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -5
  35. azure/ai/evaluation/_exceptions.py +9 -6
  36. azure/ai/evaluation/_http_utils.py +203 -132
  37. azure/ai/evaluation/_model_configurations.py +5 -5
  38. azure/ai/evaluation/_vendor/__init__.py +3 -0
  39. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  40. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  41. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  42. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  43. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  44. azure/ai/evaluation/_version.py +1 -1
  45. azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
  46. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  47. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  48. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  49. azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
  50. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
  51. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
  52. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  53. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  54. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  55. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  56. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -5
  57. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
  58. azure/ai/evaluation/simulator/_simulator.py +112 -113
  59. azure/ai/evaluation/simulator/_tracing.py +4 -4
  60. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +72 -44
  61. azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
  62. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/RECORD +64 -56
  63. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
  64. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ import functools
6
6
  import inspect
7
7
  import json
8
8
  import logging
9
- from typing import Callable, Dict, TypeVar
9
+ from typing import Callable, Dict, Literal, Optional, Union, cast
10
10
 
11
11
  import pandas as pd
12
12
  from promptflow._sdk.entities._flows import FlexFlow as flex_flow
@@ -16,31 +16,30 @@ from promptflow.client import PFClient
16
16
  from promptflow.core import Prompty as prompty_core
17
17
  from typing_extensions import ParamSpec
18
18
 
19
+ from azure.ai.evaluation._model_configurations import AzureAIProject
20
+
19
21
  from ..._user_agent import USER_AGENT
20
- from .._utils import _trace_destination_from_project_scope
22
+ from .._utils import EvaluateResult, _trace_destination_from_project_scope
21
23
 
22
24
  LOGGER = logging.getLogger(__name__)
23
25
 
24
26
  P = ParamSpec("P")
25
- R = TypeVar("R")
26
27
 
27
28
 
28
- def _get_evaluator_type(evaluator: Dict[str, Callable]):
29
+ def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
29
30
  """
30
31
  Get evaluator type for telemetry.
31
32
 
32
33
  :param evaluator: The evaluator object
33
34
  :type evaluator: Dict[str, Callable]
34
35
  :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
35
- :rtype: str
36
+ :rtype: Literal["content-safety", "built-in", "custom"]
36
37
  """
37
- built_in = False
38
- content_safety = False
39
-
40
38
  module = inspect.getmodule(evaluator)
41
- built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
42
- if built_in:
43
- content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
39
+ module_name = module.__name__ if module else ""
40
+
41
+ built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
42
+ content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
44
43
 
45
44
  if content_safety:
46
45
  return "content-safety"
@@ -98,22 +97,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
98
97
 
99
98
 
100
99
  # cspell:ignore isna
101
- def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
100
+ def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
102
101
  """Decorator to log evaluate activity
103
102
 
104
103
  :param func: The function to be decorated
105
104
  :type func: Callable
106
105
  :returns: The decorated function
107
- :rtype: Callable[P, R]
106
+ :rtype: Callable[P, EvaluateResult]
108
107
  """
109
108
 
110
109
  @functools.wraps(func)
111
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
110
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
112
111
  from promptflow._sdk._telemetry import ActivityType, log_activity
113
112
  from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
114
113
 
115
- evaluators = kwargs.get("evaluators", [])
116
- azure_ai_project = kwargs.get("azure_ai_project", None)
114
+ evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
115
+ azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
117
116
 
118
117
  pf_client = PFClient(
119
118
  config=(
@@ -127,7 +126,7 @@ def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
127
126
  track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
128
127
  evaluate_target = bool(kwargs.get("target", None))
129
128
  evaluator_config = bool(kwargs.get("evaluator_config", None))
130
- custom_dimensions = {
129
+ custom_dimensions: Dict[str, Union[str, bool]] = {
131
130
  "track_in_cloud": track_in_cloud,
132
131
  "evaluate_target": evaluate_target,
133
132
  "evaluator_config": evaluator_config,
@@ -6,15 +6,22 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import tempfile
9
- from collections import namedtuple
10
9
  from pathlib import Path
11
- from typing import Dict
10
+ from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict, Union
12
11
 
13
12
  import pandas as pd
14
-
15
- from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
13
+ from promptflow.client import PFClient
14
+ from promptflow.entities import Run
15
+
16
+ from azure.ai.evaluation._constants import (
17
+ DEFAULT_EVALUATION_RESULTS_FILE_NAME,
18
+ DefaultOpenEncoding,
19
+ EvaluationRunProperties,
20
+ Prefixes,
21
+ )
16
22
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
17
23
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
24
+ from azure.ai.evaluation._model_configurations import AzureAIProject
18
25
 
19
26
  LOGGER = logging.getLogger(__name__)
20
27
 
@@ -23,14 +30,26 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
23
30
  "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
24
31
  )
25
32
 
26
- AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
33
+
34
+ class AzureMLWorkspace(NamedTuple):
35
+ subscription_id: str
36
+ resource_group_name: str
37
+ workspace_name: str
27
38
 
28
39
 
29
- def is_none(value):
40
+ class EvaluateResult(TypedDict):
41
+ metrics: Dict[str, float]
42
+ studio_url: Optional[str]
43
+ rows: List[Dict]
44
+
45
+
46
+ def is_none(value) -> bool:
30
47
  return value is None or str(value).lower() == "none"
31
48
 
32
49
 
33
- def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
50
+ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
51
+ trace_provider: str,
52
+ ) -> AzureMLWorkspace:
34
53
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
35
54
  if not match or len(match.groups()) != 5:
36
55
  raise EvaluationException(
@@ -47,7 +66,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
47
66
  subscription_id = match.group(1)
48
67
  resource_group_name = match.group(3)
49
68
  workspace_name = match.group(5)
50
- return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
69
+ return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
51
70
 
52
71
 
53
72
  def load_jsonl(path):
@@ -55,7 +74,7 @@ def load_jsonl(path):
55
74
  return [json.loads(line) for line in f.readlines()]
56
75
 
57
76
 
58
- def _azure_pf_client_and_triad(trace_destination):
77
+ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
59
78
  from promptflow.azure._cli._utils import _get_azure_pf_client
60
79
 
61
80
  ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
@@ -69,14 +88,14 @@ def _azure_pf_client_and_triad(trace_destination):
69
88
 
70
89
 
71
90
  def _log_metrics_and_instance_results(
72
- metrics,
73
- instance_results,
74
- trace_destination,
75
- run,
76
- evaluation_name,
77
- ) -> str:
91
+ metrics: Dict[str, Any],
92
+ instance_results: pd.DataFrame,
93
+ trace_destination: Optional[str],
94
+ run: Run,
95
+ evaluation_name: Optional[str],
96
+ ) -> Optional[str]:
78
97
  if trace_destination is None:
79
- LOGGER.error("Unable to log traces as trace destination was not defined.")
98
+ LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
80
99
  return None
81
100
 
82
101
  azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
@@ -94,7 +113,6 @@ def _log_metrics_and_instance_results(
94
113
  ml_client=azure_pf_client.ml_client,
95
114
  promptflow_run=run,
96
115
  ) as ev_run:
97
-
98
116
  artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
99
117
 
100
118
  with tempfile.TemporaryDirectory() as tmpdir:
@@ -112,7 +130,8 @@ def _log_metrics_and_instance_results(
112
130
  if run is None:
113
131
  ev_run.write_properties_to_run_history(
114
132
  properties={
115
- "_azureml.evaluation_run": "azure-ai-generative-parent",
133
+ EvaluationRunProperties.RUN_TYPE: "eval_run",
134
+ EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent",
116
135
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
117
136
  "isEvaluatorRun": "true",
118
137
  }
@@ -138,7 +157,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
138
157
  return studio_url
139
158
 
140
159
 
141
- def _trace_destination_from_project_scope(project_scope: dict) -> str:
160
+ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
142
161
  subscription_id = project_scope["subscription_id"]
143
162
  resource_group_name = project_scope["resource_group_name"]
144
163
  workspace_name = project_scope["project_name"]
@@ -151,9 +170,9 @@ def _trace_destination_from_project_scope(project_scope: dict) -> str:
151
170
  return trace_destination
152
171
 
153
172
 
154
- def _write_output(path, data_dict):
173
+ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
155
174
  p = Path(path)
156
- if os.path.isdir(path):
175
+ if p.is_dir():
157
176
  p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
158
177
 
159
178
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
@@ -161,7 +180,7 @@ def _write_output(path, data_dict):
161
180
 
162
181
 
163
182
  def _apply_column_mapping(
164
- source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
183
+ source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
165
184
  ) -> pd.DataFrame:
166
185
  """
167
186
  Apply column mapping to source_df based on mapping_config.
@@ -211,7 +230,7 @@ def _apply_column_mapping(
211
230
  return result_df
212
231
 
213
232
 
214
- def _has_aggregator(evaluator):
233
+ def _has_aggregator(evaluator: object) -> bool:
215
234
  return hasattr(evaluator, "__aggregate__")
216
235
 
217
236
 
@@ -234,11 +253,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
234
253
  return default_value
235
254
 
236
255
 
237
- def set_event_loop_policy():
256
+ def set_event_loop_policy() -> None:
238
257
  import asyncio
239
258
  import platform
240
259
 
241
260
  if platform.system().lower() == "windows":
242
261
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
243
262
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
244
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
263
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
@@ -3,6 +3,7 @@
3
3
  # ---------------------------------------------------------
4
4
  import os
5
5
  from typing import Optional
6
+
6
7
  from typing_extensions import override
7
8
 
8
9
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -50,7 +51,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
50
51
  query: Optional[str] = None,
51
52
  response: Optional[str] = None,
52
53
  conversation: Optional[dict] = None,
53
- **kwargs
54
+ **kwargs,
54
55
  ):
55
56
  """Evaluate coherence. Accepts either a query and response for a single evaluation,
56
57
  or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
@@ -65,6 +66,6 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
65
66
  to be dictionaries with keys "content" and "role".
66
67
  :paramtype conversation: Optional[Dict]
67
68
  :return: The relevance score.
68
- :rtype: dict
69
+ :rtype: Dict[str, float]
69
70
  """
70
71
  return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -2,19 +2,55 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from typing import List, Dict, Callable, Any
6
5
  import inspect
6
+ from abc import ABC, abstractmethod
7
+ from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
7
8
 
8
- from abc import ABC
9
-
10
- import numpy as np
11
9
  from promptflow._utils.async_utils import async_run_allowing_running_loop
10
+ from typing_extensions import ParamSpec, TypeAlias
11
+
12
+ from azure.ai.evaluation._common.math import list_mean
13
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
+
15
+ P = ParamSpec("P")
16
+ T = TypeVar("T")
17
+ T_EvalValue = TypeVar("T_EvalValue")
18
+
19
+
20
+ class DerivedEvalInput(TypedDict, total=False):
21
+ """The eval input generated by EvaluatorBase._derive_conversation_starter."""
22
+
23
+ query: Dict[str, Any]
24
+ response: Dict[str, Any]
25
+ context: str
26
+
27
+
28
+ AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
29
+ """TypeAlias that models the return value of EvaluatorBase._aggregate_results
12
30
 
13
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
31
+ .. code-block:: python
32
+
33
+ foo: AggregateResult[float] = {
34
+ "evaluation_per_turn": {
35
+ "gpt_coherence": [1.0, 2.0, 3.0]
36
+ },
37
+ "gpt_coherence": 2.0
38
+ }
39
+ """
40
+
41
+ DoEvalResult: TypeAlias = Dict[str, T]
42
+ """TypeAlias that models the return value of EvaluatorBase._do_eval
43
+
44
+ .. code-block:: python
45
+
46
+ foo: DoEvalResult[float] = {
47
+ "gpt_coherence": 2.0
48
+ }
49
+ """
14
50
 
15
51
 
16
52
  # TODO exception target pass down?
17
- class EvaluatorBase(ABC):
53
+ class EvaluatorBase(ABC, Generic[T_EvalValue]):
18
54
  """Base class for all evaluators that are capable of accepting either a group of single values,
19
55
  or conversation as input. All such evaluators need to implement two functions of their own:
20
56
  - _convert_conversation_to_eval_input
@@ -51,7 +87,7 @@ class EvaluatorBase(ABC):
51
87
  # This needs to be overridden just to change the function header into something more informative,
52
88
  # and to be able to add a more specific docstring. The actual function contents should just be
53
89
  # super().__call__(<inputs>)
54
- def __call__(self, **kwargs) -> Dict:
90
+ def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
55
91
  """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
56
92
  one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
57
93
  The actual behavior of this function shouldn't change beyond adding more inputs to the
@@ -64,9 +100,8 @@ class EvaluatorBase(ABC):
64
100
  """
65
101
  return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
66
102
 
67
- # Probably the only thing that can't be simplified. Each evaluator, or at least each family
68
- # of evaluators, will need to implement their own version of this function.
69
- async def _do_eval(self, eval_input: Any) -> Dict:
103
+ @abstractmethod
104
+ async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
70
105
  """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
71
106
  In the default case, all required inputs are assumed to be within eval_input, as user-friendly
72
107
  typing is handled above this function in favor of polymorphic simplicity. This function must be
@@ -76,12 +111,7 @@ class EvaluatorBase(ABC):
76
111
  :type eval_input: Any
77
112
  :return: A single evaluation result
78
113
  :rtype: Dict
79
-
80
114
  """
81
- raise EvaluationException(
82
- message="Not implemented",
83
- internal_message="BaseConversationEval's _do_eval method called somehow. This should be overridden.",
84
- )
85
115
 
86
116
  # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
87
117
 
@@ -103,7 +133,7 @@ class EvaluatorBase(ABC):
103
133
  singletons.append(param)
104
134
  return singletons
105
135
 
106
- def _derive_conversation_converter(self) -> Callable:
136
+ def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
107
137
  """Produce the function that will be used to convert conversations to a list of evaluable inputs.
108
138
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
109
139
  aspects of a conversation ought to be extracted.
@@ -115,12 +145,12 @@ class EvaluatorBase(ABC):
115
145
  include_query = "query" in self._singleton_inputs
116
146
  include_response = "response" in self._singleton_inputs
117
147
 
118
- def converter(conversation: Dict) -> List:
119
- messages = conversation["messages"]
148
+ def converter(conversation: Dict) -> List[DerivedEvalInput]:
149
+ messages = cast(List[Dict[str, Any]], conversation["messages"])
120
150
  global_context = conversation.get("context", None)
121
151
  # Extract queries, responses from conversation
122
- queries = []
123
- responses = []
152
+ queries: List[Dict[str, Any]] = []
153
+ responses: List[Dict[str, Any]] = []
124
154
 
125
155
  # Convert conversation slice into queries and responses.
126
156
  # Assume that 'user' role is asking queries and 'assistant' role is responding.
@@ -147,7 +177,7 @@ class EvaluatorBase(ABC):
147
177
  if response_context and not include_response:
148
178
  context["response_context"] = response_context
149
179
 
150
- eval_input = {}
180
+ eval_input: DerivedEvalInput = {}
151
181
  if include_query:
152
182
  eval_input["query"] = query
153
183
  if include_response:
@@ -159,7 +189,7 @@ class EvaluatorBase(ABC):
159
189
 
160
190
  return converter
161
191
 
162
- def _convert_kwargs_to_eval_input(self, **kwargs) -> List:
192
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
163
193
  """Convert an arbitrary input into a list of inputs for evaluators.
164
194
  It is assumed that evaluators generally make use of their inputs in one of two ways.
165
195
  Either they receive a collection of keyname inputs that are all single values
@@ -211,7 +241,7 @@ class EvaluatorBase(ABC):
211
241
  target=ErrorTarget.CONVERSATION,
212
242
  )
213
243
 
214
- def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict:
244
+ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
215
245
  """Aggregate the evaluation results of each conversation turn into a single result.
216
246
 
217
247
  Exact implementation might need to vary slightly depending on the results produced.
@@ -227,8 +257,8 @@ class EvaluatorBase(ABC):
227
257
  :rtype: Dict
228
258
  """
229
259
 
230
- aggregated = {}
231
- evaluation_per_turn = {}
260
+ aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
261
+ evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
232
262
 
233
263
  # Go over each turn, and rotate the results into a
234
264
  # metric: List[values] format for the evals_per_turn dictionary.
@@ -241,13 +271,13 @@ class EvaluatorBase(ABC):
241
271
  # Find and average all numeric values
242
272
  for metric, values in evaluation_per_turn.items():
243
273
  if all(isinstance(value, (int, float)) for value in values):
244
- aggregated[metric] = np.mean(values)
274
+ aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
245
275
  # Slap the per-turn results back in.
246
276
  aggregated["evaluation_per_turn"] = evaluation_per_turn
247
277
 
248
278
  return aggregated
249
279
 
250
- async def _real_call(self, **kwargs):
280
+ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
251
281
  """The asynchronous call where real end-to-end evaluation logic is performed.
252
282
 
253
283
  :keyword kwargs: The inputs to evaluate.
@@ -270,9 +300,8 @@ class EvaluatorBase(ABC):
270
300
  # Otherwise, aggregate results.
271
301
  return self._aggregate_results(per_turn_results=per_turn_results)
272
302
 
273
- # ~~~ METHODS THAT SHOULD NEVER BE OVERRIDDEN BY CHILDREN~~~
274
-
275
- def _to_async(self):
303
+ @final
304
+ def _to_async(self) -> "AsyncEvaluatorBase":
276
305
  return self._async_evaluator
277
306
 
278
307
 
@@ -2,26 +2,23 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import math
5
6
  import re
6
7
  from typing import Dict
7
8
 
8
- from typing_extensions import override
9
-
10
-
11
- import numpy as np
12
-
13
9
  from promptflow.core import AsyncPrompty
10
+ from typing_extensions import override
14
11
 
15
- from ..._common.utils import construct_prompty_model_config
12
+ from ..._common.utils import construct_prompty_model_config, validate_model_config
13
+ from . import EvaluatorBase
16
14
 
17
15
  try:
18
16
  from ..._user_agent import USER_AGENT
19
17
  except ImportError:
20
- USER_AGENT = None
21
- from . import EvaluatorBase
18
+ USER_AGENT = "None"
22
19
 
23
20
 
24
- class PromptyEvaluatorBase(EvaluatorBase):
21
+ class PromptyEvaluatorBase(EvaluatorBase[float]):
25
22
  """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
26
23
  make use of a prompty file, and return their results as a dictionary, with a single key-value pair
27
24
  linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
@@ -42,13 +39,13 @@ class PromptyEvaluatorBase(EvaluatorBase):
42
39
  LLM_CALL_TIMEOUT = 600
43
40
  DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
44
41
 
45
- def __init__(self, *, result_key: str, prompty_file: str, model_config: Dict, eval_last_turn: bool = False):
42
+ def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
46
43
  self._result_key = result_key
47
44
  self._prompty_file = prompty_file
48
45
  super().__init__(eval_last_turn=eval_last_turn)
49
46
 
50
47
  prompty_model_config = construct_prompty_model_config(
51
- model_config,
48
+ validate_model_config(model_config),
52
49
  self.DEFAULT_OPEN_API_VERSION,
53
50
  USER_AGENT,
54
51
  )
@@ -59,7 +56,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
59
56
  # defining a default here.
60
57
 
61
58
  @override
62
- async def _do_eval(self, eval_input: Dict) -> Dict:
59
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
63
60
  """Do a relevance evaluation.
64
61
 
65
62
  :param eval_input: The input to the evaluator. Expected to contain
@@ -71,7 +68,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
71
68
  """
72
69
  llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
73
70
 
74
- score = np.nan
71
+ score = math.nan
75
72
  if llm_output:
76
73
  match = re.search(r"\d", llm_output)
77
74
  if match:
@@ -1,48 +1,46 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict, Optional, Union
4
5
 
5
- from typing import Dict, Optional
6
6
  from typing_extensions import override
7
7
 
8
- from azure.identity import DefaultAzureCredential
9
- from azure.ai.evaluation._common.constants import EvaluationMetrics
8
+ from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
10
9
  from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
10
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
11
11
  from azure.ai.evaluation._exceptions import EvaluationException
12
+ from azure.core.credentials import TokenCredential
13
+
12
14
  from . import EvaluatorBase
13
15
 
14
16
 
15
- class RaiServiceEvaluatorBase(EvaluatorBase):
17
+ class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
16
18
  """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
17
19
  This includes content safety evaluators, protected material evaluators, and others. These evaluators
18
20
  are all assumed to be of the "query and response or conversation" input variety.
19
21
 
20
- param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
21
- to specify which evaluation to perform.
22
- type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
23
- param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
22
+ :param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
23
+ to specify which evaluation to perform.
24
+ :type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
25
+ :param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
24
26
  aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
25
27
  aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
26
28
  when this occurs. Default is False, resulting full conversation evaluation and aggregation.
27
- type eval_last_turn: bool
29
+ :type eval_last_turn: bool
28
30
  """
29
31
 
30
32
  @override
31
33
  def __init__(
32
34
  self,
33
- eval_metric: EvaluationMetrics,
35
+ eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
34
36
  azure_ai_project: dict,
35
- credential: Optional[dict] = None,
37
+ credential: TokenCredential,
36
38
  eval_last_turn: bool = False,
37
39
  ):
38
40
  super().__init__(eval_last_turn=eval_last_turn)
39
41
  self._eval_metric = eval_metric
40
- self._azure_ai_project = azure_ai_project
41
- if credential is None:
42
- # Use DefaultCredential if no credential is provided
43
- self._credential = DefaultAzureCredential()
44
- else:
45
- self._credential = credential
42
+ self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
43
+ self._credential = credential
46
44
 
47
45
  @override
48
46
  def __call__(
@@ -51,7 +49,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
51
49
  query: Optional[str] = None,
52
50
  response: Optional[str] = None,
53
51
  conversation: Optional[dict] = None,
54
- **kwargs
52
+ **kwargs,
55
53
  ):
56
54
  """Evaluate either a query and response or a conversation. Must supply either a query AND response,
57
55
  or a conversation, but not both.
@@ -65,12 +63,12 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
65
63
  to be dictionaries with keys "content", "role", and possibly "context".
66
64
  :paramtype conversation: Optional[Dict]
67
65
  :return: The evaluation result.
68
- :rtype: Dict
66
+ :rtype: Dict[str, Union[str, float]]
69
67
  """
70
68
  return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
71
69
 
72
70
  @override
73
- async def _do_eval(self, eval_input: Dict):
71
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
74
72
  """Perform the evaluation using the Azure AI RAI service.
75
73
  The exact evaluation performed is determined by the evaluation metric supplied
76
74
  by the child class initializer.