azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show
  1. azure/ai/evaluation/__init__.py +4 -26
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +122 -302
  4. azure/ai/evaluation/_common/utils.py +35 -393
  5. azure/ai/evaluation/_constants.py +6 -28
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  10. azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
  11. azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
  13. azure/ai/evaluation/_evaluate/_utils.py +47 -108
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +134 -205
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +2 -3
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
  60. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  63. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
  64. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
  65. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  66. azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  69. azure/ai/evaluation/simulator/_tracing.py +28 -25
  70. azure/ai/evaluation/simulator/_utils.py +13 -34
  71. azure/ai/evaluation/simulator/simulator.py +579 -0
  72. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  73. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  74. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_common/_experimental.py +0 -172
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
  78. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  79. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  80. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  81. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  82. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  84. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  85. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  86. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  87. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  88. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  89. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  91. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  92. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  93. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  94. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  95. azure/ai/evaluation/_vendor/__init__.py +0 -3
  96. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  97. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  98. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  100. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  101. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  102. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  103. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  104. azure/ai/evaluation/simulator/_simulator.py +0 -716
  105. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  106. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  107. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  108. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
@@ -1,89 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- import math
6
- from typing import List, Callable, Any
7
-
8
- from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
9
-
10
-
11
- def list_sum(lst: List[float]) -> float:
12
- """Given a list of floats, return the sum of the values.
13
-
14
- :param lst: A list of floats.
15
- :type lst: List[float]
16
- :return: The sum of the values in the list.
17
- :rtype: float
18
- """
19
-
20
- return sum(lst)
21
-
22
-
23
- def list_mean(lst: List[float]) -> float:
24
- """Given a list of floats, calculate the mean of the values.
25
-
26
- :param lst: A list of floats.
27
- :type lst: List[float]
28
- :return: The mean of the values in the list.
29
- :rtype: float
30
- """
31
-
32
- return list_sum(lst) / len(lst)
33
-
34
-
35
- def list_mean_nan_safe(lst: List[float]) -> float:
36
- """Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
37
-
38
- :param lst: A list of floats.
39
- :type lst: List[float]
40
- :return: The mean of the values in the list.
41
- :rtype: float
42
- """
43
-
44
- msg = "All score values are NaN. The mean cannot be calculated."
45
- if all(math.isnan(l) for l in lst):
46
- raise EvaluationException(
47
- message=msg,
48
- internal_message=msg,
49
- blame=ErrorBlame.USER_ERROR,
50
- category=ErrorCategory.INVALID_VALUE,
51
- target=ErrorTarget.CONVERSATION,
52
- )
53
- return list_mean([l for l in lst if not is_none_or_nan(l)])
54
-
55
-
56
- def apply_transform_nan_safe(lst: List[float], transform_fn: Callable[[float], Any]) -> List[Any]:
57
- """Given a list of floats, remove all nan values, then apply the inputted transform function
58
- to the remaining values, and return the resulting list of outputted values.
59
-
60
- :param lst: A list of floats.
61
- :type lst: List[float]
62
- :param transform_fn: A function that produces something when applied to a float.
63
- :type transform_fn: Callable[[float], Any]
64
- :return: A list of the transformed values.
65
- :rtype: List[Any]
66
- """
67
-
68
- msg = "All score values are NaN. The mean cannot be calculated."
69
- if all(math.isnan(l) for l in lst):
70
- raise EvaluationException(
71
- message=msg,
72
- internal_message=msg,
73
- blame=ErrorBlame.USER_ERROR,
74
- category=ErrorCategory.INVALID_VALUE,
75
- target=ErrorTarget.CONVERSATION,
76
- )
77
- return [transform_fn(l) for l in lst if not is_none_or_nan(l)]
78
-
79
-
80
- def is_none_or_nan(val: float) -> bool:
81
- """math.isnan raises an error if None is inputted. This is a more robust wrapper.
82
-
83
- :param val: The value to check.
84
- :type val: float
85
- :return: Whether the value is None or NaN.
86
- :rtype: bool
87
- """
88
-
89
- return val is None or math.isnan(val)
@@ -1,99 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- # pylint: disable=protected-access
6
-
7
- import inspect
8
- import logging
9
- import math
10
- import os
11
- from collections import OrderedDict
12
- from concurrent.futures import Future
13
- from typing import Any, Callable, Dict, Optional, Union
14
-
15
- import pandas as pd
16
- from promptflow.client import PFClient
17
- from promptflow.entities import Run
18
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
19
-
20
- LOGGER = logging.getLogger(__name__)
21
-
22
-
23
- class ProxyRun:
24
- def __init__(self, run: Future, **kwargs) -> None: # pylint: disable=unused-argument
25
- self.run = run
26
-
27
-
28
- class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
29
- def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
30
- self, pf_client: PFClient
31
- ) -> None:
32
- self._pf_client = pf_client
33
- self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
34
-
35
- def run(
36
- self,
37
- flow: Union[str, os.PathLike, Callable],
38
- data: Union[str, os.PathLike],
39
- column_mapping: Optional[Dict[str, str]] = None,
40
- **kwargs
41
- ) -> ProxyRun:
42
- flow_to_run = flow
43
- if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
44
- flow_to_run = flow._to_async() # pylint: disable=protected-access
45
-
46
- batch_use_async = self._should_batch_use_async(flow_to_run)
47
- eval_future = self._thread_pool.submit(
48
- self._pf_client.run,
49
- flow_to_run,
50
- data=data,
51
- column_mapping=column_mapping,
52
- batch_use_async=batch_use_async,
53
- **kwargs
54
- )
55
- return ProxyRun(run=eval_future)
56
-
57
- def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
58
- run: Run = proxy_run.run.result()
59
- result_df = self._pf_client.get_details(run, all_results=all_results)
60
- result_df.replace("(Failed)", math.nan, inplace=True)
61
- return result_df
62
-
63
- def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
64
- run: Run = proxy_run.run.result()
65
- return self._pf_client.get_metrics(run)
66
-
67
- def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
68
- run = proxy_run.run.result()
69
-
70
- # pylint: disable=protected-access
71
- completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
72
- failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
73
-
74
- # Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
75
- if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
76
- status = "Completed with Errors"
77
- else:
78
- status = run.status
79
-
80
- # Return the ordered dictionary with the updated status
81
- return OrderedDict(
82
- [
83
- ("status", status),
84
- ("duration", str(run._end_time - run._created_on)),
85
- ("completed_lines", completed_lines),
86
- ("failed_lines", failed_lines),
87
- ("log_path", str(run._output_path)),
88
- ]
89
- )
90
-
91
- @staticmethod
92
- def _should_batch_use_async(flow):
93
- if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
94
- if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
95
- return True
96
- if inspect.iscoroutinefunction(flow):
97
- return True
98
- return False
99
- return False
@@ -1,46 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
- import os
5
- import types
6
- from typing import Optional, Type
7
-
8
- from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
9
- from azure.ai.evaluation._constants import PF_DISABLE_TRACING
10
-
11
-
12
- class TargetRunContext:
13
- """Context manager for target batch run.
14
-
15
- :param upload_snapshot: Whether to upload target snapshot.
16
- :type upload_snapshot: bool
17
- """
18
-
19
- def __init__(self, upload_snapshot: bool) -> None:
20
- self._upload_snapshot = upload_snapshot
21
- self._original_cwd = os.getcwd()
22
-
23
- def __enter__(self) -> None:
24
- # Preserve current working directory, as PF may change it without restoring it afterward
25
- self._original_cwd = os.getcwd()
26
-
27
- # Address "[WinError 32] The process cannot access the file" error,
28
- # caused by conflicts when the venv and target function are in the same directory.
29
- # Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
30
- if not self._upload_snapshot:
31
- os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
32
-
33
- os.environ[PF_DISABLE_TRACING] = "true"
34
-
35
- def __exit__(
36
- self,
37
- exc_type: Optional[Type[BaseException]],
38
- exc_value: Optional[BaseException],
39
- exc_tb: Optional[types.TracebackType],
40
- ) -> None:
41
- os.chdir(self._original_cwd)
42
-
43
- if not self._upload_snapshot:
44
- os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
45
-
46
- os.environ.pop(PF_DISABLE_TRACING, None)
@@ -1,13 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- from ._base_eval import EvaluatorBase
6
- from ._base_prompty_eval import PromptyEvaluatorBase
7
- from ._base_rai_svc_eval import RaiServiceEvaluatorBase
8
-
9
- __all__ = [
10
- "EvaluatorBase",
11
- "PromptyEvaluatorBase",
12
- "RaiServiceEvaluatorBase",
13
- ]
@@ -1,344 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- import inspect
6
- from abc import ABC, abstractmethod
7
- from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
8
-
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
-
12
- from azure.ai.evaluation._common.math import list_mean
13
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
- from azure.ai.evaluation._common.utils import remove_optional_singletons
15
-
16
- P = ParamSpec("P")
17
- T = TypeVar("T")
18
- T_EvalValue = TypeVar("T_EvalValue")
19
-
20
-
21
- class DerivedEvalInput(TypedDict, total=False):
22
- """The eval input generated by EvaluatorBase._derive_conversation_starter."""
23
-
24
- query: Dict[str, Any]
25
- response: Dict[str, Any]
26
- context: str
27
-
28
-
29
- AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
30
- """TypeAlias that models the return value of EvaluatorBase._aggregate_results
31
-
32
- .. code-block:: python
33
-
34
- foo: AggregateResult[float] = {
35
- "evaluation_per_turn": {
36
- "coherence": [1.0, 2.0, 3.0]
37
- },
38
- "coherence": 2.0
39
- }
40
- """
41
-
42
- DoEvalResult: TypeAlias = Dict[str, T]
43
- """TypeAlias that models the return value of EvaluatorBase._do_eval
44
-
45
- .. code-block:: python
46
-
47
- foo: DoEvalResult[float] = {
48
- "coherence": 2.0
49
- }
50
- """
51
-
52
-
53
- # TODO exception target pass down?
54
- class EvaluatorBase(ABC, Generic[T_EvalValue]):
55
- """Base class for all evaluators that are capable of accepting either a group of single values,
56
- or conversation as input. All such evaluators need to implement two functions of their own:
57
- - _convert_conversation_to_eval_input
58
- - _do_eval
59
-
60
- Additionally, __call__ should be overridden to reshape the function header as needed to produce more informative
61
- documentation, although ideally the actual child implementation of __call__ should just amount to
62
- 'super().__init__()'.
63
-
64
-
65
- :param not_singleton_inputs: A list of strings that represent the names of
66
- inputs to the child evaluator's __call__ function that are NOT singleton inputs. By default, this
67
- is ["conversation", "kwargs"].
68
- :type not_singleton_inputs: List[str]
69
- :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
70
- :type eval_last_turn: bool
71
- """
72
-
73
- # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
74
-
75
- # Make sure to call super().__init__() in the child class's __init__ method.
76
- # pylint: disable=dangerous-default-value
77
- def __init__(
78
- self,
79
- *,
80
- not_singleton_inputs: List[str] = ["conversation", "kwargs"],
81
- eval_last_turn: bool = False,
82
- ):
83
- self._not_singleton_inputs = not_singleton_inputs
84
- self._eval_last_turn = eval_last_turn
85
- self._singleton_inputs = self._derive_singleton_inputs()
86
- self._async_evaluator = AsyncEvaluatorBase(self._real_call)
87
-
88
- # This needs to be overridden just to change the function header into something more informative,
89
- # and to be able to add a more specific docstring. The actual function contents should just be
90
- # super().__call__(<inputs>)
91
- def __call__( # pylint: disable=docstring-missing-param
92
- self,
93
- *args,
94
- **kwargs,
95
- ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
96
- """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
97
- one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
98
- The actual behavior of this function shouldn't change beyond adding more inputs to the
99
- async_run_allowing_running_loop call.
100
-
101
- :keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
102
- :type kwargs: Dict
103
- :return: The evaluation result
104
- :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
105
- """
106
- return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
107
-
108
- @abstractmethod
109
- async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
110
- """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
111
- In the default case, all required inputs are assumed to be within eval_input, as user-friendly
112
- typing is handled above this function in favor of polymorphic simplicity. This function must be
113
- asynchronous.
114
-
115
- :param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
116
- :type eval_input: Any
117
- :return: A single evaluation result
118
- :rtype: DoEvalResult[T_EvalValue]
119
- """
120
-
121
- # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
122
-
123
- def _derive_singleton_inputs(self) -> List[str]:
124
- """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
125
- when the evaluator is being used in a non-conversation context.
126
- By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
127
- Thankfully this works the way you'd hope, with the call_signature being based on the child
128
- function's signature, not the parent's.
129
-
130
- :return: A list of strings representing the names of singleton inputs.
131
- :rtype: List[str]
132
- """
133
-
134
- overloads = get_overloads(self.__call__)
135
- if not overloads:
136
- call_signatures = [inspect.signature(self.__call__)]
137
- else:
138
- call_signatures = [inspect.signature(overload) for overload in overloads]
139
- call_signature = inspect.signature(self.__call__)
140
- singletons = []
141
- for call_signature in call_signatures:
142
- params = call_signature.parameters
143
- if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
144
- continue
145
- # exclude self since it is not a singleton input
146
- singletons.extend([p for p in params if p != "self"])
147
- return singletons
148
-
149
- def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
150
- """Produce the function that will be used to convert conversations to a list of evaluable inputs.
151
- This uses the inputs derived from the _derive_singleton_inputs function to determine which
152
- aspects of a conversation ought to be extracted.
153
-
154
- :return: The function that will be used to convert conversations to evaluable inputs.
155
- :rtype: Callable
156
- """
157
- include_context = "context" in self._singleton_inputs
158
- include_query = "query" in self._singleton_inputs
159
- include_response = "response" in self._singleton_inputs
160
-
161
- def converter(conversation: Dict) -> List[DerivedEvalInput]:
162
- messages = cast(List[Dict[str, Any]], conversation["messages"])
163
- global_context = conversation.get("context", None)
164
- # Extract queries, responses from conversation
165
- queries: List[Dict[str, Any]] = []
166
- responses: List[Dict[str, Any]] = []
167
-
168
- # Convert conversation slice into queries and responses.
169
- # Assume that 'user' role is asking queries and 'assistant' role is responding.
170
- if self._eval_last_turn and len(messages) > 1:
171
- messages = messages[-2:]
172
-
173
- for each_turn in messages:
174
- role = each_turn["role"]
175
- if role == "user":
176
- queries.append(each_turn)
177
- elif role == "assistant":
178
- responses.append(each_turn)
179
- # TODO complain if len(queries) != len(responses)?
180
- eval_inputs = []
181
- for query, response in zip(queries, responses):
182
- context = {}
183
- if include_context:
184
- query_context = query.get("context", None)
185
- response_context = response.get("context", None)
186
- if global_context:
187
- context["global_context"] = global_context
188
- if query_context and include_query:
189
- context["query_context"] = query_context
190
- if response_context and include_response:
191
- context["response_context"] = response_context
192
-
193
- eval_input: DerivedEvalInput = {}
194
- if include_query:
195
- eval_input["query"] = query.get("content", "")
196
- if include_response:
197
- eval_input["response"] = response.get("content", "")
198
- if include_context:
199
- eval_input["context"] = str(context)
200
- eval_inputs.append(eval_input)
201
- return eval_inputs
202
-
203
- return converter
204
-
205
- def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
206
- """Convert an arbitrary input into a list of inputs for evaluators.
207
- It is assumed that evaluators generally make use of their inputs in one of two ways.
208
- Either they receive a collection of keyname inputs that are all single values
209
- (like a query and response), or they receive conversation that iss a list of dictionary
210
- values.
211
-
212
- The self._singleton_inputs list assigned during initialization is used to find and extract
213
- singleton keywords, and self._allow_converssation_input is used to determine if a conversation
214
- is a valid input.
215
-
216
- If both conversations and singletons are allowed, the function will raise an exception if both
217
- are inputted.
218
-
219
- This function must be overridden by child classes IF they need to both a conversation and
220
- other inputs to be passed in.
221
-
222
- :keyword kwargs: The inputs to convert.
223
- :type kwargs: Dict
224
- :return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
225
- :rtype: List
226
- """
227
-
228
- # Collect inputs
229
- conversation = kwargs.get("conversation", None)
230
- singletons = {}
231
- if len(self._singleton_inputs) > 0:
232
- singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
233
- # Check that both conversation and other inputs aren't set
234
- if conversation is not None and any(singletons.values()):
235
- msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
236
- raise EvaluationException(
237
- message=msg,
238
- blame=ErrorBlame.USER_ERROR,
239
- category=ErrorCategory.INVALID_VALUE,
240
- target=ErrorTarget.CONVERSATION,
241
- )
242
- # Handle Conversation
243
- if conversation is not None:
244
- return self._derive_conversation_converter()(conversation)
245
- # Handle Singletons
246
- required_singletons = remove_optional_singletons(self, singletons)
247
- if all(value is not None for value in required_singletons.values()):
248
- return [singletons]
249
- # Missing input
250
- msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
251
- raise EvaluationException(
252
- message=msg,
253
- blame=ErrorBlame.USER_ERROR,
254
- category=ErrorCategory.INVALID_VALUE,
255
- target=ErrorTarget.CONVERSATION,
256
- )
257
-
258
- def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
259
- """Aggregate the evaluation results of each conversation turn into a single result.
260
-
261
- Exact implementation might need to vary slightly depending on the results produced.
262
- Default behavior is to average the all number-based outputs.
263
-
264
- :param per_turn_results: List of evaluation results for each turn in the conversation.
265
- :type per_turn_results: List[Dict]
266
- :return: A dictionary containing aggregated results, with numeric metrics having their
267
- means as top-level values in the dictionary, and all original
268
- values (including non-numerics) located in under the "evaluation_per_turn" key,
269
- which each sub-key being a metric and each sub-value being a the list of that metric's
270
- per-turn values.
271
- :rtype: AggregateResult[T_EvalValue]
272
- """
273
-
274
- aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
275
- evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
276
-
277
- # Go over each turn, and rotate the results into a
278
- # metric: List[values] format for the evals_per_turn dictionary.
279
- for turn in per_turn_results:
280
- for metric, value in turn.items():
281
- if metric not in evaluation_per_turn:
282
- evaluation_per_turn[metric] = []
283
- evaluation_per_turn[metric].append(value)
284
-
285
- # Find and average all numeric values
286
- for metric, values in evaluation_per_turn.items():
287
- if all(isinstance(value, (int, float)) for value in values):
288
- aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
289
- # Slap the per-turn results back in.
290
- aggregated["evaluation_per_turn"] = evaluation_per_turn
291
- return aggregated
292
-
293
- async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
294
- """The asynchronous call where real end-to-end evaluation logic is performed.
295
-
296
- :keyword kwargs: The inputs to evaluate.
297
- :type kwargs: Dict
298
- :return: The evaluation result.
299
- :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
300
- """
301
- # Convert inputs into list of evaluable inputs.
302
- eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
303
- per_turn_results = []
304
- # Evaluate all inputs.
305
- for eval_input in eval_input_list:
306
- per_turn_results.append(await self._do_eval(eval_input))
307
- # Return results as-is if only one result was produced.
308
-
309
- if len(per_turn_results) == 1:
310
- return per_turn_results[0]
311
- if len(per_turn_results) == 0:
312
- return {} # TODO raise something?
313
- # Otherwise, aggregate results.
314
- return self._aggregate_results(per_turn_results=per_turn_results)
315
-
316
- @final
317
- def _to_async(self) -> "AsyncEvaluatorBase":
318
- return self._async_evaluator
319
-
320
-
321
- class AsyncEvaluatorBase:
322
- """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
323
- to ensure that no one ever needs to extend or otherwise modify this class directly.
324
- """
325
-
326
- def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION
327
- self._real_call = real_call
328
-
329
- # Don't look at my shame. Nothing to see here....
330
- # Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
331
- # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
332
- # are just not passed into this function instead of ending up in kwargs.
333
- # Since we want this to be relatively call-agnostic, we just account for every input that any children
334
- # are known to throw at this, mash them into kwargs, and then pass them into the real call.
335
- async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
336
- if conversation is not None:
337
- kwargs["conversation"] = conversation
338
- if query is not None:
339
- kwargs["query"] = query
340
- if response is not None:
341
- kwargs["response"] = response
342
- if context is not None:
343
- kwargs["context"] = context
344
- return await self._real_call(**kwargs)
@@ -1,88 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- import math
6
- import re
7
- from typing import Dict, TypeVar, Union
8
-
9
- from promptflow.core import AsyncPrompty
10
- from typing_extensions import override
11
-
12
- from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
13
- from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
14
- from . import EvaluatorBase
15
-
16
- try:
17
- from ..._user_agent import USER_AGENT
18
- except ImportError:
19
- USER_AGENT = "None"
20
-
21
- T = TypeVar("T")
22
-
23
-
24
- class PromptyEvaluatorBase(EvaluatorBase[T]):
25
- """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
26
- make use of a prompty file, and return their results as a dictionary, with a single key-value pair
27
- linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
28
- per-turn results are stored in a list under the key "evaluation_per_turn").
29
-
30
- :param result_key: The key to use for the result of the evaluation. Single turn evaluations will return
31
- a dictionary in the format {result_key: float}.
32
- :type result_key: str
33
- :param prompty_file: The path to the prompty file to use for evaluation.
34
- :type prompty_file: str
35
- :param model_config: The model configuration to use for evaluation.
36
- :type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
37
- :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
38
- Useful since some evaluators of this format are response-only.
39
- :type ignore_queries: bool
40
- """
41
-
42
- _LLM_CALL_TIMEOUT = 600
43
- _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
44
-
45
- def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
46
- self._result_key = result_key
47
- self._prompty_file = prompty_file
48
- super().__init__(eval_last_turn=eval_last_turn)
49
-
50
- prompty_model_config = construct_prompty_model_config(
51
- validate_model_config(model_config),
52
- self._DEFAULT_OPEN_API_VERSION,
53
- USER_AGENT,
54
- )
55
-
56
- self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
57
-
58
- # __call__ not overridden here because child classes have such varied signatures that there's no point
59
- # defining a default here.
60
-
61
- @override
62
- async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
63
- """Do a relevance evaluation.
64
-
65
- :param eval_input: The input to the evaluator. Expected to contain
66
- whatever inputs are needed for the _flow method, including context
67
- and other fields depending on the child class.
68
- :type eval_input: Dict
69
- :return: The evaluation result.
70
- :rtype: Dict
71
- """
72
- llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
73
-
74
- score = math.nan
75
- if llm_output:
76
- # Parse out score and reason from evaluators known to possess them.
77
- if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
78
- score, reason = parse_quality_evaluator_reason_score(llm_output)
79
- return {
80
- self._result_key: float(score),
81
- f"gpt_{self._result_key}": float(score),
82
- f"{self._result_key}_reason": reason,
83
- }
84
- match = re.search(r"\d", llm_output)
85
- if match:
86
- score = float(match.group())
87
- return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
88
- return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}