azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +85 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +147 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +87 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +430 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
  155. azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
  156. azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
  157. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  158. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  159. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  160. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  161. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  162. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  163. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  165. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/METADATA +378 -27
  264. azure_ai_evaluation-1.13.5.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/top_level.txt +0 -0
@@ -3,20 +3,52 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import inspect
6
+ import logging
6
7
  from abc import ABC, abstractmethod
7
- from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
8
-
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
8
+ import json
9
+ import copy
10
+ from typing import (
11
+ Any,
12
+ Callable,
13
+ Dict,
14
+ Generic,
15
+ List,
16
+ Tuple,
17
+ TypedDict,
18
+ TypeVar,
19
+ Union,
20
+ cast,
21
+ final,
22
+ Optional,
23
+ )
24
+
25
+ from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
10
26
  from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
27
 
12
- from azure.ai.evaluation._common.math import list_mean
13
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
28
+ from azure.ai.evaluation._exceptions import (
29
+ ErrorBlame,
30
+ ErrorCategory,
31
+ ErrorTarget,
32
+ EvaluationException,
33
+ )
14
34
  from azure.ai.evaluation._common.utils import remove_optional_singletons
35
+ from azure.ai.evaluation._constants import (
36
+ _AggregationType,
37
+ EVALUATION_PASS_FAIL_MAPPING,
38
+ )
39
+ from azure.ai.evaluation._model_configurations import Conversation
40
+ from azure.ai.evaluation._common._experimental import experimental
41
+
42
+ from ._conversation_aggregators import GetAggregator, GetAggregatorType
43
+
44
+ import copy
15
45
 
16
46
  P = ParamSpec("P")
17
47
  T = TypeVar("T")
18
48
  T_EvalValue = TypeVar("T_EvalValue")
19
49
 
50
+ logger = logging.getLogger(__name__)
51
+
20
52
 
21
53
  class DerivedEvalInput(TypedDict, total=False):
22
54
  """The eval input generated by EvaluatorBase._derive_conversation_starter."""
@@ -24,6 +56,7 @@ class DerivedEvalInput(TypedDict, total=False):
24
56
  query: Dict[str, Any]
25
57
  response: Dict[str, Any]
26
58
  context: str
59
+ ground_truth: str
27
60
 
28
61
 
29
62
  AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
@@ -68,8 +101,24 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
68
101
  :type not_singleton_inputs: List[str]
69
102
  :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
70
103
  :type eval_last_turn: bool
104
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
105
+ to produce a single result.
106
+ Default is ~azure.ai.evaluation._AggregationType.MEAN.
107
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
108
+ :param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
109
+ overrides the standard aggregator implied by conversation_aggregation_type. None by default.
110
+ :type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
111
+ :param threshold: The threshold for the evaluation. Default is 3.
112
+ :type threshold: Optional[int]
113
+ :param _higher_is_better: If True, higher scores are better. Default is True.
114
+ :type _higher_is_better: Optional[bool]
71
115
  """
72
116
 
117
+ _NOT_APPLICABLE_RESULT = "not applicable"
118
+ _PASS_RESULT = "pass"
119
+ _FAIL_RESULT = "fail"
120
+ _type = "azure_ai_evaluator"
121
+
73
122
  # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
74
123
 
75
124
  # Make sure to call super().__init__() in the child class's __init__ method.
@@ -77,13 +126,23 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
77
126
  def __init__(
78
127
  self,
79
128
  *,
129
+ threshold: float = 3.0,
80
130
  not_singleton_inputs: List[str] = ["conversation", "kwargs"],
81
131
  eval_last_turn: bool = False,
132
+ conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
133
+ conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
134
+ _higher_is_better: Optional[bool] = True,
82
135
  ):
83
136
  self._not_singleton_inputs = not_singleton_inputs
84
137
  self._eval_last_turn = eval_last_turn
85
138
  self._singleton_inputs = self._derive_singleton_inputs()
86
139
  self._async_evaluator = AsyncEvaluatorBase(self._real_call)
140
+ self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
141
+ self._higher_is_better = _higher_is_better
142
+ self._threshold = threshold
143
+ if conversation_aggregator_override is not None:
144
+ # Type ignore since we already checked for None, but mypy doesn't know that.
145
+ self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]
87
146
 
88
147
  # This needs to be overridden just to change the function header into something more informative,
89
148
  # and to be able to add a more specific docstring. The actual function contents should just be
@@ -120,15 +179,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
120
179
 
121
180
  # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
122
181
 
123
- def _derive_singleton_inputs(self) -> List[str]:
182
+ def _derive_singleton_inputs(self) -> List[List[str]]:
124
183
  """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
125
184
  when the evaluator is being used in a non-conversation context.
126
185
  By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
127
186
  Thankfully this works the way you'd hope, with the call_signature being based on the child
128
187
  function's signature, not the parent's.
129
188
 
130
- :return: A list of strings representing the names of singleton inputs.
131
- :rtype: List[str]
189
+ :return: A list of lists, where each inner list represents the singleton inputs for each overload.
190
+ :rtype: List[List[str]]
132
191
  """
133
192
 
134
193
  overloads = get_overloads(self.__call__)
@@ -136,17 +195,70 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
136
195
  call_signatures = [inspect.signature(self.__call__)]
137
196
  else:
138
197
  call_signatures = [inspect.signature(overload) for overload in overloads]
139
- call_signature = inspect.signature(self.__call__)
140
- singletons = []
198
+
199
+ overload_inputs = []
141
200
  for call_signature in call_signatures:
142
201
  params = call_signature.parameters
143
202
  if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
144
203
  continue
145
204
  # exclude self since it is not a singleton input
146
- singletons.extend([p for p in params if p != "self"])
147
- return singletons
205
+ overload_inputs.append([p for p in params if p != "self"])
206
+ return overload_inputs
207
+
208
+ def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
209
+ """Find the overload that matches the provided kwargs and return its input parameters.
210
+
211
+ :keyword kwargs: The keyword arguments to match against overloads.
212
+ :type kwargs: Dict
213
+ :return: List of input parameter names for the matching overload.
214
+ :rtype: List[str]
215
+ """
216
+ overload_inputs = self._singleton_inputs
217
+ provided_keys = set(key for key, value in kwargs.items() if value is not None)
218
+
219
+ # Find the overload that best matches the provided parameters
220
+ best_match = None
221
+ best_score = -1
222
+
223
+ for inputs in overload_inputs:
224
+ input_set = set(inputs)
225
+
226
+ # Calculate match score: how many of the overload's params are provided
227
+ if input_set.issubset(provided_keys):
228
+ score = len(input_set)
229
+ if score > best_score:
230
+ best_score = score
231
+ best_match = inputs
232
+
233
+ # If exact match found, return it
234
+ if best_match is not None:
235
+ return best_match
236
+
237
+ # If no exact match, find the overload with the most overlap
238
+ for inputs in overload_inputs:
239
+ input_set = set(inputs)
240
+ overlap = len(input_set.intersection(provided_keys))
241
+ if overlap > best_score:
242
+ best_score = overlap
243
+ best_match = inputs
244
+
245
+ # Return the best match or the first overload as fallback
246
+ return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
247
+
248
+ def _get_all_singleton_inputs(self) -> List[str]:
249
+ """Get a flattened list of all possible singleton inputs across all overloads.
250
+
251
+ :return: Flattened list of all singleton input names.
252
+ :rtype: List[str]
253
+ """
254
+ all_inputs = set()
255
+ for inputs in self._singleton_inputs:
256
+ all_inputs.update(inputs)
257
+ return list(all_inputs)
148
258
 
149
- def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
259
+ def _derive_conversation_converter(
260
+ self,
261
+ ) -> Callable[[Dict], List[DerivedEvalInput]]:
150
262
  """Produce the function that will be used to convert conversations to a list of evaluable inputs.
151
263
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
152
264
  aspects of a conversation ought to be extracted.
@@ -154,9 +266,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
154
266
  :return: The function that will be used to convert conversations to evaluable inputs.
155
267
  :rtype: Callable
156
268
  """
157
- include_context = "context" in self._singleton_inputs
158
- include_query = "query" in self._singleton_inputs
159
- include_response = "response" in self._singleton_inputs
269
+ all_singleton_inputs = self._get_all_singleton_inputs()
270
+ include_context = "context" in all_singleton_inputs
271
+ include_query = "query" in all_singleton_inputs
272
+ include_response = "response" in all_singleton_inputs
273
+ include_ground_truth = "ground_truth" in all_singleton_inputs
160
274
 
161
275
  def converter(conversation: Dict) -> List[DerivedEvalInput]:
162
276
  messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -197,21 +311,78 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
197
311
  eval_input["response"] = response.get("content", "")
198
312
  if include_context:
199
313
  eval_input["context"] = str(context)
314
+ if include_ground_truth:
315
+ eval_input["ground_truth"] = response.get("ground_truth", "")
200
316
  eval_inputs.append(eval_input)
201
317
  return eval_inputs
202
318
 
203
319
  return converter
204
320
 
205
- def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
321
+ def _derive_multi_modal_conversation_converter(
322
+ self,
323
+ ) -> Callable[[Dict], List[Dict[str, Any]]]:
324
+ """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
325
+ This uses the inputs derived from the _derive_singleton_inputs function to determine which
326
+ aspects of a conversation ought to be extracted.
327
+
328
+ :return: The function that will be used to convert conversations to evaluable inputs.
329
+ :rtype: Callable
330
+ """
331
+
332
+ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
333
+ messages = cast(List[Dict[str, Any]], conversation["messages"])
334
+ # Extract user messages, assistant messages from conversation
335
+ user_messages: List[Dict[str, Any]] = []
336
+ assistant_messages: List[Dict[str, Any]] = []
337
+ system_messages: List[Dict[str, Any]] = []
338
+
339
+ # Convert conversation slice into queries and responses.
340
+ # Assume that 'user' role is asking queries and 'assistant' role is responding.
341
+ if self._eval_last_turn and len(messages) > 1:
342
+ messages = messages[-2:]
343
+
344
+ for each_turn in messages:
345
+ role = each_turn["role"]
346
+ if role == "user":
347
+ user_messages.append(each_turn)
348
+ elif role == "assistant":
349
+ assistant_messages.append(each_turn)
350
+ elif role == "system":
351
+ system_messages.append(each_turn)
352
+
353
+ # validation
354
+ if len(user_messages) != len(assistant_messages):
355
+ raise EvaluationException(
356
+ message="Mismatched number of user and assistant messages.",
357
+ internal_message=("Mismatched number of user and assistant messages."),
358
+ )
359
+ if len(assistant_messages) > 1:
360
+ raise EvaluationException(
361
+ message="Conversation can have only one assistant message.",
362
+ internal_message=("Conversation can have only one assistant message."),
363
+ )
364
+ eval_conv_inputs = []
365
+ for user_msg, assist_msg in zip(user_messages, assistant_messages):
366
+ conv_messages = []
367
+ if len(system_messages) == 1:
368
+ conv_messages.append(system_messages[0])
369
+ conv_messages.append(user_msg)
370
+ conv_messages.append(assist_msg)
371
+ eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
372
+ return eval_conv_inputs
373
+
374
+ return multi_modal_converter
375
+
376
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
206
377
  """Convert an arbitrary input into a list of inputs for evaluators.
207
378
  It is assumed that evaluators generally make use of their inputs in one of two ways.
208
379
  Either they receive a collection of keyname inputs that are all single values
209
380
  (like a query and response), or they receive conversation that iss a list of dictionary
210
381
  values.
211
382
 
212
- The self._singleton_inputs list assigned during initialization is used to find and extract
213
- singleton keywords, and self._allow_converssation_input is used to determine if a conversation
214
- is a valid input.
383
+ The self._singleton_inputs list (containing overload signatures) assigned during initialization
384
+ is used to find and extract singleton keywords, and determine which overload matches the
385
+ provided arguments.
215
386
 
216
387
  If both conversations and singletons are allowed, the function will raise an exception if both
217
388
  are inputted.
@@ -229,7 +400,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
229
400
  conversation = kwargs.get("conversation", None)
230
401
  singletons = {}
231
402
  if len(self._singleton_inputs) > 0:
232
- singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
403
+ # Get all possible singleton inputs and check what's provided
404
+ all_singleton_inputs = self._get_all_singleton_inputs()
405
+ singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
406
+
233
407
  # Check that both conversation and other inputs aren't set
234
408
  if conversation is not None and any(singletons.values()):
235
409
  msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
@@ -241,11 +415,19 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
241
415
  )
242
416
  # Handle Conversation
243
417
  if conversation is not None:
418
+ if self._is_multi_modal_conversation(conversation):
419
+ return self._derive_multi_modal_conversation_converter()(conversation)
244
420
  return self._derive_conversation_converter()(conversation)
245
- # Handle Singletons
246
- required_singletons = remove_optional_singletons(self, singletons)
247
- if all(value is not None for value in required_singletons.values()):
248
- return [singletons]
421
+
422
+ # Handle Singletons - find matching overload
423
+ matching_inputs = self._get_matching_overload_inputs(**kwargs)
424
+ if matching_inputs:
425
+ # Check if all required inputs for this overload are provided
426
+ required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
427
+ required_singletons = remove_optional_singletons(self, required_singletons)
428
+ if all(value is not None for value in required_singletons.values()):
429
+ return [singletons]
430
+
249
431
  # Missing input
250
432
  msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
251
433
  raise EvaluationException(
@@ -255,6 +437,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
255
437
  target=ErrorTarget.CONVERSATION,
256
438
  )
257
439
 
440
+ def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
441
+ if "messages" not in conversation:
442
+ return False
443
+ messages = conversation["messages"]
444
+ if not isinstance(messages, list):
445
+ return False
446
+ for message in messages:
447
+ if "content" in message:
448
+ content = message.get("content", "")
449
+ if isinstance(content, list):
450
+ if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
451
+ return True
452
+ return False
453
+
258
454
  def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
259
455
  """Aggregate the evaluation results of each conversation turn into a single result.
260
456
 
@@ -285,11 +481,109 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
285
481
  # Find and average all numeric values
286
482
  for metric, values in evaluation_per_turn.items():
287
483
  if all(isinstance(value, (int, float)) for value in values):
288
- aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
484
+ aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
289
485
  # Slap the per-turn results back in.
290
486
  aggregated["evaluation_per_turn"] = evaluation_per_turn
291
487
  return aggregated
292
488
 
489
+ def _parse_tools_from_response(self, response):
490
+ """Parse the response to extract tool calls and results.
491
+ :param response: The response to parse.
492
+ :type response: Union[str, List[dict]]
493
+ :return: List of tool calls extracted from the response.
494
+ :rtype: List[dict]
495
+ """
496
+ tool_calls = []
497
+ tool_results_map = {}
498
+
499
+ # Work on a deep copy to avoid modifying the original object
500
+ response_copy = copy.deepcopy(response)
501
+
502
+ if isinstance(response_copy, list):
503
+ for message in response_copy:
504
+ # Extract tool calls from assistant messages
505
+ if message.get("role") == "assistant" and isinstance(message.get("content"), list):
506
+ for content_item in message.get("content"):
507
+ if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
508
+ tool_calls.append(copy.deepcopy(content_item))
509
+
510
+ # Extract tool results from tool messages
511
+ elif message.get("role") == "tool" and message.get("tool_call_id"):
512
+ tool_call_id = message.get("tool_call_id")
513
+ if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
514
+ result_content = message.get("content")[0]
515
+ if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
516
+ tool_results_map[tool_call_id] = result_content
517
+
518
+ # Attach results to their corresponding calls
519
+ for tool_call in tool_calls:
520
+ tool_call_id = tool_call.get("tool_call_id")
521
+ if tool_call_id in tool_results_map:
522
+ tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
523
+
524
+ return tool_calls
525
+
526
+ def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
527
+ """Extract tool names and parameters from the response.
528
+
529
+ :param response: The response to parse.
530
+ :type response: Union[str, List[dict]]
531
+ :return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
532
+ :rtype: List[Tuple[str, Dict[str, str]]]
533
+ """
534
+ tool_calls = self._parse_tools_from_response(response)
535
+ tool_name_param_pairs = []
536
+ for tool_call in tool_calls:
537
+ if not isinstance(tool_call, dict):
538
+ raise EvaluationException(
539
+ "Tool call must be a dictionary.",
540
+ internal_message=str(tool_call),
541
+ target=ErrorTarget.EVALUATE,
542
+ category=ErrorCategory.UNKNOWN,
543
+ )
544
+ if tool_call.get("type") != "tool_call":
545
+ raise EvaluationException(
546
+ "Tool call must have 'type' set to 'tool_call'.",
547
+ internal_message=str(tool_call),
548
+ target=ErrorTarget.EVALUATE,
549
+ category=ErrorCategory.INVALID_VALUE,
550
+ )
551
+
552
+ if "name" not in tool_call:
553
+ raise EvaluationException(
554
+ "Tool call missing 'name' field.",
555
+ internal_message=str(tool_call),
556
+ target=ErrorTarget.EVALUATE,
557
+ category=ErrorCategory.MISSING_FIELD,
558
+ )
559
+
560
+ tool_name = str(tool_call["name"]).strip()
561
+
562
+ # Extract parameters/arguments
563
+ parameters = {}
564
+ if "arguments" in tool_call:
565
+ args = tool_call["arguments"]
566
+ if isinstance(args, dict):
567
+ # Convert all values to strings for consistent comparison
568
+ parameters = {str(k): str(v) for k, v in args.items()}
569
+ elif isinstance(args, str):
570
+ # If arguments is a string, try to parse it as JSON
571
+ try:
572
+ parsed_args = json.loads(args)
573
+ if isinstance(parsed_args, dict):
574
+ parameters = {str(k): str(v) for k, v in parsed_args.items()}
575
+ except json.JSONDecodeError:
576
+ raise EvaluationException(
577
+ "Failed to parse tool call arguments as JSON.",
578
+ internal_message=str(tool_call),
579
+ target=ErrorTarget.EVALUATE,
580
+ category=ErrorCategory.INVALID_VALUE,
581
+ )
582
+
583
+ tool_name_param_pairs.append((tool_name, parameters))
584
+
585
+ return tool_name_param_pairs
586
+
293
587
  async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
294
588
  """The asynchronous call where real end-to-end evaluation logic is performed.
295
589
 
@@ -299,11 +593,48 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
299
593
  :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
300
594
  """
301
595
  # Convert inputs into list of evaluable inputs.
302
- eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
596
+ try:
597
+ eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
598
+ except Exception as e:
599
+ logger.error(f"Error converting kwargs to eval_input_list: {e}")
600
+ raise e
303
601
  per_turn_results = []
304
602
  # Evaluate all inputs.
305
603
  for eval_input in eval_input_list:
306
- per_turn_results.append(await self._do_eval(eval_input))
604
+ result = await self._do_eval(eval_input)
605
+ # logic to determine threshold pass/fail
606
+ try:
607
+ for key in list(result.keys()):
608
+ if key.endswith("_score") and "rouge" not in key:
609
+ score_value = result[key]
610
+ base_key = key[:-6] # Remove "_score" suffix
611
+ result_key = f"{base_key}_result"
612
+ threshold_key = f"{base_key}_threshold"
613
+ threshold_value = (
614
+ self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
615
+ )
616
+ if not isinstance(threshold_value, (int, float)):
617
+ raise EvaluationException(
618
+ "Threshold value must be a number.",
619
+ internal_message=str(threshold_value),
620
+ target=ErrorTarget.EVALUATE,
621
+ category=ErrorCategory.INVALID_VALUE,
622
+ )
623
+
624
+ result[threshold_key] = threshold_value
625
+ if self._higher_is_better:
626
+ if float(score_value) >= threshold_value:
627
+ result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
628
+ else:
629
+ result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
630
+ else:
631
+ if float(score_value) <= threshold_value:
632
+ result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
633
+ else:
634
+ result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
635
+ except Exception as e:
636
+ logger.warning(f"Error calculating binary result: {e}")
637
+ per_turn_results.append(result)
307
638
  # Return results as-is if only one result was produced.
308
639
 
309
640
  if len(per_turn_results) == 1:
@@ -313,10 +644,51 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
313
644
  # Otherwise, aggregate results.
314
645
  return self._aggregate_results(per_turn_results=per_turn_results)
315
646
 
647
+ # ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
648
+
316
649
  @final
317
650
  def _to_async(self) -> "AsyncEvaluatorBase":
318
651
  return self._async_evaluator
319
652
 
653
+ @experimental
654
+ @final
655
+ def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
656
+ """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
657
+ multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
658
+ multi-turn conversation into a single top-level result.
659
+
660
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn
661
+ results of a conversation to produce a single result.
662
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
663
+ """
664
+ self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
665
+
666
+ @experimental
667
+ @final
668
+ def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
669
+ """Set the conversation aggregator function directly. This function will be applied to all numeric outputs
670
+ of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
671
+ evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
672
+ suit your needs, but use with caution.
673
+
674
+ :param aggregator: The function to use to aggregate per-turn results.
675
+ :type aggregator: Callable[[List[float]], float]
676
+ """
677
+ self._conversation_aggregation_function = aggregator
678
+
679
+ @experimental
680
+ @final
681
+ def _get_conversation_aggregator_type(self) -> _AggregationType:
682
+ """Get the current conversation aggregation type used by this evaluator. This refers to the
683
+ method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
684
+ is inputted into an evaluator that evaluates each turn individually). The individual inputs
685
+ are combined by the function implied here to produce a single overall result.
686
+
687
+ :return: The conversation aggregation type.
688
+ :rtype: ~azure.ai.evaluation._AggregationType
689
+ """
690
+ return GetAggregatorType(self._conversation_aggregation_function)
691
+
320
692
 
321
693
  class AsyncEvaluatorBase:
322
694
  """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
@@ -332,13 +704,42 @@ class AsyncEvaluatorBase:
332
704
  # are just not passed into this function instead of ending up in kwargs.
333
705
  # Since we want this to be relatively call-agnostic, we just account for every input that any children
334
706
  # are known to throw at this, mash them into kwargs, and then pass them into the real call.
335
- async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
707
+ async def __call__(
708
+ self,
709
+ *,
710
+ query=None,
711
+ response=None,
712
+ context=None,
713
+ conversation=None,
714
+ ground_truth=None,
715
+ tool_calls=None,
716
+ tool_definitions=None,
717
+ messages=None,
718
+ retrieval_ground_truth=None,
719
+ retrieved_documents=None,
720
+ **kwargs,
721
+ ):
336
722
  if conversation is not None:
337
723
  kwargs["conversation"] = conversation
338
724
  if query is not None:
339
725
  kwargs["query"] = query
340
726
  if response is not None:
341
727
  kwargs["response"] = response
728
+ if tool_definitions is not None:
729
+ kwargs["tool_definitions"] = tool_definitions
342
730
  if context is not None:
343
731
  kwargs["context"] = context
732
+ if ground_truth is not None:
733
+ kwargs["ground_truth"] = ground_truth
734
+ if tool_calls is not None:
735
+ kwargs["tool_calls"] = tool_calls
736
+ if tool_definitions is not None:
737
+ kwargs["tool_definitions"] = tool_definitions
738
+ if messages is not None:
739
+ kwargs["messages"] = messages
740
+ if retrieval_ground_truth is not None:
741
+ kwargs["retrieval_ground_truth"] = retrieval_ground_truth
742
+ if retrieved_documents is not None:
743
+ kwargs["retrieved_documents"] = retrieved_documents
744
+
344
745
  return await self._real_call(**kwargs)
@@ -0,0 +1,63 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from concurrent.futures import as_completed
5
+ from typing import TypeVar, Dict, List
6
+
7
+ from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
8
+ from typing_extensions import override
9
+
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ class MultiEvaluatorBase(EvaluatorBase[T]):
16
+ """
17
+ Base class for evaluators that contain and run multiple other evaluators to produce a
18
+ suite of metrics.
19
+
20
+ Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval.
21
+
22
+ :param evaluators: The list of evaluators to run when this evaluator is called.
23
+ :type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase]
24
+ :param kwargs: Additional arguments to pass to the evaluator.
25
+ :type kwargs: Any
26
+ :return: An evaluator that runs multiple other evaluators and combines their results.
27
+ """
28
+
29
+ def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
30
+ self._threshold = kwargs.pop("threshold", 3)
31
+ self._higher_is_better = kwargs.pop("_higher_is_better", False)
32
+ super().__init__(threshold=self._threshold, _higher_is_better=self._higher_is_better)
33
+ self._parallel = kwargs.pop("_parallel", True)
34
+ self._evaluators = evaluators
35
+
36
+ @override
37
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
38
+ """Run each evaluator, possibly in parallel, and combine the results into
39
+ a single large dictionary containing each evaluation. Inputs are passed
40
+ directly to each evaluator without additional processing.
41
+
42
+
43
+ :param eval_input: The input to the evaluation function.
44
+ :type eval_input: Dict
45
+ :return: The evaluation result.
46
+ :rtype: Dict
47
+ """
48
+ results: Dict[str, T] = {}
49
+ if self._parallel:
50
+ with ThreadPoolExecutor() as executor:
51
+ # pylint: disable=no-value-for-parameter
52
+ futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators}
53
+
54
+ for future in as_completed(futures):
55
+ results.update(future.result())
56
+ else:
57
+ for evaluator in self._evaluators:
58
+ result = evaluator(**eval_input)
59
+ # Ignore is to avoid mypy getting upset over the amount of duck-typing
60
+ # that's going on to shove evaluators around like this.
61
+ results.update(result) # type: ignore[arg-type]
62
+
63
+ return results