azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +85 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +147 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +87 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +430 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
  155. azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
  156. azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
  157. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  158. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  159. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  160. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  161. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  162. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  163. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  165. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/METADATA +378 -27
  264. azure_ai_evaluation-1.13.5.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/top_level.txt +0 -0
@@ -1,38 +1,17 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict
5
+
4
6
  from nltk.translate.meteor_score import meteor_score
5
- from promptflow._utils.async_utils import async_run_allowing_running_loop
7
+ from typing_extensions import overload, override
6
8
 
7
9
  from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
8
12
 
9
13
 
10
- class _AsyncMeteorScoreEvaluator:
11
- def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
12
- self._alpha = alpha
13
- self._beta = beta
14
- self._gamma = gamma
15
-
16
- ensure_nltk_data_downloaded()
17
-
18
- async def __call__(self, *, ground_truth: str, response: str, **kwargs):
19
- reference_tokens = nltk_tokenize(ground_truth)
20
- hypothesis_tokens = nltk_tokenize(response)
21
-
22
- score = meteor_score(
23
- [reference_tokens],
24
- hypothesis_tokens,
25
- alpha=self._alpha,
26
- beta=self._beta,
27
- gamma=self._gamma,
28
- )
29
-
30
- return {
31
- "meteor_score": score,
32
- }
33
-
34
-
35
- class MeteorScoreEvaluator:
14
+ class MeteorScoreEvaluator(EvaluatorBase):
36
15
  """
37
16
  Calculates the METEOR score for a given response and ground truth.
38
17
 
@@ -54,6 +33,8 @@ class MeteorScoreEvaluator:
54
33
  :type beta: float
55
34
  :param gamma: The METEOR score gamma parameter. Default is 0.5.
56
35
  :type gamma: float
36
+ :param threshold: The threshold for the METEOR score evaluator. Default is 0.5.
37
+ :type threshold: float
57
38
 
58
39
  .. admonition:: Example:
59
40
 
@@ -63,15 +44,75 @@ class MeteorScoreEvaluator:
63
44
  :language: python
64
45
  :dedent: 8
65
46
  :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
47
+
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START meteor_score_evaluator]
52
+ :end-before: [END meteor_score_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
58
+ .. admonition:: Example with Threshold:
59
+
60
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
61
+ :start-after: [START threshold_meteor_score_evaluator]
62
+ :end-before: [END threshold_meteor_score_evaluator]
63
+ :language: python
64
+ :dedent: 8
65
+ :caption: Initialize with threshold and call a MeteorScoreEvaluator.
66
66
  """
67
67
 
68
- id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
68
+ id = "azureai://built-in/evaluators/meteor_score"
69
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
70
 
71
- def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
72
- self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
71
+ @override
72
+ def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5, *, threshold: float = 0.5):
73
+ self._alpha = alpha
74
+ self._beta = beta
75
+ self._gamma = gamma
76
+ ensure_nltk_data_downloaded()
77
+ self._threshold = threshold
78
+ self._higher_is_better = True
79
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
80
+
81
+ @override
82
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
83
+ """Produce a meteor score evaluation result.
84
+
85
+ :param eval_input: The input to the evaluation function.
86
+ :type eval_input: Dict
87
+ :return: The evaluation result.
88
+ :rtype: Dict
89
+ """
90
+ ground_truth = eval_input["ground_truth"]
91
+ response = eval_input["response"]
92
+ reference_tokens = nltk_tokenize(ground_truth)
93
+ hypothesis_tokens = nltk_tokenize(response)
94
+ score = meteor_score(
95
+ [reference_tokens],
96
+ hypothesis_tokens,
97
+ alpha=self._alpha,
98
+ beta=self._beta,
99
+ gamma=self._gamma,
100
+ )
101
+ binary_result = False
102
+ if self._higher_is_better:
103
+ if score >= self._threshold:
104
+ binary_result = True
105
+ else:
106
+ if score <= self._threshold:
107
+ binary_result = True
108
+ return {
109
+ "meteor_score": score,
110
+ "meteor_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
111
+ "meteor_threshold": self._threshold,
112
+ }
73
113
 
74
- def __call__(self, *, ground_truth: str, response: str, **kwargs):
114
+ @overload # type: ignore
115
+ def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
75
116
  """
76
117
  Evaluate the METEOR score between the response and the ground truth.
77
118
 
@@ -82,9 +123,21 @@ class MeteorScoreEvaluator:
82
123
  :return: The METEOR score.
83
124
  :rtype: Dict[str, float]
84
125
  """
85
- return async_run_allowing_running_loop(
86
- self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
87
- )
88
126
 
89
- def _to_async(self):
90
- return self._async_evaluator
127
+ @override
128
+ def __call__( # pylint: disable=docstring-missing-param
129
+ self,
130
+ *args,
131
+ **kwargs,
132
+ ):
133
+ """
134
+ Evaluate the METEOR score between the response and the ground truth.
135
+
136
+ :keyword response: The response to be evaluated.
137
+ :paramtype response: str
138
+ :keyword ground_truth: The ground truth to be compared against.
139
+ :paramtype ground_truth: str
140
+ :return: The METEOR score.
141
+ :rtype: Dict[str, float]
142
+ """
143
+ return super().__call__(*args, **kwargs)
@@ -25,9 +25,9 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
25
25
 
26
26
  :param credential: The credential required for connecting to the Azure AI project.
27
27
  :type credential: ~azure.core.credentials.TokenCredential
28
- :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
29
- resource group, and project name.
30
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
28
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
29
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
30
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
31
31
 
32
32
  .. admonition:: Example:
33
33
 
@@ -37,21 +37,39 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
37
37
  :language: python
38
38
  :dedent: 8
39
39
  :caption: Initialize and call a ProtectedMaterialEvaluator.
40
+
41
+ .. admonition:: Example using Azure AI Project URL:
42
+
43
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
+ :start-after: [START protected_material_evaluator]
45
+ :end-before: [END protected_material_evaluator]
46
+ :language: python
47
+ :dedent: 8
48
+ :caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
49
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
+
40
51
  """
41
52
 
42
- id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
53
+ id = "azureai://built-in/evaluators/protected_material"
43
54
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
+ _OPTIONAL_PARAMS = ["query"]
44
56
 
45
57
  @override
46
58
  def __init__(
47
59
  self,
48
60
  credential,
49
61
  azure_ai_project,
62
+ **kwargs,
50
63
  ):
64
+ # Set default for evaluate_query if not provided
65
+ if "evaluate_query" not in kwargs:
66
+ kwargs["evaluate_query"] = True
67
+
51
68
  super().__init__(
52
69
  eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
53
70
  azure_ai_project=azure_ai_project,
54
71
  credential=credential,
72
+ **kwargs,
55
73
  )
56
74
 
57
75
  @overload
@@ -2,10 +2,11 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from concurrent.futures import as_completed
6
- from typing import Callable, Dict, List, Union
5
+ from typing import Union
7
6
 
8
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
7
+ from typing_extensions import overload, override
8
+
9
+ from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
9
10
 
10
11
  from .._coherence import CoherenceEvaluator
11
12
  from .._f1_score import F1ScoreEvaluator
@@ -15,13 +16,25 @@ from .._relevance import RelevanceEvaluator
15
16
  from .._similarity import SimilarityEvaluator
16
17
 
17
18
 
18
- class QAEvaluator:
19
+ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
19
20
  """
20
21
  Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
21
22
 
22
23
  :param model_config: Configuration for the Azure OpenAI model.
23
24
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
24
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
26
+ :param groundedness_threshold: The threshold for groundedness evaluation. Default is 3.
27
+ :type groundedness_threshold: int
28
+ :param relevance_threshold: The threshold for relevance evaluation. Default is 3.
29
+ :type relevance_threshold: int
30
+ :param coherence_threshold: The threshold for coherence evaluation. Default is 3.
31
+ :type coherence_threshold: int
32
+ :param fluency_threshold: The threshold for fluency evaluation. Default is 3.
33
+ :type fluency_threshold: int
34
+ :param similarity_threshold: The threshold for similarity evaluation. Default is 3.
35
+ :type similarity_threshold: int
36
+ :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
37
+ :type f1_score_threshold: float
25
38
  :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
26
39
  :param kwargs: Additional arguments to pass to the evaluator.
27
40
  :type kwargs: Any
@@ -35,6 +48,25 @@ class QAEvaluator:
35
48
  :dedent: 8
36
49
  :caption: Initialize and call a QAEvaluator.
37
50
 
51
+ .. admonition:: Example using Azure AI Project URL:
52
+
53
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
54
+ :start-after: [START qa_evaluator]
55
+ :end-before: [END qa_evaluator]
56
+ :language: python
57
+ :dedent: 8
58
+ :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
59
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
60
+
61
+ .. admonition:: Example with Threshold:
62
+
63
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
64
+ :start-after: [START threshold_qa_evaluator]
65
+ :end-before: [END threshold_qa_evaluator]
66
+ :language: python
67
+ :dedent: 8
68
+ :caption: Initialize with threshold and call a QAEvaluator.
69
+
38
70
  .. note::
39
71
 
40
72
  To align with our support of a diverse set of models, keys without the `gpt_` prefix has been added.
@@ -42,22 +74,66 @@ class QAEvaluator:
42
74
  however, it is recommended to use the new keys moving forward as the old keys will be deprecated in the future.
43
75
  """
44
76
 
45
- id = "qa"
77
+ id = "azureai://built-in/evaluators/qa"
46
78
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
47
79
 
48
- def __init__(self, model_config, **kwargs):
49
- self._parallel = kwargs.pop("_parallel", False)
50
-
51
- self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
52
- GroundednessEvaluator(model_config),
53
- RelevanceEvaluator(model_config),
54
- CoherenceEvaluator(model_config),
55
- FluencyEvaluator(model_config),
56
- SimilarityEvaluator(model_config),
57
- F1ScoreEvaluator(),
80
+ def __init__(
81
+ self,
82
+ model_config,
83
+ *,
84
+ groundedness_threshold: int = 3,
85
+ relevance_threshold: int = 3,
86
+ coherence_threshold: int = 3,
87
+ fluency_threshold: int = 3,
88
+ similarity_threshold: int = 3,
89
+ f1_score_threshold: float = 0.5,
90
+ **kwargs,
91
+ ):
92
+ # Type checking
93
+ for name, value in [
94
+ ("groundedness_threshold", groundedness_threshold),
95
+ ("relevance_threshold", relevance_threshold),
96
+ ("coherence_threshold", coherence_threshold),
97
+ ("fluency_threshold", fluency_threshold),
98
+ ("similarity_threshold", similarity_threshold),
99
+ ("f1_score_threshold", f1_score_threshold),
100
+ ]:
101
+ if not isinstance(value, (int, float)):
102
+ raise TypeError(f"{name} must be an int or float, got {type(value)}")
103
+
104
+ evaluators = [
105
+ GroundednessEvaluator(model_config, threshold=groundedness_threshold),
106
+ RelevanceEvaluator(model_config, threshold=relevance_threshold),
107
+ CoherenceEvaluator(model_config, threshold=coherence_threshold),
108
+ FluencyEvaluator(model_config, threshold=fluency_threshold),
109
+ SimilarityEvaluator(model_config, threshold=similarity_threshold),
110
+ F1ScoreEvaluator(threshold=f1_score_threshold),
58
111
  ]
112
+ super().__init__(evaluators=evaluators, **kwargs)
113
+
114
+ @overload # type: ignore
115
+ def __call__(self, *, query: str, response: str, context: str, ground_truth: str):
116
+ """
117
+ Evaluates question-answering scenario.
118
+
119
+ :keyword query: The query to be evaluated.
120
+ :paramtype query: str
121
+ :keyword response: The response to be evaluated.
122
+ :paramtype response: str
123
+ :keyword context: The context to be evaluated.
124
+ :paramtype context: str
125
+ :keyword ground_truth: The ground truth to be evaluated.
126
+ :paramtype ground_truth: str
127
+ :return: The scores for QA scenario.
128
+ :rtype: Dict[str, Union[str, float]]
129
+ """
59
130
 
60
- def __call__(self, *, query: str, response: str, context: str, ground_truth: str, **kwargs):
131
+ @override
132
+ def __call__( # pylint: disable=docstring-missing-param
133
+ self,
134
+ *args,
135
+ **kwargs,
136
+ ):
61
137
  """
62
138
  Evaluates question-answering scenario.
63
139
 
@@ -72,22 +148,5 @@ class QAEvaluator:
72
148
  :return: The scores for QA scenario.
73
149
  :rtype: Dict[str, Union[str, float]]
74
150
  """
75
- results: Dict[str, Union[str, float]] = {}
76
- if self._parallel:
77
- with ThreadPoolExecutor() as executor:
78
- futures = {
79
- executor.submit(
80
- evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
81
- ): evaluator
82
- for evaluator in self._evaluators
83
- }
84
-
85
- # Collect results as they complete
86
- for future in as_completed(futures):
87
- results.update(future.result())
88
- else:
89
- for evaluator in self._evaluators:
90
- result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
91
- results.update(result)
92
-
93
- return results
151
+
152
+ return super().__call__(*args, **kwargs)
@@ -1,15 +1,21 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
4
+ import logging
5
+ import math
5
6
  import os
6
7
  from typing import Dict, Union, List
7
8
 
8
9
  from typing_extensions import overload, override
9
10
 
11
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12
+ from ..._common.utils import reformat_conversation_history, reformat_agent_response
13
+
10
14
  from azure.ai.evaluation._model_configurations import Conversation
11
15
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
12
16
 
17
+ logger = logging.getLogger(__name__)
18
+
13
19
 
14
20
  class RelevanceEvaluator(PromptyEvaluatorBase):
15
21
  """
@@ -27,6 +33,13 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
27
33
  :param model_config: Configuration for the Azure OpenAI model.
28
34
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
29
35
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
+ :param threshold: The threshold for the relevance evaluator. Default is 3.
37
+ :type threshold: int
38
+ :param credential: The credential for authenticating to Azure AI service.
39
+ :type credential: ~azure.core.credentials.TokenCredential
40
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
41
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
42
+ :paramtype is_reasoning_model: bool
30
43
 
31
44
  .. admonition:: Example:
32
45
 
@@ -37,6 +50,25 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
37
50
  :dedent: 8
38
51
  :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
39
52
 
53
+ .. admonition:: Example using Azure AI Project URL:
54
+
55
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
56
+ :start-after: [START relevance_evaluator]
57
+ :end-before: [END relevance_evaluator]
58
+ :language: python
59
+ :dedent: 8
60
+ :caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
61
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
62
+
63
+ .. admonition:: Example with Threshold:
64
+
65
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
66
+ :start-after: [START threshold_relevance_evaluator]
67
+ :end-before: [END threshold_relevance_evaluator]
68
+ :language: python
69
+ :dedent: 8
70
+ :caption: Initialize with threshold and call a RelevanceEvaluator with a query, response, and context.
71
+
40
72
  .. note::
41
73
 
42
74
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -48,14 +80,22 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
48
80
  _PROMPTY_FILE = "relevance.prompty"
49
81
  _RESULT_KEY = "relevance"
50
82
 
51
- id = "azureml://registries/azureml/models/Relevance-Evaluator/versions/4"
83
+ id = "azureai://built-in/evaluators/relevance"
52
84
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
53
85
 
54
86
  @override
55
- def __init__(self, model_config):
87
+ def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
56
88
  current_dir = os.path.dirname(__file__)
57
89
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
58
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
90
+ super().__init__(
91
+ model_config=model_config,
92
+ prompty_file=prompty_path,
93
+ result_key=self._RESULT_KEY,
94
+ threshold=threshold,
95
+ credential=credential,
96
+ _higher_is_better=True,
97
+ **kwargs,
98
+ )
59
99
 
60
100
  @overload
61
101
  def __call__(
@@ -112,3 +152,59 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
112
152
  :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
113
153
  """
114
154
  return super().__call__(*args, **kwargs)
155
+
156
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
157
+ """Do a relevance evaluation.
158
+
159
+ :param eval_input: The input to the evaluator. Expected to contain
160
+ whatever inputs are needed for the _flow method, including context
161
+ and other fields depending on the child class.
162
+ :type eval_input: Dict
163
+ :return: The evaluation result.
164
+ :rtype: Dict
165
+ """
166
+ if "query" not in eval_input and "response" not in eval_input:
167
+ raise EvaluationException(
168
+ message="Only text conversation inputs are supported.",
169
+ internal_message="Only text conversation inputs are supported.",
170
+ blame=ErrorBlame.USER_ERROR,
171
+ category=ErrorCategory.INVALID_VALUE,
172
+ target=ErrorTarget.CONVERSATION,
173
+ )
174
+ if not isinstance(eval_input["query"], str):
175
+ eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
176
+ if not isinstance(eval_input["response"], str):
177
+ eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
178
+ result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
179
+ llm_output = result.get("llm_output")
180
+ score = math.nan
181
+
182
+ if isinstance(llm_output, dict):
183
+ score = float(llm_output.get("score", math.nan))
184
+ reason = llm_output.get("explanation", "")
185
+ # Parse out score and reason from evaluators known to possess them.
186
+ binary_result = self._get_binary_result(score)
187
+ return {
188
+ self._result_key: float(score),
189
+ f"gpt_{self._result_key}": float(score),
190
+ f"{self._result_key}_result": binary_result,
191
+ f"{self._result_key}_threshold": self._threshold,
192
+ f"{self._result_key}_reason": reason,
193
+ f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
194
+ f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
195
+ f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
196
+ f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
197
+ f"{self._result_key}_model": result.get("model_id", ""),
198
+ f"{self._result_key}_sample_input": result.get("sample_input", ""),
199
+ f"{self._result_key}_sample_output": result.get("sample_output", ""),
200
+ }
201
+
202
+ if logger:
203
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
204
+
205
+ binary_result = self._get_binary_result(score)
206
+ return {
207
+ self._result_key: float(score),
208
+ f"{self._result_key}_result": binary_result,
209
+ f"{self._result_key}_threshold": self._threshold,
210
+ }