azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,11 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from concurrent.futures import as_completed
5
+ from typing import Union
6
6
 
7
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
7
+ from typing_extensions import overload, override
8
+
9
+ from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
8
10
 
9
11
  from .._coherence import CoherenceEvaluator
10
12
  from .._f1_score import F1ScoreEvaluator
@@ -14,55 +16,103 @@ from .._relevance import RelevanceEvaluator
14
16
  from .._similarity import SimilarityEvaluator
15
17
 
16
18
 
17
- class QAEvaluator:
19
+ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
18
20
  """
19
21
  Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
20
22
 
21
23
  :param model_config: Configuration for the Azure OpenAI model.
22
24
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
23
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
24
- :return: A function that evaluates and generates metrics for "question-answering" scenario.
25
- :rtype: Callable
26
-
27
- **Usage**
28
-
29
- .. code-block:: python
30
-
31
- eval_fn = QAEvaluator(model_config)
32
- result = qa_eval(
33
- query="Tokyo is the capital of which country?",
34
- response="Japan",
35
- context="Tokyo is the capital of Japan.",
36
- ground_truth="Japan"
37
- )
38
-
39
- **Output format**
40
-
41
- .. code-block:: python
42
-
43
- {
44
- "gpt_groundedness": 3.5,
45
- "gpt_relevance": 4.0,
46
- "gpt_coherence": 1.5,
47
- "gpt_fluency": 4.0,
48
- "gpt_similarity": 3.0,
49
- "f1_score": 0.42
50
- }
26
+ :param groundedness_threshold: The threshold for groundedness evaluation. Default is 3.
27
+ :type groundedness_threshold: int
28
+ :param relevance_threshold: The threshold for relevance evaluation. Default is 3.
29
+ :type relevance_threshold: int
30
+ :param coherence_threshold: The threshold for coherence evaluation. Default is 3.
31
+ :type coherence_threshold: int
32
+ :param fluency_threshold: The threshold for fluency evaluation. Default is 3.
33
+ :type fluency_threshold: int
34
+ :param similarity_threshold: The threshold for similarity evaluation. Default is 3.
35
+ :type similarity_threshold: int
36
+ :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
37
+ :type f1_score_threshold: float
38
+ :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
39
+ :param kwargs: Additional arguments to pass to the evaluator.
40
+ :type kwargs: Any
41
+
42
+ .. admonition:: Example:
43
+
44
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
45
+ :start-after: [START qa_evaluator]
46
+ :end-before: [END qa_evaluator]
47
+ :language: python
48
+ :dedent: 8
49
+ :caption: Initialize and call a QAEvaluator.
50
+
51
+ .. admonition:: Example using Azure AI Project URL:
52
+
53
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
54
+ :start-after: [START qa_evaluator]
55
+ :end-before: [END qa_evaluator]
56
+ :language: python
57
+ :dedent: 8
58
+ :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
59
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
60
+
61
+ .. admonition:: Example with Threshold:
62
+
63
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
64
+ :start-after: [START threshold_qa_evaluator]
65
+ :end-before: [END threshold_qa_evaluator]
66
+ :language: python
67
+ :dedent: 8
68
+ :caption: Initialize with threshold and call a QAEvaluator.
69
+
70
+ .. note::
71
+
72
+ To align with our support of a diverse set of models, keys without the `gpt_` prefix has been added.
73
+ To maintain backwards compatibility, the old keys with the `gpt_` prefix are still be present in the output;
74
+ however, it is recommended to use the new keys moving forward as the old keys will be deprecated in the future.
51
75
  """
52
76
 
53
- def __init__(self, model_config: dict, parallel: bool = True):
54
- self._parallel = parallel
55
-
56
- self._evaluators = [
57
- GroundednessEvaluator(model_config),
58
- RelevanceEvaluator(model_config),
59
- CoherenceEvaluator(model_config),
60
- FluencyEvaluator(model_config),
61
- SimilarityEvaluator(model_config),
62
- F1ScoreEvaluator(),
77
+ id = "azureai://built-in/evaluators/qa"
78
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
79
+
80
+ def __init__(
81
+ self,
82
+ model_config,
83
+ *,
84
+ groundedness_threshold: int = 3,
85
+ relevance_threshold: int = 3,
86
+ coherence_threshold: int = 3,
87
+ fluency_threshold: int = 3,
88
+ similarity_threshold: int = 3,
89
+ f1_score_threshold: float = 0.5,
90
+ **kwargs,
91
+ ):
92
+ # Type checking
93
+ for name, value in [
94
+ ("groundedness_threshold", groundedness_threshold),
95
+ ("relevance_threshold", relevance_threshold),
96
+ ("coherence_threshold", coherence_threshold),
97
+ ("fluency_threshold", fluency_threshold),
98
+ ("similarity_threshold", similarity_threshold),
99
+ ("f1_score_threshold", f1_score_threshold),
100
+ ]:
101
+ if not isinstance(value, (int, float)):
102
+ raise TypeError(f"{name} must be an int or float, got {type(value)}")
103
+
104
+ evaluators = [
105
+ GroundednessEvaluator(model_config, threshold=groundedness_threshold),
106
+ RelevanceEvaluator(model_config, threshold=relevance_threshold),
107
+ CoherenceEvaluator(model_config, threshold=coherence_threshold),
108
+ FluencyEvaluator(model_config, threshold=fluency_threshold),
109
+ SimilarityEvaluator(model_config, threshold=similarity_threshold),
110
+ F1ScoreEvaluator(threshold=f1_score_threshold),
63
111
  ]
112
+ super().__init__(evaluators=evaluators, **kwargs)
64
113
 
65
- def __call__(self, *, query: str, response: str, context: str, ground_truth: str, **kwargs):
114
+ @overload # type: ignore
115
+ def __call__(self, *, query: str, response: str, context: str, ground_truth: str):
66
116
  """
67
117
  Evaluates question-answering scenario.
68
118
 
@@ -74,27 +124,29 @@ class QAEvaluator:
74
124
  :paramtype context: str
75
125
  :keyword ground_truth: The ground truth to be evaluated.
76
126
  :paramtype ground_truth: str
77
- :keyword parallel: Whether to evaluate in parallel. Defaults to True.
78
- :paramtype parallel: bool
79
127
  :return: The scores for QA scenario.
80
- :rtype: dict
128
+ :rtype: Dict[str, Union[str, float]]
129
+ """
130
+
131
+ @override
132
+ def __call__( # pylint: disable=docstring-missing-param
133
+ self,
134
+ *args,
135
+ **kwargs,
136
+ ):
81
137
  """
82
- results = {}
83
- if self._parallel:
84
- with ThreadPoolExecutor() as executor:
85
- futures = {
86
- executor.submit(
87
- evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
88
- ): evaluator
89
- for evaluator in self._evaluators
90
- }
91
-
92
- # Collect results as they complete
93
- for future in as_completed(futures):
94
- results.update(future.result())
95
- else:
96
- for evaluator in self._evaluators:
97
- result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
98
- results.update(result)
99
-
100
- return results
138
+ Evaluates question-answering scenario.
139
+
140
+ :keyword query: The query to be evaluated.
141
+ :paramtype query: str
142
+ :keyword response: The response to be evaluated.
143
+ :paramtype response: str
144
+ :keyword context: The context to be evaluated.
145
+ :paramtype context: str
146
+ :keyword ground_truth: The ground truth to be evaluated.
147
+ :paramtype ground_truth: str
148
+ :return: The scores for QA scenario.
149
+ :rtype: Dict[str, Union[str, float]]
150
+ """
151
+
152
+ return super().__call__(*args, **kwargs)
@@ -1,126 +1,210 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
4
+ import logging
5
+ import math
5
6
  import os
6
- import re
7
-
8
- import numpy as np
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from promptflow.core import AsyncPrompty
11
-
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
7
+ from typing import Dict, Union, List
13
8
 
14
- from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
9
+ from typing_extensions import overload, override
15
10
 
16
- try:
17
- from ..._user_agent import USER_AGENT
18
- except ImportError:
19
- USER_AGENT = None
20
-
21
-
22
- class _AsyncRelevanceEvaluator:
23
- # Constants must be defined within eval's directory to be save/loadable
24
- PROMPTY_FILE = "relevance.prompty"
25
- LLM_CALL_TIMEOUT = 600
26
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
11
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12
+ from ..._common.utils import reformat_conversation_history, reformat_agent_response
27
13
 
28
- def __init__(self, model_config: dict):
29
- ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
14
+ from azure.ai.evaluation._model_configurations import Conversation
15
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
30
16
 
31
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
32
-
33
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
34
- # https://github.com/encode/httpx/discussions/2959
35
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
36
-
37
- ensure_user_agent_in_aoai_model_config(
38
- model_config,
39
- prompty_model_config,
40
- USER_AGENT,
41
- )
42
-
43
- current_dir = os.path.dirname(__file__)
44
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
45
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
17
+ logger = logging.getLogger(__name__)
46
18
 
47
- async def __call__(self, *, query: str, response: str, context: str, **kwargs):
48
- # Validate input parameters
49
- query = str(query or "")
50
- response = str(response or "")
51
- context = str(context or "")
52
19
 
53
- if not (query.strip() and response.strip() and context.strip()):
54
- msg = "'query', 'response' and 'context' must be non-empty strings."
55
- raise EvaluationException(
56
- message=msg,
57
- internal_message=msg,
58
- error_category=ErrorCategory.MISSING_FIELD,
59
- error_blame=ErrorBlame.USER_ERROR,
60
- error_target=ErrorTarget.RELEVANCE_EVALUATOR,
61
- )
62
-
63
- # Run the evaluation flow
64
- llm_output = await self._flow(
65
- query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
66
- )
67
-
68
- score = np.nan
69
- if llm_output:
70
- match = re.search(r"\d", llm_output)
71
- if match:
72
- score = float(match.group())
73
-
74
- return {"gpt_relevance": float(score)}
20
+ class RelevanceEvaluator(PromptyEvaluatorBase):
21
+ """
22
+ Evaluates relevance score for a given query and response or a multi-turn conversation, including reasoning.
75
23
 
24
+ The relevance measure assesses the ability of answers to capture the key points of the context.
25
+ High relevance scores signify the AI system's understanding of the input and its capability to produce coherent
26
+ and contextually appropriate outputs. Conversely, low relevance scores indicate that generated responses might
27
+ be off-topic, lacking in context, or insufficient in addressing the user's intended queries. Use the relevance
28
+ metric when evaluating the AI system's performance in understanding the input and generating contextually
29
+ appropriate responses.
76
30
 
77
- class RelevanceEvaluator:
78
- """
79
- Initialize a relevance evaluator configured for a specific Azure OpenAI model.
31
+ Relevance scores range from 1 to 5, with 1 being the worst and 5 being the best.
80
32
 
81
33
  :param model_config: Configuration for the Azure OpenAI model.
82
34
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
83
35
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
+ :param threshold: The threshold for the relevance evaluator. Default is 3.
37
+ :type threshold: int
38
+ :param credential: The credential for authenticating to Azure AI service.
39
+ :type credential: ~azure.core.credentials.TokenCredential
40
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
41
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
42
+ :paramtype is_reasoning_model: bool
43
+
44
+ .. admonition:: Example:
45
+
46
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
47
+ :start-after: [START relevance_evaluator]
48
+ :end-before: [END relevance_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
52
+
53
+ .. admonition:: Example using Azure AI Project URL:
54
+
55
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
56
+ :start-after: [START relevance_evaluator]
57
+ :end-before: [END relevance_evaluator]
58
+ :language: python
59
+ :dedent: 8
60
+ :caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
61
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
62
+
63
+ .. admonition:: Example with Threshold:
64
+
65
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
66
+ :start-after: [START threshold_relevance_evaluator]
67
+ :end-before: [END threshold_relevance_evaluator]
68
+ :language: python
69
+ :dedent: 8
70
+ :caption: Initialize with threshold and call a RelevanceEvaluator with a query, response, and context.
71
+
72
+ .. note::
73
+
74
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
75
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
76
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
77
+ """
84
78
 
85
- **Usage**
86
-
87
- .. code-block:: python
88
-
89
- eval_fn = RelevanceEvaluator(model_config)
90
- result = eval_fn(
91
- query="What is the capital of Japan?",
92
- response="The capital of Japan is Tokyo.",
93
- context="Tokyo is Japan's capital, known for its blend of traditional culture \
94
- and technological advancements.")
95
-
96
- **Output format**
97
-
98
- .. code-block:: python
79
+ # Constants must be defined within eval's directory to be save/loadable
80
+ _PROMPTY_FILE = "relevance.prompty"
81
+ _RESULT_KEY = "relevance"
99
82
 
100
- {
101
- "gpt_relevance": 3.0
102
- }
103
- """
83
+ id = "azureai://built-in/evaluators/relevance"
84
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
104
85
 
105
- def __init__(self, model_config: dict):
106
- self._async_evaluator = _AsyncRelevanceEvaluator(model_config)
86
+ @override
87
+ def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
88
+ current_dir = os.path.dirname(__file__)
89
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
90
+ super().__init__(
91
+ model_config=model_config,
92
+ prompty_file=prompty_path,
93
+ result_key=self._RESULT_KEY,
94
+ threshold=threshold,
95
+ credential=credential,
96
+ _higher_is_better=True,
97
+ **kwargs,
98
+ )
107
99
 
108
- def __call__(self, *, query: str, response: str, context: str, **kwargs):
109
- """
110
- Evaluate relevance.
100
+ @overload
101
+ def __call__(
102
+ self,
103
+ *,
104
+ query: str,
105
+ response: str,
106
+ ) -> Dict[str, Union[str, float]]:
107
+ """Evaluate groundedness for given input of query, response, context
111
108
 
112
109
  :keyword query: The query to be evaluated.
113
110
  :paramtype query: str
114
111
  :keyword response: The response to be evaluated.
115
112
  :paramtype response: str
116
- :keyword context: The context to be evaluated.
117
- :paramtype context: str
118
113
  :return: The relevance score.
119
- :rtype: dict
114
+ :rtype: Dict[str, float]
120
115
  """
121
- return async_run_allowing_running_loop(
122
- self._async_evaluator, query=query, response=response, context=context, **kwargs
123
- )
124
116
 
125
- def _to_async(self):
126
- return self._async_evaluator
117
+ @overload
118
+ def __call__(
119
+ self,
120
+ *,
121
+ conversation: Conversation,
122
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
123
+ """Evaluate relevance for a conversation
124
+
125
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
126
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
127
+ to be dictionaries with keys "content", "role", and possibly "context".
128
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
129
+ :return: The relevance score.
130
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
131
+ """
132
+
133
+ @override
134
+ def __call__( # pylint: disable=docstring-missing-param
135
+ self,
136
+ *args,
137
+ **kwargs,
138
+ ):
139
+ """Evaluate relevance. Accepts either a query and response for a single evaluation,
140
+ or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
141
+ the evaluator will aggregate the results of each turn.
142
+
143
+ :keyword query: The query to be evaluated. Mutually exclusive with the `conversation` parameter.
144
+ :paramtype query: Optional[str]
145
+ :keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter.
146
+ :paramtype response: Optional[str]
147
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
148
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
149
+ to be dictionaries with keys "content", "role", and possibly "context".
150
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
151
+ :return: The relevance score.
152
+ :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
153
+ """
154
+ return super().__call__(*args, **kwargs)
155
+
156
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
157
+ """Do a relevance evaluation.
158
+
159
+ :param eval_input: The input to the evaluator. Expected to contain
160
+ whatever inputs are needed for the _flow method, including context
161
+ and other fields depending on the child class.
162
+ :type eval_input: Dict
163
+ :return: The evaluation result.
164
+ :rtype: Dict
165
+ """
166
+ if "query" not in eval_input and "response" not in eval_input:
167
+ raise EvaluationException(
168
+ message="Only text conversation inputs are supported.",
169
+ internal_message="Only text conversation inputs are supported.",
170
+ blame=ErrorBlame.USER_ERROR,
171
+ category=ErrorCategory.INVALID_VALUE,
172
+ target=ErrorTarget.CONVERSATION,
173
+ )
174
+ if not isinstance(eval_input["query"], str):
175
+ eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
176
+ if not isinstance(eval_input["response"], str):
177
+ eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
178
+ result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
179
+ llm_output = result.get("llm_output")
180
+ score = math.nan
181
+
182
+ if isinstance(llm_output, dict):
183
+ score = float(llm_output.get("score", math.nan))
184
+ reason = llm_output.get("explanation", "")
185
+ # Parse out score and reason from evaluators known to possess them.
186
+ binary_result = self._get_binary_result(score)
187
+ return {
188
+ self._result_key: float(score),
189
+ f"gpt_{self._result_key}": float(score),
190
+ f"{self._result_key}_result": binary_result,
191
+ f"{self._result_key}_threshold": self._threshold,
192
+ f"{self._result_key}_reason": reason,
193
+ f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
194
+ f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
195
+ f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
196
+ f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
197
+ f"{self._result_key}_model": result.get("model_id", ""),
198
+ f"{self._result_key}_sample_input": result.get("sample_input", ""),
199
+ f"{self._result_key}_sample_output": result.get("sample_output", ""),
200
+ }
201
+
202
+ if logger:
203
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
204
+
205
+ binary_result = self._get_binary_result(score)
206
+ return {
207
+ self._result_key: float(score),
208
+ f"{self._result_key}_result": binary_result,
209
+ f"{self._result_key}_threshold": self._threshold,
210
+ }