azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,179 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing import List, Union, Dict
5
+ from typing_extensions import overload, override
6
+
7
+ from azure.ai.evaluation._common._experimental import experimental
8
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
9
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
+ from azure.ai.evaluation._model_configurations import Conversation
11
+
12
+
13
+ @experimental
14
+ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
15
+ """
16
+ Evaluates service-based groundedness score for a given response, context, and query or a multi-turn conversation,
17
+ including reasoning.
18
+
19
+ The groundedness measure calls Azure AI Evaluation service to assess how well the AI-generated answer is grounded
20
+ in the source context. Even if the responses from LLM are factually correct, they'll be considered ungrounded if
21
+ they can't be verified against the provided sources (such as your input source or your database).
22
+
23
+ Service-based groundedness scores are boolean values, where True indicates that the response is grounded.
24
+
25
+ :param credential: The credential for connecting to Azure AI project. Required
26
+ :type credential: ~azure.core.credentials.TokenCredential
27
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
28
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
29
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
30
+ :param threshold: The threshold for the groundedness pro evaluator. Default is 5.
31
+ :type threshold: int
32
+ :param kwargs: Additional arguments to pass to the evaluator.
33
+ :type kwargs: Any
34
+
35
+ .. admonition:: Example:
36
+
37
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
38
+ :start-after: [START groundedness_pro_evaluator]
39
+ :end-before: [END groundedness_pro_evaluator]
40
+ :language: python
41
+ :dedent: 8
42
+ :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
43
+
44
+ .. admonition:: Example using Azure AI Project URL:
45
+
46
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
47
+ :start-after: [START groundedness_pro_evaluator]
48
+ :end-before: [END groundedness_pro_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
52
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
53
+
54
+ .. admonition:: Example with threshold:
55
+
56
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
57
+ :start-after: [START threshold_groundedness_pro_evaluator]
58
+ :end-before: [END threshold_groundedness_pro_evaluator]
59
+ :language: python
60
+ :dedent: 8
61
+ :caption: Initialize with a specified threshold and call GroundednessProEvaluator with a query, response, and context.
62
+
63
+ .. note::
64
+
65
+ If this evaluator is supplied to the `evaluate` function, the aggregated metric
66
+ for the groundedness pro label will be "groundedness_pro_passing_rate".
67
+ """
68
+
69
+ id = "azureai://built-in/evaluators/groundedness_pro"
70
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71
+ _OPTIONAL_PARAMS = ["query"]
72
+
73
+ @override
74
+ def __init__(
75
+ self,
76
+ credential,
77
+ azure_ai_project,
78
+ *,
79
+ threshold: int = 5,
80
+ **kwargs,
81
+ ):
82
+ self.threshold = threshold
83
+ self._higher_is_better = True
84
+ self._output_prefix = "groundedness_pro"
85
+ super().__init__(
86
+ eval_metric=EvaluationMetrics.GROUNDEDNESS,
87
+ azure_ai_project=azure_ai_project,
88
+ credential=credential,
89
+ threshold=self.threshold,
90
+ **kwargs,
91
+ )
92
+
93
+ @overload
94
+ def __call__(
95
+ self,
96
+ *,
97
+ response: str,
98
+ context: str,
99
+ query: str,
100
+ ) -> Dict[str, Union[str, bool]]:
101
+ """Evaluate groundedness for a given query/response/context
102
+
103
+ :keyword response: The response to be evaluated.
104
+ :paramtype response: str
105
+ :keyword context: The context to be evaluated.
106
+ :paramtype context: str
107
+ :keyword query: The query to be evaluated.
108
+ :paramtype query: Optional[str]
109
+ :return: The relevance score.
110
+ :rtype: Dict[str, Union[str, bool]]
111
+ """
112
+
113
+ @overload
114
+ def __call__(
115
+ self,
116
+ *,
117
+ conversation: Conversation,
118
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
119
+ """Evaluate groundedness for a conversation for a multi-turn evaluation. If the conversation has
120
+ more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results
121
+ available in the output under the "evaluation_per_turn" key.
122
+
123
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
124
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
125
+ to be dictionaries with keys "content", "role", and possibly "context".
126
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
127
+ :return: The relevance score.
128
+ :rtype: Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]
129
+ """
130
+
131
+ @override
132
+ def __call__( # pylint: disable=docstring-missing-param
133
+ self,
134
+ *args,
135
+ **kwargs,
136
+ ):
137
+ """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
138
+ or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
139
+ the evaluator will aggregate the results of each turn, with the per-turn results available
140
+ in the output under the "evaluation_per_turn" key.
141
+
142
+ :keyword query: The query to be evaluated.
143
+ :paramtype query: Optional[str]
144
+ :keyword response: The response to be evaluated.
145
+ :paramtype response: Optional[str]
146
+ :keyword context: The context to be evaluated.
147
+ :paramtype context: Optional[str]
148
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
149
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
150
+ to be dictionaries with keys "content", "role", and possibly "context".
151
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
152
+ :return: The relevance score.
153
+ :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
154
+ """
155
+ return super().__call__(*args, **kwargs)
156
+
157
+ @override
158
+ async def _do_eval(self, eval_input: Dict):
159
+ """This evaluator has some unique post-processing that requires data that
160
+ the rai_service script is not currently built to handle. So we post-post-process
161
+ the result here to message it into the right form.
162
+
163
+ :param eval_input: The input to the evaluation function.
164
+ :type eval_input: Dict
165
+ :return: The evaluation result.
166
+ :rtype: Dict
167
+ """
168
+ result = await super()._do_eval(eval_input)
169
+ real_result = {}
170
+ real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
171
+ real_result[self._output_prefix + "_label"] = (
172
+ result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self.threshold
173
+ )
174
+ if self._higher_is_better:
175
+ real_result[self._output_prefix + "_score"] = max(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 0)
176
+ else:
177
+ real_result[self._output_prefix + "_score"] = min(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 1)
178
+
179
+ return real_result
@@ -3,108 +3,124 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import os
6
- import re
6
+ from typing import Dict
7
7
 
8
- import numpy as np
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from promptflow.core import AsyncPrompty
8
+ from typing_extensions import overload, override
11
9
 
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
10
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13
11
 
14
- from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
15
12
 
16
- try:
17
- from ..._user_agent import USER_AGENT
18
- except ImportError:
19
- USER_AGENT = None
20
-
21
-
22
- class _AsyncSimilarityEvaluator:
23
- # Constants must be defined within eval's directory to be save/loadable
24
- PROMPTY_FILE = "similarity.prompty"
25
- LLM_CALL_TIMEOUT = 600
26
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
-
28
- def __init__(self, model_config: dict):
29
- ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
30
-
31
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
32
-
33
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
34
- # https://github.com/encode/httpx/discussions/2959
35
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
36
-
37
- ensure_user_agent_in_aoai_model_config(
38
- model_config,
39
- prompty_model_config,
40
- USER_AGENT,
41
- )
42
-
43
- current_dir = os.path.dirname(__file__)
44
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
45
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
46
-
47
- async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
48
- # Validate input parameters
49
- query = str(query or "")
50
- response = str(response or "")
51
- ground_truth = str(ground_truth or "")
52
-
53
- if not (query.strip() and response.strip() and ground_truth.strip()):
54
- msg = "'query', 'response' and 'ground_truth' must be non-empty strings."
55
- raise EvaluationException(
56
- message=msg,
57
- internal_message=msg,
58
- error_category=ErrorCategory.MISSING_FIELD,
59
- error_blame=ErrorBlame.USER_ERROR,
60
- error_target=ErrorTarget.SIMILARITY_EVALUATOR,
61
- )
62
-
63
- # Run the evaluation flow
64
- llm_output = await self._flow(
65
- query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
66
- )
67
-
68
- score = np.nan
69
- if llm_output:
70
- match = re.search(r"\d", llm_output)
71
- if match:
72
- score = float(match.group())
13
+ class SimilarityEvaluator(PromptyEvaluatorBase):
14
+ """
15
+ Evaluates similarity score for a given query, response, and ground truth.
73
16
 
74
- return {"gpt_similarity": float(score)}
17
+ The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
18
+ AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
19
+ the ground truth and the model's prediction, which are high-dimensional vector representations capturing
20
+ the semantic meaning and context of the sentences.
75
21
 
22
+ Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
23
+ tasks where you have access to ground truth responses. Similarity enables you to assess the generated
24
+ text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
76
25
 
77
- class SimilarityEvaluator:
78
- """
79
- Initialize a similarity evaluator configured for a specific Azure OpenAI model.
26
+ Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
80
27
 
81
28
  :param model_config: Configuration for the Azure OpenAI model.
82
29
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
83
30
  ~azure.ai.evaluation.OpenAIModelConfiguration]
31
+ :param threshold: The threshold for the similarity evaluator. Default is 3.
32
+ :type threshold: int
33
+ :param credential: The credential for authenticating to Azure AI service.
34
+ :type credential: ~azure.core.credentials.TokenCredential
35
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
36
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
37
+ :paramtype is_reasoning_model: bool
38
+
39
+ .. admonition:: Example:
40
+
41
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
42
+ :start-after: [START similarity_evaluator]
43
+ :end-before: [END similarity_evaluator]
44
+ :language: python
45
+ :dedent: 8
46
+ :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
47
+
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START similarity_evaluator]
52
+ :end-before: [END similarity_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
58
+ .. admonition:: Example:
59
+
60
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
61
+ :start-after: [START threshold_similarity_evaluator]
62
+ :end-before: [END threshold_similarity_evaluator]
63
+ :language: python
64
+ :dedent: 8
65
+ :caption: Initialize with a threshold and call a SimilarityEvaluator.
66
+
67
+ .. note::
68
+
69
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
70
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
71
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
72
+ """
84
73
 
85
- **Usage**
86
-
87
- .. code-block:: python
74
+ # Constants must be defined within eval's directory to be save/loadable
88
75
 
89
- eval_fn = SimilarityEvaluator(model_config)
90
- result = eval_fn(
91
- query="What is the capital of Japan?",
92
- response="The capital of Japan is Tokyo.",
93
- ground_truth="Tokyo is Japan's capital.")
76
+ _PROMPTY_FILE = "similarity.prompty"
77
+ _RESULT_KEY = "similarity"
94
78
 
95
- **Output format**
79
+ id = "azureai://built-in/evaluators/similarity"
80
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
96
81
 
97
- .. code-block:: python
82
+ @override
83
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
84
+ current_dir = os.path.dirname(__file__)
85
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
86
+ self._threshold = threshold
87
+ self._higher_is_better = True
88
+ super().__init__(
89
+ model_config=model_config,
90
+ prompty_file=prompty_path,
91
+ result_key=self._RESULT_KEY,
92
+ threshold=threshold,
93
+ credential=credential,
94
+ _higher_is_better=self._higher_is_better,
95
+ **kwargs,
96
+ )
98
97
 
99
- {
100
- "gpt_similarity": 3.0
101
- }
102
- """
98
+ # Ignoring a mypy error about having only 1 overload function.
99
+ # We want to use the overload style for all evals, even single-inputs. This is both to make
100
+ # refactoring to multi-input styles easier, stylistic consistency consistency across evals,
101
+ # and due to the fact that non-overloaded syntax now causes various parsing issues that
102
+ # we don't want to deal with.
103
+ @overload # type: ignore
104
+ def __call__(self, *, query: str, response: str, ground_truth: str) -> Dict[str, float]:
105
+ """
106
+ Evaluate similarity.
103
107
 
104
- def __init__(self, model_config: dict):
105
- self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
108
+ :keyword query: The query to be evaluated.
109
+ :paramtype query: str
110
+ :keyword response: The response to be evaluated.
111
+ :paramtype response: str
112
+ :keyword ground_truth: The ground truth to be evaluated.
113
+ :paramtype ground_truth: str
114
+ :return: The similarity score.
115
+ :rtype: Dict[str, float]
116
+ """
106
117
 
107
- def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
118
+ @override
119
+ def __call__( # pylint: disable=docstring-missing-param
120
+ self,
121
+ *args,
122
+ **kwargs,
123
+ ):
108
124
  """
109
125
  Evaluate similarity.
110
126
 
@@ -115,11 +131,6 @@ class SimilarityEvaluator:
115
131
  :keyword ground_truth: The ground truth to be evaluated.
116
132
  :paramtype ground_truth: str
117
133
  :return: The similarity score.
118
- :rtype: dict
134
+ :rtype: Dict[str, float]
119
135
  """
120
- return async_run_allowing_running_loop(
121
- self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
122
- )
123
-
124
- def _to_async(self):
125
- return self._async_evaluator
136
+ return super().__call__(*args, **kwargs)
@@ -3,11 +3,6 @@ name: Similarity
3
3
  description: Evaluates similarity score for QA scenario
4
4
  model:
5
5
  api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
6
  parameters:
12
7
  temperature: 0.0
13
8
  max_tokens: 1
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._task_adherence import TaskAdherenceEvaluator
6
+
7
+ __all__ = ["TaskAdherenceEvaluator"]
@@ -0,0 +1,226 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import math
6
+ import logging
7
+ from typing import Dict, Union, List, Optional
8
+
9
+ from typing_extensions import overload, override
10
+
11
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13
+ from ..._common.utils import (
14
+ reformat_conversation_history,
15
+ reformat_agent_response,
16
+ )
17
+ from azure.ai.evaluation._model_configurations import Message
18
+ from azure.ai.evaluation._common._experimental import experimental
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @experimental
24
+ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
25
+ """The Task Adherence evaluator assesses whether an AI assistant's actions fully align with the user's intent
26
+ and fully achieve the intended goal across three dimensions:
27
+
28
+ - Goal adherence: Did the assistant achieve the user's objective within scope and constraints?
29
+ - Rule adherence: Did the assistant respect safety, privacy, authorization, and presentation contracts?
30
+ - Procedural adherence: Did the assistant follow required workflows, tool use, sequencing, and verification?
31
+
32
+ The evaluator returns a boolean flag indicating whether there was any material failure in any dimension.
33
+ A material failure is an issue that makes the output unusable, creates verifiable risk, violates an explicit
34
+ constraint, or is a critical issue as defined in the evaluation dimensions.
35
+
36
+ The evaluation includes step-by-step reasoning and a flagged boolean result.
37
+
38
+
39
+ :param model_config: Configuration for the Azure OpenAI model.
40
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
41
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
42
+
43
+ .. admonition:: Example:
44
+
45
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
46
+ :start-after: [START task_adherence_evaluator]
47
+ :end-before: [END task_adherence_evaluator]
48
+ :language: python
49
+ :dedent: 8
50
+ :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
51
+
52
+ .. admonition:: Example using Azure AI Project URL:
53
+
54
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
55
+ :start-after: [START task_adherence_evaluator]
56
+ :end-before: [END task_adherence_evaluator]
57
+ :language: python
58
+ :dedent: 8
59
+ :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
60
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
61
+
62
+ """
63
+
64
+ _PROMPTY_FILE = "task_adherence.prompty"
65
+ _RESULT_KEY = "task_adherence"
66
+ _OPTIONAL_PARAMS = []
67
+
68
+ _DEFAULT_TASK_ADHERENCE_SCORE = 0
69
+
70
+ id = "azureai://built-in/evaluators/task_adherence"
71
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
72
+
73
+ @override
74
+ def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
75
+ current_dir = os.path.dirname(__file__)
76
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
77
+ self.threshold = threshold # to be removed in favor of _threshold
78
+ super().__init__(
79
+ model_config=model_config,
80
+ prompty_file=prompty_path,
81
+ result_key=self._RESULT_KEY,
82
+ credential=credential,
83
+ _higher_is_better=True,
84
+ **kwargs,
85
+ )
86
+
87
+ @overload
88
+ def __call__(
89
+ self,
90
+ *,
91
+ query: Union[str, List[dict]],
92
+ response: Union[str, List[dict]],
93
+ tool_definitions: Optional[Union[dict, List[dict]]] = None,
94
+ ) -> Dict[str, Union[str, float]]:
95
+ """Evaluate task adherence for a given query and response.
96
+ The query and response must be lists of messages in conversation format.
97
+
98
+
99
+ Example with list of messages:
100
+ evaluator = TaskAdherenceEvaluator(model_config)
101
+ query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
102
+ response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
103
+
104
+ result = evaluator(query=query, response=response)
105
+
106
+ :keyword query: The query being evaluated, must be a list of messages including system and user messages.
107
+ :paramtype query: Union[str, List[dict]]
108
+ :keyword response: The response being evaluated, must be a list of messages (full agent response including tool calls and results)
109
+ :paramtype response: Union[str, List[dict]]
110
+ :return: A dictionary with the task adherence evaluation results including flagged (bool) and reasoning (str).
111
+ :rtype: Dict[str, Union[str, float, bool]]
112
+ """
113
+
114
+ @override
115
+ def __call__( # pylint: disable=docstring-missing-param
116
+ self,
117
+ *args,
118
+ **kwargs,
119
+ ):
120
+ """
121
+ Invokes the instance using the overloaded __call__ signature.
122
+
123
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
124
+ """
125
+ return super().__call__(*args, **kwargs)
126
+
127
+ @override
128
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]: # type: ignore[override]
129
+ """Do Task Adherence evaluation.
130
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
131
+ :type eval_input: Dict
132
+ :return: The evaluation result.
133
+ :rtype: Dict
134
+ """
135
+ # we override the _do_eval method as we want the output to be a dictionary,
136
+ # which is a different schema than _base_prompty_eval.py
137
+ if "query" not in eval_input or "response" not in eval_input:
138
+ raise EvaluationException(
139
+ message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
140
+ internal_message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
141
+ blame=ErrorBlame.USER_ERROR,
142
+ category=ErrorCategory.MISSING_FIELD,
143
+ target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
144
+ )
145
+
146
+ # Reformat conversation history and extract system message
147
+ query_messages = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
148
+ system_message = ""
149
+ user_query = ""
150
+
151
+ # Parse query messages to extract system message and user query
152
+ if isinstance(query_messages, list):
153
+ for msg in query_messages:
154
+ if isinstance(msg, dict) and msg.get("role") == "system":
155
+ system_message = msg.get("content", "")
156
+ elif isinstance(msg, dict) and msg.get("role") == "user":
157
+ user_query = msg.get("content", "")
158
+ elif isinstance(query_messages, str):
159
+ user_query = query_messages
160
+
161
+ # Reformat response and separate assistant messages from tool calls
162
+ response_messages = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
163
+ assistant_response = ""
164
+ tool_calls = ""
165
+
166
+ # Parse response messages to extract assistant response and tool calls
167
+ if isinstance(response_messages, list):
168
+ assistant_parts = []
169
+ tool_parts = []
170
+ for msg in response_messages:
171
+ if isinstance(msg, dict):
172
+ role = msg.get("role", "")
173
+ if role == "assistant":
174
+ content = msg.get("content", "")
175
+ if isinstance(content, list):
176
+ for item in content:
177
+ if isinstance(item, dict):
178
+ if item.get("type") == "text":
179
+ assistant_parts.append(item.get("text", ""))
180
+ elif item.get("type") == "tool_call":
181
+ tool_parts.append(str(item.get("tool_call", "")))
182
+ else:
183
+ assistant_parts.append(str(content))
184
+ elif role == "tool":
185
+ tool_parts.append(str(msg))
186
+ assistant_response = "\n".join(assistant_parts)
187
+ tool_calls = "\n".join(tool_parts)
188
+ elif isinstance(response_messages, str):
189
+ assistant_response = response_messages
190
+
191
+ # Prepare inputs for prompty
192
+ prompty_input = {
193
+ "system_message": system_message,
194
+ "query": user_query,
195
+ "response": assistant_response,
196
+ "tool_calls": tool_calls,
197
+ }
198
+
199
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_input)
200
+ llm_output = prompty_output_dict["llm_output"]
201
+
202
+ if isinstance(llm_output, dict):
203
+ flagged = llm_output.get("flagged", False)
204
+ reasoning = llm_output.get("reasoning", "")
205
+ # Convert flagged to numeric score for backward compatibility (1 = pass, 0 = fail)
206
+ score = 0.0 if flagged else 1.0
207
+ score_result = "fail" if flagged else "pass"
208
+
209
+ return {
210
+ f"{self._result_key}": score,
211
+ f"{self._result_key}_result": score_result,
212
+ f"{self._result_key}_reason": reasoning,
213
+ f"{self._result_key}_details": llm_output.get("details", ""),
214
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
215
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
216
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
217
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
218
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
219
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
220
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
221
+ }
222
+
223
+ if logger:
224
+ logger.warning("LLM output is not a dictionary, returning 0 for the success.")
225
+
226
+ return {self._result_key: 0}