azure-ai-evaluation 1.13.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (422) hide show
  1. azure_ai_evaluation-1.13.4/CHANGELOG.md +527 -0
  2. azure_ai_evaluation-1.13.4/MANIFEST.in +8 -0
  3. azure_ai_evaluation-1.13.4/NOTICE.txt +70 -0
  4. azure_ai_evaluation-1.13.4/PKG-INFO +945 -0
  5. azure_ai_evaluation-1.13.4/README.md +358 -0
  6. azure_ai_evaluation-1.13.4/TROUBLESHOOTING.md +95 -0
  7. azure_ai_evaluation-1.13.4/azure/__init__.py +5 -0
  8. azure_ai_evaluation-1.13.4/azure/ai/__init__.py +5 -0
  9. azure_ai_evaluation-1.13.4/azure/ai/evaluation/__init__.py +153 -0
  10. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_aoai/__init__.py +10 -0
  11. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  12. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_aoai/label_grader.py +68 -0
  13. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_aoai/python_grader.py +86 -0
  14. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  15. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  16. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  17. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_azure/__init__.py +3 -0
  18. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_azure/_clients.py +204 -0
  19. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_azure/_envs.py +207 -0
  20. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_azure/_models.py +227 -0
  21. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_azure/_token_manager.py +129 -0
  22. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/__init__.py +24 -0
  23. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/_experimental.py +172 -0
  24. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/constants.py +194 -0
  25. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  26. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/math.py +89 -0
  27. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  28. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_client.py +166 -0
  29. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  30. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  31. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  32. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  33. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_types.py +21 -0
  34. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  35. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  36. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  37. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  38. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  39. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/_version.py +9 -0
  40. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  41. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  42. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  43. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  44. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  45. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  46. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  47. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  48. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  49. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  50. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  51. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  52. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  53. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  54. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/py.typed +1 -0
  55. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  56. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  57. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  58. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  59. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  60. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  61. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  62. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  63. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  64. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  65. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  66. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  67. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  68. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  69. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  70. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  71. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/rai_service.py +1141 -0
  72. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  73. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  74. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  75. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  76. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  77. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  78. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  79. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  80. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  81. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  82. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  83. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  84. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  85. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  86. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  87. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  88. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  89. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  90. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  91. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  92. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  93. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  94. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_common/utils.py +923 -0
  95. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_constants.py +219 -0
  96. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_converters/__init__.py +3 -0
  97. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_converters/_ai_services.py +899 -0
  98. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_converters/_models.py +467 -0
  99. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_converters/_sk_services.py +495 -0
  100. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_eval_mapping.py +87 -0
  101. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/__init__.py +3 -0
  102. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  103. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  104. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  105. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +194 -0
  106. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +102 -0
  107. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  108. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  109. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_eval_run.py +557 -0
  110. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_evaluate.py +2519 -0
  111. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  112. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +94 -0
  113. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluate/_utils.py +493 -0
  114. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluator_definition.py +76 -0
  115. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/__init__.py +3 -0
  116. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  117. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +124 -0
  118. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  119. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  120. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  121. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +143 -0
  122. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  123. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  124. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  125. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  126. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  127. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  128. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  129. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  130. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +164 -0
  131. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +159 -0
  132. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +144 -0
  133. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +155 -0
  134. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +156 -0
  135. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  136. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  137. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  138. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_eci/_eci.py +92 -0
  139. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  140. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +184 -0
  141. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  142. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +140 -0
  143. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  144. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  145. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +124 -0
  146. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  147. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +354 -0
  148. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  149. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  150. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  151. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  152. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  153. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  154. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +143 -0
  155. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  156. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +131 -0
  157. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  158. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_qa/_qa.py +152 -0
  159. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  160. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +210 -0
  161. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +198 -0
  162. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  163. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  164. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  165. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  166. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  167. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  168. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  169. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +238 -0
  170. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  171. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  172. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  173. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +136 -0
  174. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  175. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  176. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  177. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  178. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  179. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  180. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  181. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  182. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  183. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  184. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  185. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  186. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
  187. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
  188. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
  189. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  190. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  191. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  192. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  193. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  194. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  195. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  196. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  197. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  198. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  199. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  200. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  201. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_evaluators/_xpia/xpia.py +141 -0
  202. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_exceptions.py +151 -0
  203. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_http_utils.py +468 -0
  204. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/__init__.py +3 -0
  205. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  206. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  207. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  208. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  209. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  210. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  211. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  212. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  213. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  214. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  215. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  216. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  217. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  218. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  219. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  220. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  221. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  222. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  223. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  224. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  225. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  226. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  227. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  228. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  229. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  230. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  231. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  232. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  233. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  234. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  235. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  236. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  237. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  238. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  239. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  240. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_model_configurations.py +149 -0
  241. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  242. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  243. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  244. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_user_agent.py +37 -0
  245. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_vendor/__init__.py +3 -0
  246. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  247. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  248. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  249. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  250. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  251. azure_ai_evaluation-1.13.4/azure/ai/evaluation/_version.py +6 -0
  252. azure_ai_evaluation-1.13.4/azure/ai/evaluation/py.typed +0 -0
  253. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/__init__.py +22 -0
  254. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  255. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  256. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  257. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  258. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  259. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  260. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  261. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  262. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_default_converter.py +21 -0
  263. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  264. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  265. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  266. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_red_team.py +1717 -0
  267. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  268. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  269. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  270. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  271. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  272. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  273. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  274. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  275. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  276. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  277. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  278. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  279. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  280. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  281. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  282. azure_ai_evaluation-1.13.4/azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  283. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/__init__.py +16 -0
  284. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_adversarial_scenario.py +52 -0
  285. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_adversarial_simulator.py +578 -0
  286. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_constants.py +28 -0
  287. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_conversation/__init__.py +443 -0
  288. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_conversation/_conversation.py +182 -0
  289. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  290. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  291. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  292. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_direct_attack_simulator.py +231 -0
  293. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  294. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +18 -0
  295. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  296. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +248 -0
  297. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_model_tools/__init__.py +24 -0
  298. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  299. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +197 -0
  300. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +299 -0
  301. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +266 -0
  302. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +221 -0
  303. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_model_tools/models.py +617 -0
  304. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  305. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  306. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  307. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_simulator.py +740 -0
  308. azure_ai_evaluation-1.13.4/azure/ai/evaluation/simulator/_utils.py +132 -0
  309. azure_ai_evaluation-1.13.4/azure_ai_evaluation.egg-info/PKG-INFO +945 -0
  310. azure_ai_evaluation-1.13.4/azure_ai_evaluation.egg-info/SOURCES.txt +420 -0
  311. azure_ai_evaluation-1.13.4/azure_ai_evaluation.egg-info/dependency_links.txt +1 -0
  312. azure_ai_evaluation-1.13.4/azure_ai_evaluation.egg-info/not-zip-safe +1 -0
  313. azure_ai_evaluation-1.13.4/azure_ai_evaluation.egg-info/requires.txt +30 -0
  314. azure_ai_evaluation-1.13.4/azure_ai_evaluation.egg-info/top_level.txt +1 -0
  315. azure_ai_evaluation-1.13.4/migration_guide.md +243 -0
  316. azure_ai_evaluation-1.13.4/pyproject.toml +20 -0
  317. azure_ai_evaluation-1.13.4/samples/README.md +57 -0
  318. azure_ai_evaluation-1.13.4/samples/agent_evaluators/agent_evaluation.ipynb +455 -0
  319. azure_ai_evaluation-1.13.4/samples/agent_evaluators/instructions.md +40 -0
  320. azure_ai_evaluation-1.13.4/samples/agent_evaluators/intent_resolution.ipynb +452 -0
  321. azure_ai_evaluation-1.13.4/samples/agent_evaluators/response_completeness.ipynb +236 -0
  322. azure_ai_evaluation-1.13.4/samples/agent_evaluators/sample_synthetic_conversations.jsonl +90 -0
  323. azure_ai_evaluation-1.13.4/samples/agent_evaluators/task_adherence.ipynb +245 -0
  324. azure_ai_evaluation-1.13.4/samples/agent_evaluators/task_navigation_efficiency.ipynb +578 -0
  325. azure_ai_evaluation-1.13.4/samples/agent_evaluators/tool_call_accuracy.ipynb +368 -0
  326. azure_ai_evaluation-1.13.4/samples/agent_evaluators/user_functions.py +275 -0
  327. azure_ai_evaluation-1.13.4/samples/aoai_score_model_grader_sample.py +311 -0
  328. azure_ai_evaluation-1.13.4/samples/data/custom_objectives_with_context_example.json +51 -0
  329. azure_ai_evaluation-1.13.4/samples/data/evaluate_test_data.jsonl +3 -0
  330. azure_ai_evaluation-1.13.4/samples/evaluation_samples_common.py +128 -0
  331. azure_ai_evaluation-1.13.4/samples/evaluation_samples_evaluate.py +885 -0
  332. azure_ai_evaluation-1.13.4/samples/evaluation_samples_evaluate_fdp.py +857 -0
  333. azure_ai_evaluation-1.13.4/samples/evaluation_samples_safety_evaluation.py +332 -0
  334. azure_ai_evaluation-1.13.4/samples/evaluation_samples_simulate.py +249 -0
  335. azure_ai_evaluation-1.13.4/samples/evaluation_samples_threshold.py +403 -0
  336. azure_ai_evaluation-1.13.4/samples/red_team_agent_tool_sample.py +169 -0
  337. azure_ai_evaluation-1.13.4/samples/red_team_samples.py +603 -0
  338. azure_ai_evaluation-1.13.4/samples/red_team_skip_upload.py +101 -0
  339. azure_ai_evaluation-1.13.4/samples/score_model_multimodal/aoai_score_model_grader_sample_audio.py +261 -0
  340. azure_ai_evaluation-1.13.4/samples/score_model_multimodal/aoai_score_model_grader_sample_audio_file.py +268 -0
  341. azure_ai_evaluation-1.13.4/samples/score_model_multimodal/aoai_score_model_grader_sample_image.py +257 -0
  342. azure_ai_evaluation-1.13.4/samples/score_model_multimodal/chat_compeletion_audio.py +64 -0
  343. azure_ai_evaluation-1.13.4/samples/score_model_multimodal/image.jpg +0 -0
  344. azure_ai_evaluation-1.13.4/samples/score_model_multimodal/input_audio.wav +0 -0
  345. azure_ai_evaluation-1.13.4/samples/semantic_kernel_red_team_agent_sample.py +90 -0
  346. azure_ai_evaluation-1.13.4/setup.cfg +4 -0
  347. azure_ai_evaluation-1.13.4/setup.py +100 -0
  348. azure_ai_evaluation-1.13.4/tests/__init__.py +0 -0
  349. azure_ai_evaluation-1.13.4/tests/__openai_patcher.py +118 -0
  350. azure_ai_evaluation-1.13.4/tests/conftest.py +688 -0
  351. azure_ai_evaluation-1.13.4/tests/converters/ai_agent_converter/serialization_helper.py +196 -0
  352. azure_ai_evaluation-1.13.4/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py +292 -0
  353. azure_ai_evaluation-1.13.4/tests/converters/ai_agent_converter/test_run_ids_from_conversation.py +67 -0
  354. azure_ai_evaluation-1.13.4/tests/converters/ai_agent_converter/test_sk_agent_converter_internals.py +128 -0
  355. azure_ai_evaluation-1.13.4/tests/converters/ai_agent_converter/test_sk_turn_idxs_from_conversation.py +112 -0
  356. azure_ai_evaluation-1.13.4/tests/e2etests/__init__.py +0 -0
  357. azure_ai_evaluation-1.13.4/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +23 -0
  358. azure_ai_evaluation-1.13.4/tests/e2etests/target_fn.py +37 -0
  359. azure_ai_evaluation-1.13.4/tests/e2etests/test_adv_simulator.py +1030 -0
  360. azure_ai_evaluation-1.13.4/tests/e2etests/test_aoai_graders.py +412 -0
  361. azure_ai_evaluation-1.13.4/tests/e2etests/test_builtin_evaluators.py +1356 -0
  362. azure_ai_evaluation-1.13.4/tests/e2etests/test_evaluate.py +562 -0
  363. azure_ai_evaluation-1.13.4/tests/e2etests/test_lite_management_client.py +87 -0
  364. azure_ai_evaluation-1.13.4/tests/e2etests/test_mass_evaluate.py +658 -0
  365. azure_ai_evaluation-1.13.4/tests/e2etests/test_metrics_upload.py +248 -0
  366. azure_ai_evaluation-1.13.4/tests/e2etests/test_prompty_async.py +201 -0
  367. azure_ai_evaluation-1.13.4/tests/e2etests/test_red_team.py +380 -0
  368. azure_ai_evaluation-1.13.4/tests/e2etests/test_remote_evaluation.py +99 -0
  369. azure_ai_evaluation-1.13.4/tests/e2etests/test_sim_and_eval.py +620 -0
  370. azure_ai_evaluation-1.13.4/tests/unittests/test_agent_evaluators.py +105 -0
  371. azure_ai_evaluation-1.13.4/tests/unittests/test_aoai_alignment_missing_rows.py +90 -0
  372. azure_ai_evaluation-1.13.4/tests/unittests/test_aoai_data_source.py +510 -0
  373. azure_ai_evaluation-1.13.4/tests/unittests/test_aoai_evaluation_pagination.py +252 -0
  374. azure_ai_evaluation-1.13.4/tests/unittests/test_aoai_integration_features.py +159 -0
  375. azure_ai_evaluation-1.13.4/tests/unittests/test_aoai_nested_integration.py +289 -0
  376. azure_ai_evaluation-1.13.4/tests/unittests/test_aoai_python_grader.py +54 -0
  377. azure_ai_evaluation-1.13.4/tests/unittests/test_aoai_score_model_grader.py +970 -0
  378. azure_ai_evaluation-1.13.4/tests/unittests/test_batch_run_context.py +81 -0
  379. azure_ai_evaluation-1.13.4/tests/unittests/test_built_in_evaluator.py +245 -0
  380. azure_ai_evaluation-1.13.4/tests/unittests/test_completeness_evaluator.py +144 -0
  381. azure_ai_evaluation-1.13.4/tests/unittests/test_content_safety_defect_rate.py +25 -0
  382. azure_ai_evaluation-1.13.4/tests/unittests/test_content_safety_rai_script.py +472 -0
  383. azure_ai_evaluation-1.13.4/tests/unittests/test_document_retrieval_evaluator.py +277 -0
  384. azure_ai_evaluation-1.13.4/tests/unittests/test_eval_run.py +795 -0
  385. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluate.py +1627 -0
  386. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluate_aoai.py +109 -0
  387. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluate_mismatch.py +488 -0
  388. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluate_performance.py +77 -0
  389. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluator_scoring_patterns.py +245 -0
  390. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluators/slow_eval.py +34 -0
  391. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluators/test_conversation_thresholds.py +137 -0
  392. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluators/test_inputs_evaluators.py +88 -0
  393. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluators/test_service_evaluator_thresholds.py +198 -0
  394. azure_ai_evaluation-1.13.4/tests/unittests/test_evaluators/test_threshold_behavior.py +249 -0
  395. azure_ai_evaluation-1.13.4/tests/unittests/test_jailbreak_simulator.py +123 -0
  396. azure_ai_evaluation-1.13.4/tests/unittests/test_lazy_imports.py +135 -0
  397. azure_ai_evaluation-1.13.4/tests/unittests/test_non_adv_simulator.py +362 -0
  398. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/__init__.py +16 -0
  399. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_attack_objective_generator.py +177 -0
  400. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_attack_strategy.py +70 -0
  401. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_callback_chat_target.py +165 -0
  402. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_constants.py +41 -0
  403. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_formatting_utils.py +231 -0
  404. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_rai_service_eval_chat_target.py +157 -0
  405. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_rai_service_target.py +390 -0
  406. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_rai_service_true_false_scorer.py +114 -0
  407. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_red_team.py +1818 -0
  408. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_red_team_language_support.py +156 -0
  409. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_red_team_result.py +250 -0
  410. azure_ai_evaluation-1.13.4/tests/unittests/test_redteam/test_strategy_utils.py +239 -0
  411. azure_ai_evaluation-1.13.4/tests/unittests/test_remote_evaluation_features.py +71 -0
  412. azure_ai_evaluation-1.13.4/tests/unittests/test_safety_evaluation.py +397 -0
  413. azure_ai_evaluation-1.13.4/tests/unittests/test_save_eval.py +67 -0
  414. azure_ai_evaluation-1.13.4/tests/unittests/test_simulator.py +123 -0
  415. azure_ai_evaluation-1.13.4/tests/unittests/test_synthetic_callback_conv_bot.py +111 -0
  416. azure_ai_evaluation-1.13.4/tests/unittests/test_synthetic_conversation_bot.py +123 -0
  417. azure_ai_evaluation-1.13.4/tests/unittests/test_task_completion_evaluator.py +377 -0
  418. azure_ai_evaluation-1.13.4/tests/unittests/test_task_navigation_efficiency_evaluators.py +186 -0
  419. azure_ai_evaluation-1.13.4/tests/unittests/test_tool_call_accuracy_evaluator.py +690 -0
  420. azure_ai_evaluation-1.13.4/tests/unittests/test_tool_input_accuracy_evaluator.py +654 -0
  421. azure_ai_evaluation-1.13.4/tests/unittests/test_tool_selection_evaluator.py +286 -0
  422. azure_ai_evaluation-1.13.4/tests/unittests/test_utils.py +940 -0
@@ -0,0 +1,527 @@
1
+ # Release History
2
+
3
+ ## 1.13.4 (2025-11-10)
4
+
5
+ ### Bugs Fixed
6
+
7
+ - Handle input data for evaluation result when evaluators.
8
+
9
+ ## 1.13.3 (2025-11-08)
10
+
11
+ ### Other Changes
12
+
13
+ - Added `scenario` property to red team evaluation request to align scores with red team concepts of attack success.
14
+
15
+ ## 1.13.2 (2025-11-07)
16
+
17
+ ### Bugs Fixed
18
+
19
+ - Added App Insights redaction for agent safety run telemetry so adversarial prompts are not stored in collected logs.
20
+
21
+ ## 1.13.1 (2025-11-05)
22
+
23
+ ### Features Added
24
+
25
+ - Improved RedTeam coverage across risk sub-categories to ensure comprehensive security testing
26
+ - Made RedTeam's `AttackStrategy.Tense` seed prompts dynamic to allow use of this strategy with additional risk categories
27
+ - Refactors error handling and result semantics in the RedTeam evaluation system to improve clarity and align with Attack Success Rate (ASR) conventions (passed=False means attack success)
28
+
29
+ ### Bugs Fixed
30
+
31
+ - Fixed RedTeam evaluation error related to context handling for context-dependent risk categories
32
+ - Fixed RedTeam prompt application for model targets during Indirect Jailbreak XPIA (Cross-Platform Indirect Attack)
33
+
34
+ ## 1.13.0 (2025-10-30)
35
+
36
+ ### Features Added
37
+
38
+ - Updated `IndirectAttack` risk category for RedTeam to `IndirectJailbreak` to better reflect its purpose. This change allows users to apply cross-domain prompt injection (XPIA) attack strategies across all risk categories, enabling more comprehensive security testing of AI systems against indirect prompt injection attacks during red teaming.
39
+ - Added `TaskAdherence`, `SensitiveDataLeakage`, and `ProhibitedActions` as cloud-only agent safety risk categories for red teaming.
40
+ - Updated all evaluators' output to be of the following schema:
41
+ - `gpt_{evaluator_name}`, `{evaluator_name}`: float score,
42
+ - `{evaluator_name}_result`: pass/fail based on threshold,
43
+ - `{evaluator_name}_reason`, `{evaluator_name}_threshold`
44
+ - `{evaluator_name}_prompt_tokens`, `{evaluator_name}_completion_tokens`, `{evaluator_name}_total_tokens`, `{evaluator_name}_finish_reason`
45
+ - `{evaluator_name}_model`: model used for evaluation
46
+ - `{evaluator_name}_sample_input`, `{evaluator_name}_sample_output`: input and output used for evaluation
47
+
48
+ This change standardizes the output format across all evaluators and follows OTel convention.
49
+
50
+ ### Bugs Fixed
51
+
52
+ - `image_tag` parameter in `AzureOpenAIPythonGrader` is now optional.
53
+
54
+ ## 1.11.2 (2025-10-09)
55
+
56
+ ### Bugs Fixed
57
+
58
+ - **kwargs in an evaluator signature receives input columns that are not otherwise named in the evaluator's signature
59
+
60
+ ## 1.12.0 (2025-10-02)
61
+
62
+ ### Features Added
63
+ - AOAI Graders now accept a "credential" parameter that can be used for authentication with an AzureOpenAIModelConfiguration
64
+ - Added `is_reasoning_model` parameter support to `CoherenceEvaluator`, `FluencyEvaluator`, `SimilarityEvaluator`, `GroundednessEvaluator`, `RetrievalEvaluator`, and `RelevanceEvaluator` to enable reasoning model configuration for o1/o3 models.
65
+
66
+ ### Bugs Fixed
67
+ - Support for multi-level nesting in OpenAI grader (experimental)
68
+
69
+ ## 1.11.1 (2025-09-19)
70
+
71
+ ### Bugs Fixed
72
+ - Pinning duckdb version to 1.3.2 for redteam extra to fix error `TypeError: unhashable type: '_duckdb.typing.DuckDBPyType'`
73
+
74
+ ## 1.11.0 (2025-09-03)
75
+
76
+ ### Features Added
77
+ - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
78
+ - Added support for user-supplied TokenCredentials with LLM based evaluators.
79
+ - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
80
+ - Added `language` parameter to `RedTeam` class for multilingual red team scanning support. The parameter accepts values from `SupportedLanguages` enum including English, Spanish, French, German, Italian, Portuguese, Japanese, Korean, and Simplified Chinese, enabling red team attacks to be generated and conducted in multiple languages.
81
+ - Added support for IndirectAttack and UngroundedAttributes risk categories in `RedTeam` scanning. These new risk categories expand red team capabilities to detect cross-platform indirect attacks and evaluate ungrounded inferences about human attributes including emotional state and protected class information.
82
+
83
+ ### Bugs Fixed
84
+ - Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
85
+
86
+ ### Other Changes
87
+ - Deprecating `AdversarialSimulator` in favor of the [AI Red Teaming Agent](https://aka.ms/airedteamingagent-sample). `AdversarialSimulator` will be removed in the next minor release.
88
+ - Moved retry configuration constants (`MAX_RETRY_ATTEMPTS`, `MAX_RETRY_WAIT_SECONDS`, `MIN_RETRY_WAIT_SECONDS`) from `RedTeam` class to new `RetryManager` class for better code organization and configurability.
89
+
90
+ ## 1.10.0 (2025-07-31)
91
+
92
+ ### Breaking Changes
93
+
94
+ - Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
95
+
96
+ ### Features Added
97
+
98
+ - Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
99
+ - Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
100
+ tolerance for harmful responses).
101
+ - Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
102
+
103
+
104
+ ### Bugs Fixed
105
+
106
+ - Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
107
+ - Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
108
+ - Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
109
+
110
+
111
+ ### Other Changes
112
+
113
+ - The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
114
+ - Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
115
+ This is due to be removed in a future release.
116
+
117
+
118
+ ## 1.9.0 (2025-07-02)
119
+
120
+ ### Features Added
121
+
122
+ - Added support for Azure Open AI evaluation via `AzureOpenAIScoreModelGrader` class, which serves as a wrapper around Azure Open AI score model configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
123
+ - Added new experimental risk categories ProtectedMaterial and CodeVulnerability for redteam agent scan.
124
+
125
+
126
+ ### Bugs Fixed
127
+
128
+ - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
129
+
130
+ - Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
131
+ - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
132
+ - Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
133
+ - `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
134
+
135
+ ## 1.8.0 (2025-05-29)
136
+
137
+ ### Features Added
138
+
139
+ - Introduces `AttackStrategy.MultiTurn` and `AttackStrategy.Crescendo` to `RedTeam`. These strategies attack the target of a `RedTeam` scan over the course of multi-turn conversations.
140
+
141
+ ### Bugs Fixed
142
+ - AdversarialSimulator in `ADVERSARIAL_CONVERSATION` mode was broken. It is now fixed.
143
+
144
+ ## 1.7.0 (2025-05-12)
145
+
146
+ ### Bugs Fixed
147
+ - azure-ai-evaluation failed with module not found [#40992](https://github.com/Azure/azure-sdk-for-python/issues/40992)
148
+
149
+ ## 1.6.0 (2025-05-07)
150
+
151
+ ### Features Added
152
+ - New `<evaluator>.binary_aggregate` field added to evaluation result metrics. This field contains the aggregated binary evaluation results for each evaluator, providing a summary of the evaluation outcomes.
153
+ - Added support for Azure Open AI evaluation via 4 new 'grader' classes, which serve as wrappers around Azure Open AI grader configurations. These new grader objects can be supplied to the main `evaluate` method as if they were normal callable evaluators. The new classes are:
154
+ - AzureOpenAIGrader (general class for experienced users)
155
+ - AzureOpenAILabelGrader
156
+ - AzureOpenAIStringCheckGrader
157
+ - AzureOpenAITextSimilarityGrader
158
+
159
+ ### Breaking Changes
160
+ - In the experimental RedTeam's scan method, the `data_only` param has been replaced with `skip_evals` and if you do not want data to be uploaded, use the `skip_upload` flag.
161
+
162
+ ### Bugs Fixed
163
+ - Fixed error in `evaluate` where data fields could not contain numeric characters. Previously, a data file with schema:
164
+ ```
165
+ "query1": "some query", "response": "some response"
166
+ ```
167
+ throws error when passed into `evaluator_config` as `{"evaluator_name": {"column_mapping": {"query": "${data.query1}", "response": "${data.response}"}},}`.
168
+ Now, users may import data containing fields with numeric characters.
169
+
170
+
171
+ ## 1.5.0 (2025-04-04)
172
+
173
+ ### Features Added
174
+
175
+ - New `RedTeam` agent functionality to assess the safety and resilience of AI systems against adversarial prompt attacks
176
+
177
+ ## 1.4.0 (2025-03-27)
178
+
179
+ ### Features Added
180
+ - Enhanced binary evaluation results with customizable thresholds
181
+ - Added threshold support for QA and ContentSafety evaluators
182
+ - Evaluation results now include both the score and threshold values
183
+ - Configurable threshold parameter allows custom binary classification boundaries
184
+ - Default thresholds provided for backward compatibility
185
+ - Quality evaluators use "higher is better" scoring (score ≥ threshold is positive)
186
+ - Content safety evaluators use "lower is better" scoring (score ≤ threshold is positive)
187
+ - New Built-in evaluator called CodeVulnerabilityEvaluator is added.
188
+ - It provides capabilities to identify the following code vulnerabilities.
189
+ - path-injection
190
+ - sql-injection
191
+ - code-injection
192
+ - stack-trace-exposure
193
+ - incomplete-url-substring-sanitization
194
+ - flask-debug
195
+ - clear-text-logging-sensitive-data
196
+ - incomplete-hostname-regexp
197
+ - server-side-unvalidated-url-redirection
198
+ - weak-cryptographic-algorithm
199
+ - full-ssrf
200
+ - bind-socket-all-network-interfaces
201
+ - client-side-unvalidated-url-redirection
202
+ - likely-bugs
203
+ - reflected-xss
204
+ - clear-text-storage-sensitive-data
205
+ - tarslip
206
+ - hardcoded-credentials
207
+ - insecure-randomness
208
+ - It also supports multiple coding languages such as (Python, Java, C++, C#, Go, Javascript, SQL)
209
+
210
+ - New Built-in evaluator called UngroundedAttributesEvaluator is added.
211
+ - It evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
212
+ - where query represents the user query and response represents the AI system response given the provided context.
213
+
214
+ - Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class
215
+ - or emotional state of a person.
216
+
217
+ - It identifies the following attributes:
218
+
219
+ - emotional_state
220
+ - protected_class
221
+ - groundedness
222
+ - New Built-in evaluators for Agent Evaluation (Preview)
223
+ - IntentResolutionEvaluator - Evaluates the intent resolution of an agent's response to a user query.
224
+ - ResponseCompletenessEvaluator - Evaluates the response completeness of an agent's response to a user query.
225
+ - TaskAdherenceEvaluator - Evaluates the task adherence of an agent's response to a user query.
226
+ - ToolCallAccuracyEvaluator - Evaluates the accuracy of tool calls made by an agent in response to a user query.
227
+
228
+ ### Bugs Fixed
229
+ - Fixed error in `GroundednessProEvaluator` when handling non-numeric values like "n/a" returned from the service.
230
+ - Uploading local evaluation results from `evaluate` with the same run name will no longer result in each online run sharing (and bashing) result files.
231
+
232
+ ## 1.3.0 (2025-02-28)
233
+
234
+ ### Breaking Changes
235
+ - Multimodal specific evaluators `ContentSafetyMultimodalEvaluator`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` has been removed. Please use `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator` instead.
236
+ - Metric name in ProtectedMaterialEvaluator's output is changed from `protected_material.fictional_characters_label` to `protected_material.fictional_characters_defect_rate`. It's now consistent with other evaluator's metric names (ending with `_defect_rate`).
237
+
238
+ ## 1.2.0 (2025-01-27)
239
+
240
+ ### Features Added
241
+ - CSV files are now supported as data file inputs with `evaluate()` API. The CSV file should have a header row with column names that match the `data` and `target` fields in the `evaluate()` method and the filename should be passed as the `data` parameter. Column name 'Conversation' in CSV file is not fully supported yet.
242
+
243
+ ### Breaking Changes
244
+ - `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be removed in next release.
245
+
246
+ ### Bugs Fixed
247
+ - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
248
+ - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
249
+ - Fixed the non adversarial simulator to run in task-free mode
250
+ - Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
251
+ main score when aggregating per-turn evaluations from a conversation into an overall
252
+ evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
253
+ - Fixed bug in non adversarial simulator sample where `tasks` undefined
254
+
255
+ ### Other Changes
256
+ - Changed minimum required python version to use this package from 3.8 to 3.9
257
+ - Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
258
+ - Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
259
+ environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
260
+
261
+ ## 1.1.0 (2024-12-12)
262
+
263
+ ### Features Added
264
+ - Added image support in `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator`. Provide image URLs or base64 encoded images in `conversation` input for image evaluation. See below for an example:
265
+
266
+ ```python
267
+ evaluator = ContentSafetyEvaluator(credential=azure_cred, azure_ai_project=project_scope)
268
+ conversation = {
269
+ "messages": [
270
+ {
271
+ "role": "system",
272
+ "content": [
273
+ {"type": "text", "text": "You are an AI assistant that understands images."}
274
+ ],
275
+ },
276
+ {
277
+ "role": "user",
278
+ "content": [
279
+ {"type": "text", "text": "Can you describe this image?"},
280
+ {
281
+ "type": "image_url",
282
+ "image_url": {
283
+ "url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"
284
+ },
285
+ },
286
+ ],
287
+ },
288
+ {
289
+ "role": "assistant",
290
+ "content": [
291
+ {
292
+ "type": "text",
293
+ "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.",
294
+ }
295
+ ],
296
+ },
297
+ ]
298
+ }
299
+ print("Calling Content Safety Evaluator for multi-modal")
300
+ score = evaluator(conversation=conversation)
301
+ ```
302
+
303
+ - Please switch to generic evaluators for image evaluations as mentioned above. `ContentSafetyMultimodalEvaluator`, `ContentSafetyMultimodalEvaluatorBase`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be deprecated in the next release.
304
+
305
+ ### Bugs Fixed
306
+ - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Foundry portal.
307
+ - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
308
+
309
+ ## 1.0.1 (2024-11-15)
310
+
311
+ ### Bugs Fixed
312
+ - Removing `azure-ai-inference` as dependency.
313
+ - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
314
+
315
+ ## 1.0.0 (2024-11-13)
316
+
317
+ ### Breaking Changes
318
+ - The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future.
319
+ - Parameters `query_response_generating_prompty_kwargs` and `user_simulator_prompty_kwargs` have been renamed to `query_response_generating_prompty_options` and `user_simulator_prompty_options` in the Simulator's __call__ method.
320
+
321
+ ### Bugs Fixed
322
+ - Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
323
+ - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
324
+ - Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
325
+ - Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.
326
+ - Fix `evaluate` function not producing aggregated metrics if ANY values to be aggregated were None, NaN, or
327
+ otherwise difficult to process. Such values are ignored fully, so the aggregated metric of `[1, 2, 3, NaN]`
328
+ would be 2, not 1.5.
329
+
330
+ ### Other Changes
331
+ - Refined error messages for serviced-based evaluators and simulators.
332
+ - Tracing has been disabled due to Cosmos DB initialization issue.
333
+ - Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features.
334
+ - Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
335
+ - For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example:
336
+ ```python
337
+ adversarial_simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
338
+ outputs = asyncio.run(
339
+ adversarial_simulator(
340
+ scenario=scenario,
341
+ target=callback,
342
+ randomize_order=True
343
+ )
344
+ )
345
+ ```
346
+
347
+ ## 1.0.0b5 (2024-10-28)
348
+
349
+ ### Features Added
350
+ - Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness.
351
+ - Groundedness detection in Non Adversarial Simulator via query/context pairs
352
+ ```python
353
+ import importlib.resources as pkg_resources
354
+ package = "azure.ai.evaluation.simulator._data_sources"
355
+ resource_name = "grounding.json"
356
+ custom_simulator = Simulator(model_config=model_config)
357
+ conversation_turns = []
358
+ with pkg_resources.path(package, resource_name) as grounding_file:
359
+ with open(grounding_file, "r") as file:
360
+ data = json.load(file)
361
+ for item in data:
362
+ conversation_turns.append([item])
363
+ outputs = asyncio.run(custom_simulator(
364
+ target=callback,
365
+ conversation_turns=conversation_turns,
366
+ max_conversation_turns=1,
367
+ ))
368
+ ```
369
+ - Adding evaluator for multimodal use cases
370
+
371
+ ### Breaking Changes
372
+ - Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
373
+ - `RetrievalEvaluator` now requires a `context` input in addition to `query` in single-turn evaluation.
374
+ - `RelevanceEvaluator` no longer takes `context` as an input. It now only takes `query` and `response` in single-turn evaluation.
375
+ - `FluencyEvaluator` no longer takes `query` as an input. It now only takes `response` in single-turn evaluation.
376
+ - AdversarialScenario enum does not include `ADVERSARIAL_INDIRECT_JAILBREAK`, invoking IndirectJailbreak or XPIA should be done with `IndirectAttackSimulator`
377
+ - Outputs of `Simulator` and `AdversarialSimulator` previously had `to_eval_qa_json_lines` and now has `to_eval_qr_json_lines`. Where `to_eval_qa_json_lines` had:
378
+ ```json
379
+ {"question": <user_message>, "answer": <assistant_message>}
380
+ ```
381
+ `to_eval_qr_json_lines` now has:
382
+ ```json
383
+ {"query": <user_message>, "response": assistant_message}
384
+ ```
385
+
386
+ ### Bugs Fixed
387
+ - Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
388
+ - Fixed an issue where the `evaluate` API would fail with "[WinError 32] The process cannot access the file because it is being used by another process" when venv folder and target function file are in the same directory.
389
+ - Fix evaluate API failure when `trace.destination` is set to `none`
390
+ - Non adversarial simulator now accepts context from the callback
391
+
392
+ ### Other Changes
393
+ - Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
394
+ - `GroundednessEvaluator` now supports `query` as an optional input in single-turn evaluation. If `query` is provided, a different prompt template will be used for the evaluation.
395
+ - To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
396
+ - `CoherenceEvaluator`
397
+ - `RelevanceEvaluator`
398
+ - `FluencyEvaluator`
399
+ - `GroundednessEvaluator`
400
+ - `SimilarityEvaluator`
401
+ - `RetrievalEvaluator`
402
+ - The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
403
+
404
+ | Evaluator | New `max_token` for Generation |
405
+ | --- | --- |
406
+ | `CoherenceEvaluator` | 800 |
407
+ | `RelevanceEvaluator` | 800 |
408
+ | `FluencyEvaluator` | 800 |
409
+ | `GroundednessEvaluator` | 800 |
410
+ | `RetrievalEvaluator` | 1600 |
411
+ - Improved the error message for storage access permission issues to provide clearer guidance for users.
412
+
413
+ ## 1.0.0b4 (2024-10-16)
414
+
415
+ ### Breaking Changes
416
+
417
+ - Removed `numpy` dependency. All NaN values returned by the SDK have been changed to from `numpy.nan` to `math.nan`.
418
+ - `credential` is now required to be passed in for all content safety evaluators and `ProtectedMaterialsEvaluator`. `DefaultAzureCredential` will no longer be chosen if a credential is not passed.
419
+ - Changed package extra name from "pf-azure" to "remote".
420
+
421
+ ### Bugs Fixed
422
+ - Adversarial Conversation simulations would fail with `Forbidden`. Added logic to re-fetch token in the exponential retry logic to retrive RAI Service response.
423
+ - Fixed an issue where the Evaluate API did not fail due to missing inputs when the target did not return columns required by the evaluators.
424
+
425
+ ### Other Changes
426
+ - Enhance the error message to provide clearer instruction when required packages for the remote tracking feature are missing.
427
+ - Print the per-evaluator run summary at the end of the Evaluate API call to make troubleshooting row-level failures easier.
428
+
429
+ ## 1.0.0b3 (2024-10-01)
430
+
431
+ ### Features Added
432
+
433
+ - Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
434
+ - The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
435
+ - `ViolenceEvaluator`
436
+ - `SexualEvaluator`
437
+ - `SelfHarmEvaluator`
438
+ - `HateUnfairnessEvaluator`
439
+ - `ProtectedMaterialEvaluator`
440
+ - `IndirectAttackEvaluator`
441
+ - `CoherenceEvaluator`
442
+ - `RelevanceEvaluator`
443
+ - `FluencyEvaluator`
444
+ - `GroundednessEvaluator`
445
+ - Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
446
+
447
+ ### Breaking Changes
448
+
449
+ - Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
450
+ - The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
451
+ `column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
452
+
453
+ Before:
454
+ ```python
455
+ evaluate(
456
+ ...,
457
+ evaluator_config={
458
+ "hate_unfairness": {
459
+ "query": "${data.question}",
460
+ "response": "${data.answer}",
461
+ }
462
+ },
463
+ ...
464
+ )
465
+ ```
466
+
467
+ After
468
+ ```python
469
+ evaluate(
470
+ ...,
471
+ evaluator_config={
472
+ "hate_unfairness": {
473
+ "column_mapping": {
474
+ "query": "${data.question}",
475
+ "response": "${data.answer}",
476
+ }
477
+ }
478
+ },
479
+ ...
480
+ )
481
+ ```
482
+
483
+ - Simulator now requires a model configuration to call the prompty instead of an Azure AI project scope. This enables the usage of simulator with Entra ID based auth.
484
+ Before:
485
+ ```python
486
+ azure_ai_project = {
487
+ "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
488
+ "resource_group_name": os.environ.get("RESOURCE_GROUP"),
489
+ "project_name": os.environ.get("PROJECT_NAME"),
490
+ }
491
+ sim = Simulator(azure_ai_project=azure_ai_project, credentails=DefaultAzureCredentials())
492
+ ```
493
+ After:
494
+ ```python
495
+ model_config = {
496
+ "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
497
+ "azure_deployment": os.environ.get("AZURE_DEPLOYMENT"),
498
+ }
499
+ sim = Simulator(model_config=model_config)
500
+ ```
501
+ If `api_key` is not included in the `model_config`, the prompty runtime in `promptflow-core` will pick up `DefaultAzureCredential`.
502
+
503
+ ### Bugs Fixed
504
+
505
+ - Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
506
+
507
+ ## 1.0.0b2 (2024-09-24)
508
+
509
+ ### Breaking Changes
510
+
511
+ - `data` and `evaluators` are now required keywords in `evaluate`.
512
+
513
+ ## 1.0.0b1 (2024-09-20)
514
+
515
+ ### Breaking Changes
516
+
517
+ - The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
518
+ - The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
519
+ - The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
520
+ - Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
521
+ - Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
522
+
523
+ ### Features Added
524
+
525
+ - First preview
526
+ - This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
527
+ - Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
@@ -0,0 +1,8 @@
1
+ recursive-include tests *.py
2
+ include *.md
3
+ include azure/__init__.py
4
+ include azure/ai/__init__.py
5
+ include azure/ai/evaluation/py.typed
6
+ recursive-include azure/ai/evaluation *.prompty
7
+ include azure/ai/evaluation/simulator/_data_sources/grounding.json
8
+ recursive-include samples *
@@ -0,0 +1,70 @@
1
+ NOTICES AND INFORMATION
2
+ Do Not Translate or Localize
3
+
4
+ This software incorporates material from third parties.
5
+ Microsoft makes certain open source code available at https://3rdpartysource.microsoft.com,
6
+ or you may send a check or money order for US $5.00, including the product name,
7
+ the open source component name, platform, and version number, to:
8
+
9
+ Source Code Compliance Team
10
+ Microsoft Corporation
11
+ One Microsoft Way
12
+ Redmond, WA 98052
13
+ USA
14
+
15
+ Notwithstanding any other terms, you may reverse engineer this software to the extent
16
+ required to debug changes to any libraries licensed under the GNU Lesser General Public License.
17
+
18
+ License notice for nltk
19
+ ---------------------------------------------------------
20
+
21
+ Copyright 2024 The NLTK Project
22
+
23
+ Licensed under the Apache License, Version 2.0 (the "License");
24
+ you may not use this file except in compliance with the License.
25
+ You may obtain a copy of the License at
26
+
27
+ http://www.apache.org/licenses/LICENSE-2.0
28
+
29
+ Unless required by applicable law or agreed to in writing, software
30
+ distributed under the License is distributed on an "AS IS" BASIS,
31
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
32
+ See the License for the specific language governing permissions and
33
+ limitations under the License.
34
+
35
+ License notice for rouge-score
36
+ ---------------------------------------------------------
37
+
38
+ Copyright 2024 The Google Research Authors
39
+
40
+ Licensed under the Apache License, Version 2.0 (the "License");
41
+ you may not use this file except in compliance with the License.
42
+ You may obtain a copy of the License at
43
+
44
+ http://www.apache.org/licenses/LICENSE-2.0
45
+
46
+ Unless required by applicable law or agreed to in writing, software
47
+ distributed under the License is distributed on an "AS IS" BASIS,
48
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49
+ See the License for the specific language governing permissions and
50
+ limitations under the License.
51
+
52
+
53
+ License notice for [Is GPT-4 a reliable rater? Evaluating consistency in GPT-4's text ratings](https://www.frontiersin.org/journals/education/articles/10.3389/feduc.2023.1272229/full)
54
+ ------------------------------------------------------------------------------------------------------------------
55
+ Copyright © 2023 Hackl, Müller, Granitzer and Sailer. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
56
+
57
+
58
+ License notice for [Is ChatGPT a Good NLG Evaluator? A Preliminary Study](https://aclanthology.org/2023.newsum-1.1) (Wang et al., NewSum 2023)
59
+ ------------------------------------------------------------------------------------------------------------------
60
+ Copyright © 2023. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
61
+
62
+
63
+ License notice for [SummEval: Re-evaluating Summarization Evaluation.](https://doi.org/10.1162/tacl_a_00373) (Fabbri et al.)
64
+ ------------------------------------------------------------------------------------------------------------------
65
+ © 2021 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
66
+
67
+
68
+ License notice for [Evaluation Metrics in the Era of GPT-4: Reliably Evaluating Large Language Models on Sequence to Sequence Tasks](https://aclanthology.org/2023.emnlp-main.543) (Sottana et al., EMNLP 2023)
69
+ ------------------------------------------------------------------------------------------------------------------
70
+ © 2023 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).