azure-ai-evaluation 1.9.0__tar.gz → 1.13.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (454) hide show
  1. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/CHANGELOG.md +124 -0
  2. {azure_ai_evaluation-1.9.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.13.5}/PKG-INFO +153 -11
  3. azure_ai_evaluation-1.13.5/TROUBLESHOOTING.md +130 -0
  4. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/__init__.py +50 -12
  5. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  6. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_aoai/label_grader.py +14 -13
  7. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_aoai/python_grader.py +86 -0
  8. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_aoai/score_model_grader.py +14 -10
  9. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  10. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  11. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/__init__.py +2 -1
  12. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_common/constants.py +194 -0
  13. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  14. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  15. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_client.py +44 -14
  16. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  17. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  18. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  19. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  20. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  21. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  22. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  23. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  24. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  25. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  26. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  27. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  28. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  29. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/rai_service.py +304 -5
  30. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/utils.py +308 -49
  31. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_constants.py +219 -0
  32. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_converters/_ai_services.py +60 -10
  33. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_converters/_models.py +75 -26
  34. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_eval_mapping.py +14 -0
  35. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  36. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  37. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluate/_evaluate.py +2519 -0
  38. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  39. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_utils.py +22 -8
  40. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluator_definition.py +76 -0
  41. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +2 -1
  42. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -18
  43. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +9 -2
  44. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +237 -30
  45. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  46. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
  47. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  48. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  49. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  50. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  51. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  52. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  53. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  54. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  55. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +9 -2
  56. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  57. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +354 -0
  58. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  59. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  60. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +32 -7
  61. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  62. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  63. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  64. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +73 -6
  65. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +198 -0
  66. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +54 -23
  67. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +9 -2
  68. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  69. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  70. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +9 -2
  71. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  72. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  73. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  74. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  75. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  76. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  77. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  78. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  79. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  80. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
  81. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
  82. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
  83. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  84. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  85. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  86. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  87. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py → azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +115 -49
  88. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  89. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  90. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  91. azure_ai_evaluation-1.13.5/azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  92. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -20
  93. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  94. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_exceptions.py +7 -0
  95. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  96. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_engine.py +120 -30
  97. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  98. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  99. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +42 -32
  100. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  101. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  102. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_model_configurations.py +26 -0
  103. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  104. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_version.py +1 -1
  105. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/__init__.py +4 -3
  106. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_attack_objective_generator.py +35 -4
  107. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  108. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_callback_chat_target.py +48 -2
  109. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  110. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  111. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  112. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_red_team.py +1717 -0
  113. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_red_team_result.py +289 -15
  114. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  115. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  116. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
  117. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_utils/_rai_service_target.py +7 -1
  118. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  119. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_utils/constants.py +15 -2
  120. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  121. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  122. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  123. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_utils/metric_mapping.py +27 -4
  124. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  125. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  126. azure_ai_evaluation-1.13.5/azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  127. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_utils/strategy_utils.py +49 -33
  128. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_adversarial_simulator.py +15 -3
  129. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  130. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  131. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  132. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +14 -2
  133. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +49 -8
  134. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  135. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -6
  136. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  137. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  138. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_simulator.py +12 -0
  139. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5/azure_ai_evaluation.egg-info}/PKG-INFO +153 -11
  140. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure_ai_evaluation.egg-info/SOURCES.txt +59 -0
  141. azure_ai_evaluation-1.13.5/azure_ai_evaluation.egg-info/requires.txt +30 -0
  142. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/agent_evaluators/agent_evaluation.ipynb +3 -3
  143. azure_ai_evaluation-1.13.5/samples/agent_evaluators/coherence.ipynb +204 -0
  144. azure_ai_evaluation-1.13.5/samples/agent_evaluators/fluency.ipynb +283 -0
  145. azure_ai_evaluation-1.13.5/samples/agent_evaluators/groundedness.ipynb +332 -0
  146. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/agent_evaluators/intent_resolution.ipynb +80 -81
  147. azure_ai_evaluation-1.13.5/samples/agent_evaluators/relevance.ipynb +208 -0
  148. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/agent_evaluators/response_completeness.ipynb +36 -13
  149. azure_ai_evaluation-1.13.5/samples/agent_evaluators/task_adherence.ipynb +260 -0
  150. azure_ai_evaluation-1.13.5/samples/agent_evaluators/task_completion.ipynb +355 -0
  151. azure_ai_evaluation-1.13.5/samples/agent_evaluators/task_navigation_efficiency.ipynb +603 -0
  152. azure_ai_evaluation-1.13.5/samples/agent_evaluators/tool_call_accuracy.ipynb +507 -0
  153. azure_ai_evaluation-1.13.5/samples/agent_evaluators/tool_input_accuracy.ipynb +427 -0
  154. azure_ai_evaluation-1.13.5/samples/agent_evaluators/tool_output_utilization.ipynb +535 -0
  155. azure_ai_evaluation-1.13.5/samples/agent_evaluators/tool_selection.ipynb +576 -0
  156. azure_ai_evaluation-1.13.5/samples/agent_evaluators/tool_success.ipynb +476 -0
  157. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/aoai_score_model_grader_sample.py +62 -8
  158. azure_ai_evaluation-1.13.5/samples/data/custom_objectives_with_context_example.json +51 -0
  159. azure_ai_evaluation-1.13.5/samples/evaluation_samples_common.py +128 -0
  160. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/evaluation_samples_evaluate.py +388 -47
  161. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/evaluation_samples_evaluate_fdp.py +349 -25
  162. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/evaluation_samples_threshold.py +16 -16
  163. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/red_team_samples.py +56 -0
  164. azure_ai_evaluation-1.13.5/samples/score_model_multimodal/aoai_score_model_grader_sample_audio.py +261 -0
  165. azure_ai_evaluation-1.13.5/samples/score_model_multimodal/aoai_score_model_grader_sample_audio_file.py +268 -0
  166. azure_ai_evaluation-1.13.5/samples/score_model_multimodal/aoai_score_model_grader_sample_image.py +257 -0
  167. azure_ai_evaluation-1.13.5/samples/score_model_multimodal/chat_compeletion_audio.py +64 -0
  168. azure_ai_evaluation-1.13.5/samples/score_model_multimodal/image.jpg +0 -0
  169. azure_ai_evaluation-1.13.5/samples/score_model_multimodal/input_audio.wav +0 -0
  170. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/setup.py +12 -9
  171. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/conftest.py +60 -2
  172. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/converters/ai_agent_converter/serialization_helper.py +6 -1
  173. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py +4 -4
  174. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_adv_simulator.py +2 -1
  175. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_aoai_graders.py +126 -3
  176. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_builtin_evaluators.py +58 -26
  177. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_evaluate.py +7 -7
  178. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_mass_evaluate.py +11 -7
  179. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_metrics_upload.py +4 -0
  180. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_prompty_async.py +37 -23
  181. azure_ai_evaluation-1.13.5/tests/e2etests/test_red_team.py +380 -0
  182. azure_ai_evaluation-1.13.5/tests/unittests/test_agent_evaluators.py +105 -0
  183. azure_ai_evaluation-1.13.5/tests/unittests/test_aoai_alignment_missing_rows.py +90 -0
  184. azure_ai_evaluation-1.13.5/tests/unittests/test_aoai_data_source.py +510 -0
  185. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_aoai_evaluation_pagination.py +13 -5
  186. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_aoai_integration_features.py +1 -1
  187. azure_ai_evaluation-1.13.5/tests/unittests/test_aoai_nested_integration.py +289 -0
  188. azure_ai_evaluation-1.13.5/tests/unittests/test_aoai_python_grader.py +54 -0
  189. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_aoai_score_model_grader.py +22 -3
  190. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_built_in_evaluator.py +122 -7
  191. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_completeness_evaluator.py +22 -12
  192. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_eval_run.py +291 -1
  193. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_evaluate.py +667 -12
  194. azure_ai_evaluation-1.13.5/tests/unittests/test_evaluate_aoai.py +109 -0
  195. azure_ai_evaluation-1.13.5/tests/unittests/test_evaluate_mismatch.py +488 -0
  196. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_evaluate_performance.py +2 -3
  197. azure_ai_evaluation-1.13.5/tests/unittests/test_evaluator_scoring_patterns.py +245 -0
  198. azure_ai_evaluation-1.13.5/tests/unittests/test_lazy_imports.py +135 -0
  199. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_attack_objective_generator.py +3 -0
  200. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_attack_strategy.py +1 -0
  201. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_callback_chat_target.py +38 -1
  202. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_rai_service_eval_chat_target.py +1 -1
  203. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_red_team.py +511 -171
  204. azure_ai_evaluation-1.13.5/tests/unittests/test_redteam/test_red_team_language_support.py +156 -0
  205. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_red_team_result.py +6 -1
  206. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_strategy_utils.py +63 -20
  207. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_safety_evaluation.py +48 -0
  208. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_save_eval.py +1 -0
  209. azure_ai_evaluation-1.13.5/tests/unittests/test_task_completion_evaluator.py +377 -0
  210. azure_ai_evaluation-1.13.5/tests/unittests/test_task_navigation_efficiency_evaluators.py +186 -0
  211. azure_ai_evaluation-1.13.5/tests/unittests/test_tool_call_accuracy_evaluator.py +690 -0
  212. azure_ai_evaluation-1.13.5/tests/unittests/test_tool_input_accuracy_evaluator.py +654 -0
  213. azure_ai_evaluation-1.13.5/tests/unittests/test_tool_selection_evaluator.py +286 -0
  214. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_utils.py +305 -1
  215. azure_ai_evaluation-1.9.0/TROUBLESHOOTING.md +0 -98
  216. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_common/constants.py +0 -85
  217. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_common/onedp/models/__init__.py +0 -168
  218. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_common/onedp/models/_models.py +0 -2685
  219. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_constants.py +0 -118
  220. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +0 -118
  221. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluate/_evaluate.py +0 -1187
  222. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +0 -594
  223. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -162
  224. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -176
  225. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -100
  226. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +0 -117
  227. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +0 -375
  228. azure_ai_evaluation-1.9.0/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +0 -71
  229. azure_ai_evaluation-1.9.0/azure/ai/evaluation/red_team/_red_team.py +0 -3174
  230. azure_ai_evaluation-1.9.0/azure/ai/evaluation/red_team/_utils/formatting_utils.py +0 -162
  231. azure_ai_evaluation-1.9.0/azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  232. azure_ai_evaluation-1.9.0/azure_ai_evaluation.egg-info/requires.txt +0 -17
  233. azure_ai_evaluation-1.9.0/samples/agent_evaluators/task_adherence.ipynb +0 -245
  234. azure_ai_evaluation-1.9.0/samples/agent_evaluators/tool_call_accuracy.ipynb +0 -365
  235. azure_ai_evaluation-1.9.0/samples/evaluation_samples_common.py +0 -60
  236. azure_ai_evaluation-1.9.0/tests/unittests/test_agent_evaluators.py +0 -102
  237. azure_ai_evaluation-1.9.0/tests/unittests/test_tool_call_accuracy_evaluator.py +0 -417
  238. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/MANIFEST.in +0 -0
  239. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/NOTICE.txt +0 -0
  240. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/README.md +0 -0
  241. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/__init__.py +0 -0
  242. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/__init__.py +0 -0
  243. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_aoai/__init__.py +0 -0
  244. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_azure/__init__.py +0 -0
  245. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_azure/_clients.py +0 -0
  246. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_azure/_envs.py +0 -0
  247. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_azure/_models.py +0 -0
  248. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_azure/_token_manager.py +0 -0
  249. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/_experimental.py +0 -0
  250. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/math.py +0 -0
  251. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_model_base.py +0 -0
  252. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_patch.py +0 -0
  253. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_serialization.py +0 -0
  254. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_types.py +0 -0
  255. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_utils/__init__.py +0 -0
  256. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_utils/serialization.py +0 -0
  257. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_vendor.py +0 -0
  258. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/_version.py +0 -0
  259. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/aio/_patch.py +0 -0
  260. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +0 -0
  261. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/models/_patch.py +0 -0
  262. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/operations/_patch.py +0 -0
  263. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/py.typed +0 -0
  264. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +0 -0
  265. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +0 -0
  266. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +0 -0
  267. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +0 -0
  268. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +0 -0
  269. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +0 -0
  270. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +0 -0
  271. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +0 -0
  272. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +0 -0
  273. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +0 -0
  274. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +0 -0
  275. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +0 -0
  276. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +0 -0
  277. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +0 -0
  278. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +0 -0
  279. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +0 -0
  280. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/__init__.py +0 -0
  281. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/_client.py +0 -0
  282. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/_configuration.py +0 -0
  283. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/_model_base.py +0 -0
  284. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/_patch.py +0 -0
  285. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/_serialization.py +0 -0
  286. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/_version.py +0 -0
  287. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/aio/__init__.py +0 -0
  288. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/aio/_client.py +0 -0
  289. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/aio/_configuration.py +0 -0
  290. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/aio/_patch.py +0 -0
  291. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +0 -0
  292. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +0 -0
  293. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +0 -0
  294. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/models/__init__.py +0 -0
  295. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/models/_enums.py +0 -0
  296. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/models/_models.py +0 -0
  297. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/models/_patch.py +0 -0
  298. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/operations/__init__.py +0 -0
  299. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/operations/_operations.py +0 -0
  300. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/operations/_patch.py +0 -0
  301. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_common/raiclient/py.typed +0 -0
  302. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_converters/__init__.py +0 -0
  303. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_converters/_sk_services.py +0 -0
  304. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
  305. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +0 -0
  306. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +0 -0
  307. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +0 -0
  308. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +0 -0
  309. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -0
  310. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -0
  311. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -0
  312. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
  313. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
  314. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +0 -0
  315. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
  316. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -0
  317. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_common/__init__.py +0 -0
  318. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +0 -0
  319. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +0 -0
  320. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
  321. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +0 -0
  322. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  323. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
  324. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
  325. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -0
  326. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
  327. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
  328. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +0 -0
  329. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +0 -0
  330. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
  331. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
  332. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
  333. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
  334. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +0 -0
  335. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +0 -0
  336. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
  337. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -0
  338. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
  339. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +0 -0
  340. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
  341. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
  342. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +0 -0
  343. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +0 -0
  344. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +0 -0
  345. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
  346. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_http_utils.py +0 -0
  347. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/__init__.py +0 -0
  348. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -0
  349. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/_check.py +0 -0
  350. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/_configuration.py +0 -0
  351. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/_constants.py +0 -0
  352. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/_errors.py +0 -0
  353. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/_flows.py +0 -0
  354. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/_service.py +0 -0
  355. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/client.py +0 -0
  356. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/entities.py +0 -0
  357. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/tracing.py +0 -0
  358. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/types.py +0 -0
  359. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_adapters/utils.py +0 -0
  360. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/__init__.py +0 -0
  361. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +0 -0
  362. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +0 -0
  363. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +0 -0
  364. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_status.py +0 -0
  365. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_trace.py +0 -0
  366. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_utils.py +0 -0
  367. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +0 -0
  368. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_common/__init__.py +0 -0
  369. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_common/_async_token_provider.py +0 -0
  370. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_common/_logging.py +0 -0
  371. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +0 -0
  372. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/prompty/__init__.py +0 -0
  373. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/prompty/_connection.py +0 -0
  374. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/prompty/_exceptions.py +0 -0
  375. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +0 -0
  376. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_safety_evaluation/__init__.py +0 -0
  377. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  378. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_user_agent.py +0 -0
  379. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_vendor/__init__.py +0 -0
  380. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
  381. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -0
  382. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -0
  383. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -0
  384. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
  385. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/py.typed +0 -0
  386. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_agent/__init__.py +0 -0
  387. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_agent/_agent_functions.py +0 -0
  388. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_agent/_agent_tools.py +0 -0
  389. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_agent/_agent_utils.py +0 -0
  390. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +0 -0
  391. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_default_converter.py +0 -0
  392. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/red_team/_utils/logging_utils.py +0 -0
  393. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/__init__.py +0 -0
  394. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_adversarial_scenario.py +0 -0
  395. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_constants.py +0 -0
  396. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
  397. {azure_ai_evaluation-1.9.0/azure/ai/evaluation/red_team/_utils → azure_ai_evaluation-1.13.5/azure/ai/evaluation/simulator/_data_sources}/__init__.py +0 -0
  398. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -0
  399. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
  400. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
  401. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
  402. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
  403. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +0 -0
  404. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  405. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -0
  406. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
  407. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure/ai/evaluation/simulator/_utils.py +0 -0
  408. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
  409. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
  410. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
  411. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/migration_guide.md +0 -0
  412. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/pyproject.toml +0 -0
  413. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/README.md +0 -0
  414. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/agent_evaluators/instructions.md +0 -0
  415. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/agent_evaluators/sample_synthetic_conversations.jsonl +0 -0
  416. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/agent_evaluators/user_functions.py +0 -0
  417. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/data/evaluate_test_data.jsonl +0 -0
  418. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/evaluation_samples_safety_evaluation.py +0 -0
  419. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/evaluation_samples_simulate.py +0 -0
  420. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/red_team_agent_tool_sample.py +0 -0
  421. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/red_team_skip_upload.py +0 -0
  422. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/samples/semantic_kernel_red_team_agent_sample.py +0 -0
  423. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/setup.cfg +0 -0
  424. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/__init__.py +0 -0
  425. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/__openai_patcher.py +0 -0
  426. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/converters/ai_agent_converter/test_run_ids_from_conversation.py +0 -0
  427. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/converters/ai_agent_converter/test_sk_agent_converter_internals.py +0 -0
  428. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/converters/ai_agent_converter/test_sk_turn_idxs_from_conversation.py +0 -0
  429. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/__init__.py +0 -0
  430. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
  431. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/target_fn.py +0 -0
  432. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_lite_management_client.py +0 -0
  433. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_remote_evaluation.py +0 -0
  434. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/e2etests/test_sim_and_eval.py +0 -0
  435. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_batch_run_context.py +0 -0
  436. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_content_safety_defect_rate.py +0 -0
  437. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_content_safety_rai_script.py +0 -0
  438. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_document_retrieval_evaluator.py +0 -0
  439. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_evaluators/slow_eval.py +0 -0
  440. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_evaluators/test_conversation_thresholds.py +0 -0
  441. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -0
  442. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_evaluators/test_service_evaluator_thresholds.py +0 -0
  443. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_evaluators/test_threshold_behavior.py +0 -0
  444. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_jailbreak_simulator.py +0 -0
  445. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_non_adv_simulator.py +0 -0
  446. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/__init__.py +0 -0
  447. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_constants.py +0 -0
  448. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_formatting_utils.py +0 -0
  449. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_rai_service_target.py +0 -0
  450. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_redteam/test_rai_service_true_false_scorer.py +0 -0
  451. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_remote_evaluation_features.py +0 -0
  452. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_simulator.py +0 -0
  453. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
  454. {azure_ai_evaluation-1.9.0 → azure_ai_evaluation-1.13.5}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
@@ -1,5 +1,126 @@
1
1
  # Release History
2
2
 
3
+ ## 1.13.5 (2025-11-10)
4
+
5
+ ### Bugs Fixed
6
+
7
+ - **TaskAdherenceEvaluator:** treat tool definitions as optional so evaluations with only query/response inputs no longer raise “Either 'conversation' or individual inputs must be provided.”
8
+
9
+ ## 1.13.4 (2025-11-10)
10
+
11
+ ### Bugs Fixed
12
+
13
+ - Handle input data for evaluation result when evaluators.
14
+
15
+ ## 1.13.3 (2025-11-08)
16
+
17
+ ### Other Changes
18
+
19
+ - Added `scenario` property to red team evaluation request to align scores with red team concepts of attack success.
20
+
21
+ ## 1.13.2 (2025-11-07)
22
+
23
+ ### Bugs Fixed
24
+
25
+ - Added App Insights redaction for agent safety run telemetry so adversarial prompts are not stored in collected logs.
26
+
27
+ ## 1.13.1 (2025-11-05)
28
+
29
+ ### Features Added
30
+
31
+ - Improved RedTeam coverage across risk sub-categories to ensure comprehensive security testing
32
+ - Made RedTeam's `AttackStrategy.Tense` seed prompts dynamic to allow use of this strategy with additional risk categories
33
+ - Refactors error handling and result semantics in the RedTeam evaluation system to improve clarity and align with Attack Success Rate (ASR) conventions (passed=False means attack success)
34
+
35
+ ### Bugs Fixed
36
+
37
+ - Fixed RedTeam evaluation error related to context handling for context-dependent risk categories
38
+ - Fixed RedTeam prompt application for model targets during Indirect Jailbreak XPIA (Cross-Platform Indirect Attack)
39
+
40
+ ## 1.13.0 (2025-10-30)
41
+
42
+ ### Features Added
43
+
44
+ - Updated `IndirectAttack` risk category for RedTeam to `IndirectJailbreak` to better reflect its purpose. This change allows users to apply cross-domain prompt injection (XPIA) attack strategies across all risk categories, enabling more comprehensive security testing of AI systems against indirect prompt injection attacks during red teaming.
45
+ - Added `TaskAdherence`, `SensitiveDataLeakage`, and `ProhibitedActions` as cloud-only agent safety risk categories for red teaming.
46
+ - Updated all evaluators' output to be of the following schema:
47
+ - `gpt_{evaluator_name}`, `{evaluator_name}`: float score,
48
+ - `{evaluator_name}_result`: pass/fail based on threshold,
49
+ - `{evaluator_name}_reason`, `{evaluator_name}_threshold`
50
+ - `{evaluator_name}_prompt_tokens`, `{evaluator_name}_completion_tokens`, `{evaluator_name}_total_tokens`, `{evaluator_name}_finish_reason`
51
+ - `{evaluator_name}_model`: model used for evaluation
52
+ - `{evaluator_name}_sample_input`, `{evaluator_name}_sample_output`: input and output used for evaluation
53
+
54
+ This change standardizes the output format across all evaluators and follows OTel convention.
55
+
56
+ ### Bugs Fixed
57
+
58
+ - `image_tag` parameter in `AzureOpenAIPythonGrader` is now optional.
59
+
60
+ ## 1.11.2 (2025-10-09)
61
+
62
+ ### Bugs Fixed
63
+
64
+ - **kwargs in an evaluator signature receives input columns that are not otherwise named in the evaluator's signature
65
+
66
+ ## 1.12.0 (2025-10-02)
67
+
68
+ ### Features Added
69
+ - AOAI Graders now accept a "credential" parameter that can be used for authentication with an AzureOpenAIModelConfiguration
70
+ - Added `is_reasoning_model` parameter support to `CoherenceEvaluator`, `FluencyEvaluator`, `SimilarityEvaluator`, `GroundednessEvaluator`, `RetrievalEvaluator`, and `RelevanceEvaluator` to enable reasoning model configuration for o1/o3 models.
71
+
72
+ ### Bugs Fixed
73
+ - Support for multi-level nesting in OpenAI grader (experimental)
74
+
75
+ ## 1.11.1 (2025-09-19)
76
+
77
+ ### Bugs Fixed
78
+ - Pinning duckdb version to 1.3.2 for redteam extra to fix error `TypeError: unhashable type: '_duckdb.typing.DuckDBPyType'`
79
+
80
+ ## 1.11.0 (2025-09-03)
81
+
82
+ ### Features Added
83
+ - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
84
+ - Added support for user-supplied TokenCredentials with LLM based evaluators.
85
+ - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
86
+ - Added `language` parameter to `RedTeam` class for multilingual red team scanning support. The parameter accepts values from `SupportedLanguages` enum including English, Spanish, French, German, Italian, Portuguese, Japanese, Korean, and Simplified Chinese, enabling red team attacks to be generated and conducted in multiple languages.
87
+ - Added support for IndirectAttack and UngroundedAttributes risk categories in `RedTeam` scanning. These new risk categories expand red team capabilities to detect cross-platform indirect attacks and evaluate ungrounded inferences about human attributes including emotional state and protected class information.
88
+
89
+ ### Bugs Fixed
90
+ - Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
91
+
92
+ ### Other Changes
93
+ - Deprecating `AdversarialSimulator` in favor of the [AI Red Teaming Agent](https://aka.ms/airedteamingagent-sample). `AdversarialSimulator` will be removed in the next minor release.
94
+ - Moved retry configuration constants (`MAX_RETRY_ATTEMPTS`, `MAX_RETRY_WAIT_SECONDS`, `MIN_RETRY_WAIT_SECONDS`) from `RedTeam` class to new `RetryManager` class for better code organization and configurability.
95
+
96
+ ## 1.10.0 (2025-07-31)
97
+
98
+ ### Breaking Changes
99
+
100
+ - Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
101
+
102
+ ### Features Added
103
+
104
+ - Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
105
+ - Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
106
+ tolerance for harmful responses).
107
+ - Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
108
+
109
+
110
+ ### Bugs Fixed
111
+
112
+ - Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
113
+ - Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
114
+ - Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
115
+
116
+
117
+ ### Other Changes
118
+
119
+ - The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
120
+ - Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
121
+ This is due to be removed in a future release.
122
+
123
+
3
124
  ## 1.9.0 (2025-07-02)
4
125
 
5
126
  ### Features Added
@@ -11,8 +132,11 @@
11
132
  ### Bugs Fixed
12
133
 
13
134
  - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
135
+
136
+ - Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
14
137
  - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
15
138
  - Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
139
+ - `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
16
140
 
17
141
  ## 1.8.0 (2025-05-29)
18
142
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: azure-ai-evaluation
3
- Version: 1.9.0
3
+ Version: 1.13.5
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -21,22 +21,40 @@ Classifier: Operating System :: OS Independent
21
21
  Requires-Python: >=3.9
22
22
  Description-Content-Type: text/markdown
23
23
  License-File: NOTICE.txt
24
- Requires-Dist: promptflow-devkit>=1.17.1
25
- Requires-Dist: promptflow-core>=1.17.1
26
24
  Requires-Dist: pyjwt>=2.8.0
27
- Requires-Dist: azure-identity>=1.16.0
28
- Requires-Dist: azure-core>=1.30.2
25
+ Requires-Dist: azure-identity>=1.19.0
26
+ Requires-Dist: azure-core>=1.31.0
29
27
  Requires-Dist: nltk>=3.9.1
30
- Requires-Dist: azure-storage-blob>=12.10.0
31
- Requires-Dist: httpx>=0.25.1
32
- Requires-Dist: pandas<3.0.0,>=2.1.2
33
- Requires-Dist: openai>=1.78.0
28
+ Requires-Dist: azure-storage-blob>=12.19.0
29
+ Requires-Dist: httpx>=0.27.2
30
+ Requires-Dist: pandas<3.0.0,>=2.1.2; python_version < "3.13"
31
+ Requires-Dist: pandas<3.0.0,>=2.2.3; python_version == "3.13"
32
+ Requires-Dist: pandas<3.0.0,>=2.3.3; python_version >= "3.14"
33
+ Requires-Dist: openai>=1.108.0
34
34
  Requires-Dist: ruamel.yaml<1.0.0,>=0.17.10
35
35
  Requires-Dist: msrest>=0.6.21
36
36
  Requires-Dist: Jinja2>=3.1.6
37
37
  Requires-Dist: aiohttp>=3.0
38
38
  Provides-Extra: redteam
39
- Requires-Dist: pyrit==0.8.1; extra == "redteam"
39
+ Requires-Dist: pyrit==0.8.1; python_version >= "3.10" and extra == "redteam"
40
+ Requires-Dist: duckdb==1.3.2; python_version >= "3.10" and extra == "redteam"
41
+ Provides-Extra: opentelemetry
42
+ Requires-Dist: opentelemetry-sdk>=1.17.0; extra == "opentelemetry"
43
+ Requires-Dist: azure-monitor-opentelemetry-exporter>=1.0.0b17; extra == "opentelemetry"
44
+ Dynamic: author
45
+ Dynamic: author-email
46
+ Dynamic: classifier
47
+ Dynamic: description
48
+ Dynamic: description-content-type
49
+ Dynamic: home-page
50
+ Dynamic: keywords
51
+ Dynamic: license
52
+ Dynamic: license-file
53
+ Dynamic: project-url
54
+ Dynamic: provides-extra
55
+ Dynamic: requires-dist
56
+ Dynamic: requires-python
57
+ Dynamic: summary
40
58
 
41
59
  # Azure AI Evaluation client library for Python
42
60
 
@@ -400,6 +418,127 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
400
418
 
401
419
  # Release History
402
420
 
421
+ ## 1.13.5 (2025-11-10)
422
+
423
+ ### Bugs Fixed
424
+
425
+ - **TaskAdherenceEvaluator:** treat tool definitions as optional so evaluations with only query/response inputs no longer raise “Either 'conversation' or individual inputs must be provided.”
426
+
427
+ ## 1.13.4 (2025-11-10)
428
+
429
+ ### Bugs Fixed
430
+
431
+ - Handle input data for evaluation result when evaluators.
432
+
433
+ ## 1.13.3 (2025-11-08)
434
+
435
+ ### Other Changes
436
+
437
+ - Added `scenario` property to red team evaluation request to align scores with red team concepts of attack success.
438
+
439
+ ## 1.13.2 (2025-11-07)
440
+
441
+ ### Bugs Fixed
442
+
443
+ - Added App Insights redaction for agent safety run telemetry so adversarial prompts are not stored in collected logs.
444
+
445
+ ## 1.13.1 (2025-11-05)
446
+
447
+ ### Features Added
448
+
449
+ - Improved RedTeam coverage across risk sub-categories to ensure comprehensive security testing
450
+ - Made RedTeam's `AttackStrategy.Tense` seed prompts dynamic to allow use of this strategy with additional risk categories
451
+ - Refactors error handling and result semantics in the RedTeam evaluation system to improve clarity and align with Attack Success Rate (ASR) conventions (passed=False means attack success)
452
+
453
+ ### Bugs Fixed
454
+
455
+ - Fixed RedTeam evaluation error related to context handling for context-dependent risk categories
456
+ - Fixed RedTeam prompt application for model targets during Indirect Jailbreak XPIA (Cross-Platform Indirect Attack)
457
+
458
+ ## 1.13.0 (2025-10-30)
459
+
460
+ ### Features Added
461
+
462
+ - Updated `IndirectAttack` risk category for RedTeam to `IndirectJailbreak` to better reflect its purpose. This change allows users to apply cross-domain prompt injection (XPIA) attack strategies across all risk categories, enabling more comprehensive security testing of AI systems against indirect prompt injection attacks during red teaming.
463
+ - Added `TaskAdherence`, `SensitiveDataLeakage`, and `ProhibitedActions` as cloud-only agent safety risk categories for red teaming.
464
+ - Updated all evaluators' output to be of the following schema:
465
+ - `gpt_{evaluator_name}`, `{evaluator_name}`: float score,
466
+ - `{evaluator_name}_result`: pass/fail based on threshold,
467
+ - `{evaluator_name}_reason`, `{evaluator_name}_threshold`
468
+ - `{evaluator_name}_prompt_tokens`, `{evaluator_name}_completion_tokens`, `{evaluator_name}_total_tokens`, `{evaluator_name}_finish_reason`
469
+ - `{evaluator_name}_model`: model used for evaluation
470
+ - `{evaluator_name}_sample_input`, `{evaluator_name}_sample_output`: input and output used for evaluation
471
+
472
+ This change standardizes the output format across all evaluators and follows OTel convention.
473
+
474
+ ### Bugs Fixed
475
+
476
+ - `image_tag` parameter in `AzureOpenAIPythonGrader` is now optional.
477
+
478
+ ## 1.11.2 (2025-10-09)
479
+
480
+ ### Bugs Fixed
481
+
482
+ - **kwargs in an evaluator signature receives input columns that are not otherwise named in the evaluator's signature
483
+
484
+ ## 1.12.0 (2025-10-02)
485
+
486
+ ### Features Added
487
+ - AOAI Graders now accept a "credential" parameter that can be used for authentication with an AzureOpenAIModelConfiguration
488
+ - Added `is_reasoning_model` parameter support to `CoherenceEvaluator`, `FluencyEvaluator`, `SimilarityEvaluator`, `GroundednessEvaluator`, `RetrievalEvaluator`, and `RelevanceEvaluator` to enable reasoning model configuration for o1/o3 models.
489
+
490
+ ### Bugs Fixed
491
+ - Support for multi-level nesting in OpenAI grader (experimental)
492
+
493
+ ## 1.11.1 (2025-09-19)
494
+
495
+ ### Bugs Fixed
496
+ - Pinning duckdb version to 1.3.2 for redteam extra to fix error `TypeError: unhashable type: '_duckdb.typing.DuckDBPyType'`
497
+
498
+ ## 1.11.0 (2025-09-03)
499
+
500
+ ### Features Added
501
+ - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
502
+ - Added support for user-supplied TokenCredentials with LLM based evaluators.
503
+ - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
504
+ - Added `language` parameter to `RedTeam` class for multilingual red team scanning support. The parameter accepts values from `SupportedLanguages` enum including English, Spanish, French, German, Italian, Portuguese, Japanese, Korean, and Simplified Chinese, enabling red team attacks to be generated and conducted in multiple languages.
505
+ - Added support for IndirectAttack and UngroundedAttributes risk categories in `RedTeam` scanning. These new risk categories expand red team capabilities to detect cross-platform indirect attacks and evaluate ungrounded inferences about human attributes including emotional state and protected class information.
506
+
507
+ ### Bugs Fixed
508
+ - Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
509
+
510
+ ### Other Changes
511
+ - Deprecating `AdversarialSimulator` in favor of the [AI Red Teaming Agent](https://aka.ms/airedteamingagent-sample). `AdversarialSimulator` will be removed in the next minor release.
512
+ - Moved retry configuration constants (`MAX_RETRY_ATTEMPTS`, `MAX_RETRY_WAIT_SECONDS`, `MIN_RETRY_WAIT_SECONDS`) from `RedTeam` class to new `RetryManager` class for better code organization and configurability.
513
+
514
+ ## 1.10.0 (2025-07-31)
515
+
516
+ ### Breaking Changes
517
+
518
+ - Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
519
+
520
+ ### Features Added
521
+
522
+ - Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
523
+ - Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
524
+ tolerance for harmful responses).
525
+ - Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
526
+
527
+
528
+ ### Bugs Fixed
529
+
530
+ - Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
531
+ - Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
532
+ - Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
533
+
534
+
535
+ ### Other Changes
536
+
537
+ - The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
538
+ - Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
539
+ This is due to be removed in a future release.
540
+
541
+
403
542
  ## 1.9.0 (2025-07-02)
404
543
 
405
544
  ### Features Added
@@ -411,8 +550,11 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
411
550
  ### Bugs Fixed
412
551
 
413
552
  - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
553
+
554
+ - Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
414
555
  - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
415
556
  - Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
557
+ - `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
416
558
 
417
559
  ## 1.8.0 (2025-05-29)
418
560
 
@@ -0,0 +1,130 @@
1
+ # Troubleshoot AI Evaluation SDK Issues
2
+
3
+ This guide walks you through how to investigate failures, common errors in the `azure-ai-evaluation` SDK, and steps to mitigate these issues.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Handle Evaluate API Errors](#handle-evaluate-api-errors)
8
+ - [Troubleshoot Remote Tracking Issues](#troubleshoot-remote-tracking-issues)
9
+ - [Troubleshoot Column Mapping Issues](#troubleshoot-column-mapping-issues)
10
+ - [Troubleshoot Safety Evaluator Issues](#troubleshoot-safety-evaluator-issues)
11
+ - [Troubleshoot Quality Evaluator Issues](#troubleshoot-quality-evaluator-issues)
12
+ - [Handle Simulation Errors](#handle-simulation-errors)
13
+ - [Adversarial Simulation Supported Regions](#adversarial-simulation-supported-regions)
14
+ - [Need to generate simulations for specific harm type](#need-to-generate-simulations-for-specific-harm-type)
15
+ - [Simulator is slow](#simulator-is-slow)
16
+ - [Handle RedTeam Errors](#handle-redteam-errors)
17
+ - [Permission or authentication failures](#permission-or-authentication-failures)
18
+ - [Target resource not found](#target-resource-not-found)
19
+ - [Agent name not found](#agent-name-not-found)
20
+ - [Insufficient Storage Permissions](#insufficient-storage-permissions)
21
+ - [PyRIT "Error sending prompt" message](#pyrit-error-sending-prompt-message)
22
+ - [Logging](#logging)
23
+ - [Get Additional Help](#get-additional-help)
24
+
25
+ ## Handle Evaluate API Errors
26
+
27
+ ### Troubleshoot Remote Tracking Issues
28
+
29
+ - Before running `evaluate()`, to ensure that you can enable logging and tracing to your Azure AI project, make sure you are first logged in by running `az login`.
30
+
31
+ - Ensure that you assign the proper permissions to the storage account linked to your Azure AI Studio hub. This can be done with the following command. More information can be found [here](https://aka.ms/credentialleshub).
32
+
33
+ ```Shell
34
+ # <mySubscriptionID>: Subscription ID of the Azure AI Studio hub's linked storage account (available in Azure AI hub resource view in Azure Portal).
35
+ # <myResourceGroupName>: Resource group of the Azure AI Studio hub's linked storage account.
36
+ # <user-id>: User object ID for role assignment (retrieve with "az ad user show" command).
37
+
38
+ az role assignment create --role "Storage Blob Data Contributor" --scope /subscriptions/<mySubscriptionID>/resourceGroups/<myResourceGroupName> --assignee-principal-type User --assignee-object-id "<user-id>"
39
+ ```
40
+
41
+ - Additionally, if you're using a virtual network or private link, and your evaluation run upload fails because of that, check out this [guide](https://docs.microsoft.com/azure/machine-learning/how-to-enable-studio-virtual-network#access-data-using-the-studio).
42
+
43
+ ### Troubleshoot Column Mapping Issues
44
+
45
+ - When using `column_mapping` parameter in evaluators, ensure all keys and values are non-empty strings and contain only alphanumeric characters. Empty strings, non-string values, or non-alphanumeric characters can cause serialization errors and issues in downstream applications. Example of valid mapping: `{"query": "${data.query}", "response": "${data.response}"}`.
46
+
47
+ ### Troubleshoot Safety Evaluator Issues
48
+
49
+ - Risk and safety evaluators depend on the Azure AI Studio safety evaluation backend service. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaisafetyeval-regionsupport).
50
+ - If you encounter a 403 Unauthorized error when using safety evaluators, verify that you have the `Contributor` role assigned to your Azure AI project. `Contributor` role is currently required to run safety evaluations.
51
+
52
+ ## Handle Simulation Errors
53
+
54
+ ### Adversarial Simulation Supported Regions
55
+
56
+ Adversarial simulators use Azure AI Studio safety evaluation backend service to generate an adversarial dataset against your application. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaiadvsimulator-regionsupport).
57
+
58
+ ### Need to generate simulations for specific harm type
59
+
60
+ The Adversarial simulator does not support selecting individual harms, instead we recommend running the `AdversarialSimulator` for 4x the number of specific harms as the `max_simulation_results`
61
+
62
+ ### Simulator is slow
63
+
64
+ Identify the type of simulations being run (adversarial or non-adversarial).
65
+ Adjust parameters such as `api_call_retry_sleep_sec`, `api_call_delay_sec`, and `concurrent_async_task`. Please note that rate limits to llm calls can be both tokens per minute and requests per minute.
66
+
67
+ ## Handle RedTeam errors
68
+
69
+ ### Permission or authentication failures
70
+ - Run `az login` in the active shell before starting the scan and ensure the account has the **Azure AI User** role plus the `Storage Blob Data Contributor` assignment on the linked storage account. Both are required to create evaluation runs and upload artifacts.
71
+ - In secured hubs, confirm the linked storage account allows access from your network (or private endpoint) and that Entra ID authentication is enabled on the storage resource.
72
+ - If the helper warns `This may be due to missing environment variables or insufficient permissions.`, double-check the `AZURE_PROJECT_ENDPOINT`, `AGENT_NAME`, and storage role assignments before retrying.
73
+
74
+ ### Target resource not found
75
+ - When initializing an Azure OpenAI deployment directly as the `target`, specify `azure_endpoint` as `https://<hub>.openai.azure.com/openai/deployments/<deployment_name>/chat/completions?api-version=2025-01-01-preview`.
76
+ - If you instantiate `AzureOpenAI`, use the resource-level endpoint format `https://<hub>.openai.azure.com/` and ensure the deployment name plus API version match an active deployment.
77
+ - A cloud run error such as `Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}` when creating the eval group can also indicate that `azure-ai-projects>=2.0.0b1` is not installed. Upgrade to that version or later to access the preview APIs used by Red Team.
78
+
79
+ ### Agent name not found
80
+ - `(not_found) Agent <name> doesn’t exist` means the Azure AI project could not resolve the agent `name`. Names are case sensitive and differ from display names.
81
+ - Verify the `AZURE_PROJECT_ENDPOINT` points to the correct project and that the agent is published there.
82
+ - Requires `DefaultAzureCredential` from `azure.identity` and `AIProjectClient` from `azure.ai.projects`.
83
+ - Use the following helper to list agents in the current project and confirm the `name` column matches your `AGENT_NAME` value:
84
+
85
+ ```python
86
+ def list_project_agents(endpoint: str | None = None) -> None:
87
+ project_endpoint = endpoint or os.environ.get("AZURE_PROJECT_ENDPOINT") or ""
88
+ if not project_endpoint:
89
+ print("Set AZURE_PROJECT_ENDPOINT before listing agents.")
90
+ return
91
+ with DefaultAzureCredential() as project_credential:
92
+ with AIProjectClient(
93
+ endpoint=project_endpoint,
94
+ credential=project_credential,
95
+ api_version="2025-11-15-preview",
96
+ ) as project_client:
97
+ agents = list(project_client.agents.list())
98
+ if not agents:
99
+ print(f"No agents found in project: {project_endpoint}")
100
+ return
101
+ print(f"Agents in {project_endpoint}:")
102
+ for agent in agents:
103
+ display_name = agent.get("display_name") if isinstance(agent, dict) else getattr(agent, "display_name", "")
104
+ name = agent.get("name") if isinstance(agent, dict) else getattr(agent, "name", "")
105
+ print(f"- name: {name} | display_name: {display_name}")
106
+ ```
107
+
108
+ ### Insufficient Storage Permissions
109
+ - `WARNING: Failed to log artifacts to MLFlow: (UserError) Failed to upload evaluation run to the cloud due to insufficient permission to access the storage` means the linked storage account is missing the necessary assignments.
110
+ - Portal steps:
111
+ 1. Open the resource group tied to the Azure AI Project in the Azure Portal.
112
+ 2. Locate the linked storage account(s).
113
+ 3. Select each storage account and choose **Access control (IAM)**.
114
+ 4. Grant the affected identity the **Storage Blob Data Contributor** role.
115
+ - Prefer CLI? Reuse the `az role assignment create` command described in [Troubleshoot Remote Tracking Issues](#troubleshoot-remote-tracking-issues).
116
+
117
+ ### PyRIT "Error sending prompt" message
118
+ - `Exception: Error sending prompt with conversation ID: <guid>` is raised by PyRIT when a target LLM call fails inside the `PromptSendingOrchestrator`. The runner retries the conversation up to the configured limit, so occasional occurrences usually resolve automatically.
119
+ - Common triggers include transient network issues, 429 throttling, or 5xx responses from the target deployment. Even if retries succeed you will still see the stack trace in notebook output.
120
+ - Inspect the `redteam.log` file written to the scan output directory (typically `<working dir>/runs/<scan_id>/redteam.log`) for the underlying exception and HTTP status. Increase verbosity with `DEBUG=True` for deeper diagnostics.
121
+ - Running in Azure AI Studio? Navigate to **Evaluate > Red Team > <run name> > Logs**, download `redteam.log`, and search for the conversation ID to inspect the payload.
122
+ - If one conversation ID keeps failing after retries, verify the target credentials, check deployment health, and review Azure OpenAI quota or rate-limit alerts in the Azure portal.
123
+
124
+ ## Logging
125
+
126
+ You can set logging level via environment variable `PF_LOGGING_LEVEL`, valid values include `CRITICAL`, `ERROR`, `WARNING`, `INFO`, `DEBUG`; default is `INFO`.
127
+
128
+ ## Get Additional Help
129
+
130
+ Additional information on ways to reach out for support can be found in the [SUPPORT.md](https://github.com/Azure/azure-sdk-for-python/blob/main/SUPPORT.md) at the root of the repo.
@@ -32,6 +32,8 @@ from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
32
32
  from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator
33
33
  from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
34
34
  from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
35
+ from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
36
+ from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
35
37
  from ._model_configurations import (
36
38
  AzureAIProject,
37
39
  AzureOpenAIModelConfiguration,
@@ -46,6 +48,7 @@ from ._aoai.label_grader import AzureOpenAILabelGrader
46
48
  from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
47
49
  from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
48
50
  from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
51
+ from ._aoai.python_grader import AzureOpenAIPythonGrader
49
52
 
50
53
 
51
54
  _patch_all = []
@@ -53,21 +56,46 @@ _patch_all = []
53
56
  # The converter from the AI service to the evaluator schema requires a dependency on
54
57
  # ai.projects, but we also don't want to force users installing ai.evaluations to pull
55
58
  # in ai.projects. So we only import it if it's available and the user has ai.projects.
56
- try:
57
- from ._converters._ai_services import AIAgentConverter
59
+ # We use lazy loading to avoid printing messages during import unless the classes are actually used.
60
+ _lazy_imports = {}
58
61
 
59
- _patch_all.append("AIAgentConverter")
60
- except ImportError:
61
- print(
62
- "[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`."
63
- )
64
62
 
65
- try:
66
- from ._converters._sk_services import SKAgentConverter
63
+ def _create_lazy_import(class_name, module_path, dependency_name):
64
+ """Create a lazy import function for optional dependencies.
67
65
 
68
- _patch_all.append("SKAgentConverter")
69
- except ImportError:
70
- print("[INFO] Could not import SKAgentConverter. Please install the dependency with `pip install semantic-kernel`.")
66
+ Args:
67
+ class_name: Name of the class to import
68
+ module_path: Module path to import from
69
+ dependency_name: Name of the dependency package for error message
70
+
71
+ Returns:
72
+ A function that performs the lazy import when called
73
+ """
74
+
75
+ def lazy_import():
76
+ try:
77
+ module = __import__(module_path, fromlist=[class_name])
78
+ cls = getattr(module, class_name)
79
+ _patch_all.append(class_name)
80
+ return cls
81
+ except ImportError:
82
+ raise ImportError(
83
+ f"Could not import {class_name}. Please install the dependency with `pip install {dependency_name}`."
84
+ )
85
+
86
+ return lazy_import
87
+
88
+
89
+ _lazy_imports["AIAgentConverter"] = _create_lazy_import(
90
+ "AIAgentConverter",
91
+ "azure.ai.evaluation._converters._ai_services",
92
+ "azure-ai-projects",
93
+ )
94
+ _lazy_imports["SKAgentConverter"] = _create_lazy_import(
95
+ "SKAgentConverter",
96
+ "azure.ai.evaluation._converters._sk_services",
97
+ "semantic-kernel",
98
+ )
71
99
 
72
100
  __all__ = [
73
101
  "evaluate",
@@ -105,11 +133,21 @@ __all__ = [
105
133
  "CodeVulnerabilityEvaluator",
106
134
  "UngroundedAttributesEvaluator",
107
135
  "ToolCallAccuracyEvaluator",
136
+ "_ToolOutputUtilizationEvaluator",
137
+ "_ToolCallSuccessEvaluator",
108
138
  "AzureOpenAIGrader",
109
139
  "AzureOpenAILabelGrader",
110
140
  "AzureOpenAIStringCheckGrader",
111
141
  "AzureOpenAITextSimilarityGrader",
112
142
  "AzureOpenAIScoreModelGrader",
143
+ "AzureOpenAIPythonGrader",
113
144
  ]
114
145
 
115
146
  __all__.extend([p for p in _patch_all if p not in __all__])
147
+
148
+
149
+ def __getattr__(name):
150
+ """Handle lazy imports for optional dependencies."""
151
+ if name in _lazy_imports:
152
+ return _lazy_imports[name]()
153
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")