azure-ai-evaluation 1.0.0b4__tar.gz → 1.0.0b5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/CHANGELOG.md +68 -0
  2. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/MANIFEST.in +1 -0
  3. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/NOTICE.txt +20 -0
  4. {azure_ai_evaluation-1.0.0b4/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.0.0b5}/PKG-INFO +166 -9
  5. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/README.md +96 -8
  6. azure_ai_evaluation-1.0.0b5/TROUBLESHOOTING.md +50 -0
  7. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/__init__.py +22 -0
  8. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/constants.py +5 -0
  9. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/math.py +11 -0
  10. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/rai_service.py +172 -35
  11. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/utils.py +162 -23
  12. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_constants.py +6 -6
  13. {azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluate/_batch_run_client → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run}/__init__.py +3 -2
  14. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +4 -4
  15. {azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluate/_batch_run_client → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run}/proxy_client.py +6 -3
  16. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  17. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
  18. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
  19. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
  20. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/_utils.py +40 -7
  21. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  22. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
  23. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  24. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
  25. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
  26. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
  27. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
  28. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
  29. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
  30. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
  31. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
  32. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
  33. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
  34. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
  35. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  36. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  37. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -0
  38. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  39. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  40. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  41. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  42. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  43. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  44. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  45. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  46. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  47. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  48. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  49. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +90 -0
  50. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
  51. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
  52. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  53. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +197 -0
  54. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  55. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  56. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  57. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  58. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
  59. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
  60. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_exceptions.py +17 -0
  61. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_model_configurations.py +18 -1
  62. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_version.py +1 -1
  63. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/__init__.py +2 -1
  64. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  65. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
  66. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  67. azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  68. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  69. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  70. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  71. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
  72. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
  73. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  74. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  75. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_simulator.py +115 -61
  76. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_utils.py +6 -6
  77. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5/azure_ai_evaluation.egg-info}/PKG-INFO +166 -9
  78. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/SOURCES.txt +22 -6
  79. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/requires.txt +1 -0
  80. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/setup.py +2 -0
  81. azure_ai_evaluation-1.0.0b5/tests/__pf_service_isolation.py +28 -0
  82. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/conftest.py +27 -8
  83. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/target_fn.py +18 -0
  84. azure_ai_evaluation-1.0.0b5/tests/e2etests/test_builtin_evaluators.py +1021 -0
  85. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/test_evaluate.py +217 -21
  86. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/test_sim_and_eval.py +5 -9
  87. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_batch_run_context.py +8 -8
  88. azure_ai_evaluation-1.0.0b5/tests/unittests/test_built_in_evaluator.py +138 -0
  89. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_content_safety_rai_script.py +17 -12
  90. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_eval_run.py +28 -2
  91. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_evaluate.py +59 -22
  92. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_non_adv_simulator.py +7 -4
  93. azure_ai_evaluation-1.0.0b5/tests/unittests/test_utils.py +56 -0
  94. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -57
  95. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -56
  96. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -72
  97. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  98. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -57
  99. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -64
  100. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -154
  101. azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -43
  102. azure_ai_evaluation-1.0.0b4/tests/e2etests/test_builtin_evaluators.py +0 -474
  103. azure_ai_evaluation-1.0.0b4/tests/unittests/test_built_in_evaluator.py +0 -41
  104. azure_ai_evaluation-1.0.0b4/tests/unittests/test_utils.py +0 -20
  105. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/__init__.py +0 -0
  106. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/__init__.py +0 -0
  107. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_common/__init__.py +0 -0
  108. {azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/simulator/_helpers → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_common}/_experimental.py +0 -0
  109. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
  110. {azure_ai_evaluation-1.0.0b4/azure/ai/evaluation/_evaluate/_batch_run_client → azure_ai_evaluation-1.0.0b5/azure/ai/evaluation/_evaluate/_batch_run}/code_client.py +0 -0
  111. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
  112. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
  113. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
  114. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_common/__init__.py +0 -0
  115. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
  116. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  117. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
  118. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +0 -0
  119. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
  120. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
  121. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
  122. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
  123. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
  124. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
  125. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
  126. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
  127. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
  128. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
  129. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
  130. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
  131. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_http_utils.py +0 -0
  132. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_user_agent.py +0 -0
  133. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/__init__.py +0 -0
  134. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
  135. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -0
  136. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -0
  137. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -0
  138. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
  139. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/py.typed +0 -0
  140. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_constants.py +0 -0
  141. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_conversation/__init__.py +0 -0
  142. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -0
  143. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
  144. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
  145. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
  146. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +0 -0
  147. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +0 -0
  148. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +0 -0
  149. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
  150. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  151. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure/ai/evaluation/simulator/_tracing.py +0 -0
  152. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
  153. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
  154. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
  155. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/pyproject.toml +0 -0
  156. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/setup.cfg +0 -0
  157. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/__init__.py +0 -0
  158. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/__openai_patcher.py +0 -0
  159. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/__init__.py +0 -0
  160. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
  161. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/test_adv_simulator.py +0 -0
  162. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/e2etests/test_metrics_upload.py +1 -1
  163. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_content_safety_defect_rate.py +1 -1
  164. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_evaluate_telemetry.py +1 -1
  165. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
  166. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -0
  167. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_jailbreak_simulator.py +0 -0
  168. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_save_eval.py +0 -0
  169. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_simulator.py +0 -0
  170. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
  171. {azure_ai_evaluation-1.0.0b4 → azure_ai_evaluation-1.0.0b5}/tests/unittests/test_synthetic_conversation_bot.py +1 -1
@@ -1,5 +1,71 @@
1
1
  # Release History
2
2
 
3
+ ## 1.0.0b5 (2024-10-28)
4
+
5
+ ### Features Added
6
+ - Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness.
7
+ - Groundedness detection in Non Adversarial Simulator via query/context pairs
8
+ ```python
9
+ import importlib.resources as pkg_resources
10
+ package = "azure.ai.evaluation.simulator._data_sources"
11
+ resource_name = "grounding.json"
12
+ custom_simulator = Simulator(model_config=model_config)
13
+ conversation_turns = []
14
+ with pkg_resources.path(package, resource_name) as grounding_file:
15
+ with open(grounding_file, "r") as file:
16
+ data = json.load(file)
17
+ for item in data:
18
+ conversation_turns.append([item])
19
+ outputs = asyncio.run(custom_simulator(
20
+ target=callback,
21
+ conversation_turns=conversation_turns,
22
+ max_conversation_turns=1,
23
+ ))
24
+ ```
25
+ - Adding evaluator for multimodal use cases
26
+
27
+ ### Breaking Changes
28
+ - Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
29
+ - `RetrievalEvaluator` now requires a `context` input in addition to `query` in single-turn evaluation.
30
+ - `RelevanceEvaluator` no longer takes `context` as an input. It now only takes `query` and `response` in single-turn evaluation.
31
+ - `FluencyEvaluator` no longer takes `query` as an input. It now only takes `response` in single-turn evaluation.
32
+ - AdversarialScenario enum does not include `ADVERSARIAL_INDIRECT_JAILBREAK`, invoking IndirectJailbreak or XPIA should be done with `IndirectAttackSimulator`
33
+ - Outputs of `Simulator` and `AdversarialSimulator` previously had `to_eval_qa_json_lines` and now has `to_eval_qr_json_lines`. Where `to_eval_qa_json_lines` had:
34
+ ```json
35
+ {"question": <user_message>, "answer": <assistant_message>}
36
+ ```
37
+ `to_eval_qr_json_lines` now has:
38
+ ```json
39
+ {"query": <user_message>, "response": assistant_message}
40
+ ```
41
+
42
+ ### Bugs Fixed
43
+ - Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
44
+ - Fixed an issue where the `evaluate` API would fail with "[WinError 32] The process cannot access the file because it is being used by another process" when venv folder and target function file are in the same directory.
45
+ - Fix evaluate API failure when `trace.destination` is set to `none`
46
+ - Non adversarial simulator now accepts context from the callback
47
+
48
+ ### Other Changes
49
+ - Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
50
+ - `GroundednessEvaluator` now supports `query` as an optional input in single-turn evaluation. If `query` is provided, a different prompt template will be used for the evaluation.
51
+ - To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
52
+ - `CoherenceEvaluator`
53
+ - `RelevanceEvaluator`
54
+ - `FluencyEvaluator`
55
+ - `GroundednessEvaluator`
56
+ - `SimilarityEvaluator`
57
+ - `RetrievalEvaluator`
58
+ - The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
59
+
60
+ | Evaluator | New Token Limit |
61
+ | --- | --- |
62
+ | `CoherenceEvaluator` | 800 |
63
+ | `RelevanceEvaluator` | 800 |
64
+ | `FluencyEvaluator` | 800 |
65
+ | `GroundednessEvaluator` | 800 |
66
+ | `RetrievalEvaluator` | 1600 |
67
+ - Improved the error message for storage access permission issues to provide clearer guidance for users.
68
+
3
69
  ## 1.0.0b4 (2024-10-16)
4
70
 
5
71
  ### Breaking Changes
@@ -10,9 +76,11 @@
10
76
 
11
77
  ### Bugs Fixed
12
78
  - Adversarial Conversation simulations would fail with `Forbidden`. Added logic to re-fetch token in the exponential retry logic to retrive RAI Service response.
79
+ - Fixed an issue where the Evaluate API did not fail due to missing inputs when the target did not return columns required by the evaluators.
13
80
 
14
81
  ### Other Changes
15
82
  - Enhance the error message to provide clearer instruction when required packages for the remote tracking feature are missing.
83
+ - Print the per-evaluator run summary at the end of the Evaluate API call to make troubleshooting row-level failures easier.
16
84
 
17
85
  ## 1.0.0b3 (2024-10-01)
18
86
 
@@ -4,3 +4,4 @@ include azure/__init__.py
4
4
  include azure/ai/__init__.py
5
5
  include azure/ai/evaluation/py.typed
6
6
  recursive-include azure/ai/evaluation *.prompty
7
+ include azure/ai/evaluation/simulator/_data_sources/grounding.json
@@ -48,3 +48,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
48
48
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49
49
  See the License for the specific language governing permissions and
50
50
  limitations under the License.
51
+
52
+
53
+ License notice for [Is GPT-4 a reliable rater? Evaluating consistency in GPT-4's text ratings](https://www.frontiersin.org/journals/education/articles/10.3389/feduc.2023.1272229/full)
54
+ ------------------------------------------------------------------------------------------------------------------
55
+ Copyright © 2023 Hackl, Müller, Granitzer and Sailer. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
56
+
57
+
58
+ License notice for [Is ChatGPT a Good NLG Evaluator? A Preliminary Study](https://aclanthology.org/2023.newsum-1.1) (Wang et al., NewSum 2023)
59
+ ------------------------------------------------------------------------------------------------------------------
60
+ Copyright © 2023. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
61
+
62
+
63
+ License notice for [SummEval: Re-evaluating Summarization Evaluation.](https://doi.org/10.1162/tacl_a_00373) (Fabbri et al.)
64
+ ------------------------------------------------------------------------------------------------------------------
65
+ © 2021 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
66
+
67
+
68
+ License notice for [Evaluation Metrics in the Era of GPT-4: Reliably Evaluating Large Language Models on Sequence to Sequence Tasks](https://aclanthology.org/2023.emnlp-main.543) (Sottana et al., EMNLP 2023)
69
+ ------------------------------------------------------------------------------------------------------------------
70
+ © 2023 Association for Computational Linguistics. This work is openly licensed via [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: azure-ai-evaluation
3
- Version: 1.0.0b4
3
+ Version: 1.0.0b5
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -30,6 +30,7 @@ Requires-Dist: azure-core>=1.30.2
30
30
  Requires-Dist: nltk>=3.9.1
31
31
  Provides-Extra: remote
32
32
  Requires-Dist: promptflow-azure<2.0.0,>=1.15.0; extra == "remote"
33
+ Requires-Dist: azure-ai-inference>=1.0.0b4; extra == "remote"
33
34
 
34
35
  # Azure AI Evaluation client library for Python
35
36
 
@@ -95,9 +96,6 @@ if __name__ == "__main__":
95
96
  # Running Relevance Evaluator on single input row
96
97
  relevance_score = relevance_eval(
97
98
  response="The Alpine Explorer Tent is the most waterproof.",
98
- context="From the our product list,"
99
- " the alpine explorer tent is the most waterproof."
100
- " The Adventure Dining Table has higher weight.",
101
99
  query="Which tent is the most waterproof?",
102
100
  )
103
101
 
@@ -172,6 +170,95 @@ Output with a string that continues the conversation, responding to the latest m
172
170
  {{ conversation_history }}
173
171
 
174
172
  ```
173
+
174
+ Query Response generaing prompty for gpt-4o with `json_schema` support
175
+ Use this file as an override.
176
+ ```yaml
177
+ ---
178
+ name: TaskSimulatorQueryResponseGPT4o
179
+ description: Gets queries and responses from a blob of text
180
+ model:
181
+ api: chat
182
+ parameters:
183
+ temperature: 0.0
184
+ top_p: 1.0
185
+ presence_penalty: 0
186
+ frequency_penalty: 0
187
+ response_format:
188
+ type: json_schema
189
+ json_schema:
190
+ name: QRJsonSchema
191
+ schema:
192
+ type: object
193
+ properties:
194
+ items:
195
+ type: array
196
+ items:
197
+ type: object
198
+ properties:
199
+ q:
200
+ type: string
201
+ r:
202
+ type: string
203
+ required:
204
+ - q
205
+ - r
206
+
207
+ inputs:
208
+ text:
209
+ type: string
210
+ num_queries:
211
+ type: integer
212
+
213
+
214
+ ---
215
+ system:
216
+ You're an AI that helps in preparing a Question/Answer quiz from Text for "Who wants to be a millionaire" tv show
217
+ Both Questions and Answers MUST BE extracted from given Text
218
+ Frame Question in a way so that Answer is RELEVANT SHORT BITE-SIZED info from Text
219
+ RELEVANT info could be: NUMBER, DATE, STATISTIC, MONEY, NAME
220
+ A sentence should contribute multiple QnAs if it has more info in it
221
+ Answer must not be more than 5 words
222
+ Answer must be picked from Text as is
223
+ Question should be as descriptive as possible and must include as much context as possible from Text
224
+ Output must always have the provided number of QnAs
225
+ Output must be in JSON format.
226
+ Output must have {{num_queries}} objects in the format specified below. Any other count is unacceptable.
227
+ Text:
228
+ <|text_start|>
229
+ On January 24, 1984, former Apple CEO Steve Jobs introduced the first Macintosh. In late 2003, Apple had 2.06 percent of the desktop share in the United States.
230
+ Some years later, research firms IDC and Gartner reported that Apple's market share in the U.S. had increased to about 6%.
231
+ <|text_end|>
232
+ Output with 5 QnAs:
233
+ {
234
+ "qna": [{
235
+ "q": "When did the former Apple CEO Steve Jobs introduced the first Macintosh?",
236
+ "r": "January 24, 1984"
237
+ },
238
+ {
239
+ "q": "Who was the former Apple CEO that introduced the first Macintosh on January 24, 1984?",
240
+ "r": "Steve Jobs"
241
+ },
242
+ {
243
+ "q": "What percent of the desktop share did Apple have in the United States in late 2003?",
244
+ "r": "2.06 percent"
245
+ },
246
+ {
247
+ "q": "What were the research firms that reported on Apple's market share in the U.S.?",
248
+ "r": "IDC and Gartner"
249
+ },
250
+ {
251
+ "q": "What was the percentage increase of Apple's market share in the U.S., as reported by research firms IDC and Gartner?",
252
+ "r": "6%"
253
+ }]
254
+ }
255
+ Text:
256
+ <|text_start|>
257
+ {{ text }}
258
+ <|text_end|>
259
+ Output with {{ num_queries }} QnAs:
260
+ ```
261
+
175
262
  Application code:
176
263
 
177
264
  ```python
@@ -189,6 +276,7 @@ model_config = {
189
276
  "azure_deployment": os.environ.get("AZURE_DEPLOYMENT"),
190
277
  # not providing key would make the SDK pick up `DefaultAzureCredential`
191
278
  # use "api_key": "<your API key>"
279
+ "api_version": "2024-08-01-preview" # keep this for gpt-4o
192
280
  }
193
281
 
194
282
  # Use Wikipedia to get some text for the simulation
@@ -232,20 +320,21 @@ async def callback(
232
320
  formatted_response = {
233
321
  "content": response,
234
322
  "role": "assistant",
235
- "context": {
236
- "citations": None,
237
- },
323
+ "context": "",
238
324
  }
239
325
  messages["messages"].append(formatted_response)
240
326
  return {"messages": messages["messages"], "stream": stream, "session_state": session_state, "context": context}
241
327
 
242
328
  async def main():
243
329
  simulator = Simulator(model_config=model_config)
330
+ current_dir = os.path.dirname(__file__)
331
+ query_response_override_for_latest_gpt_4o = os.path.join(current_dir, "TaskSimulatorQueryResponseGPT4o.prompty")
244
332
  outputs = await simulator(
245
333
  target=callback,
246
334
  text=text,
335
+ query_response_generating_prompty=query_response_override_for_latest_gpt_4o, # use this only with latest gpt-4o
247
336
  num_queries=2,
248
- max_conversation_turns=4,
337
+ max_conversation_turns=1,
249
338
  user_persona=[
250
339
  f"I am a student and I want to learn more about {wiki_search_term}",
251
340
  f"I am a teacher and I want to teach my students about {wiki_search_term}"
@@ -267,7 +356,7 @@ if __name__ == "__main__":
267
356
  #### Adversarial Simulator
268
357
 
269
358
  ```python
270
- from from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
359
+ from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
271
360
  from azure.identity import DefaultAzureCredential
272
361
  from typing import Any, Dict, List, Optional
273
362
  import asyncio
@@ -420,6 +509,72 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
420
509
 
421
510
  # Release History
422
511
 
512
+ ## 1.0.0b5 (2024-10-28)
513
+
514
+ ### Features Added
515
+ - Added `GroundednessProEvaluator`, which is a service-based evaluator for determining response groundedness.
516
+ - Groundedness detection in Non Adversarial Simulator via query/context pairs
517
+ ```python
518
+ import importlib.resources as pkg_resources
519
+ package = "azure.ai.evaluation.simulator._data_sources"
520
+ resource_name = "grounding.json"
521
+ custom_simulator = Simulator(model_config=model_config)
522
+ conversation_turns = []
523
+ with pkg_resources.path(package, resource_name) as grounding_file:
524
+ with open(grounding_file, "r") as file:
525
+ data = json.load(file)
526
+ for item in data:
527
+ conversation_turns.append([item])
528
+ outputs = asyncio.run(custom_simulator(
529
+ target=callback,
530
+ conversation_turns=conversation_turns,
531
+ max_conversation_turns=1,
532
+ ))
533
+ ```
534
+ - Adding evaluator for multimodal use cases
535
+
536
+ ### Breaking Changes
537
+ - Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
538
+ - `RetrievalEvaluator` now requires a `context` input in addition to `query` in single-turn evaluation.
539
+ - `RelevanceEvaluator` no longer takes `context` as an input. It now only takes `query` and `response` in single-turn evaluation.
540
+ - `FluencyEvaluator` no longer takes `query` as an input. It now only takes `response` in single-turn evaluation.
541
+ - AdversarialScenario enum does not include `ADVERSARIAL_INDIRECT_JAILBREAK`, invoking IndirectJailbreak or XPIA should be done with `IndirectAttackSimulator`
542
+ - Outputs of `Simulator` and `AdversarialSimulator` previously had `to_eval_qa_json_lines` and now has `to_eval_qr_json_lines`. Where `to_eval_qa_json_lines` had:
543
+ ```json
544
+ {"question": <user_message>, "answer": <assistant_message>}
545
+ ```
546
+ `to_eval_qr_json_lines` now has:
547
+ ```json
548
+ {"query": <user_message>, "response": assistant_message}
549
+ ```
550
+
551
+ ### Bugs Fixed
552
+ - Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
553
+ - Fixed an issue where the `evaluate` API would fail with "[WinError 32] The process cannot access the file because it is being used by another process" when venv folder and target function file are in the same directory.
554
+ - Fix evaluate API failure when `trace.destination` is set to `none`
555
+ - Non adversarial simulator now accepts context from the callback
556
+
557
+ ### Other Changes
558
+ - Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
559
+ - `GroundednessEvaluator` now supports `query` as an optional input in single-turn evaluation. If `query` is provided, a different prompt template will be used for the evaluation.
560
+ - To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
561
+ - `CoherenceEvaluator`
562
+ - `RelevanceEvaluator`
563
+ - `FluencyEvaluator`
564
+ - `GroundednessEvaluator`
565
+ - `SimilarityEvaluator`
566
+ - `RetrievalEvaluator`
567
+ - The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
568
+
569
+ | Evaluator | New Token Limit |
570
+ | --- | --- |
571
+ | `CoherenceEvaluator` | 800 |
572
+ | `RelevanceEvaluator` | 800 |
573
+ | `FluencyEvaluator` | 800 |
574
+ | `GroundednessEvaluator` | 800 |
575
+ | `RetrievalEvaluator` | 1600 |
576
+ - Improved the error message for storage access permission issues to provide clearer guidance for users.
577
+
423
578
  ## 1.0.0b4 (2024-10-16)
424
579
 
425
580
  ### Breaking Changes
@@ -430,9 +585,11 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
430
585
 
431
586
  ### Bugs Fixed
432
587
  - Adversarial Conversation simulations would fail with `Forbidden`. Added logic to re-fetch token in the exponential retry logic to retrive RAI Service response.
588
+ - Fixed an issue where the Evaluate API did not fail due to missing inputs when the target did not return columns required by the evaluators.
433
589
 
434
590
  ### Other Changes
435
591
  - Enhance the error message to provide clearer instruction when required packages for the remote tracking feature are missing.
592
+ - Print the per-evaluator run summary at the end of the Evaluate API call to make troubleshooting row-level failures easier.
436
593
 
437
594
  ## 1.0.0b3 (2024-10-01)
438
595
 
@@ -62,9 +62,6 @@ if __name__ == "__main__":
62
62
  # Running Relevance Evaluator on single input row
63
63
  relevance_score = relevance_eval(
64
64
  response="The Alpine Explorer Tent is the most waterproof.",
65
- context="From the our product list,"
66
- " the alpine explorer tent is the most waterproof."
67
- " The Adventure Dining Table has higher weight.",
68
65
  query="Which tent is the most waterproof?",
69
66
  )
70
67
 
@@ -139,6 +136,95 @@ Output with a string that continues the conversation, responding to the latest m
139
136
  {{ conversation_history }}
140
137
 
141
138
  ```
139
+
140
+ Query Response generaing prompty for gpt-4o with `json_schema` support
141
+ Use this file as an override.
142
+ ```yaml
143
+ ---
144
+ name: TaskSimulatorQueryResponseGPT4o
145
+ description: Gets queries and responses from a blob of text
146
+ model:
147
+ api: chat
148
+ parameters:
149
+ temperature: 0.0
150
+ top_p: 1.0
151
+ presence_penalty: 0
152
+ frequency_penalty: 0
153
+ response_format:
154
+ type: json_schema
155
+ json_schema:
156
+ name: QRJsonSchema
157
+ schema:
158
+ type: object
159
+ properties:
160
+ items:
161
+ type: array
162
+ items:
163
+ type: object
164
+ properties:
165
+ q:
166
+ type: string
167
+ r:
168
+ type: string
169
+ required:
170
+ - q
171
+ - r
172
+
173
+ inputs:
174
+ text:
175
+ type: string
176
+ num_queries:
177
+ type: integer
178
+
179
+
180
+ ---
181
+ system:
182
+ You're an AI that helps in preparing a Question/Answer quiz from Text for "Who wants to be a millionaire" tv show
183
+ Both Questions and Answers MUST BE extracted from given Text
184
+ Frame Question in a way so that Answer is RELEVANT SHORT BITE-SIZED info from Text
185
+ RELEVANT info could be: NUMBER, DATE, STATISTIC, MONEY, NAME
186
+ A sentence should contribute multiple QnAs if it has more info in it
187
+ Answer must not be more than 5 words
188
+ Answer must be picked from Text as is
189
+ Question should be as descriptive as possible and must include as much context as possible from Text
190
+ Output must always have the provided number of QnAs
191
+ Output must be in JSON format.
192
+ Output must have {{num_queries}} objects in the format specified below. Any other count is unacceptable.
193
+ Text:
194
+ <|text_start|>
195
+ On January 24, 1984, former Apple CEO Steve Jobs introduced the first Macintosh. In late 2003, Apple had 2.06 percent of the desktop share in the United States.
196
+ Some years later, research firms IDC and Gartner reported that Apple's market share in the U.S. had increased to about 6%.
197
+ <|text_end|>
198
+ Output with 5 QnAs:
199
+ {
200
+ "qna": [{
201
+ "q": "When did the former Apple CEO Steve Jobs introduced the first Macintosh?",
202
+ "r": "January 24, 1984"
203
+ },
204
+ {
205
+ "q": "Who was the former Apple CEO that introduced the first Macintosh on January 24, 1984?",
206
+ "r": "Steve Jobs"
207
+ },
208
+ {
209
+ "q": "What percent of the desktop share did Apple have in the United States in late 2003?",
210
+ "r": "2.06 percent"
211
+ },
212
+ {
213
+ "q": "What were the research firms that reported on Apple's market share in the U.S.?",
214
+ "r": "IDC and Gartner"
215
+ },
216
+ {
217
+ "q": "What was the percentage increase of Apple's market share in the U.S., as reported by research firms IDC and Gartner?",
218
+ "r": "6%"
219
+ }]
220
+ }
221
+ Text:
222
+ <|text_start|>
223
+ {{ text }}
224
+ <|text_end|>
225
+ Output with {{ num_queries }} QnAs:
226
+ ```
227
+
142
228
  Application code:
143
229
 
144
230
  ```python
@@ -156,6 +242,7 @@ model_config = {
156
242
  "azure_deployment": os.environ.get("AZURE_DEPLOYMENT"),
157
243
  # not providing key would make the SDK pick up `DefaultAzureCredential`
158
244
  # use "api_key": "<your API key>"
245
+ "api_version": "2024-08-01-preview" # keep this for gpt-4o
159
246
  }
160
247
 
161
248
  # Use Wikipedia to get some text for the simulation
@@ -199,20 +286,21 @@ async def callback(
199
286
  formatted_response = {
200
287
  "content": response,
201
288
  "role": "assistant",
202
- "context": {
203
- "citations": None,
204
- },
289
+ "context": "",
205
290
  }
206
291
  messages["messages"].append(formatted_response)
207
292
  return {"messages": messages["messages"], "stream": stream, "session_state": session_state, "context": context}
208
293
 
209
294
  async def main():
210
295
  simulator = Simulator(model_config=model_config)
296
+ current_dir = os.path.dirname(__file__)
297
+ query_response_override_for_latest_gpt_4o = os.path.join(current_dir, "TaskSimulatorQueryResponseGPT4o.prompty")
211
298
  outputs = await simulator(
212
299
  target=callback,
213
300
  text=text,
301
+ query_response_generating_prompty=query_response_override_for_latest_gpt_4o, # use this only with latest gpt-4o
214
302
  num_queries=2,
215
- max_conversation_turns=4,
303
+ max_conversation_turns=1,
216
304
  user_persona=[
217
305
  f"I am a student and I want to learn more about {wiki_search_term}",
218
306
  f"I am a teacher and I want to teach my students about {wiki_search_term}"
@@ -234,7 +322,7 @@ if __name__ == "__main__":
234
322
  #### Adversarial Simulator
235
323
 
236
324
  ```python
237
- from from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
325
+ from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
238
326
  from azure.identity import DefaultAzureCredential
239
327
  from typing import Any, Dict, List, Optional
240
328
  import asyncio
@@ -0,0 +1,50 @@
1
+ # Troubleshoot AI Evaluation SDK Issues
2
+
3
+ This guide walks you through how to investigate failures, common errors in the `azure-ai-evaluation` SDK, and steps to mitigate these issues.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Handle Evaluate API Errors](#handle-evaluate-api-errors)
8
+ - [Troubleshoot Remote Tracking Issues](#troubleshoot-remote-tracking-issues)
9
+ - [Safety Metric Supported Regions](#safety-metric-supported-regions)
10
+ - [Handle Simulation Errors](#handle-simulation-errors)
11
+ - [Adversarial Simulation Supported Regions](#adversarial-simulation-supported-regions)
12
+ - [Logging](#logging)
13
+ - [Get additional help](#get-additional-help)
14
+
15
+ ## Handle Evaluate API Errors
16
+
17
+ ### Troubleshoot Remote Tracking Issues
18
+
19
+ - Before running `evaluate()`, to ensure that you can enable logging and tracing to your Azure AI project, make sure you are first logged in by running `az login`.
20
+ - Then install the following sub-package:
21
+
22
+ ```Shell
23
+ pip install azure-ai-evaluation[remote]
24
+ ```
25
+
26
+ - Ensure that you assign the proper permissions to the storage account linked to your Azure AI Studio hub. This can be done with the following command. More information can be found [here](https://review.learn.microsoft.com/azure/ai-studio/how-to/disable-local-auth).
27
+
28
+ ```Shell
29
+ az role assignment create --role "Storage Blob Data Contributor" --scope /subscriptions/<mySubscriptionID>/resourceGroups/<myResourceGroupName> --assignee-principal-type User --assignee-object-id "<user-id>"
30
+ ```
31
+
32
+ - Additionally, if you're using a virtual network or private link, and your evaluation run upload fails because of that, check out this [guide](https://docs.microsoft.com/azure/machine-learning/how-to-enable-studio-virtual-network#access-data-using-the-studio).
33
+
34
+ ### Safety Metric Supported Regions
35
+
36
+ Risk and safety evaluators depend on the Azure AI Studio safety evaluation backend service. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaisafetyeval-regionsupport).
37
+
38
+ ## Handle Simulation Errors
39
+
40
+ ### Adversarial Simulation Supported Regions
41
+
42
+ Adversarial simulators use Azure AI Studio safety evaluation backend service to generate an adversarial dataset against your application. For a list of supported regions, please refer to the documentation [here](https://aka.ms/azureaiadvsimulator-regionsupport).
43
+
44
+ ## Logging
45
+
46
+ You can set logging level via environment variable `PF_LOGGING_LEVEL`, valid values includes `CRITICAL`, `ERROR`, `WARNING`, `INFO`, `DEBUG`, default to `INFO`.
47
+
48
+ ## Get Additional Help
49
+
50
+ Additional information on ways to reach out for support can be found in the [SUPPORT.md](https://github.com/Azure/azure-sdk-for-python/blob/main/SUPPORT.md) at the root of the repo.
@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
12
12
  SexualEvaluator,
13
13
  ViolenceEvaluator,
14
14
  )
15
+ from ._evaluators._multimodal._content_safety_multimodal import (
16
+ ContentSafetyMultimodalEvaluator,
17
+ HateUnfairnessMultimodalEvaluator,
18
+ SelfHarmMultimodalEvaluator,
19
+ SexualMultimodalEvaluator,
20
+ ViolenceMultimodalEvaluator,
21
+ )
22
+ from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
15
23
  from ._evaluators._f1_score import F1ScoreEvaluator
16
24
  from ._evaluators._fluency import FluencyEvaluator
17
25
  from ._evaluators._gleu import GleuScoreEvaluator
18
26
  from ._evaluators._groundedness import GroundednessEvaluator
27
+ from ._evaluators._service_groundedness import GroundednessProEvaluator
19
28
  from ._evaluators._meteor import MeteorScoreEvaluator
20
29
  from ._evaluators._protected_material import ProtectedMaterialEvaluator
21
30
  from ._evaluators._qa import QAEvaluator
@@ -27,7 +36,10 @@ from ._evaluators._xpia import IndirectAttackEvaluator
27
36
  from ._model_configurations import (
28
37
  AzureAIProject,
29
38
  AzureOpenAIModelConfiguration,
39
+ Conversation,
40
+ EvaluationResult,
30
41
  EvaluatorConfig,
42
+ Message,
31
43
  OpenAIModelConfiguration,
32
44
  )
33
45
 
@@ -37,6 +49,7 @@ __all__ = [
37
49
  "F1ScoreEvaluator",
38
50
  "FluencyEvaluator",
39
51
  "GroundednessEvaluator",
52
+ "GroundednessProEvaluator",
40
53
  "RelevanceEvaluator",
41
54
  "SimilarityEvaluator",
42
55
  "QAEvaluator",
@@ -57,4 +70,13 @@ __all__ = [
57
70
  "AzureOpenAIModelConfiguration",
58
71
  "OpenAIModelConfiguration",
59
72
  "EvaluatorConfig",
73
+ "Conversation",
74
+ "Message",
75
+ "EvaluationResult",
76
+ "ContentSafetyMultimodalEvaluator",
77
+ "HateUnfairnessMultimodalEvaluator",
78
+ "SelfHarmMultimodalEvaluator",
79
+ "SexualMultimodalEvaluator",
80
+ "ViolenceMultimodalEvaluator",
81
+ "ProtectedMaterialMultimodalEvaluator",
60
82
  ]
@@ -6,6 +6,9 @@ from enum import Enum
6
6
  from azure.core import CaseInsensitiveEnumMeta
7
7
 
8
8
 
9
+ PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
10
+
11
+
9
12
  class CommonConstants:
10
13
  """Define common constants."""
11
14
 
@@ -35,6 +38,7 @@ class Tasks:
35
38
  CONTENT_HARM = "content harm"
36
39
  PROTECTED_MATERIAL = "protected material"
37
40
  XPIA = "xpia"
41
+ GROUNDEDNESS = "groundedness"
38
42
 
39
43
 
40
44
  class _InternalAnnotationTasks:
@@ -56,6 +60,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
56
60
  SEXUAL = "sexual"
57
61
  PROTECTED_MATERIAL = "protected_material"
58
62
  XPIA = "xpia"
63
+ GROUNDEDNESS = "generic_groundedness"
59
64
 
60
65
 
61
66
  class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
@@ -5,6 +5,8 @@
5
5
  import math
6
6
  from typing import List
7
7
 
8
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
9
+
8
10
 
9
11
  def list_sum(lst: List[float]) -> float:
10
12
  return sum(lst)
@@ -15,4 +17,13 @@ def list_mean(lst: List[float]) -> float:
15
17
 
16
18
 
17
19
  def list_mean_nan_safe(lst: List[float]) -> float:
20
+ msg = "All score values are NaN. The mean cannot be calculated."
21
+ if all(math.isnan(l) for l in lst):
22
+ raise EvaluationException(
23
+ message=msg,
24
+ internal_message=msg,
25
+ blame=ErrorBlame.USER_ERROR,
26
+ category=ErrorCategory.INVALID_VALUE,
27
+ target=ErrorTarget.CONVERSATION,
28
+ )
18
29
  return list_mean([l for l in lst if not math.isnan(l)])