azure-ai-evaluation 1.0.0__tar.gz → 1.0.0b1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. azure_ai_evaluation-1.0.0b1/CHANGELOG.md +17 -0
  2. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/MANIFEST.in +0 -2
  3. azure_ai_evaluation-1.0.0b1/PKG-INFO +377 -0
  4. azure_ai_evaluation-1.0.0b1/README.md +323 -0
  5. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/__init__.py +4 -26
  6. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_common/constants.py +2 -9
  7. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_common/rai_service.py +452 -0
  8. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_common/utils.py +87 -0
  9. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_constants.py +6 -28
  10. {azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run → azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_batch_run_client}/__init__.py +2 -3
  11. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py → azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +8 -25
  12. {azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run → azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_batch_run_client}/code_client.py +30 -68
  13. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  14. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
  15. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
  16. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
  17. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluate/_utils.py +47 -108
  18. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
  19. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
  20. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  21. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
  22. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  23. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  24. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
  25. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
  26. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  27. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
  28. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py → azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +35 -24
  29. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  30. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
  31. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
  32. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
  33. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
  34. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
  35. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
  36. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
  37. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
  38. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
  39. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
  40. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  41. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
  42. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
  43. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  44. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  45. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
  46. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
  47. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
  48. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
  49. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
  50. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  51. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
  52. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_exceptions.py +7 -28
  53. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_http_utils.py +134 -205
  54. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/_model_configurations.py +27 -0
  55. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_version.py +1 -1
  56. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/__init__.py +2 -3
  57. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  58. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
  59. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_constants.py +1 -11
  60. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
  61. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
  62. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
  63. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
  64. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
  65. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  66. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
  67. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
  68. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  69. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
  70. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  71. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  72. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_tracing.py +28 -25
  73. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_utils.py +13 -34
  74. azure_ai_evaluation-1.0.0b1/azure/ai/evaluation/simulator/simulator.py +579 -0
  75. azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info/PKG-INFO +377 -0
  76. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure_ai_evaluation.egg-info/SOURCES.txt +17 -46
  77. azure_ai_evaluation-1.0.0b1/azure_ai_evaluation.egg-info/requires.txt +16 -0
  78. azure_ai_evaluation-1.0.0b1/pyproject.toml +6 -0
  79. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/setup.py +6 -7
  80. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/conftest.py +9 -57
  81. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +2 -9
  82. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/e2etests/target_fn.py +0 -18
  83. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/e2etests/test_adv_simulator.py +24 -51
  84. azure_ai_evaluation-1.0.0b1/tests/e2etests/test_builtin_evaluators.py +514 -0
  85. azure_ai_evaluation-1.0.0b1/tests/e2etests/test_evaluate.py +520 -0
  86. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/e2etests/test_metrics_upload.py +4 -10
  87. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_batch_run_context.py +8 -8
  88. azure_ai_evaluation-1.0.0b1/tests/unittests/test_built_in_evaluator.py +46 -0
  89. azure_ai_evaluation-1.0.0b1/tests/unittests/test_chat_evaluator.py +109 -0
  90. azure_ai_evaluation-1.0.0b1/tests/unittests/test_content_safety_chat_evaluator.py +82 -0
  91. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_content_safety_rai_script.py +26 -72
  92. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_eval_run.py +4 -33
  93. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_evaluate.py +54 -230
  94. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_evaluate_telemetry.py +10 -11
  95. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_jailbreak_simulator.py +3 -4
  96. azure_ai_evaluation-1.0.0b1/tests/unittests/test_non_adv_simulator.py +131 -0
  97. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_simulator.py +7 -6
  98. azure_ai_evaluation-1.0.0b1/tests/unittests/test_utils.py +20 -0
  99. azure_ai_evaluation-1.0.0/CHANGELOG.md +0 -214
  100. azure_ai_evaluation-1.0.0/NOTICE.txt +0 -70
  101. azure_ai_evaluation-1.0.0/PKG-INFO +0 -595
  102. azure_ai_evaluation-1.0.0/README.md +0 -345
  103. azure_ai_evaluation-1.0.0/TROUBLESHOOTING.md +0 -61
  104. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/_experimental.py +0 -172
  105. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/math.py +0 -89
  106. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/rai_service.py +0 -632
  107. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_common/utils.py +0 -445
  108. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
  109. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  110. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluate/_evaluate.py +0 -850
  111. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +0 -107
  112. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -99
  113. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  114. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  115. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  116. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  117. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -144
  118. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +0 -129
  119. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +0 -123
  120. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +0 -125
  121. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +0 -126
  122. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_eci/_eci.py +0 -89
  123. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +0 -104
  124. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -86
  125. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -144
  126. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  127. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  128. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  129. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  130. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  131. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  132. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  133. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  134. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  135. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -113
  136. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +0 -114
  137. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -100
  138. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -9
  139. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  140. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  141. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +0 -9
  142. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  143. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +0 -140
  144. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_evaluators/_xpia/xpia.py +0 -125
  145. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_model_configurations.py +0 -123
  146. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/__init__.py +0 -3
  147. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  148. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  149. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  150. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  151. azure_ai_evaluation-1.0.0/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  152. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  153. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  154. azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_simulator.py +0 -716
  155. azure_ai_evaluation-1.0.0/azure_ai_evaluation.egg-info/PKG-INFO +0 -595
  156. azure_ai_evaluation-1.0.0/azure_ai_evaluation.egg-info/requires.txt +0 -10
  157. azure_ai_evaluation-1.0.0/pyproject.toml +0 -19
  158. azure_ai_evaluation-1.0.0/samples/README.md +0 -57
  159. azure_ai_evaluation-1.0.0/samples/data/evaluate_test_data.jsonl +0 -3
  160. azure_ai_evaluation-1.0.0/samples/evaluation_samples_common.py +0 -60
  161. azure_ai_evaluation-1.0.0/samples/evaluation_samples_evaluate.py +0 -395
  162. azure_ai_evaluation-1.0.0/samples/evaluation_samples_simulate.py +0 -249
  163. azure_ai_evaluation-1.0.0/tests/__pf_service_isolation.py +0 -28
  164. azure_ai_evaluation-1.0.0/tests/e2etests/__init__.py +0 -0
  165. azure_ai_evaluation-1.0.0/tests/e2etests/test_builtin_evaluators.py +0 -997
  166. azure_ai_evaluation-1.0.0/tests/e2etests/test_evaluate.py +0 -926
  167. azure_ai_evaluation-1.0.0/tests/e2etests/test_sim_and_eval.py +0 -129
  168. azure_ai_evaluation-1.0.0/tests/unittests/test_built_in_evaluator.py +0 -128
  169. azure_ai_evaluation-1.0.0/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -46
  170. azure_ai_evaluation-1.0.0/tests/unittests/test_non_adv_simulator.py +0 -362
  171. azure_ai_evaluation-1.0.0/tests/unittests/test_utils.py +0 -258
  172. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/__init__.py +0 -0
  173. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/__init__.py +0 -0
  174. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_common/__init__.py +0 -0
  175. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
  176. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
  177. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
  178. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
  179. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  180. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
  181. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
  182. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
  183. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
  184. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
  185. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
  186. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
  187. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
  188. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
  189. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
  190. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
  191. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/_user_agent.py +0 -0
  192. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/py.typed +0 -0
  193. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
  194. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
  195. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
  196. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
  197. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
  198. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
  199. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
  200. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/setup.cfg +0 -0
  201. {azure_ai_evaluation-1.0.0/azure/ai/evaluation/simulator/_prompty → azure_ai_evaluation-1.0.0b1/tests}/__init__.py +0 -0
  202. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/__openai_patcher.py +0 -0
  203. {azure_ai_evaluation-1.0.0/tests → azure_ai_evaluation-1.0.0b1/tests/e2etests}/__init__.py +0 -0
  204. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_content_safety_defect_rate.py +1 -1
  205. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
  206. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_save_eval.py +0 -0
  207. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
  208. {azure_ai_evaluation-1.0.0 → azure_ai_evaluation-1.0.0b1}/tests/unittests/test_synthetic_conversation_bot.py +1 -1
@@ -0,0 +1,17 @@
1
+ # Release History
2
+
3
+ ## 1.0.0b1 (2024-09-20)
4
+
5
+ ### Breaking Changes
6
+
7
+ - The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
8
+ - The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
9
+ - The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
10
+ - Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
11
+ - Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
12
+
13
+ ### Features Added
14
+
15
+ - First preview
16
+ - This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
17
+ - Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
@@ -4,5 +4,3 @@ include azure/__init__.py
4
4
  include azure/ai/__init__.py
5
5
  include azure/ai/evaluation/py.typed
6
6
  recursive-include azure/ai/evaluation *.prompty
7
- include azure/ai/evaluation/simulator/_data_sources/grounding.json
8
- recursive-include samples *
@@ -0,0 +1,377 @@
1
+ Metadata-Version: 2.1
2
+ Name: azure-ai-evaluation
3
+ Version: 1.0.0b1
4
+ Summary: Microsoft Azure Evaluation Library for Python
5
+ Home-page: https://github.com/Azure/azure-sdk-for-python
6
+ Author: Microsoft Corporation
7
+ Author-email: azuresdkengsysadmins@microsoft.com
8
+ License: MIT License
9
+ Project-URL: Bug Reports, https://github.com/Azure/azure-sdk-for-python/issues
10
+ Project-URL: Source, https://github.com/Azure/azure-sdk-for-python
11
+ Keywords: azure,azure sdk
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Programming Language :: Python
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3 :: Only
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: License :: OSI Approved :: MIT License
21
+ Classifier: Operating System :: OS Independent
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: promptflow-devkit>=1.15.0
25
+ Requires-Dist: promptflow-core>=1.15.0
26
+ Requires-Dist: numpy>=1.23.2; python_version < "3.12"
27
+ Requires-Dist: numpy>=1.26.4; python_version >= "3.12"
28
+ Requires-Dist: pyjwt>=2.8.0
29
+ Requires-Dist: azure-identity>=1.12.0
30
+ Requires-Dist: azure-core>=1.30.2
31
+ Requires-Dist: nltk>=3.9.1
32
+ Requires-Dist: rouge-score>=0.1.2
33
+ Provides-Extra: pf-azure
34
+ Requires-Dist: promptflow-azure<2.0.0,>=1.15.0; extra == "pf-azure"
35
+
36
+ # Azure AI Evaluation client library for Python
37
+
38
+ ## Getting started
39
+
40
+ ### Install the package
41
+
42
+ Install the Azure AI Evaluation library for Python with:
43
+
44
+ ```bash
45
+ pip install azure-ai-evaluation
46
+ ```
47
+
48
+ ## Key concepts
49
+
50
+ Evaluators are custom or prebuilt classes or functions that are designed to measure the quality of the outputs from language models.
51
+
52
+ ## Examples
53
+
54
+ Users can create evaluator runs on the local machine as shown in the example below:
55
+
56
+ ```python
57
+ import os
58
+ from pprint import pprint
59
+
60
+ from azure.ai.evaluation import evaluate, RelevanceEvaluator, ViolenceEvaluator
61
+
62
+
63
+ def response_length(response, **kwargs):
64
+ return {"value": len(response)}
65
+
66
+
67
+ if __name__ == "__main__":
68
+ # Built-in evaluators
69
+ # Initialize Azure OpenAI Model Configuration
70
+ model_config = {
71
+ "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
72
+ "api_key": os.environ.get("AZURE_OPENAI_KEY"),
73
+ "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
74
+ }
75
+
76
+ # Initialzing Relevance Evaluator
77
+ relevance_eval = RelevanceEvaluator(model_config)
78
+
79
+ # Running Relevance Evaluator on single input row
80
+ relevance_score = relevance_eval(
81
+ response="The Alpine Explorer Tent is the most waterproof.",
82
+ context="From the our product list,"
83
+ " the alpine explorer tent is the most waterproof."
84
+ " The Adventure Dining Table has higher weight.",
85
+ query="Which tent is the most waterproof?",
86
+ )
87
+
88
+ pprint(relevance_score)
89
+ # {'gpt_relevance': 5.0}
90
+
91
+ # Content Safety Evaluator
92
+
93
+ # Initialize Project Scope
94
+ azure_ai_project = {
95
+ "subscription_id": "e0fd569c-e34a-4249-8c24-e8d723c7f054",
96
+ "resource_group_name": "rg-test",
97
+ "project_name": "project-test",
98
+ }
99
+
100
+ violence_eval = ViolenceEvaluator(azure_ai_project)
101
+ violence_score = violence_eval(query="What is the capital of France?", response="Paris.")
102
+ pprint(violence_score)
103
+ # {'violence': 'Very low',
104
+ # 'violence_reason': "The system's response is a straightforward factual response "
105
+ # 'to a geography question. There is no violent content or '
106
+ # 'language present.',
107
+ # 'violence_score': 0}
108
+
109
+ # Code based evaluator
110
+ response_length("The Alpine Explorer Tent is the most waterproof.")
111
+ # {'value': 48}
112
+
113
+ # Using multiple evaluators together using `Evaluate` API
114
+
115
+ result = evaluate(
116
+ data="evaluate_test_data.jsonl",
117
+ evaluators={
118
+ "response_length": response_length,
119
+ "violence": violence_eval,
120
+ },
121
+ )
122
+
123
+ pprint(result)
124
+ ```
125
+ ## Simulator
126
+
127
+ Sample application prompty
128
+
129
+ ```yaml
130
+ ---
131
+ name: ApplicationPrompty
132
+ description: Simulates an application
133
+ model:
134
+ api: chat
135
+ configuration:
136
+ type: azure_openai
137
+ azure_deployment: ${env:AZURE_DEPLOYMENT}
138
+ api_key: ${env:AZURE_OPENAI_API_KEY}
139
+ azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
140
+ parameters:
141
+ temperature: 0.0
142
+ top_p: 1.0
143
+ presence_penalty: 0
144
+ frequency_penalty: 0
145
+ response_format:
146
+ type: text
147
+
148
+ inputs:
149
+ conversation_history:
150
+ type: dict
151
+
152
+ ---
153
+ system:
154
+ You are a helpful assistant and you're helping with the user's query. Keep the conversation engaging and interesting.
155
+
156
+ Output with a string that continues the conversation, responding to the latest message from the user, given the conversation history:
157
+ {{ conversation_history }}
158
+
159
+ ```
160
+ Application code:
161
+
162
+ ```python
163
+ import json
164
+ import asyncio
165
+ from typing import Any, Dict, List, Optional
166
+ from azure.ai.evaluation.synthetic import Simulator
167
+ from promptflow.client import load_flow
168
+ from azure.identity import DefaultAzureCredential
169
+ import os
170
+
171
+ azure_ai_project = {
172
+ "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
173
+ "resource_group_name": os.environ.get("RESOURCE_GROUP"),
174
+ "project_name": os.environ.get("PROJECT_NAME"),
175
+ "credential": DefaultAzureCredential(),
176
+ }
177
+
178
+ import wikipedia
179
+ wiki_search_term = "Leonardo da vinci"
180
+ wiki_title = wikipedia.search(wiki_search_term)[0]
181
+ wiki_page = wikipedia.page(wiki_title)
182
+ text = wiki_page.summary[:1000]
183
+
184
+ def method_to_invoke_application_prompty(query: str):
185
+ try:
186
+ current_dir = os.path.dirname(__file__)
187
+ prompty_path = os.path.join(current_dir, "application.prompty")
188
+ _flow = load_flow(source=prompty_path, model={
189
+ "configuration": azure_ai_project
190
+ })
191
+ response = _flow(
192
+ query=query,
193
+ context=context,
194
+ conversation_history=messages_list
195
+ )
196
+ return response
197
+ except:
198
+ print("Something went wrong invoking the prompty")
199
+ return "something went wrong"
200
+
201
+ async def callback(
202
+ messages: List[Dict],
203
+ stream: bool = False,
204
+ session_state: Any = None, # noqa: ANN401
205
+ context: Optional[Dict[str, Any]] = None,
206
+ ) -> dict:
207
+ messages_list = messages["messages"]
208
+ # get last message
209
+ latest_message = messages_list[-1]
210
+ query = latest_message["content"]
211
+ context = None
212
+ # call your endpoint or ai application here
213
+ response = method_to_invoke_application_prompty(query)
214
+ # we are formatting the response to follow the openAI chat protocol format
215
+ formatted_response = {
216
+ "content": response,
217
+ "role": "assistant",
218
+ "context": {
219
+ "citations": None,
220
+ },
221
+ }
222
+ messages["messages"].append(formatted_response)
223
+ return {"messages": messages["messages"], "stream": stream, "session_state": session_state, "context": context}
224
+
225
+
226
+
227
+ async def main():
228
+ simulator = Simulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
229
+ outputs = await simulator(
230
+ target=callback,
231
+ text=text,
232
+ num_queries=2,
233
+ max_conversation_turns=4,
234
+ user_persona=[
235
+ f"I am a student and I want to learn more about {wiki_search_term}",
236
+ f"I am a teacher and I want to teach my students about {wiki_search_term}"
237
+ ],
238
+ )
239
+ print(json.dumps(outputs))
240
+
241
+ if __name__ == "__main__":
242
+ os.environ["AZURE_SUBSCRIPTION_ID"] = ""
243
+ os.environ["RESOURCE_GROUP"] = ""
244
+ os.environ["PROJECT_NAME"] = ""
245
+ os.environ["AZURE_OPENAI_API_KEY"] = ""
246
+ os.environ["AZURE_OPENAI_ENDPOINT"] = ""
247
+ os.environ["AZURE_DEPLOYMENT"] = ""
248
+ asyncio.run(main())
249
+ print("done!")
250
+ ```
251
+
252
+ Simulators allow users to generate synthentic data using their application. Simulator expects the user to have a callback method that invokes
253
+ their AI application. Here's a sample of a callback which invokes AsyncAzureOpenAI:
254
+
255
+ ```python
256
+ from from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
257
+ from azure.identity import DefaultAzureCredential
258
+ from typing import Any, Dict, List, Optional
259
+ import asyncio
260
+
261
+
262
+ azure_ai_project = {
263
+ "subscription_id": <subscription_id>,
264
+ "resource_group_name": <resource_group_name>,
265
+ "project_name": <project_name>
266
+ }
267
+
268
+ async def callback(
269
+ messages: List[Dict],
270
+ stream: bool = False,
271
+ session_state: Any = None,
272
+ context: Dict[str, Any] = None
273
+ ) -> dict:
274
+ messages_list = messages["messages"]
275
+ # get last message
276
+ latest_message = messages_list[-1]
277
+ query = latest_message["content"]
278
+ context = None
279
+ if 'file_content' in messages["template_parameters"]:
280
+ query += messages["template_parameters"]['file_content']
281
+ # the next few lines explains how to use the AsyncAzureOpenAI's chat.completions
282
+ # to respond to the simulator. You should replace it with a call to your model/endpoint/application
283
+ # make sure you pass the `query` and format the response as we have shown below
284
+ from openai import AsyncAzureOpenAI
285
+ oai_client = AsyncAzureOpenAI(
286
+ api_key=<api_key>,
287
+ azure_endpoint=<endpoint>,
288
+ api_version="2023-12-01-preview",
289
+ )
290
+ try:
291
+ response_from_oai_chat_completions = await oai_client.chat.completions.create(messages=[{"content": query, "role": "user"}], model="gpt-4", max_tokens=300)
292
+ except Exception as e:
293
+ print(f"Error: {e}")
294
+ # to continue the conversation, return the messages, else you can fail the adversarial with an exception
295
+ message = {
296
+ "content": "Something went wrong. Check the exception e for more details.",
297
+ "role": "assistant",
298
+ "context": None,
299
+ }
300
+ messages["messages"].append(message)
301
+ return {
302
+ "messages": messages["messages"],
303
+ "stream": stream,
304
+ "session_state": session_state
305
+ }
306
+ response_result = response_from_oai_chat_completions.choices[0].message.content
307
+ formatted_response = {
308
+ "content": response_result,
309
+ "role": "assistant",
310
+ "context": {},
311
+ }
312
+ messages["messages"].append(formatted_response)
313
+ return {
314
+ "messages": messages["messages"],
315
+ "stream": stream,
316
+ "session_state": session_state,
317
+ "context": context
318
+ }
319
+
320
+ ```
321
+ ### Adversarial QA:
322
+ ```python
323
+ scenario = AdversarialScenario.ADVERSARIAL_QA
324
+ simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
325
+
326
+ outputs = asyncio.run(
327
+ simulator(
328
+ scenario=scenario,
329
+ max_conversation_turns=1,
330
+ max_simulation_results=3,
331
+ target=callback
332
+ )
333
+ )
334
+
335
+ print(outputs.to_eval_qa_json_lines())
336
+ ```
337
+ ### Direct Attack Simulator
338
+
339
+ ```python
340
+ scenario = AdversarialScenario.ADVERSARIAL_QA
341
+ simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
342
+
343
+ outputs = asyncio.run(
344
+ simulator(
345
+ scenario=scenario,
346
+ max_conversation_turns=1,
347
+ max_simulation_results=2,
348
+ target=callback
349
+ )
350
+ )
351
+
352
+ print(outputs)
353
+ ```
354
+ ## Troubleshooting
355
+
356
+ ## Next steps
357
+
358
+ ## Contributing
359
+
360
+
361
+ # Release History
362
+
363
+ ## 1.0.0b1 (2024-09-20)
364
+
365
+ ### Breaking Changes
366
+
367
+ - The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
368
+ - The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
369
+ - The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
370
+ - Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
371
+ - Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
372
+
373
+ ### Features Added
374
+
375
+ - First preview
376
+ - This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
377
+ - Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information