azure-ai-evaluation 1.0.0b2__tar.gz → 1.0.0b3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. azure_ai_evaluation-1.0.0b3/CHANGELOG.md +81 -0
  2. {azure_ai_evaluation-1.0.0b2/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.0.0b3}/PKG-INFO +59 -1
  3. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/__init__.py +9 -5
  4. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/utils.py +24 -9
  5. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_constants.py +4 -0
  6. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_evaluate.py +57 -39
  7. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +70 -0
  8. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -5
  9. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  10. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_eval.py +302 -0
  11. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +79 -0
  12. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +99 -0
  13. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  14. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +55 -0
  15. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +55 -0
  16. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +55 -0
  17. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +55 -0
  18. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_eci/_eci.py +62 -0
  19. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +72 -0
  20. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -5
  21. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +71 -0
  22. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -5
  23. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -0
  24. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +77 -0
  25. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -5
  26. {azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/__init__.py +2 -2
  27. {azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat/retrieval → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/_retrieval.py +16 -22
  28. {azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat/retrieval → azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_retrieval}/retrieval.prompty +0 -5
  29. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -11
  30. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  31. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_evaluators/_xpia/xpia.py +65 -0
  32. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_exceptions.py +0 -1
  33. azure_ai_evaluation-1.0.0b3/azure/ai/evaluation/_model_configurations.py +55 -0
  34. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_version.py +1 -1
  35. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +2 -1
  36. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_simulator.py +19 -8
  37. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3/azure_ai_evaluation.egg-info}/PKG-INFO +59 -1
  38. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/SOURCES.txt +8 -10
  39. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/conftest.py +22 -2
  40. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_builtin_evaluators.py +146 -186
  41. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_evaluate.py +18 -11
  42. azure_ai_evaluation-1.0.0b3/tests/e2etests/test_sim_and_eval.py +134 -0
  43. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_built_in_evaluator.py +4 -9
  44. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluate.py +2 -2
  45. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_non_adv_simulator.py +2 -3
  46. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_simulator.py +2 -2
  47. azure_ai_evaluation-1.0.0b2/CHANGELOG.md +0 -23
  48. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  49. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +0 -9
  50. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +0 -117
  51. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  52. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +0 -78
  53. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +0 -76
  54. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +0 -76
  55. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +0 -76
  56. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_eci/_eci.py +0 -99
  57. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +0 -117
  58. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -118
  59. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -104
  60. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  61. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  62. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +0 -126
  63. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_evaluators/_xpia/xpia.py +0 -139
  64. azure_ai_evaluation-1.0.0b2/azure/ai/evaluation/_model_configurations.py +0 -27
  65. azure_ai_evaluation-1.0.0b2/tests/unittests/test_chat_evaluator.py +0 -109
  66. azure_ai_evaluation-1.0.0b2/tests/unittests/test_content_safety_chat_evaluator.py +0 -82
  67. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/MANIFEST.in +0 -0
  68. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/README.md +0 -0
  69. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/__init__.py +0 -0
  70. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/__init__.py +0 -0
  71. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/__init__.py +0 -0
  72. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/constants.py +0 -0
  73. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_common/rai_service.py +0 -0
  74. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
  75. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +0 -0
  76. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +0 -0
  77. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +0 -0
  78. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -0
  79. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_eval_run.py +0 -0
  80. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -0
  81. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluate/_utils.py +0 -0
  82. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
  83. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
  84. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +0 -0
  85. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
  86. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +0 -0
  87. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -0
  88. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  89. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
  90. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +0 -0
  91. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
  92. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
  93. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +0 -0
  94. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
  95. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
  96. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +0 -0
  97. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
  98. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
  99. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_qa/_qa.py +0 -0
  100. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
  101. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
  102. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +0 -0
  103. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
  104. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
  105. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_http_utils.py +0 -0
  106. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/_user_agent.py +0 -0
  107. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/py.typed +0 -0
  108. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/__init__.py +0 -0
  109. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_adversarial_scenario.py +0 -0
  110. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_adversarial_simulator.py +0 -0
  111. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_constants.py +0 -0
  112. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/__init__.py +0 -0
  113. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -0
  114. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
  115. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +0 -0
  116. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
  117. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/_experimental.py +0 -0
  118. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
  119. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
  120. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +0 -0
  121. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
  122. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
  123. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +0 -0
  124. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +0 -0
  125. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +0 -0
  126. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
  127. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  128. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
  129. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_tracing.py +0 -0
  130. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure/ai/evaluation/simulator/_utils.py +0 -0
  131. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
  132. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
  133. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/requires.txt +0 -0
  134. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
  135. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/pyproject.toml +0 -0
  136. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/setup.cfg +0 -0
  137. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/setup.py +0 -0
  138. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/__init__.py +0 -0
  139. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/__openai_patcher.py +0 -0
  140. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/__init__.py +0 -0
  141. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
  142. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/target_fn.py +0 -0
  143. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_adv_simulator.py +0 -0
  144. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/e2etests/test_metrics_upload.py +0 -0
  145. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_batch_run_context.py +0 -0
  146. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_content_safety_defect_rate.py +0 -0
  147. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_content_safety_rai_script.py +0 -0
  148. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_eval_run.py +0 -0
  149. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluate_telemetry.py +0 -0
  150. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
  151. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_evaluators/test_inputs_evaluators.py +0 -0
  152. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_jailbreak_simulator.py +0 -0
  153. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_save_eval.py +0 -0
  154. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
  155. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
  156. {azure_ai_evaluation-1.0.0b2 → azure_ai_evaluation-1.0.0b3}/tests/unittests/test_utils.py +0 -0
@@ -0,0 +1,81 @@
1
+ # Release History
2
+
3
+ ## 1.0.0b3 (2024-10-01)
4
+
5
+ ### Features Added
6
+
7
+ - Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
8
+ - The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
9
+ - `ViolenceEvaluator`
10
+ - `SexualEvaluator`
11
+ - `SelfHarmEvaluator`
12
+ - `HateUnfairnessEvaluator`
13
+ - `ProtectedMaterialEvaluator`
14
+ - `IndirectAttackEvaluator`
15
+ - `CoherenceEvaluator`
16
+ - `RelevanceEvaluator`
17
+ - `FluencyEvaluator`
18
+ - `GroundednessEvaluator`
19
+ - Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
20
+
21
+ ### Breaking Changes
22
+
23
+ - Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
24
+ - The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
25
+ `column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
26
+
27
+ Before:
28
+ ```python
29
+ evaluate(
30
+ ...,
31
+ evaluator_config={
32
+ "hate_unfairness": {
33
+ "query": "${data.question}",
34
+ "response": "${data.answer}",
35
+ }
36
+ },
37
+ ...
38
+ )
39
+ ```
40
+
41
+ After
42
+ ```python
43
+ evaluate(
44
+ ...,
45
+ evaluator_config={
46
+ "hate_unfairness": {
47
+ "column_mapping": {
48
+ "query": "${data.question}",
49
+ "response": "${data.answer}",
50
+ }
51
+ }
52
+ },
53
+ ...
54
+ )
55
+ ```
56
+
57
+ ### Bugs Fixed
58
+
59
+ - Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
60
+
61
+ ## 1.0.0b2 (2024-09-24)
62
+
63
+ ### Breaking Changes
64
+
65
+ - `data` and `evaluators` are now required keywords in `evaluate`.
66
+
67
+ ## 1.0.0b1 (2024-09-20)
68
+
69
+ ### Breaking Changes
70
+
71
+ - The `synthetic` namespace has been renamed to `simulator`, and sub-namespaces under this module have been removed
72
+ - The `evaluate` and `evaluators` namespaces have been removed, and everything previously exposed in those modules has been added to the root namespace `azure.ai.evaluation`
73
+ - The parameter name `project_scope` in content safety evaluators have been renamed to `azure_ai_project` for consistency with evaluate API and simulators.
74
+ - Model configurations classes are now of type `TypedDict` and are exposed in the `azure.ai.evaluation` module instead of coming from `promptflow.core`.
75
+ - Updated the parameter names for `question` and `answer` in built-in evaluators to more generic terms: `query` and `response`.
76
+
77
+ ### Features Added
78
+
79
+ - First preview
80
+ - This package is port of `promptflow-evals`. New features will be added only to this package moving forward.
81
+ - Added a `TypedDict` for `AzureAIProject` that allows for better intellisense and type checking when passing in project information
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: azure-ai-evaluation
3
- Version: 1.0.0b2
3
+ Version: 1.0.0b3
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -426,6 +426,64 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
426
426
 
427
427
  # Release History
428
428
 
429
+ ## 1.0.0b3 (2024-10-01)
430
+
431
+ ### Features Added
432
+
433
+ - Added `type` field to `AzureOpenAIModelConfiguration` and `OpenAIModelConfiguration`
434
+ - The following evaluators now support `conversation` as an alternative input to their usual single-turn inputs:
435
+ - `ViolenceEvaluator`
436
+ - `SexualEvaluator`
437
+ - `SelfHarmEvaluator`
438
+ - `HateUnfairnessEvaluator`
439
+ - `ProtectedMaterialEvaluator`
440
+ - `IndirectAttackEvaluator`
441
+ - `CoherenceEvaluator`
442
+ - `RelevanceEvaluator`
443
+ - `FluencyEvaluator`
444
+ - `GroundednessEvaluator`
445
+ - Surfaced `RetrievalScoreEvaluator`, formally an internal part of `ChatEvaluator` as a standalone conversation-only evaluator.
446
+
447
+ ### Breaking Changes
448
+
449
+ - Removed `ContentSafetyChatEvaluator` and `ChatEvaluator`
450
+ - The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
451
+ `column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
452
+
453
+ Before:
454
+ ```python
455
+ evaluate(
456
+ ...,
457
+ evaluator_config={
458
+ "hate_unfairness": {
459
+ "query": "${data.question}",
460
+ "response": "${data.answer}",
461
+ }
462
+ },
463
+ ...
464
+ )
465
+ ```
466
+
467
+ After
468
+ ```python
469
+ evaluate(
470
+ ...,
471
+ evaluator_config={
472
+ "hate_unfairness": {
473
+ "column_mapping": {
474
+ "query": "${data.question}",
475
+ "response": "${data.answer}",
476
+ }
477
+ }
478
+ },
479
+ ...
480
+ )
481
+ ```
482
+
483
+ ### Bugs Fixed
484
+
485
+ - Fixed issue where Entra ID authentication was not working with `AzureOpenAIModelConfiguration`
486
+
429
487
  ## 1.0.0b2 (2024-09-24)
430
488
 
431
489
  ### Breaking Changes
@@ -4,10 +4,8 @@
4
4
 
5
5
  from ._evaluate._evaluate import evaluate
6
6
  from ._evaluators._bleu import BleuScoreEvaluator
7
- from ._evaluators._chat import ChatEvaluator
8
7
  from ._evaluators._coherence import CoherenceEvaluator
9
8
  from ._evaluators._content_safety import (
10
- ContentSafetyChatEvaluator,
11
9
  ContentSafetyEvaluator,
12
10
  HateUnfairnessEvaluator,
13
11
  SelfHarmEvaluator,
@@ -22,10 +20,16 @@ from ._evaluators._meteor import MeteorScoreEvaluator
22
20
  from ._evaluators._protected_material import ProtectedMaterialEvaluator
23
21
  from ._evaluators._qa import QAEvaluator
24
22
  from ._evaluators._relevance import RelevanceEvaluator
23
+ from ._evaluators._retrieval import RetrievalEvaluator
25
24
  from ._evaluators._rouge import RougeScoreEvaluator, RougeType
26
25
  from ._evaluators._similarity import SimilarityEvaluator
27
26
  from ._evaluators._xpia import IndirectAttackEvaluator
28
- from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
27
+ from ._model_configurations import (
28
+ AzureAIProject,
29
+ AzureOpenAIModelConfiguration,
30
+ OpenAIModelConfiguration,
31
+ EvaluatorConfig,
32
+ )
29
33
 
30
34
  __all__ = [
31
35
  "evaluate",
@@ -36,21 +40,21 @@ __all__ = [
36
40
  "RelevanceEvaluator",
37
41
  "SimilarityEvaluator",
38
42
  "QAEvaluator",
39
- "ChatEvaluator",
40
43
  "ViolenceEvaluator",
41
44
  "SexualEvaluator",
42
45
  "SelfHarmEvaluator",
43
46
  "HateUnfairnessEvaluator",
44
47
  "ContentSafetyEvaluator",
45
- "ContentSafetyChatEvaluator",
46
48
  "IndirectAttackEvaluator",
47
49
  "BleuScoreEvaluator",
48
50
  "GleuScoreEvaluator",
49
51
  "MeteorScoreEvaluator",
52
+ "RetrievalEvaluator",
50
53
  "RougeScoreEvaluator",
51
54
  "RougeType",
52
55
  "ProtectedMaterialEvaluator",
53
56
  "AzureAIProject",
54
57
  "AzureOpenAIModelConfiguration",
55
58
  "OpenAIModelConfiguration",
59
+ "EvaluatorConfig",
56
60
  ]
@@ -3,12 +3,13 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  import threading
6
- from typing import List, Optional, Union
6
+ from typing import List, Union
7
7
 
8
8
  import nltk
9
9
  import numpy as np
10
10
 
11
11
  from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
12
+ from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
12
13
 
13
14
  from . import constants
14
15
 
@@ -70,18 +71,32 @@ def nltk_tokenize(text: str) -> List[str]:
70
71
  return list(tokens)
71
72
 
72
73
 
73
- def ensure_api_version_in_aoai_model_config(
74
+ def parse_model_config_type(
74
75
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
75
- default_api_version: str,
76
76
  ) -> None:
77
77
  if "azure_endpoint" in model_config or "azure_deployment" in model_config:
78
- model_config["api_version"] = model_config.get("api_version", default_api_version)
78
+ model_config["type"] = AZURE_OPENAI_TYPE
79
+ else:
80
+ model_config["type"] = OPENAI_TYPE
79
81
 
80
82
 
81
- def ensure_user_agent_in_aoai_model_config(
83
+ def construct_prompty_model_config(
82
84
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
83
- prompty_model_config: dict,
84
- user_agent: Optional[str] = None,
85
- ) -> None:
86
- if user_agent and ("azure_endpoint" in model_config or "azure_deployment" in model_config):
85
+ default_api_version: str,
86
+ user_agent: str,
87
+ ) -> dict:
88
+ parse_model_config_type(model_config)
89
+
90
+ if model_config["type"] == AZURE_OPENAI_TYPE:
91
+ model_config["api_version"] = model_config.get("api_version", default_api_version)
92
+
93
+ prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
94
+
95
+ # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
96
+ # https://github.com/encode/httpx/discussions/2959
97
+ prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
98
+
99
+ if model_config["type"] == AZURE_OPENAI_TYPE and user_agent:
87
100
  prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
101
+
102
+ return prompty_model_config
@@ -57,3 +57,7 @@ PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
57
57
 
58
58
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
59
59
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
60
+
61
+ AZURE_OPENAI_TYPE = "azure_openai"
62
+
63
+ OPENAI_TYPE = "openai"
@@ -19,7 +19,7 @@ from .._constants import (
19
19
  Prefixes,
20
20
  _InternalEvaluationMetrics,
21
21
  )
22
- from .._model_configurations import AzureAIProject
22
+ from .._model_configurations import AzureAIProject, EvaluatorConfig
23
23
  from .._user_agent import USER_AGENT
24
24
  from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
25
25
  from ._utils import (
@@ -158,6 +158,12 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
158
158
  ]
159
159
 
160
160
  missing_inputs = [col for col in required_inputs if col not in df_data.columns]
161
+ if missing_inputs and "conversation" in required_inputs:
162
+ non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
163
+ if len(missing_inputs) == len(non_conversation_inputs) and [
164
+ input in non_conversation_inputs for input in missing_inputs
165
+ ]:
166
+ missing_inputs = []
161
167
  if missing_inputs:
162
168
  if not is_target_fn:
163
169
  msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
@@ -273,7 +279,7 @@ def _validate_columns(
273
279
  df: pd.DataFrame,
274
280
  evaluators: Dict[str, Any],
275
281
  target: Optional[Callable],
276
- evaluator_config: Dict[str, Dict[str, str]],
282
+ column_mapping: Dict[str, Dict[str, str]],
277
283
  ) -> None:
278
284
  """
279
285
  Check that all columns needed by evaluator or target function are present.
@@ -284,8 +290,8 @@ def _validate_columns(
284
290
  :type evaluators: Dict[str, Any]
285
291
  :param target: The callable to be applied to data set.
286
292
  :type target: Optional[Callable]
287
- :param evaluator_config: The configuration for evaluators.
288
- :type evaluator_config: Dict[str, Dict[str, str]]
293
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
294
+ :type column_mapping: Dict[str, Dict[str, str]]
289
295
  :raises EvaluationException: If column starts from "__outputs." while target is defined.
290
296
  """
291
297
  if target:
@@ -306,7 +312,7 @@ def _validate_columns(
306
312
  else:
307
313
  for evaluator_name, evaluator in evaluators.items():
308
314
  # Apply column mapping
309
- mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
315
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
310
316
  new_df = _apply_column_mapping(df, mapping_config)
311
317
 
312
318
  # Validate input data for evaluator
@@ -372,11 +378,11 @@ def _apply_target_to_data(
372
378
  return target_output, generated_columns, run
373
379
 
374
380
 
375
- def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
376
- """Process evaluator_config to replace ${target.} with ${data.}
381
+ def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
382
+ """Process column_mapping to replace ${target.} with ${data.}
377
383
 
378
- :param evaluator_config: The configuration for evaluators.
379
- :type evaluator_config: Dict[str, Dict[str, str]]
384
+ :param column_mapping: The configuration for evaluators.
385
+ :type column_mapping: Dict[str, Dict[str, str]]
380
386
  :return: The processed configuration.
381
387
  :rtype: Dict[str, Dict[str, str]]
382
388
  """
@@ -385,15 +391,15 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Di
385
391
 
386
392
  unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
387
393
 
388
- if evaluator_config:
389
- for evaluator, mapping_config in evaluator_config.items():
394
+ if column_mapping:
395
+ for evaluator, mapping_config in column_mapping.items():
390
396
  if isinstance(mapping_config, dict):
391
397
  processed_config[evaluator] = {}
392
398
 
393
399
  for map_to_key, map_value in mapping_config.items():
394
400
  # Check if there's any unexpected reference other than ${target.} or ${data.}
395
401
  if unexpected_references.search(map_value):
396
- msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
402
+ msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
397
403
  raise EvaluationException(
398
404
  message=msg,
399
405
  internal_message=msg,
@@ -439,7 +445,7 @@ def evaluate(
439
445
  evaluators: Dict[str, Callable],
440
446
  evaluation_name: Optional[str] = None,
441
447
  target: Optional[Callable] = None,
442
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
448
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
443
449
  azure_ai_project: Optional[AzureAIProject] = None,
444
450
  output_path: Optional[str] = None,
445
451
  **kwargs,
@@ -458,10 +464,10 @@ def evaluate(
458
464
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
459
465
  :paramtype target: Optional[Callable]
460
466
  :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
461
- names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
462
- keys as the column names in the evaluator input and values as the column names in the input data or data
463
- generated by target.
464
- :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
467
+ names as keys and a values that are dictionaries containing the column mappings. The column mappings should
468
+ be a dictionary with keys as the column names in the evaluator input and values as the column names in the
469
+ input data or data generated by target.
470
+ :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
465
471
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
466
472
  the results will be saved to a file named `evaluation_results.json` in the folder.
467
473
  :paramtype output_path: Optional[str]
@@ -482,7 +488,7 @@ def evaluate(
482
488
  model_config = {
483
489
  "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
484
490
  "api_key": os.environ.get("AZURE_OPENAI_KEY"),
485
- "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
491
+ "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
486
492
  }
487
493
 
488
494
  coherence_eval = CoherenceEvaluator(model_config=model_config)
@@ -497,15 +503,19 @@ def evaluate(
497
503
  },
498
504
  evaluator_config={
499
505
  "coherence": {
500
- "response": "${data.response}",
501
- "query": "${data.query}"
506
+ "column_mapping": {
507
+ "response": "${data.response}",
508
+ "query": "${data.query}",
509
+ },
502
510
  },
503
511
  "relevance": {
504
- "response": "${data.response}",
505
- "context": "${data.context}",
506
- "query": "${data.query}"
507
- }
508
- }
512
+ "column_mapping": {
513
+ "response": "${data.response}",
514
+ "context": "${data.context}",
515
+ "query": "${data.query}",
516
+ },
517
+ },
518
+ },
509
519
  )
510
520
 
511
521
  """
@@ -544,13 +554,13 @@ def evaluate(
544
554
  raise e
545
555
 
546
556
 
547
- def _evaluate( # pylint: disable=too-many-locals
557
+ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
548
558
  *,
549
559
  evaluation_name: Optional[str] = None,
550
560
  target: Optional[Callable] = None,
551
561
  data: Optional[str] = None,
552
562
  evaluators: Optional[Dict[str, Callable]] = None,
553
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
563
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
554
564
  azure_ai_project: Optional[AzureAIProject] = None,
555
565
  output_path: Optional[str] = None,
556
566
  **kwargs,
@@ -560,8 +570,13 @@ def _evaluate( # pylint: disable=too-many-locals
560
570
  # Process evaluator config to replace ${target.} with ${data.}
561
571
  if evaluator_config is None:
562
572
  evaluator_config = {}
563
- evaluator_config = _process_evaluator_config(evaluator_config)
564
- _validate_columns(input_data_df, evaluators, target, evaluator_config)
573
+ # extract column mapping dicts into dictionary mapping evaluator name to column mapping
574
+ column_mapping = {
575
+ evaluator_name: evaluator_configuration.get("column_mapping", None)
576
+ for evaluator_name, evaluator_configuration in evaluator_config.items()
577
+ }
578
+ column_mapping = _process_column_mappings(column_mapping)
579
+ _validate_columns(input_data_df, evaluators, target, column_mapping)
565
580
 
566
581
  # Target Run
567
582
  pf_client = PFClient(
@@ -577,8 +592,8 @@ def _evaluate( # pylint: disable=too-many-locals
577
592
 
578
593
  # Create default configuration for evaluators that directly maps
579
594
  # input data names to keyword inputs of the same name in the evaluators.
580
- evaluator_config = evaluator_config or {}
581
- evaluator_config.setdefault("default", {})
595
+ column_mapping = column_mapping or {}
596
+ column_mapping.setdefault("default", {})
582
597
 
583
598
  # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
584
599
  if data is not None and target is not None:
@@ -586,21 +601,21 @@ def _evaluate( # pylint: disable=too-many-locals
586
601
  target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
587
602
  )
588
603
 
589
- for evaluator_name, mapping in evaluator_config.items():
604
+ for evaluator_name, mapping in column_mapping.items():
590
605
  mapped_to_values = set(mapping.values())
591
606
  for col in target_generated_columns:
592
607
  # If user defined mapping differently, do not change it.
593
608
  # If it was mapped to target, we have already changed it
594
- # in _process_evaluator_config
609
+ # in _process_column_mappings
595
610
  run_output = f"${{run.outputs.{col}}}"
596
611
  # We will add our mapping only if
597
612
  # customer did not mapped target output.
598
613
  if col not in mapping and run_output not in mapped_to_values:
599
- evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
614
+ column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
600
615
 
601
616
  # After we have generated all columns we can check if we have
602
617
  # everything we need for evaluators.
603
- _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
618
+ _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
604
619
 
605
620
  # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
606
621
  # via target mapping.
@@ -610,13 +625,16 @@ def _evaluate( # pylint: disable=too-many-locals
610
625
  for col in input_data_df.columns:
611
626
  # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
612
627
  # Also ignore columns that are already in config, since they've been covered by target mapping.
613
- if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
614
- evaluator_config["default"][col] = f"${{data.{col}}}"
628
+ if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
629
+ column_mapping["default"][col] = f"${{data.{col}}}"
615
630
  # Batch Run
616
631
  evaluators_info = {}
617
632
  use_pf_client = kwargs.get("_use_pf_client", True)
618
633
  if use_pf_client:
619
- batch_run_client = ProxyClient(pf_client)
634
+ # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
635
+ # The root cause is still unclear, but it seems related to a conflict between the async run uploader
636
+ # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
637
+ batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
620
638
 
621
639
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
622
640
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
@@ -632,7 +650,7 @@ def _evaluate( # pylint: disable=too-many-locals
632
650
  flow=evaluator,
633
651
  run=target_run,
634
652
  evaluator_name=evaluator_name,
635
- column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
653
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
636
654
  data=data,
637
655
  stream=True,
638
656
  name=kwargs.get("_run_name"),
@@ -0,0 +1,70 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ from typing import Optional
6
+ from typing_extensions import override
7
+
8
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
9
+
10
+
11
+ class CoherenceEvaluator(PromptyEvaluatorBase):
12
+ """
13
+ Initialize a coherence evaluator configured for a specific Azure OpenAI model.
14
+
15
+ :param model_config: Configuration for the Azure OpenAI model.
16
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
17
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
18
+
19
+ **Usage**
20
+
21
+ .. code-block:: python
22
+
23
+ eval_fn = CoherenceEvaluator(model_config)
24
+ result = eval_fn(
25
+ query="What is the capital of Japan?",
26
+ response="The capital of Japan is Tokyo.")
27
+
28
+ **Output format**
29
+
30
+ .. code-block:: python
31
+
32
+ {
33
+ "gpt_coherence": 1.0
34
+ }
35
+ """
36
+
37
+ PROMPTY_FILE = "coherence.prompty"
38
+ RESULT_KEY = "gpt_coherence"
39
+
40
+ @override
41
+ def __init__(self, model_config: dict):
42
+ current_dir = os.path.dirname(__file__)
43
+ prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
44
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self.RESULT_KEY)
45
+
46
+ @override
47
+ def __call__(
48
+ self,
49
+ *,
50
+ query: Optional[str] = None,
51
+ response: Optional[str] = None,
52
+ conversation: Optional[dict] = None,
53
+ **kwargs
54
+ ):
55
+ """Evaluate coherence. Accepts either a query and response for a single evaluation,
56
+ or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
57
+ turns, the evaluator will aggregate the results of each turn.
58
+
59
+ :keyword response: The response to be evaluated.
60
+ :paramtype response: Optional[str]
61
+ :keyword context: The context to be evaluated.
62
+ :paramtype context: Optional[str]
63
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
64
+ key "messages". Conversation turns are expected
65
+ to be dictionaries with keys "content" and "role".
66
+ :paramtype conversation: Optional[Dict]
67
+ :return: The relevance score.
68
+ :rtype: dict
69
+ """
70
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -3,11 +3,6 @@ name: Coherence
3
3
  description: Evaluates coherence score for QA scenario
4
4
  model:
5
5
  api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
6
  parameters:
12
7
  temperature: 0.0
13
8
  max_tokens: 1
@@ -0,0 +1,13 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._base_eval import EvaluatorBase
6
+ from ._base_prompty_eval import PromptyEvaluatorBase
7
+ from ._base_rai_svc_eval import RaiServiceEvaluatorBase
8
+
9
+ __all__ = [
10
+ "EvaluatorBase",
11
+ "PromptyEvaluatorBase",
12
+ "RaiServiceEvaluatorBase",
13
+ ]