azure-ai-evaluation 1.1.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (175) hide show
  1. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/CHANGELOG.md +66 -1
  2. {azure_ai_evaluation-1.1.0/azure_ai_evaluation.egg-info → azure_ai_evaluation-1.2.0}/PKG-INFO +71 -7
  3. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/README.md +1 -1
  4. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/_clients.py +24 -8
  5. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/_models.py +2 -2
  6. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_constants.py +18 -0
  7. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
  8. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_eval_run.py +3 -1
  9. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_evaluate.py +69 -12
  10. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_utils.py +27 -0
  11. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
  12. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  13. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_eval.py +69 -4
  14. azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
  15. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +7 -1
  16. azure_ai_evaluation-1.2.0/azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  17. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +5 -42
  18. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
  19. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
  20. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
  21. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
  22. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
  23. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
  24. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
  25. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
  26. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
  27. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +41 -81
  28. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_version.py +1 -1
  29. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_simulator.py +21 -13
  30. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0/azure_ai_evaluation.egg-info}/PKG-INFO +71 -7
  31. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/SOURCES.txt +2 -1
  32. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/requires.txt +2 -2
  33. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/README.md +1 -1
  34. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_simulate.py +1 -1
  35. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/setup.py +3 -4
  36. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/conftest.py +23 -1
  37. azure_ai_evaluation-1.2.0/tests/e2etests/test_evaluate.py +501 -0
  38. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_lite_management_client.py +12 -3
  39. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_mass_evaluate.py +111 -86
  40. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_metrics_upload.py +11 -1
  41. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_built_in_evaluator.py +1 -1
  42. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluate.py +189 -31
  43. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluate_performance.py +9 -13
  44. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluators/test_inputs_evaluators.py +42 -0
  45. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_save_eval.py +6 -4
  46. azure_ai_evaluation-1.1.0/tests/__pf_service_isolation.py +0 -28
  47. azure_ai_evaluation-1.1.0/tests/e2etests/test_evaluate.py +0 -953
  48. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/MANIFEST.in +0 -0
  49. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/NOTICE.txt +0 -0
  50. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/TROUBLESHOOTING.md +0 -0
  51. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/__init__.py +0 -0
  52. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/__init__.py +0 -0
  53. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/__init__.py +0 -0
  54. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/__init__.py +0 -0
  55. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_azure/_token_manager.py +0 -0
  56. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/__init__.py +0 -0
  57. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/_experimental.py +0 -0
  58. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/constants.py +0 -0
  59. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/math.py +0 -0
  60. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/rai_service.py +0 -0
  61. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_common/utils.py +0 -0
  62. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/__init__.py +0 -0
  63. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/code_client.py +0 -0
  64. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +0 -0
  65. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -0
  66. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -0
  67. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -0
  68. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/__init__.py +0 -0
  69. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_bleu/__init__.py +0 -0
  70. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/__init__.py +0 -0
  71. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/_coherence.py +0 -0
  72. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +0 -0
  73. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -0
  74. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -0
  75. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  76. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_eci/_eci.py +0 -0
  77. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_f1_score/__init__.py +0 -0
  78. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/__init__.py +0 -0
  79. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/_fluency.py +0 -0
  80. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +0 -0
  81. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_gleu/__init__.py +0 -0
  82. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/__init__.py +0 -0
  83. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +0 -0
  84. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -0
  85. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -0
  86. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_meteor/__init__.py +0 -0
  87. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -0
  88. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -0
  89. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -0
  90. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -0
  91. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -0
  92. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -0
  93. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -0
  94. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -0
  95. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_protected_material/__init__.py +0 -0
  96. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +0 -0
  97. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_qa/__init__.py +0 -0
  98. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/__init__.py +0 -0
  99. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/_relevance.py +0 -0
  100. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +0 -0
  101. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/__init__.py +0 -0
  102. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -0
  103. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -0
  104. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_rouge/__init__.py +0 -0
  105. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +0 -0
  106. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -0
  107. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/__init__.py +0 -0
  108. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -0
  109. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_xpia/__init__.py +0 -0
  110. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_evaluators/_xpia/xpia.py +0 -0
  111. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_exceptions.py +0 -0
  112. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_http_utils.py +0 -0
  113. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_model_configurations.py +0 -0
  114. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_user_agent.py +0 -0
  115. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/__init__.py +0 -0
  116. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -0
  117. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -0
  118. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -0
  119. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -0
  120. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -0
  121. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/py.typed +0 -0
  122. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/__init__.py +0 -0
  123. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_adversarial_scenario.py +0 -0
  124. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_adversarial_simulator.py +0 -0
  125. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_constants.py +0 -0
  126. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/__init__.py +0 -0
  127. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -0
  128. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_conversation/constants.py +0 -0
  129. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -0
  130. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -0
  131. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_direct_attack_simulator.py +0 -0
  132. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/__init__.py +0 -0
  133. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +0 -0
  134. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +0 -0
  135. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_indirect_attack_simulator.py +0 -0
  136. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/__init__.py +0 -0
  137. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +0 -0
  138. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +0 -0
  139. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_rai_client.py +0 -0
  140. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/_template_handler.py +0 -0
  141. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_model_tools/models.py +0 -0
  142. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  143. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -0
  144. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -0
  145. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_tracing.py +0 -0
  146. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure/ai/evaluation/simulator/_utils.py +0 -0
  147. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/dependency_links.txt +0 -0
  148. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/not-zip-safe +0 -0
  149. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/azure_ai_evaluation.egg-info/top_level.txt +0 -0
  150. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/pyproject.toml +0 -0
  151. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/data/evaluate_test_data.jsonl +0 -0
  152. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_common.py +0 -0
  153. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/samples/evaluation_samples_evaluate.py +0 -0
  154. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/setup.cfg +0 -0
  155. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/__init__.py +0 -0
  156. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/__openai_patcher.py +0 -0
  157. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/__init__.py +0 -0
  158. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/custom_evaluators/answer_length_with_aggregation.py +0 -0
  159. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/target_fn.py +0 -0
  160. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_adv_simulator.py +0 -0
  161. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_builtin_evaluators.py +0 -0
  162. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/e2etests/test_sim_and_eval.py +0 -0
  163. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_batch_run_context.py +0 -0
  164. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_content_safety_defect_rate.py +0 -0
  165. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_content_safety_rai_script.py +0 -0
  166. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_eval_run.py +0 -0
  167. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluate_telemetry.py +0 -0
  168. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluators/apology_dag/apology.py +0 -0
  169. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_evaluators/slow_eval.py +0 -0
  170. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_jailbreak_simulator.py +0 -0
  171. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_non_adv_simulator.py +0 -0
  172. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_simulator.py +0 -0
  173. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_synthetic_callback_conv_bot.py +0 -0
  174. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_synthetic_conversation_bot.py +0 -0
  175. {azure_ai_evaluation-1.1.0 → azure_ai_evaluation-1.2.0}/tests/unittests/test_utils.py +0 -0
@@ -1,10 +1,75 @@
1
1
  # Release History
2
2
 
3
- ## 1.1.0 (2024-12-12)
3
+ ## 1.2.0 (2025-01-27)
4
+
5
+ ### Features Added
6
+ - CSV files are now supported as data file inputs with `evaluate()` API. The CSV file should have a header row with column names that match the `data` and `target` fields in the `evaluate()` method and the filename should be passed as the `data` parameter. Column name 'Conversation' in CSV file is not fully supported yet.
7
+
8
+ ### Breaking Changes
9
+ - `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be removed in next release.
4
10
 
5
11
  ### Bugs Fixed
6
12
  - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
7
13
  - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
14
+ - Fixed the non adversarial simulator to run in task-free mode
15
+ - Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
16
+ main score when aggregating per-turn evaluations from a conversation into an overall
17
+ evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
18
+ - Fixed bug in non adversarial simulator sample where `tasks` undefined
19
+
20
+ ### Other Changes
21
+ - Changed minimum required python version to use this package from 3.8 to 3.9
22
+ - Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
23
+ - Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
24
+ environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
25
+
26
+ ## 1.1.0 (2024-12-12)
27
+
28
+ ### Features Added
29
+ - Added image support in `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator`. Provide image URLs or base64 encoded images in `conversation` input for image evaluation. See below for an example:
30
+
31
+ ```python
32
+ evaluator = ContentSafetyEvaluator(credential=azure_cred, azure_ai_project=project_scope)
33
+ conversation = {
34
+ "messages": [
35
+ {
36
+ "role": "system",
37
+ "content": [
38
+ {"type": "text", "text": "You are an AI assistant that understands images."}
39
+ ],
40
+ },
41
+ {
42
+ "role": "user",
43
+ "content": [
44
+ {"type": "text", "text": "Can you describe this image?"},
45
+ {
46
+ "type": "image_url",
47
+ "image_url": {
48
+ "url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"
49
+ },
50
+ },
51
+ ],
52
+ },
53
+ {
54
+ "role": "assistant",
55
+ "content": [
56
+ {
57
+ "type": "text",
58
+ "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.",
59
+ }
60
+ ],
61
+ },
62
+ ]
63
+ }
64
+ print("Calling Content Safety Evaluator for multi-modal")
65
+ score = evaluator(conversation=conversation)
66
+ ```
67
+
68
+ - Please switch to generic evaluators for image evaluations as mentioned above. `ContentSafetyMultimodalEvaluator`, `ContentSafetyMultimodalEvaluatorBase`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be deprecated in the next release.
69
+
70
+ ### Bugs Fixed
71
+ - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Foundry portal.
72
+ - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
8
73
 
9
74
  ## 1.0.1 (2024-11-15)
10
75
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: azure-ai-evaluation
3
- Version: 1.1.0
3
+ Version: 1.2.0
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -13,17 +13,16 @@ Classifier: Development Status :: 5 - Production/Stable
13
13
  Classifier: Programming Language :: Python
14
14
  Classifier: Programming Language :: Python :: 3
15
15
  Classifier: Programming Language :: Python :: 3 :: Only
16
- Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
20
19
  Classifier: License :: OSI Approved :: MIT License
21
20
  Classifier: Operating System :: OS Independent
22
- Requires-Python: >=3.8
21
+ Requires-Python: >=3.9
23
22
  Description-Content-Type: text/markdown
24
23
  License-File: NOTICE.txt
25
- Requires-Dist: promptflow-devkit>=1.15.0
26
- Requires-Dist: promptflow-core>=1.15.0
24
+ Requires-Dist: promptflow-devkit>=1.17.1
25
+ Requires-Dist: promptflow-core>=1.17.1
27
26
  Requires-Dist: pyjwt>=2.8.0
28
27
  Requires-Dist: azure-identity>=1.16.0
29
28
  Requires-Dist: azure-core>=1.30.2
@@ -54,7 +53,7 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
54
53
 
55
54
  ### Prerequisites
56
55
 
57
- - Python 3.8 or later is required to use this package.
56
+ - Python 3.9 or later is required to use this package.
58
57
  - [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
59
58
 
60
59
  ### Install the package
@@ -378,11 +377,76 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
378
377
 
379
378
  # Release History
380
379
 
381
- ## 1.1.0 (2024-12-12)
380
+ ## 1.2.0 (2025-01-27)
381
+
382
+ ### Features Added
383
+ - CSV files are now supported as data file inputs with `evaluate()` API. The CSV file should have a header row with column names that match the `data` and `target` fields in the `evaluate()` method and the filename should be passed as the `data` parameter. Column name 'Conversation' in CSV file is not fully supported yet.
384
+
385
+ ### Breaking Changes
386
+ - `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be removed in next release.
382
387
 
383
388
  ### Bugs Fixed
384
389
  - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
385
390
  - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
391
+ - Fixed the non adversarial simulator to run in task-free mode
392
+ - Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
393
+ main score when aggregating per-turn evaluations from a conversation into an overall
394
+ evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
395
+ - Fixed bug in non adversarial simulator sample where `tasks` undefined
396
+
397
+ ### Other Changes
398
+ - Changed minimum required python version to use this package from 3.8 to 3.9
399
+ - Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
400
+ - Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
401
+ environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
402
+
403
+ ## 1.1.0 (2024-12-12)
404
+
405
+ ### Features Added
406
+ - Added image support in `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator`. Provide image URLs or base64 encoded images in `conversation` input for image evaluation. See below for an example:
407
+
408
+ ```python
409
+ evaluator = ContentSafetyEvaluator(credential=azure_cred, azure_ai_project=project_scope)
410
+ conversation = {
411
+ "messages": [
412
+ {
413
+ "role": "system",
414
+ "content": [
415
+ {"type": "text", "text": "You are an AI assistant that understands images."}
416
+ ],
417
+ },
418
+ {
419
+ "role": "user",
420
+ "content": [
421
+ {"type": "text", "text": "Can you describe this image?"},
422
+ {
423
+ "type": "image_url",
424
+ "image_url": {
425
+ "url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"
426
+ },
427
+ },
428
+ ],
429
+ },
430
+ {
431
+ "role": "assistant",
432
+ "content": [
433
+ {
434
+ "type": "text",
435
+ "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.",
436
+ }
437
+ ],
438
+ },
439
+ ]
440
+ }
441
+ print("Calling Content Safety Evaluator for multi-modal")
442
+ score = evaluator(conversation=conversation)
443
+ ```
444
+
445
+ - Please switch to generic evaluators for image evaluations as mentioned above. `ContentSafetyMultimodalEvaluator`, `ContentSafetyMultimodalEvaluatorBase`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be deprecated in the next release.
446
+
447
+ ### Bugs Fixed
448
+ - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Foundry portal.
449
+ - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
386
450
 
387
451
  ## 1.0.1 (2024-11-15)
388
452
 
@@ -22,7 +22,7 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
22
22
 
23
23
  ### Prerequisites
24
24
 
25
- - Python 3.8 or later is required to use this package.
25
+ - Python 3.9 or later is required to use this package.
26
26
  - [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
27
27
 
28
28
  ### Install the package
@@ -17,7 +17,7 @@ from azure.ai.evaluation.simulator._model_tools._identity_manager import TokenSc
17
17
  from ._models import BlobStoreInfo, Workspace
18
18
 
19
19
 
20
- API_VERSION: Final[str] = "2024-10-01"
20
+ API_VERSION: Final[str] = "2024-07-01-preview"
21
21
  QUERY_KEY_API_VERSION: Final[str] = "api-version"
22
22
  PATH_ML_WORKSPACES = ("providers", "Microsoft.MachineLearningServices", "workspaces")
23
23
 
@@ -69,7 +69,9 @@ class LiteMLClient:
69
69
  self._get_token_manager()
70
70
  return cast(TokenCredential, self._credential)
71
71
 
72
- def workspace_get_default_datastore(self, workspace_name: str, include_credentials: bool = False) -> BlobStoreInfo:
72
+ def workspace_get_default_datastore(
73
+ self, workspace_name: str, *, include_credentials: bool = False, **kwargs: Any
74
+ ) -> BlobStoreInfo:
73
75
  # 1. Get the default blob store
74
76
  # REST API documentation:
75
77
  # https://learn.microsoft.com/rest/api/azureml/datastores/list?view=rest-azureml-2024-10-01
@@ -92,18 +94,29 @@ class LiteMLClient:
92
94
  account_name = props_json["accountName"]
93
95
  endpoint = props_json["endpoint"]
94
96
  container_name = props_json["containerName"]
97
+ credential_type = props_json.get("credentials", {}).get("credentialsType")
95
98
 
96
99
  # 2. Get the SAS token to use for accessing the blob store
97
100
  # REST API documentation:
98
101
  # https://learn.microsoft.com/rest/api/azureml/datastores/list-secrets?view=rest-azureml-2024-10-01
99
- blob_store_credential: Optional[Union[AzureSasCredential, str]] = None
100
- if include_credentials:
102
+ blob_store_credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
103
+ if not include_credentials:
104
+ blob_store_credential = None
105
+ elif credential_type and credential_type.lower() == "none":
106
+ # If storage account key access is disabled, and only Microsoft Entra ID authentication is available,
107
+ # the credentialsType will be "None" and we should not attempt to get the secrets.
108
+ blob_store_credential = self.get_credential()
109
+ else:
101
110
  url = self._generate_path(
102
111
  *PATH_ML_WORKSPACES, workspace_name, "datastores", "workspaceblobstore", "listSecrets"
103
112
  )
104
113
  secrets_response = self._http_client.request(
105
114
  method="POST",
106
115
  url=url,
116
+ json={
117
+ "expirableSecret": True,
118
+ "expireAfterHours": int(kwargs.get("key_expiration_hours", 1)),
119
+ },
107
120
  params={
108
121
  QUERY_KEY_API_VERSION: self._api_version,
109
122
  },
@@ -114,10 +127,13 @@ class LiteMLClient:
114
127
  secrets_json = secrets_response.json()
115
128
  secrets_type = secrets_json["secretsType"].lower()
116
129
 
130
+ # As per this website, only SAS tokens, access tokens, or Entra IDs are valid for accessing blob data
131
+ # stores:
132
+ # https://learn.microsoft.com/rest/api/storageservices/authorize-requests-to-azure-storage.
117
133
  if secrets_type == "sas":
118
134
  blob_store_credential = AzureSasCredential(secrets_json["sasToken"])
119
135
  elif secrets_type == "accountkey":
120
- # To support olders versions of azure-storage-blob better, we return a string here instead of
136
+ # To support older versions of azure-storage-blob better, we return a string here instead of
121
137
  # an AzureNamedKeyCredential
122
138
  blob_store_credential = secrets_json["key"]
123
139
  else:
@@ -164,19 +180,19 @@ class LiteMLClient:
164
180
  # nothing to see here, move along
165
181
  return
166
182
 
167
- additional_info: Optional[str] = None
183
+ message = f"The {description} request failed with HTTP {response.status_code}"
168
184
  try:
169
185
  error_json = response.json()["error"]
170
186
  additional_info = f"({error_json['code']}) {error_json['message']}"
187
+ message += f" - {additional_info}"
171
188
  except (JSONDecodeError, ValueError, KeyError):
172
189
  pass
173
190
 
174
191
  raise EvaluationException(
175
- message=f"The {description} request failed with HTTP {response.status_code}",
192
+ message=message,
176
193
  target=ErrorTarget.EVALUATE,
177
194
  category=ErrorCategory.FAILED_EXECUTION,
178
195
  blame=ErrorBlame.SYSTEM_ERROR,
179
- internal_message=additional_info,
180
196
  )
181
197
 
182
198
  def _generate_path(self, *paths: str) -> str:
@@ -8,7 +8,7 @@
8
8
 
9
9
  from typing import Dict, List, NamedTuple, Optional, Union
10
10
  from msrest.serialization import Model
11
- from azure.core.credentials import AzureSasCredential
11
+ from azure.core.credentials import AzureSasCredential, TokenCredential
12
12
 
13
13
 
14
14
  class BlobStoreInfo(NamedTuple):
@@ -16,7 +16,7 @@ class BlobStoreInfo(NamedTuple):
16
16
  account_name: str
17
17
  endpoint: str
18
18
  container_name: str
19
- credential: Optional[Union[AzureSasCredential, str]]
19
+ credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
20
20
 
21
21
 
22
22
  class WorkspaceHubConfig(Model):
@@ -1,7 +1,9 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ import enum
4
5
  from typing import Literal
6
+ from azure.ai.evaluation._common._experimental import experimental
5
7
 
6
8
 
7
9
  class EvaluationMetrics:
@@ -57,6 +59,22 @@ class EvaluationRunProperties:
57
59
  EVALUATION_SDK = "_azureml.evaluation_sdk_name"
58
60
 
59
61
 
62
+ @experimental
63
+ class _AggregationType(enum.Enum):
64
+ """Defines how numeric evaluation results should be aggregated
65
+ to produce a single value. Used by individual evaluators to combine per-turn results for
66
+ a conversation-based input. In general, wherever this enum is used, it is also possible
67
+ to directly assign the underlying aggregation function for more complex use cases.
68
+ The 'custom' value is generally not an acceptable input, and should only be used as an output
69
+ to indicate that a custom aggregation function has been injected."""
70
+
71
+ MEAN = "mean"
72
+ MAX = "max"
73
+ MIN = "min"
74
+ SUM = "sum"
75
+ CUSTOM = "custom"
76
+
77
+
60
78
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
61
79
 
62
80
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
5
5
  from .code_client import CodeClient
6
6
  from .proxy_client import ProxyClient
7
7
  from .target_run_context import TargetRunContext
8
+ from .proxy_client import ProxyRun
8
9
 
9
- __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
10
+ __all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
@@ -421,7 +421,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
421
421
  local_paths.append(local_file_path)
422
422
 
423
423
  # We will write the artifacts to the workspaceblobstore
424
- datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
424
+ datastore = self._management_client.workspace_get_default_datastore(
425
+ self._workspace_name, include_credentials=True
426
+ )
425
427
  account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
426
428
 
427
429
  svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
@@ -12,6 +12,7 @@ import pandas as pd
12
12
  from promptflow._sdk._constants import LINE_NUMBER
13
13
  from promptflow.client import PFClient
14
14
  from promptflow.entities import Run
15
+ from promptflow._sdk._configuration import Configuration
15
16
 
16
17
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
17
18
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
@@ -20,17 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
20
21
  from .._constants import (
21
22
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
22
23
  EvaluationMetrics,
24
+ DefaultOpenEncoding,
23
25
  Prefixes,
24
26
  _InternalEvaluationMetrics,
25
27
  )
26
28
  from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
27
29
  from .._user_agent import USER_AGENT
28
- from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
30
+ from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
29
31
  from ._utils import (
30
32
  _apply_column_mapping,
31
33
  _log_metrics_and_instance_results,
32
34
  _trace_destination_from_project_scope,
33
35
  _write_output,
36
+ DataLoaderFactory,
34
37
  )
35
38
 
36
39
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
@@ -429,10 +432,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
429
432
  )
430
433
 
431
434
  try:
432
- initial_data_df = pd.read_json(data, lines=True)
435
+ data_loader = DataLoaderFactory.get_loader(data)
436
+ initial_data_df = data_loader.load()
433
437
  except Exception as e:
434
438
  raise EvaluationException(
435
- message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
439
+ message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
436
440
  target=ErrorTarget.EVALUATE,
437
441
  category=ErrorCategory.INVALID_VALUE,
438
442
  blame=ErrorBlame.USER_ERROR,
@@ -444,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
444
448
  def _apply_target_to_data(
445
449
  target: Callable,
446
450
  data: Union[str, os.PathLike],
447
- pf_client: PFClient,
451
+ batch_client: TClient,
448
452
  initial_data: pd.DataFrame,
449
453
  evaluation_name: Optional[str] = None,
450
454
  **kwargs,
@@ -454,10 +458,10 @@ def _apply_target_to_data(
454
458
 
455
459
  :param target: The function to be applied to data.
456
460
  :type target: Callable
457
- :param data: The path to input jsonl file.
461
+ :param data: The path to input jsonl or csv file.
458
462
  :type data: Union[str, os.PathLike]
459
- :param pf_client: The promptflow client to be used.
460
- :type pf_client: PFClient
463
+ :param batch_client: The promptflow client to be used.
464
+ :type batch_client: PFClient
461
465
  :param initial_data: The data frame with the loaded data.
462
466
  :type initial_data: pd.DataFrame
463
467
  :param evaluation_name: The name of the evaluation.
@@ -467,7 +471,7 @@ def _apply_target_to_data(
467
471
  """
468
472
  _run_name = kwargs.get("_run_name")
469
473
  with TargetRunContext():
470
- run: Run = pf_client.run(
474
+ run: ProxyRun = batch_client.run(
471
475
  flow=target,
472
476
  display_name=evaluation_name,
473
477
  data=data,
@@ -475,7 +479,18 @@ def _apply_target_to_data(
475
479
  name=_run_name,
476
480
  )
477
481
 
478
- target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
482
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
483
+ run_summary = batch_client.get_run_summary(run)
484
+
485
+ if run_summary["completed_lines"] == 0:
486
+ msg = (f"Evaluation target failed to produce any results."
487
+ f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
488
+ raise EvaluationException(
489
+ message=msg,
490
+ target=ErrorTarget.EVALUATE,
491
+ category=ErrorCategory.FAILED_EXECUTION,
492
+ blame=ErrorBlame.USER_ERROR,
493
+ )
479
494
  # Remove input and output prefix
480
495
  generated_columns = {
481
496
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -494,7 +509,7 @@ def _apply_target_to_data(
494
509
  # Concatenate output to input
495
510
  target_output = pd.concat([target_output, initial_data], axis=1)
496
511
 
497
- return target_output, generated_columns, run
512
+ return target_output, generated_columns, run.run.result()
498
513
 
499
514
 
500
515
  def _process_column_mappings(
@@ -569,13 +584,14 @@ def evaluate(
569
584
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
570
585
  azure_ai_project: Optional[AzureAIProject] = None,
571
586
  output_path: Optional[Union[str, os.PathLike]] = None,
587
+ fail_on_evaluator_errors: bool = False,
572
588
  **kwargs,
573
589
  ) -> EvaluationResult:
574
590
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
575
591
  data will be run through target function and then results will be evaluated.
576
592
 
577
593
  :keyword data: Path to the data to be evaluated or passed to target if target is set.
578
- Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
594
+ JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
579
595
  :paramtype data: str
580
596
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
581
597
  and value as the evaluator function. Required.
@@ -594,6 +610,11 @@ def evaluate(
594
610
  :paramtype output_path: Optional[str]
595
611
  :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
596
612
  :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
613
+ :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
614
+ if ANY evaluator fails during their evaluation.
615
+ Defaults to false, which means that evaluations will continue regardless of failures.
616
+ If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
617
+ :paramtype fail_on_evaluator_errors: bool
597
618
  :return: Evaluation results.
598
619
  :rtype: ~azure.ai.evaluation.EvaluationResult
599
620
 
@@ -615,6 +636,7 @@ def evaluate(
615
636
  evaluator_config=evaluator_config,
616
637
  azure_ai_project=azure_ai_project,
617
638
  output_path=output_path,
639
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
618
640
  **kwargs,
619
641
  )
620
642
  except Exception as e:
@@ -663,6 +685,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
663
685
  print("\n====================================================\n")
664
686
 
665
687
 
688
+ def _print_fail_flag_warning() -> None:
689
+ print(
690
+ "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
691
+ + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
692
+ + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
693
+ + "without producing any outputs, since a single failure will cancel the entire run "
694
+ "when fail_on_evaluator_errors is enabled."
695
+ )
696
+
697
+
666
698
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
667
699
  *,
668
700
  evaluators: Dict[str, Callable],
@@ -672,8 +704,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
672
704
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
673
705
  azure_ai_project: Optional[AzureAIProject] = None,
674
706
  output_path: Optional[Union[str, os.PathLike]] = None,
707
+ fail_on_evaluator_errors: bool = False,
675
708
  **kwargs,
676
709
  ) -> EvaluationResult:
710
+ if fail_on_evaluator_errors:
711
+ _print_fail_flag_warning()
677
712
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
678
713
 
679
714
  # Process evaluator config to replace ${target.} with ${data.}
@@ -690,6 +725,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
690
725
  if target is not None:
691
726
  _validate_columns_for_target(input_data_df, target)
692
727
 
728
+ Configuration.get_instance().set_config("trace.destination", "none")
693
729
  pf_client = PFClient(user_agent=USER_AGENT)
694
730
  target_run: Optional[Run] = None
695
731
 
@@ -702,7 +738,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
702
738
  target_generated_columns: Set[str] = set()
703
739
  if data is not None and target is not None:
704
740
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
705
- target, data, pf_client, input_data_df, evaluation_name, **kwargs
741
+ target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
706
742
  )
707
743
 
708
744
  for evaluator_name, mapping in column_mapping.items():
@@ -773,6 +809,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
773
809
  evaluators_result_df = None
774
810
  evaluators_metric = {}
775
811
  for evaluator_name, evaluator_result in per_evaluator_results.items():
812
+ if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
813
+ _print_summary(per_evaluator_results)
814
+ _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
815
+
776
816
  evaluator_result_df = evaluator_result["result"]
777
817
 
778
818
  # drop input columns
@@ -825,3 +865,20 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
825
865
  _write_output(output_path, result)
826
866
 
827
867
  return result
868
+
869
+
870
+ def _turn_error_logs_into_exception(log_path: str) -> None:
871
+ """Produce an EvaluationException using the contents of the inputted
872
+ file as the error message.
873
+
874
+ :param log_path: The path to the error log file.
875
+ :type log_path: str
876
+ """
877
+ with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
878
+ error_message = file.read()
879
+ raise EvaluationException(
880
+ message=error_message,
881
+ target=ErrorTarget.EVALUATE,
882
+ category=ErrorCategory.FAILED_EXECUTION,
883
+ blame=ErrorBlame.UNKNOWN,
884
+ )
@@ -328,3 +328,30 @@ def set_event_loop_policy() -> None:
328
328
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
329
329
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
330
330
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
331
+
332
+
333
+ class JSONLDataFileLoader:
334
+ def __init__(self, filename: Union[os.PathLike, str]):
335
+ self.filename = filename
336
+
337
+ def load(self) -> pd.DataFrame:
338
+ return pd.read_json(self.filename, lines=True)
339
+
340
+
341
+ class CSVDataFileLoader:
342
+ def __init__(self, filename: Union[os.PathLike, str]):
343
+ self.filename = filename
344
+
345
+ def load(self) -> pd.DataFrame:
346
+ return pd.read_csv(self.filename)
347
+
348
+
349
+ class DataLoaderFactory:
350
+ @staticmethod
351
+ def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
352
+ filename_str = str(filename).lower()
353
+ if filename_str.endswith(".csv"):
354
+ return CSVDataFileLoader(filename)
355
+
356
+ # fallback to JSONL to maintain backward compatibility
357
+ return JSONLDataFileLoader(filename)