azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -1,30 +1,17 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict
4
5
  from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
5
- from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+ from typing_extensions import overload, override
6
7
 
7
8
  from azure.ai.evaluation._common.utils import nltk_tokenize
8
9
 
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
9
12
 
10
- class _AsyncBleuScoreEvaluator:
11
- def __init__(self):
12
- pass
13
13
 
14
- async def __call__(self, *, response: str, ground_truth: str, **kwargs):
15
- reference_tokens = nltk_tokenize(ground_truth)
16
- hypothesis_tokens = nltk_tokenize(response)
17
-
18
- # NIST Smoothing
19
- smoothing_function = SmoothingFunction().method4
20
- score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
21
-
22
- return {
23
- "bleu_score": score,
24
- }
25
-
26
-
27
- class BleuScoreEvaluator:
14
+ class BleuScoreEvaluator(EvaluatorBase):
28
15
  """
29
16
  Calculate the BLEU score for a given response and ground truth.
30
17
 
@@ -36,6 +23,8 @@ class BleuScoreEvaluator:
36
23
  indicator of quality.
37
24
 
38
25
  The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
26
+ :param threshold: The threshold for the evaluation. Default is 0.5.
27
+ :type threshold: float
39
28
 
40
29
  .. admonition:: Example:
41
30
 
@@ -44,16 +33,67 @@ class BleuScoreEvaluator:
44
33
  :end-before: [END bleu_score_evaluator]
45
34
  :language: python
46
35
  :dedent: 8
47
- :caption: Initialize and call an BleuScoreEvaluator.
36
+ :caption: Initialize and call an BleuScoreEvaluator using azure.ai.evaluation.AzureAIProject
37
+
38
+ .. admonition:: Example using Azure AI Project URL:
39
+
40
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
41
+ :start-after: [START bleu_score_evaluator]
42
+ :end-before: [END bleu_score_evaluator]
43
+ :language: python
44
+ :dedent: 8
45
+ :caption: Initialize and call an BleuScoreEvaluator using Azure AI Project URL in following format
46
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
47
+
48
+ .. admonition:: Example with Threshold:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
51
+ :start-after: [START threshold_bleu_score_evaluator]
52
+ :end-before: [END threshold_bleu_score_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize with threshold and call an BleuScoreEvaluator.
48
56
  """
49
57
 
50
- id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
58
+ id = "azureai://built-in/evaluators/bleu_score"
51
59
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
52
60
 
53
- def __init__(self):
54
- self._async_evaluator = _AsyncBleuScoreEvaluator()
61
+ def __init__(self, *, threshold=0.5):
62
+ self._threshold = threshold
63
+ self._higher_is_better = True
64
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
65
+
66
+ @override
67
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
68
+ """Produce a bleu score evaluation result.
55
69
 
56
- def __call__(self, *, response: str, ground_truth: str, **kwargs):
70
+ :param eval_input: The input to the evaluation function.
71
+ :type eval_input: Dict
72
+ :return: The evaluation result.
73
+ :rtype: Dict
74
+ """
75
+ ground_truth = eval_input["ground_truth"]
76
+ response = eval_input["response"]
77
+ reference_tokens = nltk_tokenize(ground_truth)
78
+ hypothesis_tokens = nltk_tokenize(response)
79
+
80
+ # NIST Smoothing
81
+ smoothing_function = SmoothingFunction().method4
82
+ score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
83
+ binary_result = False
84
+ if self._higher_is_better:
85
+ binary_result = score >= self._threshold
86
+ else:
87
+ binary_result = score <= self._threshold
88
+
89
+ return {
90
+ "bleu_score": score,
91
+ "bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
92
+ "bleu_threshold": self._threshold,
93
+ }
94
+
95
+ @overload # type: ignore
96
+ def __call__(self, *, response: str, ground_truth: str):
57
97
  """
58
98
  Evaluate the BLEU score between the response and the ground truth.
59
99
 
@@ -64,9 +104,21 @@ class BleuScoreEvaluator:
64
104
  :return: The BLEU score.
65
105
  :rtype: Dict[str, float]
66
106
  """
67
- return async_run_allowing_running_loop(
68
- self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
69
- )
70
107
 
71
- def _to_async(self):
72
- return self._async_evaluator
108
+ @override
109
+ def __call__( # pylint: disable=docstring-missing-param
110
+ self,
111
+ *args,
112
+ **kwargs,
113
+ ):
114
+ """
115
+ Evaluate the BLEU score between the response and the ground truth.
116
+
117
+ :keyword response: The response to be evaluated.
118
+ :paramtype response: str
119
+ :keyword ground_truth: The ground truth to be compared against.
120
+ :paramtype ground_truth: str
121
+ :return: The BLEU score.
122
+ :rtype: Dict[str, float]
123
+ """
124
+ return super().__call__(*args, **kwargs)
@@ -0,0 +1,5 @@
1
+ from ._code_vulnerability import CodeVulnerabilityEvaluator
2
+
3
+ __all__ = [
4
+ "CodeVulnerabilityEvaluator",
5
+ ]
@@ -0,0 +1,119 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing_extensions import overload, override
5
+ from typing import Dict, Union
6
+
7
+ from azure.ai.evaluation._common._experimental import experimental
8
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
9
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
+
11
+
12
+ # cspell:ignore ssrf, vuln
13
+ @experimental
14
+ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
15
+ """
16
+ Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
17
+ where query represents the user query or code before the completion, and response represents the code recommended by the assistant.
18
+
19
+ The code vulnerability evaluation checks for vulnerabilities in the following coding languages:
20
+
21
+ - Python
22
+ - Java
23
+ - C++
24
+ - C#
25
+ - Go
26
+ - Javascript
27
+ - SQL
28
+
29
+ The code vulnerability evaluation identifies the following vulnerabilities:
30
+
31
+ - path-injection
32
+ - sql-injection
33
+ - code-injection
34
+ - stack-trace-exposure
35
+ - incomplete-url-substring-sanitization
36
+ - flask-debug
37
+ - clear-text-logging-sensitive-data
38
+ - incomplete-hostname-regexp
39
+ - server-side-unvalidated-url-redirection
40
+ - weak-cryptographic-algorithm
41
+ - full-ssrf
42
+ - bind-socket-all-network-interfaces
43
+ - client-side-unvalidated-url-redirection
44
+ - likely-bugs
45
+ - reflected-xss
46
+ - clear-text-storage-sensitive-data
47
+ - tarslip
48
+ - hardcoded-credentials
49
+ - insecure-randomness
50
+
51
+ :param credential: The credential for connecting to Azure AI project. Required
52
+ :type credential: ~azure.core.credentials.TokenCredential
53
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
54
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
55
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
56
+ :param kwargs: Additional arguments to pass to the evaluator.
57
+ :type kwargs: Any
58
+
59
+ .. note::
60
+
61
+ If this evaluator is supplied to the `evaluate` function, the metric
62
+ for the code vulnerability will be "code_vulnerability_label".
63
+ """
64
+
65
+ id = "azureai://built-in/evaluators/code_vulnerability"
66
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
67
+ _OPTIONAL_PARAMS = ["query"]
68
+
69
+ @override
70
+ def __init__(
71
+ self,
72
+ credential,
73
+ azure_ai_project,
74
+ **kwargs,
75
+ ):
76
+ # Set default for evaluate_query if not provided
77
+ if "evaluate_query" not in kwargs:
78
+ kwargs["evaluate_query"] = True
79
+
80
+ super().__init__(
81
+ eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
82
+ azure_ai_project=azure_ai_project,
83
+ credential=credential,
84
+ **kwargs,
85
+ )
86
+
87
+ @overload
88
+ def __call__(
89
+ self,
90
+ *,
91
+ query: str,
92
+ response: str,
93
+ ) -> Dict[str, Union[str, float]]:
94
+ """Evaluate a given query/response pair for code vulnerability
95
+
96
+ :keyword query: The query to be evaluated.
97
+ :paramtype query: str
98
+ :keyword response: The response to be evaluated.
99
+ :paramtype response: str
100
+ :return: The code vulnerability label.
101
+ :rtype: Dict[str, Union[str, bool]]
102
+ """
103
+
104
+ @override
105
+ def __call__( # pylint: disable=docstring-missing-param
106
+ self,
107
+ *args,
108
+ **kwargs,
109
+ ):
110
+ """Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
111
+
112
+ :keyword query: The query to be evaluated.
113
+ :paramtype query: Optional[str]
114
+ :keyword response: The response to be evaluated.
115
+ :paramtype response: Optional[str]
116
+ :rtype: Dict[str, Union[str, bool]]
117
+ """
118
+
119
+ return super().__call__(*args, **kwargs)
@@ -21,6 +21,13 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
21
21
  :param model_config: Configuration for the Azure OpenAI model.
22
22
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
23
23
  ~azure.ai.evaluation.OpenAIModelConfiguration]
24
+ :param threshold: The threshold for the coherence evaluator. Default is 3.
25
+ :type threshold: int
26
+ :param credential: The credential for authenticating to Azure AI service.
27
+ :type credential: ~azure.core.credentials.TokenCredential
28
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
29
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
30
+ :paramtype is_reasoning_model: bool
24
31
 
25
32
  .. admonition:: Example:
26
33
 
@@ -29,7 +36,26 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
29
36
  :end-before: [END coherence_evaluator]
30
37
  :language: python
31
38
  :dedent: 8
32
- :caption: Initialize and call a CoherenceEvaluator with a query and response.
39
+ :caption: Initialize and call CoherenceEvaluator using azure.ai.evaluation.AzureAIProject
40
+
41
+ .. admonition:: Example using Azure AI Project URL:
42
+
43
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
+ :start-after: [START coherence_evaluator]
45
+ :end-before: [END coherence_evaluator]
46
+ :language: python
47
+ :dedent: 8
48
+ :caption: Initialize and call CoherenceEvaluator using Azure AI Project URL in following format
49
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
+
51
+ .. admonition:: Example with Threshold:
52
+
53
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
54
+ :start-after: [START threshold_coherence_evaluator]
55
+ :end-before: [END threshold_coherence_evaluator]
56
+ :language: python
57
+ :dedent: 8
58
+ :caption: Initialize with threshold and call a CoherenceEvaluator with a query and response.
33
59
 
34
60
  .. note::
35
61
 
@@ -41,14 +67,24 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
41
67
  _PROMPTY_FILE = "coherence.prompty"
42
68
  _RESULT_KEY = "coherence"
43
69
 
44
- id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
70
+ id = "azureai://built-in/evaluators/coherence"
45
71
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
46
72
 
47
73
  @override
48
- def __init__(self, model_config):
74
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
49
75
  current_dir = os.path.dirname(__file__)
50
76
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
51
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
77
+ self._threshold = threshold
78
+ self._higher_is_better = True
79
+ super().__init__(
80
+ model_config=model_config,
81
+ prompty_file=prompty_path,
82
+ result_key=self._RESULT_KEY,
83
+ threshold=threshold,
84
+ credential=credential,
85
+ _higher_is_better=self._higher_is_better,
86
+ **kwargs,
87
+ )
52
88
 
53
89
  @overload
54
90
  def __call__(
@@ -5,9 +5,11 @@
5
5
  from ._base_eval import EvaluatorBase
6
6
  from ._base_prompty_eval import PromptyEvaluatorBase
7
7
  from ._base_rai_svc_eval import RaiServiceEvaluatorBase
8
+ from ._base_multi_eval import MultiEvaluatorBase
8
9
 
9
10
  __all__ = [
10
11
  "EvaluatorBase",
11
12
  "PromptyEvaluatorBase",
12
13
  "RaiServiceEvaluatorBase",
14
+ "MultiEvaluatorBase",
13
15
  ]