azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,442 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import math
5
+ import operator
6
+ from itertools import starmap
7
+ from typing import Any, Dict, List, TypedDict, Tuple, Optional, Union
8
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
+ from azure.ai.evaluation._exceptions import EvaluationException
10
+ from typing_extensions import override, overload
11
+
12
+
13
+ RetrievalGroundTruthDocument = TypedDict(
14
+ "RetrievalGroundTruthDocument", {"document_id": str, "query_relevance_label": int}
15
+ )
16
+
17
+ RetrievedDocument = TypedDict("RetrievedDocument", {"document_id": str, "relevance_score": float})
18
+
19
+
20
+ class DocumentRetrievalEvaluator(EvaluatorBase):
21
+ """
22
+ Calculate document retrieval metrics, such as NDCG, XDCG, Fidelity, Top K Relevance and Holes.
23
+
24
+ .. admonition:: Example:
25
+
26
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
27
+ :start-after: [START document_retrieval_evaluator]
28
+ :end-before: [END document_retrieval_evaluator]
29
+ :language: python
30
+ :dedent: 8
31
+ :caption: Initialize and call a DocumentRetrievalEvaluator
32
+
33
+ .. admonition:: Example using Azure AI Project URL:
34
+
35
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
36
+ :start-after: [START document_retrieval_evaluator]
37
+ :end-before: [END document_retrieval_evaluator]
38
+ :language: python
39
+ :dedent: 8
40
+ :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
41
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
42
+
43
+ .. admonition:: Example with Threshold:
44
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
45
+ :start-after: [START threshold_document_retrieval_evaluator]
46
+ :end-before: [END threshold_document_retrieval_evaluator]
47
+ :language: python
48
+ :dedent: 8
49
+ :caption: Initialize with threshold and call a DocumentRetrievalEvaluator.
50
+ """
51
+
52
+ id = "azureai://built-in/evaluators/document_retrieval"
53
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
54
+
55
+ def __init__(
56
+ self,
57
+ *,
58
+ ground_truth_label_min: int = 0,
59
+ ground_truth_label_max: int = 4,
60
+ ndcg_threshold: Optional[float] = 0.5,
61
+ xdcg_threshold: Optional[float] = 50.0,
62
+ fidelity_threshold: Optional[float] = 0.5,
63
+ top1_relevance_threshold: Optional[float] = 50.0,
64
+ top3_max_relevance_threshold: Optional[float] = 50.0,
65
+ total_retrieved_documents_threshold: Optional[int] = 50,
66
+ total_ground_truth_documents_threshold: Optional[int] = 50,
67
+ ):
68
+ super().__init__()
69
+ self.k = 3
70
+ self.xdcg_discount_factor = 0.6
71
+
72
+ if ground_truth_label_min >= ground_truth_label_max:
73
+ raise EvaluationException(
74
+ "The ground truth label maximum must be strictly greater than the ground truth label minimum."
75
+ )
76
+
77
+ if not isinstance(ground_truth_label_min, int):
78
+ raise EvaluationException("The ground truth label minimum must be an integer value.")
79
+
80
+ if not isinstance(ground_truth_label_max, int):
81
+ raise EvaluationException("The ground truth label maximum must be an integer value.")
82
+
83
+ self.ground_truth_label_min = ground_truth_label_min
84
+ self.ground_truth_label_max = ground_truth_label_max
85
+
86
+ # The default threshold for metrics where higher numbers are better.
87
+ self._threshold_metrics: Dict[str, Any] = {
88
+ "ndcg@3": ndcg_threshold,
89
+ "xdcg@3": xdcg_threshold,
90
+ "fidelity": fidelity_threshold,
91
+ "top1_relevance": top1_relevance_threshold,
92
+ "top3_max_relevance": top3_max_relevance_threshold,
93
+ "total_retrieved_documents": total_retrieved_documents_threshold,
94
+ "total_ground_truth_documents": total_ground_truth_documents_threshold,
95
+ }
96
+
97
+ # Ideally, the number of holes should be zero.
98
+ self._threshold_holes = {"holes": 0, "holes_ratio": 0}
99
+
100
+ def _compute_holes(self, actual_docs: List[str], labeled_docs: List[str]) -> int:
101
+ """
102
+ The number of documents retrieved from a search query which have no provided ground-truth label.
103
+ This metric is helpful for determining the accuracy of other metrics that are highly sensitive to missing ground-truth knowledge,
104
+ such as NDCG, XDCG, and Fidelity.
105
+
106
+ :param actual_docs: A list of retrieved documents' IDs.
107
+ :type actual_docs: List[str]
108
+ :param labeled_docs: A list of ideal documents' IDs.
109
+ :type labeled: List[str]
110
+ :return: The holes calculation result.
111
+ :rtype: int
112
+ """
113
+ return len(set(actual_docs).difference(set(labeled_docs)))
114
+
115
+ def _compute_ndcg(
116
+ self,
117
+ result_docs_groundtruth_labels: List[int],
118
+ ideal_docs_groundtruth_labels: List[int],
119
+ ) -> float:
120
+ """NDCG (Normalized Discounted Cumulative Gain) calculated for the top K documents retrieved from a search query.
121
+ NDCG measures how well a document ranking compares to an ideal document ranking given a list of ground-truth documents.
122
+
123
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
124
+ :type result_docs_groundtruth_labels: List[int]
125
+ :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
126
+ :type ideal_docs_groundtruth_labels: List[int]
127
+ :return: The NDCG@K calculation result.
128
+ :rtype: float
129
+ """
130
+
131
+ # Set the scoring function
132
+ def calculate_dcg(relevance: float, rank: int):
133
+ return (math.pow(2, relevance) - 1) / (math.log2(rank + 1))
134
+
135
+ ranks = list(range(1, self.k + 1))
136
+ dcg = sum(starmap(calculate_dcg, zip(result_docs_groundtruth_labels, ranks)))
137
+ idcg = sum(starmap(calculate_dcg, zip(ideal_docs_groundtruth_labels, ranks)))
138
+ ndcg = dcg / float(idcg)
139
+
140
+ return ndcg
141
+
142
+ def _compute_xdcg(self, result_docs_groundtruth_labels: List[int]) -> float:
143
+ """XDCG calculated for the top K documents retrieved from a search query.
144
+ XDCG measures how objectively good are the top K documents, discounted by their position in the list.
145
+
146
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
147
+ :type result_docs_groundtruth_labels: List[int]
148
+ :return: The XDCG@K calculation result.
149
+ :rtype: float
150
+ """
151
+
152
+ def calculate_xdcg_numerator(relevance, rank):
153
+ return 25 * relevance * math.pow(self.xdcg_discount_factor, rank - 1)
154
+
155
+ def calculate_xdcg_denominator(rank):
156
+ return math.pow(self.xdcg_discount_factor, rank - 1)
157
+
158
+ ranks = list(range(1, self.k + 1))
159
+ xdcg_n = sum(starmap(calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)))
160
+ xdcg_d = sum(map(calculate_xdcg_denominator, ranks))
161
+
162
+ return xdcg_n / float(xdcg_d)
163
+
164
+ def _compute_fidelity(
165
+ self,
166
+ result_docs_groundtruth_labels: List[int],
167
+ ideal_docs_groundtruth_labels: List[int],
168
+ ) -> float:
169
+ """Fidelity calculated over all documents retrieved from a search query.
170
+ Fidelity measures how objectively good are all of the documents retrieved compared with all known good documents in the underlying data store.
171
+
172
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
173
+ :type result_docs_groundtruth_labels: List[int]
174
+ :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
175
+ :type ideal_docs_groundtruth_labels: List[int]
176
+ :return: The fidelity calculation result.
177
+ :rtype: float
178
+ """
179
+
180
+ def calculate_weighted_sum_by_rating(labels: List[int]) -> float:
181
+ # here we assume that the configured groundtruth label minimum translates to "irrelevant",
182
+ # so we exclude documents with that label from the calculation.
183
+ s = self.ground_truth_label_min + 1
184
+
185
+ # get a count of each label
186
+ label_counts = {str(i): 0 for i in range(s, self.ground_truth_label_max + 1)}
187
+
188
+ for label in labels:
189
+ if label >= s:
190
+ label_counts[str(label)] += 1
191
+
192
+ sorted_label_counts = [x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])]
193
+
194
+ # calculate weights
195
+ weights = [(math.pow(2, i + 1) - 1) for i in range(s, self.ground_truth_label_max + 1)]
196
+
197
+ # return weighted sum
198
+ return sum(starmap(operator.mul, zip(sorted_label_counts, weights)))
199
+
200
+ weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(result_docs_groundtruth_labels)
201
+ weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(ideal_docs_groundtruth_labels)
202
+
203
+ if weighted_sum_by_rating_index == 0:
204
+ return math.nan
205
+
206
+ return weighted_sum_by_rating_results / float(weighted_sum_by_rating_index)
207
+
208
+ def _get_binary_result(self, **metrics) -> Dict[str, float]:
209
+ result: Dict[str, Any] = {}
210
+
211
+ for metric_name, metric_value in metrics.items():
212
+ if metric_name in self._threshold_metrics.keys():
213
+ result[f"{metric_name}_result"] = (
214
+ "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
215
+ )
216
+ result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
217
+ result[f"{metric_name}_higher_is_better"] = True
218
+
219
+ elif metric_name in self._threshold_holes.keys():
220
+ result[f"{metric_name}_result"] = (
221
+ "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
222
+ )
223
+ result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
224
+ result[f"{metric_name}_higher_is_better"] = False
225
+
226
+ else:
227
+ raise ValueError(f"No threshold set for metric '{metric_name}'")
228
+
229
+ return result
230
+
231
+ def _validate_eval_input(
232
+ self, eval_input: Dict
233
+ ) -> Tuple[List[RetrievalGroundTruthDocument], List[RetrievedDocument]]:
234
+ """Validate document retrieval evaluator inputs.
235
+
236
+ :param eval_input: The input to the evaluation function.
237
+ :type eval_input: Dict
238
+ :return: The evaluation result.
239
+ :rtype: Tuple[List[azure.ai.evaluation.RetrievalGroundTruthDocument], List[azure.ai.evaluation.RetrievedDocument]]
240
+ """
241
+ retrieval_ground_truth = eval_input.get("retrieval_ground_truth")
242
+ retrieved_documents = eval_input.get("retrieved_documents")
243
+
244
+ # if the qrels are empty, no meaningful evaluation is possible
245
+ if not retrieval_ground_truth:
246
+ raise EvaluationException(
247
+ (
248
+ "'retrieval_ground_truth' parameter must contain at least one item. "
249
+ "Check your data input to be sure that each input record has ground truth defined."
250
+ )
251
+ )
252
+
253
+ qrels = []
254
+
255
+ # validate the qrels to be sure they are the correct type and are bounded by the given configuration
256
+ for qrel in retrieval_ground_truth:
257
+ document_id = qrel.get("document_id")
258
+ query_relevance_label = qrel.get("query_relevance_label")
259
+
260
+ if document_id is None or query_relevance_label is None:
261
+ raise EvaluationException(
262
+ (
263
+ "Invalid input data was found in the retrieval ground truth. "
264
+ "Ensure that all items in the 'retrieval_ground_truth' array contain "
265
+ "'document_id' and 'query_relevance_label' properties."
266
+ )
267
+ )
268
+
269
+ if not isinstance(query_relevance_label, int):
270
+ raise EvaluationException("Query relevance labels must be integer values.")
271
+
272
+ if query_relevance_label < self.ground_truth_label_min:
273
+ raise EvaluationException(
274
+ (
275
+ "A query relevance label less than the configured minimum value was detected in the evaluation input data. "
276
+ "Check the range of ground truth label values in the input data and set the value of ground_truth_label_min to "
277
+ "the appropriate value for your data."
278
+ )
279
+ )
280
+
281
+ if query_relevance_label > self.ground_truth_label_max:
282
+ raise EvaluationException(
283
+ (
284
+ "A query relevance label greater than the configured maximum value was detected in the evaluation input data. "
285
+ "Check the range of ground truth label values in the input data and set the value of ground_truth_label_max to "
286
+ "the appropriate value for your data."
287
+ )
288
+ )
289
+
290
+ qrels.append(qrel)
291
+
292
+ # validate retrieved documents to be sure they are the correct type
293
+ results = []
294
+
295
+ if isinstance(retrieved_documents, list):
296
+ for result in retrieved_documents:
297
+ document_id = result.get("document_id")
298
+ relevance_score = result.get("relevance_score")
299
+
300
+ if document_id is None or relevance_score is None:
301
+ raise EvaluationException(
302
+ (
303
+ "Invalid input data was found in the retrieved documents. "
304
+ "Ensure that all items in the 'retrieved_documents' array contain "
305
+ "'document_id' and 'relevance_score' properties."
306
+ )
307
+ )
308
+
309
+ if not isinstance(relevance_score, float) and not isinstance(relevance_score, int):
310
+ raise EvaluationException("Retrieved document relevance score must be a numerical value.")
311
+
312
+ results.append(result)
313
+
314
+ if len(qrels) > 10000 or len(results) > 10000:
315
+ raise EvaluationException(
316
+ "'retrieval_ground_truth' and 'retrieved_documents' inputs should contain no more than 10000 items."
317
+ )
318
+
319
+ return qrels, results
320
+
321
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
322
+ """Produce a document retrieval evaluation result.
323
+
324
+ :param eval_input: The input to the evaluation function.
325
+ :type eval_input: Dict
326
+ :return: The evaluation result.
327
+ :rtype: Dict[str, float]
328
+ """
329
+ qrels, results = self._validate_eval_input(eval_input)
330
+
331
+ # if the results set is empty, results are all zero
332
+ if len(results) == 0:
333
+ metrics = {
334
+ f"ndcg@{self.k}": 0.0,
335
+ f"xdcg@{self.k}": 0.0,
336
+ "fidelity": 0.0,
337
+ "top1_relevance": 0.0,
338
+ "top3_max_relevance": 0.0,
339
+ "holes": 0,
340
+ "holes_ratio": 0,
341
+ "total_retrieved_documents": len(results),
342
+ "total_ground_truth_documents": len(qrels),
343
+ }
344
+ binary_result = self._get_binary_result(**metrics)
345
+ for k, v in binary_result.items():
346
+ metrics[k] = v
347
+
348
+ return metrics
349
+
350
+ # flatten qrels and results to normal dictionaries
351
+ qrels_lookup = {x["document_id"]: x["query_relevance_label"] for x in qrels}
352
+ results_lookup = {x["document_id"]: x["relevance_score"] for x in results}
353
+
354
+ # sort each input set by label to get the ranking
355
+ qrels_sorted_by_rank = sorted(qrels_lookup.items(), key=lambda x: x[1], reverse=True)
356
+ results_sorted_by_rank = sorted(results_lookup.items(), key=lambda x: x[1], reverse=True)
357
+
358
+ # find ground truth labels for the results set and ideal set
359
+ result_docs_groundtruth_labels = [
360
+ qrels_lookup[doc_id] if doc_id in qrels_lookup else 0 for (doc_id, _) in results_sorted_by_rank
361
+ ]
362
+ ideal_docs_groundtruth_labels = [label for (_, label) in qrels_sorted_by_rank]
363
+
364
+ # calculate the proportion of result docs with no ground truth label (holes)
365
+ holes = self._compute_holes([x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank])
366
+ holes_ratio = holes / float(len(results))
367
+
368
+ # if none of the retrieved docs are labeled, report holes only
369
+ if not any(result_docs_groundtruth_labels):
370
+ metrics = {
371
+ f"ndcg@{self.k}": 0,
372
+ f"xdcg@{self.k}": 0,
373
+ "fidelity": 0,
374
+ "top1_relevance": 0,
375
+ "top3_max_relevance": 0,
376
+ "holes": holes,
377
+ "holes_ratio": holes_ratio,
378
+ "total_retrieved_documents": len(results),
379
+ "total_ground_truth_documents": len(qrels),
380
+ }
381
+ binary_result = self._get_binary_result(**metrics)
382
+ for k, v in binary_result.items():
383
+ metrics[k] = v
384
+
385
+ return metrics
386
+
387
+ metrics = {
388
+ f"ndcg@{self.k}": self._compute_ndcg(
389
+ result_docs_groundtruth_labels[: self.k],
390
+ ideal_docs_groundtruth_labels[: self.k],
391
+ ),
392
+ f"xdcg@{self.k}": self._compute_xdcg(result_docs_groundtruth_labels[: self.k]),
393
+ "fidelity": self._compute_fidelity(result_docs_groundtruth_labels, ideal_docs_groundtruth_labels),
394
+ "top1_relevance": result_docs_groundtruth_labels[0],
395
+ "top3_max_relevance": max(result_docs_groundtruth_labels[: self.k]),
396
+ "holes": holes,
397
+ "holes_ratio": holes_ratio,
398
+ "total_retrieved_documents": len(results),
399
+ "total_ground_truth_documents": len(qrels),
400
+ }
401
+
402
+ binary_result = self._get_binary_result(**metrics)
403
+ for k, v in binary_result.items():
404
+ metrics[k] = v
405
+
406
+ return metrics
407
+
408
+ @overload
409
+ def __call__( # type: ignore
410
+ self,
411
+ *,
412
+ retrieval_ground_truth: List[RetrievalGroundTruthDocument],
413
+ retrieved_documents: List[RetrievedDocument],
414
+ ) -> Dict[str, float]:
415
+ """
416
+ Compute document retrieval metrics for documents retrieved from a search algorithm against a known set of ground truth documents.
417
+
418
+ Evaluation metrics calculated include NDCG@3, XDCG@3, Fidelity, Top K Relevance and Holes.
419
+
420
+ :keyword retrieval_ground_truth: a list of ground-truth document judgements for a query, where each item in the list contains a unique document identifier and a query relevance label.
421
+ :paramtype retrieval_ground_truth: List[azure.ai.evaluation.RetrievalGroundTruthDocument]
422
+ :keyword retrieved_documents: a list of documents scored by a search algorithm for a query, where each item in the list contains a unique document identifier and a relevance score.
423
+ :paramtype retrieved_documents: List[azure.ai.evaluation.RetrievedDocument]
424
+ :return: The document retrieval metrics.
425
+ :rtype: Dict[str, float]
426
+ """
427
+
428
+ @override
429
+ def __call__(self, *args, **kwargs):
430
+ """
431
+ Compute document retrieval metrics for documents retrieved from a search algorithm against a known set of ground truth documents.
432
+
433
+ Evaluation metrics calculated include NDCG@3, XDCG@3, Fidelity, Top K Relevance and Holes.
434
+
435
+ :keyword retrieval_ground_truth: a list of ground-truth document judgements for a query, where each item in the list contains a unique document identifier and a query relevance label.
436
+ :paramtype retrieval_ground_truth: List[azure.ai.evaluation.RetrievalGroundTruthDocument]
437
+ :keyword retrieved_documents: a list of documents scored by a search algorithm for a query, where each item in the list contains a unique document identifier and a relevance score.
438
+ :paramtype retrieved_documents: List[azure.ai.evaluation.RetrievedDocument]
439
+ :return: The document retrieval metrics.
440
+ :rtype: Dict[str, float]
441
+ """
442
+ return super().__call__(*args, **kwargs)
@@ -22,9 +22,9 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
22
22
 
23
23
  :param credential: The credential for connecting to Azure AI project. Required
24
24
  :type credential: ~azure.core.credentials.TokenCredential
25
- :param azure_ai_project: The scope of the Azure AI project.
26
- It contains subscription id, resource group, and project name.
27
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
25
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
26
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
27
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
28
28
  :return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
29
29
  :rtype: Dict[str, str]
30
30
 
@@ -52,17 +52,20 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
52
52
 
53
53
  id = "eci"
54
54
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
+ _OPTIONAL_PARAMS = ["query"]
55
56
 
56
57
  @override
57
58
  def __init__(
58
59
  self,
59
60
  credential,
60
61
  azure_ai_project,
62
+ **kwargs,
61
63
  ):
62
64
  super().__init__(
63
65
  eval_metric=_InternalEvaluationMetrics.ECI,
64
66
  azure_ai_project=azure_ai_project,
65
67
  credential=credential,
68
+ **kwargs,
66
69
  )
67
70
 
68
71
  @overload
@@ -3,45 +3,68 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  from collections import Counter
6
- from typing import List
6
+ from typing import List, Dict
7
+ from typing_extensions import overload, override
7
8
 
8
- from promptflow._utils.async_utils import async_run_allowing_running_loop
9
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
10
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
9
11
 
10
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
11
12
 
13
+ class F1ScoreEvaluator(EvaluatorBase):
14
+ """
15
+ Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
12
16
 
13
- class _AsyncF1ScoreEvaluator:
14
- def __init__(self):
15
- pass
17
+ F1 Scores range from 0 to 1, with 1 being the best possible score.
16
18
 
17
- async def __call__(self, *, response: str, ground_truth: str, **kwargs):
18
- """
19
- Evaluate F1 score.
19
+ The F1-score computes the ratio of the number of shared words between the model generation and
20
+ the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
21
+ truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
22
+ precision is the ratio of the number of shared words to the total number of words in the generation, and recall
23
+ is the ratio of the number of shared words to the total number of words in the ground truth.
20
24
 
21
- :keyword response: The response to be evaluated.
22
- :paramtype response: str
23
- :keyword ground_truth: The ground truth to be evaluated.
24
- :paramtype ground_truth: str
25
- :return: The F1 score.
26
- :rtype: Dict[str, float]
27
- """
28
- # Validate inputs
29
- if not (response and response.strip() and response != "None") or not (
30
- ground_truth and ground_truth.strip() and ground_truth != "None"
31
- ):
32
- msg = "Both 'response' and 'ground_truth' must be non-empty strings."
33
- raise EvaluationException(
34
- message=msg,
35
- internal_message=msg,
36
- error_category=ErrorCategory.MISSING_FIELD,
37
- error_blame=ErrorBlame.USER_ERROR,
38
- error_target=ErrorTarget.F1_EVALUATOR,
39
- )
25
+ Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
26
+ model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
27
+ information in the response.
40
28
 
41
- # Run f1 score computation.
42
- f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
29
+ :param threshold: The threshold for the F1 score evaluator. Default is 0.5.
30
+ :type threshold: float
31
+
32
+ .. admonition:: Example:
33
+
34
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
35
+ :start-after: [START f1_score_evaluator]
36
+ :end-before: [END f1_score_evaluator]
37
+ :language: python
38
+ :dedent: 8
39
+ :caption: Initialize and call an F1ScoreEvaluator.
40
+
41
+ .. admonition:: Example using Azure AI Project URL:
42
+
43
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
+ :start-after: [START f1_score_evaluator]
45
+ :end-before: [END f1_score_evaluator]
46
+ :language: python
47
+ :dedent: 8
48
+ :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
49
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
+
51
+ .. admonition:: Example with Threshold:
52
+
53
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
54
+ :start-after: [START threshold_f1_score_evaluator]
55
+ :end-before: [END threshold_f1_score_evaluator]
56
+ :language: python
57
+ :dedent: 8
58
+ :caption: Initialize with threshold and call an F1ScoreEvaluator.
59
+ """
60
+
61
+ id = "azureai://built-in/evaluators/f1_score"
62
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
43
63
 
44
- return {"f1_score": f1_result}
64
+ def __init__(self, *, threshold=0.5):
65
+ self._threshold = threshold
66
+ self._higher_is_better = True
67
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
45
68
 
46
69
  @classmethod
47
70
  def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
@@ -103,41 +126,34 @@ class _AsyncF1ScoreEvaluator:
103
126
 
104
127
  return f1
105
128
 
129
+ @override
130
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
131
+ """Produce an f1 score evaluation result.
106
132
 
107
- class F1ScoreEvaluator:
108
- """
109
- Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
110
-
111
- F1 Scores range from 0 to 1, with 1 being the best possible score.
112
-
113
- The F1-score computes the ratio of the number of shared words between the model generation and
114
- the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
115
- truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
116
- precision is the ratio of the number of shared words to the total number of words in the generation, and recall
117
- is the ratio of the number of shared words to the total number of words in the ground truth.
118
-
119
- Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
120
- model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
121
- information in the response.
122
-
123
-
124
- .. admonition:: Example:
125
-
126
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
127
- :start-after: [START f1_score_evaluator]
128
- :end-before: [END f1_score_evaluator]
129
- :language: python
130
- :dedent: 8
131
- :caption: Initialize and call an F1ScoreEvaluator.
132
- """
133
-
134
- id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
135
- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
136
-
137
- def __init__(self):
138
- self._async_evaluator = _AsyncF1ScoreEvaluator()
139
-
140
- def __call__(self, *, response: str, ground_truth: str, **kwargs):
133
+ :param eval_input: The input to the evaluation function.
134
+ :type eval_input: Dict
135
+ :return: The evaluation result.
136
+ :rtype: Dict
137
+ """
138
+ ground_truth = eval_input["ground_truth"]
139
+ response = eval_input["response"]
140
+ # Run f1 score computation.
141
+ f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
142
+ binary_result = False
143
+ if self._higher_is_better:
144
+ if f1_result >= self._threshold:
145
+ binary_result = True
146
+ else:
147
+ if f1_result <= self._threshold:
148
+ binary_result = True
149
+ return {
150
+ "f1_score": f1_result,
151
+ "f1_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
152
+ "f1_threshold": self._threshold,
153
+ }
154
+
155
+ @overload # type: ignore
156
+ def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
141
157
  """
142
158
  Evaluate F1 score.
143
159
 
@@ -149,9 +165,20 @@ class F1ScoreEvaluator:
149
165
  :rtype: Dict[str, float]
150
166
  """
151
167
 
152
- return async_run_allowing_running_loop(
153
- self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
154
- )
168
+ @override
169
+ def __call__( # pylint: disable=docstring-missing-param
170
+ self,
171
+ *args,
172
+ **kwargs,
173
+ ):
174
+ """
175
+ Evaluate F1 score.
155
176
 
156
- def _to_async(self):
157
- return self._async_evaluator
177
+ :keyword response: The response to be evaluated.
178
+ :paramtype response: str
179
+ :keyword ground_truth: The ground truth to be evaluated.
180
+ :paramtype ground_truth: str
181
+ :return: The F1 score.
182
+ :rtype: Dict[str, float]
183
+ """
184
+ return super().__call__(*args, **kwargs)