azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,176 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import asyncio
6
+ import logging
7
+ import pandas as pd
8
+ import sys
9
+ import itertools
10
+ from collections import defaultdict
11
+ from concurrent.futures import Future
12
+ from os import PathLike
13
+ from typing import Any, Callable, Dict, Final, List, Mapping, Optional, Sequence, Union, cast
14
+
15
+ from .batch_clients import BatchClientRun, HasAsyncCallable
16
+ from ..._legacy._batch_engine._run_submitter import RunSubmitter
17
+ from ..._legacy._batch_engine._config import BatchEngineConfig
18
+ from ..._legacy._batch_engine._run import Run
19
+ from ..._legacy._adapters._constants import LINE_NUMBER
20
+ from ..._legacy._adapters.types import AttrDict
21
+ from ..._legacy._common._thread_pool_executor_with_context import ThreadPoolExecutorWithContext
22
+ from ..._evaluate._utils import _has_aggregator
23
+ from ..._constants import Prefixes, PF_BATCH_TIMEOUT_SEC
24
+
25
+ from .._utils import get_int_env_var as get_int
26
+
27
+
28
+ LOGGER = logging.getLogger("run")
29
+ MISSING_VALUE: Final[int] = sys.maxsize
30
+
31
+
32
+ class RunSubmitterClient:
33
+ def __init__(self, *, raise_on_errors: bool = False, config: Optional[BatchEngineConfig] = None) -> None:
34
+ if config:
35
+ self._config = config
36
+ else:
37
+ # Generate default config and apply any overrides to the configuration from environment variables
38
+ self._config = BatchEngineConfig(LOGGER, use_async=True)
39
+ if (val := get_int(PF_BATCH_TIMEOUT_SEC, MISSING_VALUE)) != MISSING_VALUE:
40
+ self._config.batch_timeout_seconds = val
41
+ if (val := get_int("PF_LINE_TIMEOUT_SEC", MISSING_VALUE)) != MISSING_VALUE:
42
+ self._config.line_timeout_seconds = val
43
+ if (val := get_int("PF_WORKER_COUNT", MISSING_VALUE)) != MISSING_VALUE:
44
+ self._config.max_concurrency = val
45
+
46
+ self._config.raise_on_error = raise_on_errors
47
+
48
+ self._thread_pool = ThreadPoolExecutorWithContext(
49
+ thread_name_prefix="evaluators_thread", max_workers=self._config.max_concurrency
50
+ )
51
+
52
+ def run(
53
+ self,
54
+ flow: Callable,
55
+ data: Union[str, PathLike, pd.DataFrame],
56
+ column_mapping: Optional[Dict[str, str]] = None,
57
+ evaluator_name: Optional[str] = None,
58
+ **kwargs: Any,
59
+ ) -> BatchClientRun:
60
+ if not isinstance(data, pd.DataFrame):
61
+ raise ValueError("Data must be a pandas DataFrame")
62
+
63
+ # The column mappings are indexed by data to indicate they come from the data
64
+ # input. Update the inputs so that each entry is a dictionary with a data key
65
+ # that contains the original input data.
66
+ inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
67
+ # Pass the correct previous run to the evaluator
68
+ run: Optional[BatchClientRun] = kwargs.pop("run", None)
69
+ if run:
70
+ kwargs["run"] = self._get_run(run)
71
+
72
+ # Try to get async function to use
73
+ if isinstance(flow, HasAsyncCallable):
74
+ flow = flow._to_async() # pylint: disable=protected-access
75
+
76
+ # Start an event loop for async execution on a thread pool thread to separate it
77
+ # from the caller's thread.
78
+ run_submitter = RunSubmitter(self._config, self._thread_pool)
79
+ run_future = self._thread_pool.submit(
80
+ asyncio.run,
81
+ run_submitter.submit(
82
+ dynamic_callable=flow,
83
+ inputs=inputs,
84
+ column_mapping=column_mapping,
85
+ name_prefix=evaluator_name,
86
+ created_on=kwargs.pop("created_on", None),
87
+ storage_creator=kwargs.pop("storage_creator", None),
88
+ **kwargs,
89
+ ),
90
+ )
91
+
92
+ return run_future
93
+
94
+ def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
95
+ run = self._get_run(client_run)
96
+
97
+ def concat(*dataframes: pd.DataFrame) -> pd.DataFrame:
98
+ return pd.concat(dataframes, axis=1, verify_integrity=True)
99
+
100
+ def to_dataframe(items: Sequence[Mapping[str, Any]], *, max_length: Optional[int] = None) -> pd.DataFrame:
101
+ """Convert a sequence of dictionaries to a DataFrame.
102
+
103
+ :param items: Sequence of dictionaries to convert.
104
+ :type items: Sequence[Mapping[str, Any]]
105
+ :param max_length: Maximum number of items to include in the DataFrame. If None, include all items.
106
+ :type max_length: Optional[int]
107
+ :return: DataFrame containing the items.
108
+ :rtype: pd.DataFrame
109
+ """
110
+ max_length = None if all_results else self._config.default_num_results
111
+ return pd.DataFrame(data=items if all_results else itertools.islice(items, max_length))
112
+
113
+ inputs = concat(
114
+ to_dataframe(run.inputs), to_dataframe([{LINE_NUMBER: i} for i in range(len(run.inputs))])
115
+ ).add_prefix(Prefixes.INPUTS)
116
+
117
+ outputs = to_dataframe(run.outputs).add_prefix(Prefixes.OUTPUTS)
118
+
119
+ return concat(inputs, outputs)
120
+
121
+ def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
122
+ run = self._get_run(client_run)
123
+ return {**run.metrics, **self._get_aggregated_metrics(client_run)}
124
+
125
+ def _get_aggregated_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
126
+ aggregated_metrics = None
127
+ run = self._get_run(client_run)
128
+ try:
129
+ if _has_aggregator(run.dynamic_callable):
130
+ result_df = pd.DataFrame(run.outputs)
131
+ if len(result_df.columns) == 1 and result_df.columns[0] == "output":
132
+ aggregate_input = result_df["output"].tolist()
133
+ else:
134
+ aggregate_input = [AttrDict(item) for item in result_df.to_dict("records")]
135
+
136
+ aggr_func = getattr(run.dynamic_callable, "__aggregate__")
137
+ aggregated_metrics = aggr_func(aggregate_input)
138
+
139
+ except Exception as ex: # pylint: disable=broad-exception-caught
140
+ LOGGER.warning("Error calculating aggregations for evaluator, failed with error %s", ex)
141
+
142
+ if not isinstance(aggregated_metrics, dict):
143
+ LOGGER.warning(
144
+ "Aggregated metrics for evaluator is not a dictionary will not be logged as metrics",
145
+ )
146
+ return {}
147
+
148
+ return aggregated_metrics
149
+
150
+ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
151
+ run = self._get_run(client_run)
152
+
153
+ total_lines = run.result.total_lines if run.result else 0
154
+ failed_lines = run.result.failed_lines if run.result else 0
155
+
156
+ return {
157
+ "status": run.status.value,
158
+ "duration": str(run.duration),
159
+ "completed_lines": total_lines - failed_lines,
160
+ "failed_lines": failed_lines,
161
+ "log_path": None,
162
+ "error_message": (
163
+ f"({run.result.error.blame.value}) {run.result.error.message}"
164
+ if run.result and run.result.error and run.result.error.blame
165
+ else None
166
+ ),
167
+ "error_code": (
168
+ f"{run.result.error.category.value}"
169
+ if run.result and run.result.error and run.result.error.category
170
+ else None
171
+ ),
172
+ }
173
+
174
+ @staticmethod
175
+ def _get_run(run: BatchClientRun) -> Run:
176
+ return cast(Future[Run], run).result()
@@ -0,0 +1,82 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import pandas
6
+ from os import PathLike
7
+ from typing import Any, Awaitable, Callable, Dict, Optional, Protocol, Union, runtime_checkable
8
+
9
+
10
+ class BatchClientRun(Protocol):
11
+ """The protocol for the batch client run."""
12
+
13
+ pass
14
+
15
+
16
+ @runtime_checkable
17
+ class HasAsyncCallable(Protocol):
18
+ """The protocol for an object that has an async callable."""
19
+
20
+ def _to_async(self) -> Callable[[Any, Any], Awaitable[Any]]: ...
21
+
22
+
23
+ class BatchClient(Protocol):
24
+ """The protocol for the batch client. This allows for running a flow on a data source
25
+ and getting the details of the run."""
26
+
27
+ def run(
28
+ self,
29
+ flow: Callable,
30
+ data: Union[str, PathLike, pandas.DataFrame],
31
+ column_mapping: Optional[Dict[str, str]] = None,
32
+ evaluator_name: Optional[str] = None,
33
+ **kwargs: Any,
34
+ ) -> BatchClientRun:
35
+ """Run the given flow on the data with the given column mapping.
36
+
37
+ :param flow: The flow to run.
38
+ :type flow: Union[Callable, HasAsyncCallable]
39
+ :param data: The JSONL file containing the data to run the flow on,
40
+ or the loaded data
41
+ :type data: Union[str, PathLike]
42
+ :param column_mapping: The column mapping to use.
43
+ :type column_mapping: Mapping[str, str]
44
+ :param name: The name of the run.
45
+ :type name: Optional[str]
46
+ :param kwargs: Additional keyword arguments to pass to the flow.
47
+ :return: The result of the batch client run.
48
+ :rtype: BatchClientRun
49
+ """
50
+ ...
51
+
52
+ def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pandas.DataFrame:
53
+ """Get the details of the run.
54
+
55
+ :param client_run: The run to get the details of.
56
+ :type client_run: BatchClientRun
57
+ :param all_results: Whether to get all results.
58
+ :type all_results: bool
59
+ :return: The details of the run.
60
+ :rtype: pandas.DataFrame
61
+ """
62
+ ...
63
+
64
+ def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
65
+ """Get the metrics of the run.
66
+
67
+ :param client_run: The run to get the metrics of.
68
+ :type client_run: BatchClientRun
69
+ :return: The metrics of the run.
70
+ :rtype: Mapping[str, Any]
71
+ """
72
+ ...
73
+
74
+ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
75
+ """Get the summary of the run.
76
+
77
+ :param client_run: The run to get the summary of.
78
+ :type client_run: BatchClientRun
79
+ :return: The summary of the run.
80
+ :rtype: Mapping[str, Any]
81
+ """
82
+ ...
@@ -5,42 +5,49 @@ import inspect
5
5
  import json
6
6
  import logging
7
7
  import os
8
- from pathlib import Path
9
- from typing import Callable, Dict, Optional, Union
8
+ from concurrent.futures import Future
9
+ from typing import Any, Callable, Dict, Optional, Sequence, Union, cast
10
10
 
11
11
  import pandas as pd
12
- from promptflow.contracts.types import AttrDict
13
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
12
+ from azure.ai.evaluation._legacy._adapters.types import AttrDict
13
+ from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
14
14
 
15
15
  from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
16
16
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
17
17
 
18
18
  from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
19
+ from .batch_clients import BatchClientRun
19
20
 
20
21
  LOGGER = logging.getLogger(__name__)
21
22
 
22
23
 
23
24
  class CodeRun:
24
25
  def __init__(
25
- self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs # pylint: disable=unused-argument
26
- ):
26
+ self,
27
+ *,
28
+ run: Future,
29
+ input_data,
30
+ evaluator_name: Optional[str] = None,
31
+ aggregator: Callable[["CodeRun"], Future],
32
+ **kwargs, # pylint: disable=unused-argument
33
+ ) -> None:
27
34
  self.run = run
28
35
  self.evaluator_name = evaluator_name if evaluator_name is not None else ""
29
36
  self.input_data = input_data
30
- self.aggregated_metrics = aggregated_metrics
37
+ self.aggregated_metrics = aggregator(self)
31
38
 
32
- def get_result_df(self, exclude_inputs=False):
39
+ def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
33
40
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
34
- result_df = self.run.result(timeout=batch_run_timeout)
41
+ result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
35
42
  if exclude_inputs:
36
43
  result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
37
44
  return result_df
38
45
 
39
- def get_aggregated_metrics(self):
46
+ def get_aggregated_metrics(self) -> Dict[str, Any]:
40
47
  try:
41
48
  batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
42
- aggregated_metrics = (
43
- self.aggregated_metrics.result(timeout=batch_run_timeout)
49
+ aggregated_metrics: Optional[Any] = (
50
+ cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
44
51
  if self.aggregated_metrics is not None
45
52
  else None
46
53
  )
@@ -77,7 +84,7 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
77
84
  for param in inspect.signature(evaluator).parameters.values()
78
85
  if param.name not in ["args", "kwargs"]
79
86
  }
80
- for value in input_df.to_dict("records"):
87
+ for value in cast(Sequence[Dict[str, Any]], input_df.to_dict("records")):
81
88
  # Filter out only the parameters that are present in the input data
82
89
  # if no parameters then pass data as is
83
90
  filtered_values = {k: v for k, v in value.items() if k in parameters} if len(parameters) > 0 else value
@@ -104,10 +111,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
104
111
  verify_integrity=True,
105
112
  )
106
113
 
107
- def _calculate_aggregations(self, evaluator, run):
114
+ @staticmethod
115
+ def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
108
116
  try:
109
117
  if _has_aggregator(evaluator):
110
- aggregate_input = None
111
118
  evaluator_output = run.get_result_df(exclude_inputs=True)
112
119
  if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
113
120
  aggregate_input = evaluator_output["output"].tolist()
@@ -126,10 +133,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
126
133
  def run(
127
134
  self, # pylint: disable=unused-argument
128
135
  flow: Callable,
129
- data: Union[os.PathLike, Path, pd.DataFrame],
130
- evaluator_name: Optional[str] = None,
136
+ data: Union[str, os.PathLike, pd.DataFrame],
131
137
  column_mapping: Optional[Dict[str, str]] = None,
132
- **kwargs,
138
+ evaluator_name: Optional[str] = None,
139
+ **kwargs: Any,
133
140
  ) -> CodeRun:
134
141
  input_df = data
135
142
  if not isinstance(input_df, pd.DataFrame):
@@ -150,23 +157,38 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
150
157
  evaluator=flow,
151
158
  input_df=input_df,
152
159
  column_mapping=column_mapping,
160
+ evaluator_name=evaluator_name or "",
161
+ )
162
+
163
+ return CodeRun(
164
+ run=eval_future,
165
+ input_data=data,
153
166
  evaluator_name=evaluator_name,
167
+ aggregator=lambda code_run: self._thread_pool.submit(
168
+ self._calculate_aggregations, evaluator=flow, run=code_run
169
+ ),
154
170
  )
155
- run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
156
- aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
157
- run.aggregated_metrics = aggregation_future
158
- return run
159
171
 
160
- def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
172
+ def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
173
+ run = self._get_result(client_run)
161
174
  result_df = run.get_result_df(exclude_inputs=not all_results)
162
175
  return result_df
163
176
 
164
- def get_metrics(self, run: CodeRun) -> Optional[None]:
177
+ def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
178
+ run = self._get_result(client_run)
165
179
  try:
166
180
  aggregated_metrics = run.get_aggregated_metrics()
167
181
  print("Aggregated metrics")
168
182
  print(aggregated_metrics)
169
183
  except Exception as ex: # pylint: disable=broad-exception-caught
170
184
  LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
171
- return None
185
+ return {}
172
186
  return aggregated_metrics
187
+
188
+ def get_run_summary(self, client_run: BatchClientRun) -> Any: # pylint: disable=unused-argument
189
+ # Not implemented
190
+ return None
191
+
192
+ @staticmethod
193
+ def _get_result(run: BatchClientRun) -> CodeRun:
194
+ return cast(CodeRun, run)
@@ -2,47 +2,61 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import os
5
-
6
- from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
7
- from promptflow._utils.user_agent_utils import ClientUserAgentUtil
8
- from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
5
+ import types
6
+ from typing import Optional, Type, Union
7
+
8
+ from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
9
+ from azure.ai.evaluation._legacy._adapters.utils import ClientUserAgentUtil
10
+ from azure.ai.evaluation._legacy._adapters.tracing import inject_openai_api, recover_openai_api
11
+ from azure.ai.evaluation._legacy._batch_engine._openai_injector import (
12
+ inject_openai_api as ported_inject_openai_api,
13
+ recover_openai_api as ported_recover_openai_api,
14
+ )
9
15
 
10
16
  from azure.ai.evaluation._constants import (
11
17
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
12
18
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
13
19
  PF_BATCH_TIMEOUT_SEC,
14
20
  PF_BATCH_TIMEOUT_SEC_DEFAULT,
21
+ PF_DISABLE_TRACING,
15
22
  )
16
23
 
17
- from ..._user_agent import USER_AGENT
24
+ from ..._user_agent import UserAgentSingleton
18
25
  from .._utils import set_event_loop_policy
26
+ from .batch_clients import BatchClient
27
+ from ._run_submitter_client import RunSubmitterClient
19
28
  from .code_client import CodeClient
20
29
  from .proxy_client import ProxyClient
21
30
 
22
31
 
23
- class BatchRunContext:
24
- """Context manager for batch run clients.
32
+ class EvalRunContext:
33
+ """Context manager for eval batch run.
25
34
 
26
35
  :param client: The client to run in the context.
27
36
  :type client: Union[
28
- ~azure.ai.evaluation._evaluate._batch_run_client.code_client.CodeClient,
29
- ~azure.ai.evaluation._evaluate._batch_run_client.proxy_client.ProxyClient
37
+ ~azure.ai.evaluation._evaluate._batch_run.code_client.CodeClient,
38
+ ~azure.ai.evaluation._evaluate._batch_run.proxy_client.ProxyClient
30
39
  ]
31
40
  """
32
41
 
33
- def __init__(self, client) -> None:
42
+ def __init__(self, client: BatchClient) -> None:
34
43
  self.client = client
35
44
  self._is_batch_timeout_set_by_system = False
36
45
  self._is_otel_timeout_set_by_system = False
46
+ self._original_cwd = os.getcwd()
47
+
48
+ def __enter__(self) -> None:
49
+ # Preserve current working directory, as PF may change it without restoring it afterward
50
+ self._original_cwd = os.getcwd()
37
51
 
38
- def __enter__(self):
39
52
  if isinstance(self.client, CodeClient):
40
- ClientUserAgentUtil.append_user_agent(USER_AGENT)
53
+ ClientUserAgentUtil.append_user_agent(UserAgentSingleton().value)
41
54
  inject_openai_api()
42
55
 
43
56
  if isinstance(self.client, ProxyClient):
44
57
  os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
45
58
  os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
59
+ os.environ[PF_DISABLE_TRACING] = "true"
46
60
 
47
61
  if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
48
62
  os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
@@ -56,13 +70,25 @@ class BatchRunContext:
56
70
  # For addressing the issue of asyncio event loop closed on Windows
57
71
  set_event_loop_policy()
58
72
 
59
- def __exit__(self, exc_type, exc_val, exc_tb):
73
+ if isinstance(self.client, RunSubmitterClient):
74
+ set_event_loop_policy()
75
+ ported_inject_openai_api()
76
+
77
+ def __exit__(
78
+ self,
79
+ exc_type: Optional[Type[BaseException]],
80
+ exc_value: Optional[BaseException],
81
+ exc_tb: Optional[types.TracebackType],
82
+ ) -> None:
83
+ os.chdir(self._original_cwd)
84
+
60
85
  if isinstance(self.client, CodeClient):
61
86
  recover_openai_api()
62
87
 
63
88
  if isinstance(self.client, ProxyClient):
64
89
  os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
65
90
  os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
91
+ os.environ.pop(PF_DISABLE_TRACING, None)
66
92
 
67
93
  if self._is_batch_timeout_set_by_system:
68
94
  os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
@@ -71,3 +97,6 @@ class BatchRunContext:
71
97
  if self._is_otel_timeout_set_by_system:
72
98
  os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
73
99
  self._is_otel_timeout_set_by_system = False
100
+
101
+ if isinstance(self.client, RunSubmitterClient):
102
+ ported_recover_openai_api()
@@ -0,0 +1,124 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ # pylint: disable=protected-access
6
+
7
+ import inspect
8
+ import logging
9
+ import math
10
+ import os
11
+ from datetime import datetime
12
+ from collections import OrderedDict
13
+ from concurrent.futures import Future
14
+ from typing import Any, Callable, Dict, Optional, Union, cast
15
+
16
+ from azure.ai.evaluation._legacy._adapters.entities import Run
17
+ from azure.ai.evaluation._legacy._adapters._configuration import Configuration
18
+ from azure.ai.evaluation._legacy._adapters.client import PFClient
19
+ from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext
20
+ import pandas as pd
21
+
22
+ from azure.ai.evaluation._evaluate._batch_run.batch_clients import BatchClientRun, HasAsyncCallable
23
+
24
+
25
+ Configuration.get_instance().set_config("trace.destination", "none")
26
+ LOGGER = logging.getLogger(__name__)
27
+
28
+
29
+ class ProxyRun:
30
+ def __init__(self, run: Future, **kwargs) -> None: # pylint: disable=unused-argument
31
+ self.run = run
32
+
33
+
34
+ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
35
+ def __init__( # pylint: disable=missing-client-constructor-parameter-credential
36
+ self,
37
+ **kwargs: Any,
38
+ ) -> None:
39
+ self._pf_client = PFClient(**kwargs)
40
+ self._thread_pool = ThreadPoolExecutorWithContext(thread_name_prefix="evaluators_thread")
41
+
42
+ def run(
43
+ self,
44
+ flow: Callable,
45
+ data: Union[str, os.PathLike, pd.DataFrame],
46
+ column_mapping: Optional[Dict[str, str]] = None,
47
+ evaluator_name: Optional[str] = None,
48
+ **kwargs: Any,
49
+ ) -> ProxyRun:
50
+ if isinstance(data, pd.DataFrame):
51
+ raise ValueError("Data cannot be a pandas DataFrame")
52
+
53
+ flow_to_run: Callable = flow
54
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and isinstance(flow, HasAsyncCallable):
55
+ flow_to_run = flow._to_async() # pylint: disable=protected-access
56
+
57
+ name: str = kwargs.pop("name", "")
58
+ if not name:
59
+ name = f"azure_ai_evaluation_evaluators_{evaluator_name}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
60
+
61
+ # Pass the correct previous run to the evaluator
62
+ run: Optional[BatchClientRun] = kwargs.pop("run", None)
63
+ if run:
64
+ kwargs["run"] = self.get_result(run)
65
+
66
+ batch_use_async = self._should_batch_use_async(flow_to_run)
67
+ eval_future = self._thread_pool.submit(
68
+ self._pf_client.run,
69
+ flow_to_run,
70
+ data=data,
71
+ column_mapping=column_mapping, # type: ignore
72
+ batch_use_async=batch_use_async,
73
+ name=name,
74
+ **kwargs,
75
+ )
76
+ return ProxyRun(run=eval_future)
77
+
78
+ def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
79
+ run: Run = self.get_result(client_run)
80
+ result_df = self._pf_client.get_details(run, all_results=all_results)
81
+ result_df.replace("(Failed)", math.nan, inplace=True)
82
+ return result_df
83
+
84
+ def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
85
+ run: Run = self.get_result(client_run)
86
+ return self._pf_client.get_metrics(run)
87
+
88
+ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
89
+ run: Run = self.get_result(client_run)
90
+
91
+ # pylint: disable=protected-access
92
+ completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
93
+ failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
94
+
95
+ # Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
96
+ if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
97
+ status = "Completed with Errors"
98
+ else:
99
+ status = run.status
100
+
101
+ # Return the ordered dictionary with the updated status
102
+ return OrderedDict(
103
+ [
104
+ ("status", status),
105
+ ("duration", str((run._end_time or run._created_on) - run._created_on)),
106
+ ("completed_lines", completed_lines),
107
+ ("failed_lines", failed_lines),
108
+ ("log_path", str(run._output_path)),
109
+ ]
110
+ )
111
+
112
+ @staticmethod
113
+ def get_result(run: BatchClientRun) -> Run:
114
+ return cast(ProxyRun, run).run.result()
115
+
116
+ @staticmethod
117
+ def _should_batch_use_async(flow):
118
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
119
+ if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
120
+ return True
121
+ if inspect.iscoroutinefunction(flow):
122
+ return True
123
+ return False
124
+ return False