azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -6,41 +6,38 @@ import functools
6
6
  import inspect
7
7
  import json
8
8
  import logging
9
- from typing import Callable, Dict, TypeVar
9
+ from typing import Callable, Dict, Literal, Optional, Union, cast
10
10
 
11
11
  import pandas as pd
12
- from promptflow._sdk.entities._flows import FlexFlow as flex_flow
13
- from promptflow._sdk.entities._flows import Prompty as prompty_sdk
14
- from promptflow._sdk.entities._flows.dag import Flow as dag_flow
15
- from promptflow.client import PFClient
16
- from promptflow.core import Prompty as prompty_core
12
+ from azure.ai.evaluation._legacy._adapters._flows import FlexFlow as flex_flow
13
+ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty as prompty_sdk
14
+ from azure.ai.evaluation._legacy._adapters._flows import Flow as dag_flow
15
+ from azure.ai.evaluation._legacy._adapters.client import PFClient
17
16
  from typing_extensions import ParamSpec
18
17
 
19
- from ..._user_agent import USER_AGENT
18
+ from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
19
+
20
20
  from .._utils import _trace_destination_from_project_scope
21
21
 
22
22
  LOGGER = logging.getLogger(__name__)
23
23
 
24
24
  P = ParamSpec("P")
25
- R = TypeVar("R")
26
25
 
27
26
 
28
- def _get_evaluator_type(evaluator: Dict[str, Callable]):
27
+ def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
29
28
  """
30
29
  Get evaluator type for telemetry.
31
30
 
32
31
  :param evaluator: The evaluator object
33
32
  :type evaluator: Dict[str, Callable]
34
33
  :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
35
- :rtype: str
34
+ :rtype: Literal["content-safety", "built-in", "custom"]
36
35
  """
37
- built_in = False
38
- content_safety = False
39
-
40
36
  module = inspect.getmodule(evaluator)
41
- built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
42
- if built_in:
43
- content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
37
+ module_name = module.__name__ if module else ""
38
+
39
+ built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
40
+ content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
44
41
 
45
42
  if content_safety:
46
43
  return "content-safety"
@@ -67,7 +64,7 @@ def _get_evaluator_properties(evaluator, evaluator_name):
67
64
 
68
65
  try:
69
66
  # Cover flex flow and prompty based evaluator
70
- if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
67
+ if isinstance(evaluator, (prompty_sdk, flex_flow)):
71
68
  name = evaluator.name
72
69
  pf_type = evaluator.__class__.__name__
73
70
  # Cover dag flow based evaluator
@@ -95,85 +92,3 @@ def _get_evaluator_properties(evaluator, evaluator_name):
95
92
  "type": _get_evaluator_type(evaluator),
96
93
  "alias": evaluator_name if evaluator_name else "",
97
94
  }
98
-
99
-
100
- # cspell:ignore isna
101
- def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
102
- """Decorator to log evaluate activity
103
-
104
- :param func: The function to be decorated
105
- :type func: Callable
106
- :returns: The decorated function
107
- :rtype: Callable[P, R]
108
- """
109
-
110
- @functools.wraps(func)
111
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
112
- from promptflow._sdk._telemetry import ActivityType, log_activity
113
- from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
114
-
115
- evaluators = kwargs.get("evaluators", [])
116
- azure_ai_project = kwargs.get("azure_ai_project", None)
117
-
118
- pf_client = PFClient(
119
- config=(
120
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
121
- if azure_ai_project
122
- else None
123
- ),
124
- user_agent=USER_AGENT,
125
- )
126
-
127
- track_in_cloud = bool(pf_client._config.get_trace_destination()) # pylint: disable=protected-access
128
- evaluate_target = bool(kwargs.get("target", None))
129
- evaluator_config = bool(kwargs.get("evaluator_config", None))
130
- custom_dimensions = {
131
- "track_in_cloud": track_in_cloud,
132
- "evaluate_target": evaluate_target,
133
- "evaluator_config": evaluator_config,
134
- }
135
-
136
- with log_activity(
137
- get_telemetry_logger(),
138
- "pf.evals.evaluate",
139
- activity_type=ActivityType.PUBLICAPI,
140
- user_agent=USER_AGENT,
141
- custom_dimensions=custom_dimensions,
142
- ):
143
- result = func(*args, **kwargs)
144
-
145
- try:
146
- evaluators_info = []
147
- for evaluator_name, evaluator in evaluators.items():
148
- evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
149
- try:
150
- evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
151
- like=f"outputs.{evaluator_name}", axis=1
152
- )
153
-
154
- failed_rows = (
155
- evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
156
- )
157
- total_rows = evaluator_df.shape[0]
158
-
159
- evaluator_info["failed_rows"] = failed_rows
160
- evaluator_info["total_rows"] = total_rows
161
- except Exception as e: # pylint: disable=broad-exception-caught
162
- LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
163
- evaluators_info.append(evaluator_info)
164
-
165
- custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
166
- with log_activity(
167
- get_telemetry_logger(),
168
- "pf.evals.evaluate_usage_info",
169
- activity_type=ActivityType.PUBLICAPI,
170
- user_agent=USER_AGENT,
171
- custom_dimensions=custom_dimensions,
172
- ):
173
- pass
174
- except Exception as e: # pylint: disable=broad-exception-caught
175
- LOGGER.debug("Failed to collect evaluate usage info: %s", e)
176
-
177
- return result
178
-
179
- return wrapper
@@ -6,15 +6,30 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import tempfile
9
- from collections import namedtuple
10
9
  from pathlib import Path
11
- from typing import Dict
10
+ import time
11
+ from typing import Any, Dict, List, NamedTuple, Optional, Union, cast
12
+ import uuid
13
+ import base64
14
+ import math
12
15
 
13
16
  import pandas as pd
17
+ from tqdm import tqdm
14
18
 
15
- from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
16
- from azure.ai.evaluation._evaluate._eval_run import EvalRun
19
+ from azure.core.pipeline.policies import UserAgentPolicy
20
+ from azure.ai.evaluation._legacy._adapters.entities import Run
21
+
22
+ from azure.ai.evaluation._constants import (
23
+ DEFAULT_EVALUATION_RESULTS_FILE_NAME,
24
+ DefaultOpenEncoding,
25
+ EvaluationRunProperties,
26
+ Prefixes,
27
+ )
17
28
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
29
+ from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
30
+ from azure.ai.evaluation._version import VERSION
31
+ from azure.ai.evaluation._user_agent import UserAgentSingleton
32
+ from azure.ai.evaluation._azure._clients import LiteMLClient
18
33
 
19
34
  LOGGER = logging.getLogger(__name__)
20
35
 
@@ -23,14 +38,22 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
23
38
  "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
24
39
  )
25
40
 
26
- AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
27
41
 
42
+ class AzureMLWorkspace(NamedTuple):
43
+ subscription_id: str
44
+ resource_group_name: str
45
+ workspace_name: str
28
46
 
29
- def is_none(value):
47
+
48
+ def is_none(value) -> bool:
30
49
  return value is None or str(value).lower() == "none"
31
50
 
32
51
 
33
- def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
52
+ def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
53
+ trace_provider: str,
54
+ ) -> AzureMLWorkspace:
55
+ from azure.ai.evaluation._legacy._adapters.utils import get_workspace_triad_from_local
56
+
34
57
  match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
35
58
  if not match or len(match.groups()) != 5:
36
59
  raise EvaluationException(
@@ -44,10 +67,20 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint:
44
67
  category=ErrorCategory.INVALID_VALUE,
45
68
  blame=ErrorBlame.UNKNOWN,
46
69
  )
70
+
47
71
  subscription_id = match.group(1)
48
72
  resource_group_name = match.group(3)
49
73
  workspace_name = match.group(5)
50
- return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
74
+
75
+ # In theory this if statement should never evaluate to True, but we'll keep it here just in case
76
+ # for backwards compatibility with what the original code that depended on promptflow-azure did
77
+ if not (subscription_id and resource_group_name and workspace_name):
78
+ local = get_workspace_triad_from_local()
79
+ subscription_id = subscription_id or local.subscription_id or os.getenv("AZUREML_ARM_SUBSCRIPTION")
80
+ resource_group_name = resource_group_name or local.resource_group_name or os.getenv("AZUREML_ARM_RESOURCEGROUP")
81
+ workspace_name = workspace_name or local.workspace_name or os.getenv("AZUREML_ARM_WORKSPACE_NAME")
82
+
83
+ return AzureMLWorkspace(subscription_id or "", resource_group_name or "", workspace_name or "")
51
84
 
52
85
 
53
86
  def load_jsonl(path):
@@ -55,49 +88,186 @@ def load_jsonl(path):
55
88
  return [json.loads(line) for line in f.readlines()]
56
89
 
57
90
 
58
- def _azure_pf_client_and_triad(trace_destination):
59
- from promptflow.azure._cli._utils import _get_azure_pf_client
91
+ def _store_multimodal_content(messages, tmpdir: str):
92
+ # verify if images folder exists
93
+ images_folder_path = os.path.join(tmpdir, "images")
94
+ os.makedirs(images_folder_path, exist_ok=True)
60
95
 
61
- ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
62
- azure_pf_client = _get_azure_pf_client(
63
- subscription_id=ws_triad.subscription_id,
64
- resource_group=ws_triad.resource_group_name,
65
- workspace_name=ws_triad.workspace_name,
96
+ # traverse all messages and replace base64 image data with new file name.
97
+ for message in messages:
98
+ if isinstance(message.get("content", []), list):
99
+ for content in message.get("content", []):
100
+ process_message_content(content, images_folder_path)
101
+
102
+
103
+ def process_message_content(content, images_folder_path):
104
+ if content.get("type", "") == "image_url":
105
+ image_url = content.get("image_url")
106
+
107
+ if not image_url or "url" not in image_url:
108
+ return None
109
+
110
+ url = image_url["url"]
111
+ if not url.startswith("data:image/"):
112
+ return None
113
+
114
+ match = re.search("data:image/([^;]+);", url)
115
+ if not match:
116
+ return None
117
+
118
+ ext = match.group(1)
119
+ # Extract the base64 string
120
+ base64image = image_url["url"].replace(f"data:image/{ext};base64,", "")
121
+
122
+ # Generate a unique filename
123
+ image_file_name = f"{str(uuid.uuid4())}.{ext}"
124
+ image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
125
+
126
+ # Decode the base64 string to binary image data
127
+ image_data_binary = base64.b64decode(base64image)
128
+
129
+ # Write the binary image data to the file
130
+ image_file_path = os.path.join(images_folder_path, image_file_name)
131
+ with open(image_file_path, "wb") as f:
132
+ f.write(image_data_binary)
133
+ return None
134
+
135
+
136
+ def _log_metrics_and_instance_results_onedp(
137
+ metrics: Dict[str, Any],
138
+ instance_results: pd.DataFrame,
139
+ project_url: str,
140
+ evaluation_name: Optional[str],
141
+ name_map: Dict[str, str],
142
+ tags: Optional[Dict[str, str]] = None,
143
+ **kwargs,
144
+ ) -> Optional[str]:
145
+
146
+ # One RP Client
147
+ from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
148
+ from azure.ai.evaluation._constants import TokenScope
149
+ from azure.ai.evaluation._common import EvaluationServiceOneDPClient, EvaluationUpload
150
+
151
+ credentials = AzureMLTokenManager(
152
+ TokenScope.COGNITIVE_SERVICES_MANAGEMENT.value, LOGGER, credential=kwargs.get("credential")
66
153
  )
154
+ client = EvaluationServiceOneDPClient(
155
+ endpoint=project_url,
156
+ credential=credentials,
157
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
158
+ )
159
+
160
+ # Massaging before artifacts are put on disk
161
+ # Adding line_number as index column this is needed by UI to form link to individual instance run
162
+ instance_results["line_number"] = instance_results.index.values
163
+
164
+ artifact_name = "instance_results.jsonl"
165
+
166
+ with tempfile.TemporaryDirectory() as tmpdir:
167
+ # storing multi_modal images if exists
168
+ col_name = "inputs.conversation"
169
+ if col_name in instance_results.columns:
170
+ for item in instance_results[col_name].items():
171
+ value = item[1]
172
+ if "messages" in value:
173
+ _store_multimodal_content(value["messages"], tmpdir)
174
+
175
+ # storing artifact result
176
+ tmp_path = os.path.join(tmpdir, artifact_name)
177
+
178
+ with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
179
+ f.write(instance_results.to_json(orient="records", lines=True))
67
180
 
68
- return azure_pf_client, ws_triad
181
+ properties = {
182
+ EvaluationRunProperties.RUN_TYPE: "eval_run",
183
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
184
+ "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
185
+ }
186
+ properties.update(_convert_name_map_into_property_entries(name_map))
187
+
188
+ create_evaluation_result_response = client.create_evaluation_result(
189
+ name=uuid.uuid4(), path=tmpdir, metrics=metrics
190
+ )
191
+
192
+ upload_run_response = client.start_evaluation_run(
193
+ evaluation=EvaluationUpload(
194
+ display_name=evaluation_name,
195
+ properties=properties,
196
+ tags=tags,
197
+ )
198
+ )
199
+
200
+ # TODO: type mis-match because Evaluation instance is assigned to EvaluationRun
201
+ evaluation_id = (
202
+ upload_run_response.name # type: ignore[attr-defined]
203
+ if hasattr(upload_run_response, "name")
204
+ else upload_run_response.id
205
+ )
206
+ update_run_response = client.update_evaluation_run(
207
+ name=evaluation_id,
208
+ evaluation=EvaluationUpload(
209
+ display_name=evaluation_name,
210
+ status="Completed",
211
+ outputs={
212
+ "evaluationResultId": create_evaluation_result_response.id,
213
+ },
214
+ ),
215
+ )
216
+
217
+ return update_run_response.properties.get("AiStudioEvaluationUri")
69
218
 
70
219
 
71
220
  def _log_metrics_and_instance_results(
72
- metrics,
73
- instance_results,
74
- trace_destination,
75
- run,
76
- evaluation_name,
77
- ) -> str:
221
+ metrics: Dict[str, Any],
222
+ instance_results: pd.DataFrame,
223
+ trace_destination: Optional[str],
224
+ run: Optional[Run],
225
+ evaluation_name: Optional[str],
226
+ name_map: Dict[str, str],
227
+ tags: Optional[Dict[str, str]] = None,
228
+ **kwargs,
229
+ ) -> Optional[str]:
230
+ from azure.ai.evaluation._evaluate._eval_run import EvalRun
231
+
78
232
  if trace_destination is None:
79
- LOGGER.error("Unable to log traces as trace destination was not defined.")
233
+ LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
80
234
  return None
81
235
 
82
- azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
83
- tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
236
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
237
+ management_client = LiteMLClient(
238
+ subscription_id=ws_triad.subscription_id,
239
+ resource_group=ws_triad.resource_group_name,
240
+ logger=LOGGER,
241
+ credential=kwargs.get("credential"),
242
+ # let the client automatically determine the credentials to use
243
+ )
244
+ tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
84
245
 
85
246
  # Adding line_number as index column this is needed by UI to form link to individual instance run
86
247
  instance_results["line_number"] = instance_results.index.values
87
248
 
88
249
  with EvalRun(
89
250
  run_name=run.name if run is not None else evaluation_name,
90
- tracking_uri=tracking_uri,
251
+ tracking_uri=cast(str, tracking_uri),
91
252
  subscription_id=ws_triad.subscription_id,
92
253
  group_name=ws_triad.resource_group_name,
93
254
  workspace_name=ws_triad.workspace_name,
94
- ml_client=azure_pf_client.ml_client,
255
+ management_client=management_client,
95
256
  promptflow_run=run,
257
+ tags=tags,
96
258
  ) as ev_run:
97
-
98
- artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
259
+ artifact_name = EvalRun.EVALUATION_ARTIFACT
99
260
 
100
261
  with tempfile.TemporaryDirectory() as tmpdir:
262
+ # storing multi_modal images if exists
263
+ col_name = "inputs.conversation"
264
+ if col_name in instance_results.columns:
265
+ for item in instance_results[col_name].items():
266
+ value = item[1]
267
+ if "messages" in value:
268
+ _store_multimodal_content(value["messages"], tmpdir)
269
+
270
+ # storing artifact result
101
271
  tmp_path = os.path.join(tmpdir, artifact_name)
102
272
 
103
273
  with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
@@ -110,11 +280,18 @@ def _log_metrics_and_instance_results(
110
280
  # adding these properties to avoid showing traces if a dummy run is created.
111
281
  # We are doing that only for the pure evaluation runs.
112
282
  if run is None:
283
+ properties = {
284
+ EvaluationRunProperties.RUN_TYPE: "eval_run",
285
+ EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
286
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
287
+ "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
288
+ }
289
+ properties.update(_convert_name_map_into_property_entries(name_map))
290
+ ev_run.write_properties_to_run_history(properties=properties)
291
+ else:
113
292
  ev_run.write_properties_to_run_history(
114
293
  properties={
115
- "_azureml.evaluation_run": "azure-ai-generative-parent",
116
- "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
117
- "isEvaluatorRun": "true",
294
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
118
295
  }
119
296
  )
120
297
 
@@ -138,7 +315,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
138
315
  return studio_url
139
316
 
140
317
 
141
- def _trace_destination_from_project_scope(project_scope: dict) -> str:
318
+ def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
142
319
  subscription_id = project_scope["subscription_id"]
143
320
  resource_group_name = project_scope["resource_group_name"]
144
321
  workspace_name = project_scope["project_name"]
@@ -151,17 +328,24 @@ def _trace_destination_from_project_scope(project_scope: dict) -> str:
151
328
  return trace_destination
152
329
 
153
330
 
154
- def _write_output(path, data_dict):
331
+ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
155
332
  p = Path(path)
156
- if os.path.isdir(path):
333
+ if p.is_dir():
157
334
  p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
158
335
 
159
336
  with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
160
- json.dump(data_dict, f)
337
+ json.dump(data_dict, f, ensure_ascii=False)
338
+
339
+ # Use tqdm.write to print message without interfering with any current progress bar
340
+ # Fall back to regular print if tqdm.write fails (e.g., when progress bar is closed)
341
+ try:
342
+ tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
343
+ except Exception:
344
+ print(f'Evaluation results saved to "{p.resolve()}".\n')
161
345
 
162
346
 
163
347
  def _apply_column_mapping(
164
- source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
348
+ source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
165
349
  ) -> pd.DataFrame:
166
350
  """
167
351
  Apply column mapping to source_df based on mapping_config.
@@ -211,7 +395,7 @@ def _apply_column_mapping(
211
395
  return result_df
212
396
 
213
397
 
214
- def _has_aggregator(evaluator):
398
+ def _has_aggregator(evaluator: object) -> bool:
215
399
  return hasattr(evaluator, "__aggregate__")
216
400
 
217
401
 
@@ -234,11 +418,76 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
234
418
  return default_value
235
419
 
236
420
 
237
- def set_event_loop_policy():
421
+ def set_event_loop_policy() -> None:
238
422
  import asyncio
239
423
  import platform
240
424
 
241
425
  if platform.system().lower() == "windows":
242
426
  # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
243
427
  # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
244
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
428
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
429
+
430
+
431
+ # textwrap.wrap tries to do fancy nonsense that we don't want
432
+ def _wrap(s, w):
433
+ return [s[i : i + w] for i in range(0, len(s), w)]
434
+
435
+
436
+ def _convert_name_map_into_property_entries(
437
+ name_map: Dict[str, str], segment_length: int = 950, max_segments: int = 10
438
+ ) -> Dict[str, Any]:
439
+ """
440
+ Convert the name map into property entries.
441
+
442
+ :param name_map: The name map to be converted.
443
+ :type name_map: Dict[str, str]
444
+ :param segment_length: The max length of each individual segment,
445
+ which will each have their own dictionary entry
446
+ :type segment_length: str
447
+ :param max_segments: The max number of segments we can have. If the stringified
448
+ name map is too long, we just return a length entry with a value
449
+ of -1 to indicate that the map was too long.
450
+ :type max_segments: str
451
+ :return: The converted name map.
452
+ :rtype: Dict[str, Any]
453
+ """
454
+ name_map_string = json.dumps(name_map)
455
+ num_segments = math.ceil(len(name_map_string) / segment_length)
456
+ # Property map is somehow still too long to encode within the space
457
+ # we allow, so give up, but make sure the service knows we gave up
458
+ if num_segments > max_segments:
459
+ return {EvaluationRunProperties.NAME_MAP_LENGTH: -1}
460
+
461
+ result: Dict[str, Any] = {EvaluationRunProperties.NAME_MAP_LENGTH: num_segments}
462
+ segments_list = _wrap(name_map_string, segment_length)
463
+ for i in range(0, num_segments):
464
+ segment_key = f"{EvaluationRunProperties.NAME_MAP}_{i}"
465
+ result[segment_key] = segments_list[i]
466
+ return result
467
+
468
+
469
+ class JSONLDataFileLoader:
470
+ def __init__(self, filename: Union[os.PathLike, str]):
471
+ self.filename = filename
472
+
473
+ def load(self) -> pd.DataFrame:
474
+ return pd.read_json(self.filename, lines=True, dtype=object)
475
+
476
+
477
+ class CSVDataFileLoader:
478
+ def __init__(self, filename: Union[os.PathLike, str]):
479
+ self.filename = filename
480
+
481
+ def load(self) -> pd.DataFrame:
482
+ return pd.read_csv(self.filename, dtype=str)
483
+
484
+
485
+ class DataLoaderFactory:
486
+ @staticmethod
487
+ def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
488
+ filename_str = str(filename).lower()
489
+ if filename_str.endswith(".csv"):
490
+ return CSVDataFileLoader(filename)
491
+
492
+ # fallback to JSONL to maintain backward compatibility
493
+ return JSONLDataFileLoader(filename)
@@ -0,0 +1,76 @@
1
+ from abc import ABC
2
+ from typing import Dict, List, Optional, Any
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class EvaluatorMetric:
8
+ type: str = "ordinal"
9
+ desirable_direction: Optional[str] = None
10
+ min_value: Optional[float] = None
11
+ max_value: Optional[float] = None
12
+
13
+ def to_dict(self) -> Dict[str, Any]:
14
+ result = {"type": self.type}
15
+ if self.desirable_direction is not None:
16
+ result["desirable_direction"] = self.desirable_direction
17
+ if self.min_value is not None:
18
+ result["min_value"] = self.min_value
19
+ if self.max_value is not None:
20
+ result["max_value"] = self.max_value
21
+ return result
22
+
23
+ @classmethod
24
+ def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorMetric":
25
+ return cls(
26
+ type=data.get("type", "ordinal"),
27
+ desirable_direction=data.get("desirable_direction"),
28
+ min_value=data.get("min_value"),
29
+ max_value=data.get("max_value"),
30
+ )
31
+
32
+
33
+ @dataclass
34
+ class ObjectParameterDescriptorWithRequired:
35
+ required: List[str] = field(default_factory=list)
36
+ type: str = "object"
37
+ properties: Dict[str, Any] = field(default_factory=dict)
38
+
39
+ def to_dict(self) -> Dict[str, Any]:
40
+ return {"required": self.required, "type": self.type, "properties": self.properties}
41
+
42
+ @classmethod
43
+ def from_dict(cls, data: Dict[str, Any]) -> "ObjectParameterDescriptorWithRequired":
44
+ return cls(
45
+ required=data.get("required", []), type=data.get("type", "object"), properties=data.get("properties", {})
46
+ )
47
+
48
+
49
+ class EvaluatorDefinition(ABC):
50
+ """Base class for evaluator definitions"""
51
+
52
+ def __init__(self):
53
+ self.init_parameters: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
54
+ self.metrics: Dict[str, EvaluatorMetric] = {}
55
+ self.data_schema: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
56
+ self.type: str = "unknown"
57
+
58
+ def to_dict(self) -> Dict[str, Any]:
59
+ result = {
60
+ "type": self.type,
61
+ "init_parameters": self.init_parameters.to_dict(),
62
+ "metrics": {k: v.to_dict() for k, v in self.metrics.items()},
63
+ "data_schema": self.data_schema.to_dict(),
64
+ }
65
+ return result
66
+
67
+ @classmethod
68
+ def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorDefinition":
69
+ # Create a generic instance since specific subclasses are not defined
70
+ instance = cls.__new__(cls)
71
+ instance.__init__()
72
+
73
+ instance.init_parameters = ObjectParameterDescriptorWithRequired.from_dict(data.get("init_parameters", {}))
74
+ instance.metrics = {k: EvaluatorMetric.from_dict(v) for k, v in data.get("metrics", {}).items()}
75
+ instance.data_schema = ObjectParameterDescriptorWithRequired.from_dict(data.get("data_schema", {}))
76
+ return instance