azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +85 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +147 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +87 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +430 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
  155. azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
  156. azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
  157. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  158. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  159. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  160. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  161. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  162. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  163. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  165. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/METADATA +378 -27
  264. azure_ai_evaluation-1.13.5.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,20 @@
1
+ # ------------------------------------
2
+ # Copyright (c) Microsoft Corporation.
3
+ # Licensed under the MIT License.
4
+ # ------------------------------------
5
+ """Customize generated code here.
6
+
7
+ Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
8
+ """
9
+ from typing import List
10
+
11
+ __all__: List[str] = [] # Add all objects you want publicly available to users at this package level
12
+
13
+
14
+ def patch_sdk():
15
+ """Do not remove from this file.
16
+
17
+ `patch_sdk` is a last resort escape hatch that allows you to do customizations
18
+ you can't accomplish using the techniques described in
19
+ https://aka.ms/azsdk/python/dpcodegen/python/customize
20
+ """
@@ -0,0 +1 @@
1
+ # Marker file for PEP 561.
@@ -1,17 +1,19 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
4
+ import os
5
+ import posixpath
5
6
  import re
6
7
  import math
7
8
  import threading
8
- from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
9
+ from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
9
10
 
10
11
  import nltk
11
- from typing_extensions import NotRequired, Required, TypeGuard
12
- from promptflow.core._errors import MissingRequiredPackage
12
+ from azure.storage.blob import ContainerClient
13
+ from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
14
+ from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
13
15
  from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
14
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
+ from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
15
17
  from azure.ai.evaluation._model_configurations import (
16
18
  AzureAIProject,
17
19
  AzureOpenAIModelConfiguration,
@@ -19,34 +21,122 @@ from azure.ai.evaluation._model_configurations import (
19
21
  )
20
22
 
21
23
  from . import constants
24
+ from .constants import EvaluatorScoringPattern, EVALUATOR_SCORING_PATTERNS, SCORING_PATTERN_CONFIG
22
25
 
23
26
  _nltk_data_download_lock = threading.Lock()
24
27
 
25
28
  T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
26
29
 
27
30
 
28
- def get_harm_severity_level(harm_score: Union[float, int]) -> Union[str, float]:
31
+ def get_harm_severity_level(harm_score: Union[float, int], evaluator: Optional[str] = None) -> Union[str, float]:
29
32
  """Generate harm severity level based on harm score.
30
33
 
31
34
  :param harm_score: The harm score to be evaluated.
32
- :type harm_score: int
35
+ :type harm_score: Union[float, int]
36
+ :param evaluator: Optional evaluator name to determine scoring pattern. If not provided, uses default 0-7 scale.
37
+ :type evaluator: Optional[str]
33
38
  :return: The harm severity level. If harm score is None or numpy.nan, returns numpy.nan.
34
- :rtype: str
39
+ :rtype: Union[str, float]
35
40
  """
36
- HARM_SEVERITY_LEVEL_MAPPING = {
37
- constants.HarmSeverityLevel.VeryLow: [0, 1],
38
- constants.HarmSeverityLevel.Low: [2, 3],
39
- constants.HarmSeverityLevel.Medium: [4, 5],
40
- constants.HarmSeverityLevel.High: [6, 7],
41
- }
42
- if math.isnan(harm_score) or harm_score is None:
41
+ if harm_score is None or (isinstance(harm_score, float) and math.isnan(harm_score)):
42
+ return math.nan
43
+
44
+ # Get the scoring pattern for this evaluator
45
+ pattern = get_evaluator_scoring_pattern(evaluator)
46
+ config = SCORING_PATTERN_CONFIG.get(pattern)
47
+
48
+ if not config:
49
+ # Fallback to default 0-7 mapping
50
+ HARM_SEVERITY_LEVEL_MAPPING = {
51
+ constants.HarmSeverityLevel.VeryLow: [0, 1],
52
+ constants.HarmSeverityLevel.Low: [2, 3],
53
+ constants.HarmSeverityLevel.Medium: [4, 5],
54
+ constants.HarmSeverityLevel.High: [6, 7],
55
+ }
56
+ for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
57
+ if harm_score_range[0] <= harm_score <= harm_score_range[1]:
58
+ return harm_level.value
43
59
  return math.nan
44
- for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
45
- if harm_score_range[0] <= harm_score <= harm_score_range[1]:
60
+
61
+ # Use pattern-specific severity mapping
62
+ severity_mapping = config.get("severity_mapping", {})
63
+ for harm_level, score_range in severity_mapping.items():
64
+ if score_range[0] <= harm_score <= score_range[-1]:
46
65
  return harm_level.value
66
+
47
67
  return math.nan
48
68
 
49
69
 
70
+ def get_evaluator_scoring_pattern(evaluator: Optional[str]) -> EvaluatorScoringPattern:
71
+ """Get the scoring pattern for a given evaluator.
72
+
73
+ :param evaluator: The name of the evaluator.
74
+ :type evaluator: Optional[str]
75
+ :return: The scoring pattern for the evaluator, defaults to SCALE_0_7 if not found.
76
+ :rtype: EvaluatorScoringPattern
77
+ """
78
+ if evaluator is None:
79
+ return EvaluatorScoringPattern.SCALE_0_7
80
+
81
+ # Normalize evaluator name (remove underscores, lowercase)
82
+ normalized_name = evaluator.lower().replace("_", "")
83
+
84
+ # Try exact match first
85
+ if evaluator in EVALUATOR_SCORING_PATTERNS:
86
+ return EVALUATOR_SCORING_PATTERNS[evaluator]
87
+
88
+ # Try normalized match
89
+ for key, pattern in EVALUATOR_SCORING_PATTERNS.items():
90
+ if key.replace("_", "").lower() == normalized_name:
91
+ return pattern
92
+
93
+ # Default to 0-7 scale for unknown evaluators
94
+ return EvaluatorScoringPattern.SCALE_0_7
95
+
96
+
97
+ def get_default_threshold_for_evaluator(evaluator: str) -> int:
98
+ """Get the default attack success threshold for a given evaluator.
99
+
100
+ :param evaluator: The name of the evaluator.
101
+ :type evaluator: str
102
+ :return: The default threshold value.
103
+ :rtype: int
104
+ """
105
+ pattern = get_evaluator_scoring_pattern(evaluator)
106
+ config = SCORING_PATTERN_CONFIG.get(pattern, {})
107
+ return config.get("default_threshold", 3)
108
+
109
+
110
+ def convert_binary_to_numeric(value: Union[str, bool, int]) -> int:
111
+ """Convert binary evaluator outputs to numeric scores.
112
+
113
+ :param value: The binary value ("safe"/"unsafe", "true"/"false", True/False, 0/1).
114
+ :type value: Union[str, bool, int]
115
+ :return: 0 for safe/true, 1 for unsafe/false.
116
+ :rtype: int
117
+ """
118
+ if isinstance(value, bool):
119
+ return 0 if value else 1
120
+
121
+ if isinstance(value, int):
122
+ return value
123
+
124
+ if isinstance(value, str):
125
+ value_lower = value.lower().strip()
126
+ # For "safe"/"unsafe" pattern
127
+ if value_lower == "safe":
128
+ return 0
129
+ if value_lower == "unsafe":
130
+ return 1
131
+ # For "true"/"false" pattern
132
+ if value_lower == "true":
133
+ return 0
134
+ if value_lower == "false":
135
+ return 1
136
+
137
+ raise ValueError(f"Unable to convert value '{value}' to numeric score")
138
+
139
+
50
140
  def ensure_nltk_data_downloaded():
51
141
  """Download NLTK data packages if not already downloaded."""
52
142
  nltk_data = [
@@ -125,9 +215,24 @@ def construct_prompty_model_config(
125
215
  return prompty_model_config
126
216
 
127
217
 
218
+ def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
219
+ """Check if the Azure AI project is an OneDP project.
220
+
221
+ :param azure_ai_project: The scope of the Azure AI project.
222
+ :type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
223
+ :return: True if the Azure AI project is an OneDP project, False otherwise.
224
+ :rtype: bool
225
+ """
226
+ return isinstance(azure_ai_project, str)
227
+
228
+
128
229
  def validate_azure_ai_project(o: object) -> AzureAIProject:
129
230
  fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
130
231
 
232
+ # TODO : Add regex check for malformed project uri
233
+ if is_onedp_project(o):
234
+ return o
235
+
131
236
  if not isinstance(o, dict):
132
237
  msg = "The 'azure_ai_project' parameter must be a dictionary."
133
238
  raise EvaluationException(
@@ -275,7 +380,27 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
275
380
  return cast(T_TypedDict, o)
276
381
 
277
382
 
278
- def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
383
+ def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool:
384
+ """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
385
+
386
+ :param score: The score to check.
387
+ :type score: Union[str, float]
388
+ :param min_score: The minimum score. Default is 1.
389
+ :type min_score: int
390
+ :param max_score: The maximum score. Default is 5.
391
+ :type max_score: int
392
+ :return: True if the score is valid, False otherwise.
393
+ :rtype: bool
394
+ """
395
+ try:
396
+ numeric_score = float(score)
397
+ except (ValueError, TypeError):
398
+ return False
399
+
400
+ return min_score <= numeric_score <= max_score
401
+
402
+
403
+ def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
279
404
  """Parse the output of prompt-based quality evaluators that return a score and reason.
280
405
 
281
406
  Current supported evaluators:
@@ -284,6 +409,8 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
284
409
  - Retrieval
285
410
  - Groundedness
286
411
  - Coherence
412
+ - ResponseCompleteness
413
+ - TaskAdherence
287
414
 
288
415
  :param llm_output: The output of the prompt-based quality evaluator.
289
416
  :type llm_output: str
@@ -294,7 +421,7 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
294
421
  reason = ""
295
422
  if llm_output:
296
423
  try:
297
- score_pattern = r"<S2>\D*?([1-5]).*?</S2>"
424
+ score_pattern = rf"<S2>\D*?({valid_score_range}).*?</S2>"
298
425
  reason_pattern = r"<S1>(.*?)</S1>"
299
426
  score_match = re.findall(score_pattern, llm_output, re.DOTALL)
300
427
  reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
@@ -366,7 +493,7 @@ def validate_conversation(conversation):
366
493
  if not isinstance(messages, list):
367
494
  raise_exception(
368
495
  "'messages' parameter must be a JSON-compatible list of chat messages",
369
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
496
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
370
497
  )
371
498
  expected_roles = {"user", "assistant", "system"}
372
499
  image_found = False
@@ -393,7 +520,7 @@ def validate_conversation(conversation):
393
520
  ):
394
521
  raise_exception(
395
522
  f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
396
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
523
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
397
524
  )
398
525
  if isinstance(message, AssistantMessage):
399
526
  assistant_message_count += 1
@@ -407,7 +534,7 @@ def validate_conversation(conversation):
407
534
  if message.get("role") not in expected_roles:
408
535
  raise_exception(
409
536
  f"Invalid role provided: {message.get('role')}. Message number: {num}",
410
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
537
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
411
538
  )
412
539
  if message.get("role") == "assistant":
413
540
  assistant_message_count += 1
@@ -417,7 +544,7 @@ def validate_conversation(conversation):
417
544
  if not isinstance(content, (str, list)):
418
545
  raise_exception(
419
546
  f"Content in each turn must be a string or array. Message number: {num}",
420
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
547
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
421
548
  )
422
549
  if isinstance(content, list):
423
550
  if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
@@ -425,21 +552,372 @@ def validate_conversation(conversation):
425
552
  if not image_found:
426
553
  raise_exception(
427
554
  "Message needs to have multi-modal input like images.",
428
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
555
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
429
556
  )
430
557
  if assistant_message_count == 0:
431
558
  raise_exception(
432
559
  "Assistant role required in one of the messages.",
433
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
560
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
434
561
  )
435
562
  if user_message_count == 0:
436
563
  raise_exception(
437
564
  "User role required in one of the messages.",
438
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
565
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
439
566
  )
440
567
  if assistant_message_count > 1:
441
568
  raise_exception(
442
569
  "Evaluators for multimodal conversations only support single turn. "
443
570
  "User and assistant role expected as the only role in each message.",
444
- ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
571
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
572
+ )
573
+
574
+
575
+ def _extract_text_from_content(content):
576
+ text = []
577
+ for msg in content:
578
+ if "text" in msg:
579
+ text.append(msg["text"])
580
+ return text
581
+
582
+
583
+ def filter_to_used_tools(tool_definitions, msgs_lists, logger=None):
584
+ """Filters the tool definitions to only include those that were actually used in the messages lists."""
585
+ try:
586
+ used_tool_names = set()
587
+ any_tools_used = False
588
+ for msgs in msgs_lists:
589
+ for msg in msgs:
590
+ if msg.get("role") == "assistant" and "content" in msg:
591
+ for content in msg.get("content", []):
592
+ if content.get("type") == "tool_call":
593
+ any_tools_used = True
594
+ if "tool_call" in content and "function" in content["tool_call"]:
595
+ used_tool_names.add(content["tool_call"]["function"])
596
+ elif "name" in content:
597
+ used_tool_names.add(content["name"])
598
+
599
+ filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names]
600
+ if any_tools_used and not filtered_tools:
601
+ if logger:
602
+ logger.warning("No tool definitions matched the tools used in the messages. Returning original list.")
603
+ filtered_tools = tool_definitions
604
+
605
+ return filtered_tools
606
+ except Exception as e:
607
+ if logger:
608
+ logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}")
609
+ return tool_definitions
610
+
611
+
612
+ def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False):
613
+ all_user_queries, all_agent_responses = [], []
614
+ cur_user_query, cur_agent_response = [], []
615
+ system_message = None
616
+
617
+ for msg in query:
618
+ role = msg.get("role")
619
+ if not role:
620
+ continue
621
+ if include_system_messages and role == "system":
622
+ system_message = msg.get("content", "")
623
+
624
+ elif role == "user" and "content" in msg:
625
+ if cur_agent_response:
626
+ formatted_agent_response = _get_agent_response(
627
+ cur_agent_response, include_tool_messages=include_tool_messages
628
+ )
629
+ all_agent_responses.append([formatted_agent_response])
630
+ cur_agent_response = []
631
+ text_in_msg = _extract_text_from_content(msg["content"])
632
+ if text_in_msg:
633
+ cur_user_query.append(text_in_msg)
634
+
635
+ elif role in ("assistant", "tool"):
636
+ if cur_user_query:
637
+ all_user_queries.append(cur_user_query)
638
+ cur_user_query = []
639
+ cur_agent_response.append(msg)
640
+
641
+ if cur_user_query:
642
+ all_user_queries.append(cur_user_query)
643
+ if cur_agent_response:
644
+ formatted_agent_response = _get_agent_response(cur_agent_response, include_tool_messages=include_tool_messages)
645
+ all_agent_responses.append([formatted_agent_response])
646
+
647
+ if len(all_user_queries) != len(all_agent_responses) + 1:
648
+ raise EvaluationException(
649
+ message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
650
+ internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
651
+ target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
652
+ category=ErrorCategory.INVALID_VALUE,
653
+ blame=ErrorBlame.USER_ERROR,
654
+ )
655
+
656
+ result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
657
+ if include_system_messages and system_message:
658
+ result["system_message"] = system_message
659
+ return result
660
+
661
+
662
+ def _pretty_format_conversation_history(conversation_history):
663
+ """Formats the conversation history for better readability."""
664
+ formatted_history = ""
665
+ if conversation_history.get("system_message"):
666
+ formatted_history += "SYSTEM_PROMPT:\n"
667
+ formatted_history += " " + conversation_history["system_message"] + "\n\n"
668
+ for i, (user_query, agent_response) in enumerate(
669
+ zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
670
+ ):
671
+ formatted_history += f"User turn {i+1}:\n"
672
+ for msg in user_query:
673
+ if isinstance(msg, list):
674
+ for submsg in msg:
675
+ formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
676
+ else:
677
+ formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
678
+ formatted_history += "\n"
679
+ if agent_response:
680
+ formatted_history += f"Agent turn {i+1}:\n"
681
+ for msg in agent_response:
682
+ if isinstance(msg, list):
683
+ for submsg in msg:
684
+ formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
685
+ else:
686
+ formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
687
+ formatted_history += "\n"
688
+ return formatted_history
689
+
690
+
691
+ def reformat_conversation_history(query, logger=None, include_system_messages=False, include_tool_messages=False):
692
+ """Reformats the conversation history to a more compact representation."""
693
+ try:
694
+ conversation_history = _get_conversation_history(
695
+ query,
696
+ include_system_messages=include_system_messages,
697
+ include_tool_messages=include_tool_messages,
698
+ )
699
+ return _pretty_format_conversation_history(conversation_history)
700
+ except Exception as e:
701
+ # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
702
+ # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
703
+ # From our tests the negative impact on IntentResolution is:
704
+ # Higher intra model variance (0.142 vs 0.046)
705
+ # Higher inter model variance (0.345 vs 0.607)
706
+ # Lower percentage of mode in Likert scale (73.4% vs 75.4%)
707
+ # Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
708
+ if logger:
709
+ logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
710
+ return query
711
+
712
+
713
+ def _get_agent_response(agent_response_msgs, include_tool_messages=False):
714
+ """Extracts formatted agent response including text, and optionally tool calls/results."""
715
+ agent_response_text = []
716
+ tool_results = {}
717
+
718
+ # First pass: collect tool results
719
+ if include_tool_messages:
720
+ for msg in agent_response_msgs:
721
+ if msg.get("role") == "tool" and "tool_call_id" in msg:
722
+ for content in msg.get("content", []):
723
+ if content.get("type") == "tool_result":
724
+ result = content.get("tool_result")
725
+ tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
726
+
727
+ # Second pass: parse assistant messages and tool calls
728
+ for msg in agent_response_msgs:
729
+ if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
730
+ text = _extract_text_from_content(msg["content"])
731
+ if text:
732
+ agent_response_text.extend(text)
733
+ if include_tool_messages:
734
+ for content in msg.get("content", []):
735
+ # Todo: Verify if this is the correct way to handle tool calls
736
+ if content.get("type") == "tool_call":
737
+ if "tool_call" in content and "function" in content.get("tool_call", {}):
738
+ tc = content.get("tool_call", {})
739
+ func_name = tc.get("function", {}).get("name", "")
740
+ args = tc.get("function", {}).get("arguments", {})
741
+ tool_call_id = tc.get("id")
742
+ else:
743
+ tool_call_id = content.get("tool_call_id")
744
+ func_name = content.get("name", "")
745
+ args = content.get("arguments", {})
746
+ args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
747
+ call_line = f"[TOOL_CALL] {func_name}({args_str})"
748
+ agent_response_text.append(call_line)
749
+ if tool_call_id in tool_results:
750
+ agent_response_text.append(tool_results[tool_call_id])
751
+
752
+ return agent_response_text
753
+
754
+
755
+ def reformat_agent_response(response, logger=None, include_tool_messages=False):
756
+ try:
757
+ if response is None or response == []:
758
+ return ""
759
+ agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
760
+ if agent_response == []:
761
+ # If no message could be extracted, likely the format changed, fallback to the original response in that case
762
+ if logger:
763
+ logger.warning(
764
+ f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
765
+ )
766
+ return response
767
+ return "\n".join(agent_response)
768
+ except:
769
+ # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
770
+ # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
771
+ if logger:
772
+ logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
773
+ return response
774
+
775
+
776
+ def reformat_tool_definitions(tool_definitions, logger=None):
777
+ try:
778
+ output_lines = ["TOOL_DEFINITIONS:"]
779
+ for tool in tool_definitions:
780
+ name = tool.get("name", "unnamed_tool")
781
+ desc = tool.get("description", "").strip()
782
+ params = tool.get("parameters", {}).get("properties", {})
783
+ param_names = ", ".join(params.keys()) if params else "no parameters"
784
+ output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
785
+ return "\n".join(output_lines)
786
+ except Exception as e:
787
+ # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
788
+ # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
789
+ if logger:
790
+ logger.warning(
791
+ f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
792
+ )
793
+ return tool_definitions
794
+
795
+
796
+ def simplify_messages(messages, drop_system=True, drop_tool_calls=False, logger=None):
797
+ """
798
+ Simplify a list of conversation messages by keeping only role and content.
799
+ Optionally filter out system messages and/or tool calls.
800
+
801
+ :param messages: List of message dicts (e.g., from query or response)
802
+ :param drop_system: If True, remove system role messages
803
+ :param drop_tool_calls: If True, remove tool_call items from assistant content
804
+ :return: New simplified list of messages
805
+ """
806
+ if isinstance(messages, str):
807
+ return messages
808
+ try:
809
+ # Validate input is a list
810
+ if not isinstance(messages, list):
811
+ return messages
812
+
813
+ simplified_msgs = []
814
+ for msg in messages:
815
+ # Ensure msg is a dict
816
+ if not isinstance(msg, dict):
817
+ simplified_msgs.append(msg)
818
+ continue
819
+
820
+ role = msg.get("role")
821
+ content = msg.get("content", [])
822
+
823
+ # Drop system message (if should)
824
+ if drop_system and role == "system":
825
+ continue
826
+
827
+ # Simplify user messages
828
+ if role == "user":
829
+ simplified_msg = {
830
+ "role": role,
831
+ "content": _extract_text_from_content(content),
832
+ }
833
+ simplified_msgs.append(simplified_msg)
834
+ continue
835
+
836
+ # Drop tool results (if should)
837
+ if drop_tool_calls and role == "tool":
838
+ continue
839
+
840
+ # Simplify assistant messages
841
+ if role == "assistant":
842
+ simplified_content = _extract_text_from_content(content)
843
+ # Check if message has content
844
+ if simplified_content:
845
+ simplified_msg = {"role": role, "content": simplified_content}
846
+ simplified_msgs.append(simplified_msg)
847
+ continue
848
+
849
+ # Drop tool calls (if should)
850
+ if drop_tool_calls and any(c.get("type") == "tool_call" for c in content if isinstance(c, dict)):
851
+ continue
852
+
853
+ # If we reach here, it means we want to keep the message
854
+ simplified_msgs.append(msg)
855
+
856
+ return simplified_msgs
857
+
858
+ except Exception as ex:
859
+ if logger:
860
+ logger.debug(f"Error simplifying messages: {str(ex)}. Returning original messages.")
861
+ return messages
862
+
863
+
864
+ def upload(path: str, container_client: ContainerClient, logger=None):
865
+ """Upload files or directories to Azure Blob Storage using a container client.
866
+
867
+ This function uploads a file or all files in a directory (recursively) to Azure Blob Storage.
868
+ When uploading a directory, the relative path structure is preserved in the blob container.
869
+
870
+ :param path: The local path to a file or directory to upload
871
+ :type path: str
872
+ :param container_client: The Azure Blob Container client to use for uploading
873
+ :type container_client: azure.storage.blob.ContainerClient
874
+ :param logger: Optional logger for debug output, defaults to None
875
+ :type logger: logging.Logger, optional
876
+ :raises EvaluationException: If the path doesn't exist or errors occur during upload
877
+ """
878
+
879
+ if not os.path.isdir(path) and not os.path.isfile(path):
880
+ raise EvaluationException(
881
+ message=f"Path '{path}' is not a directory or a file",
882
+ internal_message=f"Path '{path}' is not a directory or a file",
883
+ target=ErrorTarget.RAI_CLIENT,
884
+ category=ErrorCategory.INVALID_VALUE,
885
+ blame=ErrorBlame.SYSTEM_ERROR,
886
+ )
887
+
888
+ remote_paths = []
889
+ local_paths = []
890
+
891
+ if os.path.isdir(path):
892
+ for root, _, filenames in os.walk(path):
893
+ upload_path = ""
894
+ if root != path:
895
+ rel_path = os.path.relpath(root, path)
896
+ upload_path = posixpath.join(rel_path)
897
+ for f in filenames:
898
+ remote_file_path = posixpath.join(upload_path, f)
899
+ remote_paths.append(remote_file_path)
900
+ local_file_path = os.path.join(root, f)
901
+ local_paths.append(local_file_path)
902
+
903
+ if os.path.isfile(path):
904
+ remote_paths = [os.path.basename(path)]
905
+ local_paths = [path]
906
+
907
+ try:
908
+ # Open the file in binary read mode
909
+ for local, remote in zip(local_paths, remote_paths):
910
+ with open(local, "rb") as data:
911
+ # Upload the file to Azure Blob Storage
912
+ container_client.upload_blob(data=data, name=remote)
913
+ if logger:
914
+ logger.debug(f"File '{local}' uploaded successfully")
915
+
916
+ except Exception as e:
917
+ raise EvaluationException(
918
+ message=f"Error uploading file: {e}",
919
+ internal_message=f"Error uploading file: {e}",
920
+ target=ErrorTarget.RAI_CLIENT,
921
+ category=ErrorCategory.UPLOAD_ERROR,
922
+ blame=ErrorBlame.SYSTEM_ERROR,
445
923
  )