azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. azure/ai/evaluation/__init__.py +13 -2
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
  6. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  8. azure/ai/evaluation/_azure/_envs.py +9 -10
  9. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  10. azure/ai/evaluation/_common/constants.py +11 -2
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  13. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  14. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  15. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  16. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  17. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  18. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  20. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  21. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  22. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  23. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  24. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  25. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  26. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  27. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  28. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  29. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  30. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  31. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  32. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  33. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  34. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  35. azure/ai/evaluation/_common/rai_service.py +86 -50
  36. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  37. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  38. azure/ai/evaluation/_common/utils.py +124 -3
  39. azure/ai/evaluation/_constants.py +2 -1
  40. azure/ai/evaluation/_converters/__init__.py +1 -1
  41. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  42. azure/ai/evaluation/_converters/_models.py +46 -0
  43. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  44. azure/ai/evaluation/_eval_mapping.py +2 -2
  45. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
  46. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  47. azure/ai/evaluation/_evaluate/_evaluate.py +60 -54
  48. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
  49. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  50. azure/ai/evaluation/_evaluate/_utils.py +24 -15
  51. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
  52. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
  53. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
  54. azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
  55. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  56. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
  57. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
  58. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
  59. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
  60. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
  61. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
  62. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  63. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
  64. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
  65. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
  66. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
  67. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
  68. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
  69. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
  70. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  71. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
  72. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
  73. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
  74. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
  75. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
  76. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
  77. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +21 -21
  78. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
  79. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
  80. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
  81. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
  82. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
  83. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
  84. azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
  85. azure/ai/evaluation/_exceptions.py +10 -0
  86. azure/ai/evaluation/_http_utils.py +3 -3
  87. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
  88. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  91. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  92. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  93. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  94. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  95. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
  96. azure/ai/evaluation/_user_agent.py +32 -1
  97. azure/ai/evaluation/_version.py +1 -1
  98. azure/ai/evaluation/red_team/__init__.py +3 -1
  99. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  100. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  101. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  102. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  103. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  104. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  105. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  106. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  107. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  108. azure/ai/evaluation/red_team/_red_team.py +1286 -739
  109. azure/ai/evaluation/red_team/_red_team_result.py +43 -38
  110. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  111. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +32 -32
  112. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  113. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  114. azure/ai/evaluation/red_team/_utils/constants.py +2 -12
  115. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  116. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  117. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  118. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  119. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  120. azure/ai/evaluation/simulator/_adversarial_simulator.py +26 -15
  121. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  122. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  123. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
  124. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  125. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  126. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +10 -8
  127. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  128. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  129. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  130. azure/ai/evaluation/simulator/_simulator.py +9 -8
  131. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +15 -1
  132. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -131
  133. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  134. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
  135. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
  136. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
@@ -718,6 +718,7 @@ class AIAgentConverter:
718
718
 
719
719
  return AIAgentConverter._convert_from_conversation(data, run_id)
720
720
 
721
+
721
722
  @experimental
722
723
  class AIAgentDataRetriever:
723
724
  # Maximum items to fetch in a single AI Services API call (imposed by the service).
@@ -748,6 +749,7 @@ class AIAgentDataRetriever:
748
749
  def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
749
750
  pass
750
751
 
752
+
751
753
  @experimental
752
754
  class LegacyAgentDataRetriever(AIAgentDataRetriever):
753
755
 
@@ -768,7 +770,8 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
768
770
  after = None
769
771
  while has_more:
770
772
  messages = self.project_client.agents.list_messages(
771
- thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after)
773
+ thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after
774
+ )
772
775
  has_more = messages.has_more
773
776
  after = messages.last_id
774
777
  if messages.data:
@@ -812,6 +815,7 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
812
815
  def _get_run(self, thread_id: str, run_id: str):
813
816
  return self.project_client.agents.get_run(thread_id=thread_id, run_id=run_id)
814
817
 
818
+
815
819
  @experimental
816
820
  class FDPAgentDataRetriever(AIAgentDataRetriever):
817
821
 
@@ -833,16 +837,13 @@ class FDPAgentDataRetriever(AIAgentDataRetriever):
833
837
 
834
838
  def _list_run_steps_chronological(self, thread_id: str, run_id: str):
835
839
 
836
- return self.project_client.agents.run_steps.list(
837
- thread_id=thread_id,
838
- run_id=run_id,
839
- limit=self._AI_SERVICES_API_MAX_LIMIT,
840
- order="asc"
841
- )
840
+ return self.project_client.agents.run_steps.list(
841
+ thread_id=thread_id, run_id=run_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
842
+ )
842
843
 
843
844
  def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
844
845
  runs = self.project_client.agents.runs.list(thread_id=thread_id, order="asc")
845
846
  return [run.id for run in runs]
846
847
 
847
848
  def _get_run(self, thread_id: str, run_id: str):
848
- return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
849
+ return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
@@ -20,6 +20,7 @@ _SYSTEM = "system"
20
20
  _USER = "user"
21
21
  _AGENT = "assistant"
22
22
  _TOOL = "tool"
23
+ _DEVELOPER = "developer" # part of the semantic kernel
23
24
 
24
25
  # Constant definitions for what tool details include.
25
26
  _TOOL_CALL = "tool_call"
@@ -81,6 +82,7 @@ _BUILT_IN_PARAMS = {
81
82
  },
82
83
  }
83
84
 
85
+
84
86
  class Message(BaseModel):
85
87
  """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
86
88
  to JSON for evaluators and we have custom fields such as createdAt, run_id, and tool_call_id, so we cannot use
@@ -123,6 +125,17 @@ class UserMessage(Message):
123
125
  role: str = _USER
124
126
 
125
127
 
128
+ class SKDeveloperMessage(Message):
129
+ """Represents a developer message in a conversation with agents, assistants, and tools.
130
+ This is used in the context of Semantic Kernel (SK) agents.
131
+
132
+ :param role: The role of the message sender, which is always 'developer'.
133
+ :type role: str
134
+ """
135
+
136
+ role: str = _DEVELOPER
137
+
138
+
126
139
  class ToolMessage(Message):
127
140
  """Represents a tool message in a conversation with agents, assistants, and tools.
128
141
 
@@ -139,6 +152,19 @@ class ToolMessage(Message):
139
152
  tool_call_id: Optional[str] = None
140
153
 
141
154
 
155
+ class SKToolMessage(Message):
156
+ """Represents a tool message in the context of a Semantic Kernel (SK) agent.
157
+
158
+ :param role: The role of the message sender, which is always 'tool'.
159
+ :type role: str
160
+ :param tool_call_id: The ID of the tool call associated with the message. Optional.
161
+ :type tool_call_id: Optional[str]
162
+ """
163
+
164
+ role: str = _TOOL
165
+ tool_call_id: Optional[str] = None
166
+
167
+
142
168
  class AssistantMessage(Message):
143
169
  """Represents an assistant message.
144
170
 
@@ -152,6 +178,26 @@ class AssistantMessage(Message):
152
178
  role: str = _AGENT
153
179
 
154
180
 
181
+ class SKAssistantMessage(Message):
182
+ """Represents an assistant message in the context of a Semantic Kernel (SK) agent.
183
+
184
+ :param role: The role of the message sender, which is always 'assistant'.
185
+ :type role: str
186
+ """
187
+
188
+ role: str = _AGENT
189
+
190
+
191
+ class SKAssistantMessage(Message):
192
+ """Represents an assistant message in the context of a Semantic Kernel (SK) agent.
193
+
194
+ :param role: The role of the message sender, which is always 'assistant'.
195
+ :type role: str
196
+ """
197
+
198
+ role: str = _AGENT
199
+
200
+
155
201
  class ToolDefinition(BaseModel):
156
202
  """Represents a tool definition that will be used in the agent.
157
203
 
@@ -0,0 +1,495 @@
1
+ import json
2
+ from datetime import datetime, timezone
3
+ from typing import Any, Dict, List, Tuple, Optional
4
+
5
+ from azure.ai.evaluation._common._experimental import experimental
6
+
7
+ from semantic_kernel.contents import (
8
+ AuthorRole,
9
+ TextContent,
10
+ FunctionCallContent,
11
+ FunctionResultContent,
12
+ )
13
+ from semantic_kernel.contents.chat_message_content import ChatMessageContent
14
+
15
+ from semantic_kernel.agents import (
16
+ ChatCompletionAgent,
17
+ ChatHistoryAgentThread,
18
+ )
19
+
20
+ from ._models import (
21
+ Message,
22
+ SystemMessage,
23
+ UserMessage,
24
+ SKToolMessage,
25
+ SKAssistantMessage,
26
+ ToolDefinition,
27
+ ToolCall,
28
+ EvaluatorData,
29
+ SKDeveloperMessage,
30
+ )
31
+
32
+
33
+ @experimental
34
+ class SKAgentConverter:
35
+ """
36
+ A converter for SK agent data.
37
+ """
38
+
39
+ def __init__(self):
40
+ try:
41
+ import semantic_kernel as sk
42
+ except ImportError as e:
43
+ raise ImportError(
44
+ "semantic_kernel package is not installed. Please install it to use SKAgentConverter."
45
+ ) from e
46
+
47
+ @staticmethod
48
+ def _transform_tool_definitions(
49
+ tool_list: List[Dict[str, Any]],
50
+ ) -> List[ToolDefinition]:
51
+ """
52
+ Convert verbose tool definition dicts into the `ToolDefinition`s
53
+ :param tool_list: List of tool definitions to transform.
54
+ :type tool_list: List[Dict[str, Any]]
55
+ :return: Transformed list of tool definitions.
56
+ :rtype: List[Dict[str, Any]]
57
+ """
58
+ # TODO: Add required and default values when also supported by Foundry's converter
59
+
60
+ final_tools: List[ToolDefinition] = []
61
+
62
+ for tool in tool_list:
63
+ filtered_tool = {
64
+ "name": tool["fully_qualified_name"],
65
+ "description": tool.get("description") or "No description",
66
+ "type": "function", # TODO: hardcoded for now.
67
+ "parameters": {
68
+ "type": "object", # Is this always the case?
69
+ "properties": {}, # Will be filled in below
70
+ },
71
+ }
72
+
73
+ for param in tool.get("parameters", []):
74
+ param_name = param.get("name")
75
+ filtered_tool["parameters"]["properties"][param_name] = {
76
+ "type": param["type_"],
77
+ "description": param.get("description") or "No description",
78
+ }
79
+
80
+ final_tools.append(ToolDefinition(**filtered_tool))
81
+
82
+ return final_tools
83
+
84
+ @staticmethod
85
+ def _get_tool_definitions(agent: ChatCompletionAgent) -> list:
86
+ """
87
+ Get tool definitions from the agent's plugins.
88
+ :param agent: The ChatCompletionAgent from which to retrieve tool definitions.
89
+ :type agent: ChatCompletionAgent
90
+ :return: A list of tool definitions.
91
+ :rtype: list
92
+ """
93
+ functions = []
94
+ for plugin in agent.kernel.plugins:
95
+ functions_metadata = agent.kernel.plugins[plugin].get_functions_metadata()
96
+ for function in functions_metadata:
97
+ # Serialize metadata to a dictionary
98
+ function_dict = function.model_dump()
99
+ function_dict["fully_qualified_name"] = function.fully_qualified_name
100
+ # function_dict["type"] = "tool_call"
101
+ functions.append(function_dict)
102
+
103
+ return functions
104
+
105
+ @staticmethod
106
+ def _extract_function_tool_definitions(
107
+ agent: ChatCompletionAgent,
108
+ ) -> List[ToolDefinition]:
109
+ """Get and transform tool definitions from the agent."""
110
+ tool_definitions = SKAgentConverter._get_tool_definitions(agent)
111
+ return SKAgentConverter._transform_tool_definitions(tool_definitions)
112
+
113
+ @staticmethod
114
+ def _is_output_role(role):
115
+ return role in (AuthorRole.ASSISTANT, AuthorRole.TOOL)
116
+
117
+ @staticmethod
118
+ async def _get_messages_from_thread(
119
+ thread: ChatHistoryAgentThread,
120
+ ) -> List[ChatMessageContent]:
121
+ """
122
+ Get messages from a thread.
123
+ :param thread: The ChatHistoryAgentThread to get messages from.
124
+ :type thread: ChatHistoryAgentThread
125
+ :return: A list of ChatMessageContent objects.
126
+ :rtype: List[ChatMessageContent]
127
+ """
128
+ return [msg async for msg in thread.get_messages()]
129
+
130
+ @staticmethod
131
+ async def _get_messages_from_thread_with_agent(
132
+ thread: ChatHistoryAgentThread,
133
+ agent: ChatCompletionAgent = None,
134
+ ) -> List[ChatMessageContent]:
135
+ """
136
+ Get messages from a thread with agent instructions included as a system message if available.
137
+ :param thread: The ChatHistoryAgentThread to get messages from.
138
+ :type thread: ChatHistoryAgentThread
139
+ :param agent: The ChatCompletionAgent to use.
140
+ :type agent: ChatCompletionAgent
141
+ :return: A list of ChatMessageContent objects.
142
+ :rtype: List[ChatMessageContent]
143
+ """
144
+ messages: List[ChatMessageContent] = []
145
+
146
+ # If agent is provided, with instructions, add it as a system message
147
+ if agent and agent.instructions:
148
+ messages.append(
149
+ ChatMessageContent(
150
+ role=AuthorRole.SYSTEM,
151
+ items=[TextContent(text=agent.instructions)],
152
+ )
153
+ )
154
+
155
+ thread_messages = await SKAgentConverter._get_messages_from_thread(thread)
156
+ messages.extend(thread_messages)
157
+
158
+ return messages
159
+
160
+ @staticmethod
161
+ async def _convert_thread_to_eval_schema(
162
+ thread: ChatHistoryAgentThread,
163
+ turn_index: int,
164
+ agent: ChatCompletionAgent = None,
165
+ ):
166
+ """
167
+ Convert a thread to the evaluation schema.
168
+ :param thread: The ChatHistoryAgentThread containing the conversation history.
169
+ :type thread: ChatHistoryAgentThread
170
+ :param turn_index: The index of the turn in the conversation.
171
+ :type turn_index: int
172
+ :param agent: The ChatCompletionAgent being evaluated.
173
+ :type agent: ChatCompletionAgent
174
+ :return: A dictionary containing the converted data.
175
+ :rtype: dict
176
+ """
177
+
178
+ messages: List[ChatMessageContent] = await SKAgentConverter._get_messages_from_thread_with_agent(
179
+ thread=thread,
180
+ agent=agent,
181
+ )
182
+
183
+ turns = SKAgentConverter._extract_turns_from_messages(messages, turn_index_to_stop=turn_index)
184
+
185
+ if turn_index >= len(turns):
186
+ raise ValueError(f"Turn {turn_index} not found. Only {len(turns)} turns exist.")
187
+
188
+ return turns[turn_index]
189
+
190
+ @staticmethod
191
+ def _extract_turns_from_messages(
192
+ messages: List[ChatMessageContent],
193
+ turn_index_to_stop: Optional[int] = None,
194
+ ) -> List[Tuple[List[Message], List[Message]]]:
195
+ turns = []
196
+ query: List[Message] = []
197
+ response: List[Message] = []
198
+
199
+ queued_items = []
200
+ is_queued_output = None
201
+
202
+ for msg in messages:
203
+ curr_items = SKAgentConverter._process_message_items(msg)
204
+ curr_is_output = SKAgentConverter._is_output_role(msg.role)
205
+
206
+ # Handle the first message to initialize the output/input mode
207
+ if is_queued_output is None:
208
+ queued_items.extend(curr_items)
209
+ is_queued_output = curr_is_output
210
+ continue # This means if chat starts with an assistant/tool message, it's a separate turn
211
+
212
+ # Same group: still within the same input/output block
213
+ if is_queued_output == curr_is_output:
214
+ queued_items.extend(curr_items)
215
+ continue
216
+
217
+ # Transition from input → output
218
+ if not is_queued_output and curr_is_output:
219
+ if queued_items:
220
+ query.extend(queued_items)
221
+ queued_items = curr_items
222
+ is_queued_output = True
223
+ continue
224
+
225
+ # Transition from output → input = End of a turn
226
+ if is_queued_output and not curr_is_output:
227
+ # Transition from output to input: end of turn
228
+ response = list(queued_items)
229
+ turns.append((query, response))
230
+ # New turn's query would be the whole previous
231
+ query = list(query) + response
232
+ if turn_index_to_stop is not None and len(turns) > turn_index_to_stop:
233
+ break
234
+ queued_items = curr_items
235
+ is_queued_output = False
236
+
237
+ # Handle if final message(s) are assistant/tool messages
238
+ if queued_items and is_queued_output:
239
+ response = list(queued_items)
240
+ turns.append((query, response))
241
+
242
+ return turns
243
+
244
+ @staticmethod
245
+ def _convert_messages_to_schema_new(
246
+ messages: List[ChatMessageContent], turn_index: int
247
+ ) -> Tuple[List[Message], List[Message]]:
248
+ """
249
+ Converts messages to schema for a specific turn.
250
+ """
251
+ turns = SKAgentConverter._extract_turns_from_messages(messages, turn_index_to_stop=turn_index)
252
+ if turn_index >= len(turns):
253
+ raise ValueError(f"Turn {turn_index} not found. Only {len(turns)} turns exist.")
254
+ return turns[turn_index]
255
+
256
+ @staticmethod
257
+ def _safe_json_loads(value: Any) -> Any:
258
+ """Safely parse a JSON string into a Python object, return original if parsing fails."""
259
+ if isinstance(value, str):
260
+ try:
261
+ return json.loads(value)
262
+ except json.JSONDecodeError:
263
+ return value
264
+ return value
265
+
266
+ @staticmethod
267
+ def _process_message_items(message: ChatMessageContent) -> List[Message]:
268
+ """
269
+ Processes the items in a message and converts them to the specified schema.
270
+ Args:
271
+ message (Any): The message object to process.
272
+ Returns:
273
+ List[Dict[str, Any]]: A list of dictionaries representing the message items in the specified schema.
274
+ """
275
+ converted_messages = []
276
+ for item in message.items:
277
+ message_dict = {
278
+ "role": message.role.value,
279
+ "content": [], # will be filled in later
280
+ }
281
+ if "created" in message.metadata:
282
+ message_dict["createdAt"] = SKAgentConverter._convert_timestamp_to_iso(message.metadata["created"])
283
+ if isinstance(item, TextContent):
284
+ item_text = item.to_dict()["text"]
285
+ if message.role == AuthorRole.SYSTEM: # to match other converters
286
+ message_dict["content"] = item_text
287
+ else:
288
+ message_dict["content"] = [{"type": "text", "text": item_text}]
289
+
290
+ elif isinstance(item, FunctionCallContent):
291
+ item_dict = item.to_dict()
292
+ item_func = item_dict["function"]
293
+ arguments = SKAgentConverter._safe_json_loads(item_func["arguments"])
294
+
295
+ message_dict["content"].append(
296
+ {
297
+ "type": "tool_call",
298
+ "tool_call_id": item_dict.get("id", None),
299
+ "name": item_func["name"],
300
+ "arguments": arguments,
301
+ }
302
+ )
303
+ elif isinstance(item, FunctionResultContent):
304
+ item_dict = item.to_dict()
305
+ message_dict["tool_call_id"] = item_dict.get("tool_call_id", None)
306
+
307
+ item_content = SKAgentConverter._safe_json_loads(item_dict["content"])
308
+
309
+ message_dict["content"].append(
310
+ {
311
+ "type": "tool_result",
312
+ "tool_result": item_content,
313
+ }
314
+ )
315
+ else:
316
+ raise Exception(f"Unexpected item type: {type(item)} in message: {message}")
317
+
318
+ if message.role == AuthorRole.SYSTEM:
319
+ convert_message = SystemMessage(**message_dict)
320
+ elif message.role == AuthorRole.USER:
321
+ convert_message = UserMessage(**message_dict)
322
+ elif message.role == AuthorRole.DEVELOPER:
323
+ convert_message = SKDeveloperMessage(**message_dict)
324
+ elif message.role == AuthorRole.ASSISTANT:
325
+ convert_message = SKAssistantMessage(**message_dict)
326
+ elif message.role == AuthorRole.TOOL:
327
+ convert_message = SKToolMessage(**message_dict)
328
+ else:
329
+ raise ValueError(f"Unknown role: {message.role}")
330
+
331
+ converted_messages.append(convert_message)
332
+ return converted_messages
333
+
334
+ @staticmethod
335
+ def is_turn_complete(message: ChatMessageContent) -> bool:
336
+ """
337
+ Determines if a message completes a turn (assistant provides a response).
338
+ :param message: The message object to check.
339
+ :type message: ChatMessageContent
340
+ :return: True if the message completes a turn, False otherwise.
341
+ :rtype: bool
342
+ """
343
+ return any(isinstance(item, TextContent) for item in message.items)
344
+
345
+ @staticmethod
346
+ def _convert_timestamp_to_iso(timestamp: float) -> str:
347
+ """
348
+ Converts a timestamp to ISO format.
349
+ :param timestamp: The timestamp to convert.
350
+ :type timestamp: float
351
+ :return: The timestamp in ISO format.
352
+ :rtype: str
353
+ """
354
+ created_dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
355
+ return created_dt.isoformat().replace("+00:00", "Z")
356
+
357
+ async def convert(
358
+ self,
359
+ thread: ChatHistoryAgentThread,
360
+ agent: ChatCompletionAgent,
361
+ turn_index: int,
362
+ ) -> dict:
363
+ """Convert the sdk chat completion agent run to a format suitable for evaluation.
364
+
365
+ :param thread: The ChatHistoryAgentThread containing the conversation history.
366
+ :type thread: ChatHistoryAgentThread
367
+ :param agent: The ChatCompletionAgent being evaluated.
368
+ :type agent: ChatCompletionAgent
369
+ :param turn_index: The index of the turn in the conversation.
370
+ :type turn_index: int
371
+ :return: The converted data in dictionary format.
372
+ :rtype: dict
373
+ """
374
+
375
+ tool_definitions: List[ToolDefinition] = SKAgentConverter._extract_function_tool_definitions(agent)
376
+
377
+ if not thread:
378
+ raise ValueError("Thread cannot be None")
379
+
380
+ query, response = await SKAgentConverter._convert_thread_to_eval_schema(
381
+ thread=thread,
382
+ turn_index=turn_index,
383
+ agent=agent,
384
+ )
385
+
386
+ result = EvaluatorData(
387
+ query=query,
388
+ response=response,
389
+ tool_definitions=tool_definitions,
390
+ )
391
+
392
+ return json.loads(result.to_json())
393
+
394
+ async def prepare_evaluation_data(
395
+ self,
396
+ threads: List[ChatHistoryAgentThread],
397
+ agent: ChatCompletionAgent,
398
+ filename: Optional[str] = None,
399
+ ) -> List[dict]:
400
+ """
401
+ Prepares evaluation data for a list of threads and optionally writes it to a file.
402
+
403
+ :param threads: List of ChatHistoryAgentThread objects.
404
+ :type threads: List[ChatHistoryAgentThread]
405
+ :param agent: The ChatCompletionAgent being evaluated.
406
+ :type agent: ChatCompletionAgent
407
+ :param filename: Optional file path to save evaluation data as JSONL.
408
+ :type filename: Optional[str]
409
+ :return: List of evaluation data dictionaries.
410
+ :rtype: List[dict]
411
+ """
412
+
413
+ if isinstance(threads, ChatHistoryAgentThread):
414
+ threads = [threads]
415
+
416
+ all_eval_data: List[dict] = []
417
+
418
+ for thread in threads:
419
+ thread_data = await self._prepare_single_thread_evaluation_data(thread, agent)
420
+ all_eval_data.extend(thread_data)
421
+
422
+ if filename:
423
+ with open(filename, "w", encoding="utf-8") as f:
424
+ for item in all_eval_data:
425
+ f.write(json.dumps(item) + "\n")
426
+
427
+ return all_eval_data
428
+
429
+ async def _prepare_single_thread_evaluation_data(
430
+ self,
431
+ thread: ChatHistoryAgentThread,
432
+ agent: ChatCompletionAgent,
433
+ ) -> List[dict]:
434
+ """
435
+ Prepares evaluation data for a single thread.
436
+
437
+ :param thread: A ChatHistoryAgentThread object.
438
+ :type thread: ChatHistoryAgentThread
439
+ :param agent: The ChatCompletionAgent being evaluated.
440
+ :type agent: ChatCompletionAgent
441
+ :return: A list of evaluation data dictionaries for the thread.
442
+ :rtype: List[dict]
443
+ """
444
+ thread_eval_data: List[dict] = []
445
+
446
+ tool_definitions: List[ToolDefinition] = self._extract_function_tool_definitions(agent)
447
+
448
+ if not thread:
449
+ raise ValueError("Thread cannot be None")
450
+
451
+ messages: List[ChatMessageContent] = await SKAgentConverter._get_messages_from_thread_with_agent(
452
+ thread=thread,
453
+ agent=agent,
454
+ )
455
+
456
+ turns = SKAgentConverter._extract_turns_from_messages(messages)
457
+
458
+ for query, response in turns:
459
+ turn_eval_data = EvaluatorData(
460
+ query=query,
461
+ response=response,
462
+ tool_definitions=tool_definitions,
463
+ )
464
+
465
+ thread_eval_data.append(json.loads(turn_eval_data.to_json()))
466
+
467
+ return thread_eval_data
468
+
469
+ @staticmethod
470
+ async def _get_thread_turn_indices(thread: ChatHistoryAgentThread) -> List[int]:
471
+ """
472
+ Determines all complete turn indices in a thread.
473
+
474
+ :param thread: The ChatHistoryAgentThread to analyze.
475
+ :type thread: ChatHistoryAgentThread
476
+ :return: A list of valid turn indices (0-based).
477
+ :rtype: List[int]
478
+ """
479
+
480
+ messages: List[ChatMessageContent] = await SKAgentConverter._get_messages_from_thread(thread)
481
+ if not messages:
482
+ return []
483
+
484
+ # Extract turns from the messages
485
+ turns = SKAgentConverter._extract_turns_from_messages(messages)
486
+
487
+ # Return indices of valid turns
488
+ return SKAgentConverter._get_turn_indices(messages) if turns else []
489
+
490
+ @staticmethod
491
+ def _get_turn_indices(messages: List[ChatMessageContent]) -> List[int]:
492
+ """
493
+ Returns a list of valid turn indices.
494
+ """
495
+ return list(range(len(SKAgentConverter._extract_turns_from_messages(messages))))
@@ -38,7 +38,7 @@ from azure.ai.evaluation import (
38
38
  TaskAdherenceEvaluator,
39
39
  ToolCallAccuracyEvaluator,
40
40
  UngroundedAttributesEvaluator,
41
- ViolenceEvaluator
41
+ ViolenceEvaluator,
42
42
  )
43
43
 
44
44
  EVAL_CLASS_MAP = {
@@ -70,4 +70,4 @@ EVAL_CLASS_MAP = {
70
70
  ToolCallAccuracyEvaluator: "tool_call_accuracy",
71
71
  UngroundedAttributesEvaluator: "ungrounded_attributes",
72
72
  ViolenceEvaluator: "violence",
73
- }
73
+ }
@@ -26,8 +26,8 @@ class RunSubmitterClient:
26
26
  def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
27
27
  self._config = config or BatchEngineConfig(LOGGER, use_async=True)
28
28
  self._thread_pool = ThreadPoolExecutorWithContext(
29
- thread_name_prefix="evaluators_thread",
30
- max_workers=self._config.max_concurrency)
29
+ thread_name_prefix="evaluators_thread", max_workers=self._config.max_concurrency
30
+ )
31
31
 
32
32
  def run(
33
33
  self,
@@ -67,7 +67,7 @@ class RunSubmitterClient:
67
67
  created_on=kwargs.pop("created_on", None),
68
68
  storage_creator=kwargs.pop("storage_creator", None),
69
69
  **kwargs,
70
- )
70
+ ),
71
71
  )
72
72
 
73
73
  return run_future
@@ -89,7 +89,7 @@ class RunSubmitterClient:
89
89
  # Go from a list of dictionaries (i.e. a row view of the data) to a dictionary of lists
90
90
  # (i.e. a column view of the data)
91
91
  _update("inputs", run.inputs)
92
- _update("inputs", [{ LINE_NUMBER: i } for i in range(len(run.inputs)) ])
92
+ _update("inputs", [{LINE_NUMBER: i} for i in range(len(run.inputs))])
93
93
  _update("outputs", run.outputs)
94
94
 
95
95
  df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])
@@ -21,7 +21,7 @@ from azure.ai.evaluation._constants import (
21
21
  PF_DISABLE_TRACING,
22
22
  )
23
23
 
24
- from ..._user_agent import USER_AGENT
24
+ from ..._user_agent import UserAgentSingleton
25
25
  from .._utils import set_event_loop_policy
26
26
  from .batch_clients import BatchClient
27
27
  from ._run_submitter_client import RunSubmitterClient
@@ -50,7 +50,7 @@ class EvalRunContext:
50
50
  self._original_cwd = os.getcwd()
51
51
 
52
52
  if isinstance(self.client, CodeClient):
53
- ClientUserAgentUtil.append_user_agent(USER_AGENT)
53
+ ClientUserAgentUtil.append_user_agent(UserAgentSingleton().value)
54
54
  inject_openai_api()
55
55
 
56
56
  if isinstance(self.client, ProxyClient):