azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +13 -2
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +86 -50
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +124 -3
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +24 -15
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1622 -765
- azure/ai/evaluation/red_team/_red_team_result.py +43 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
- azure/ai/evaluation/red_team/_utils/constants.py +6 -12
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +9 -8
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -718,6 +718,7 @@ class AIAgentConverter:
|
|
|
718
718
|
|
|
719
719
|
return AIAgentConverter._convert_from_conversation(data, run_id)
|
|
720
720
|
|
|
721
|
+
|
|
721
722
|
@experimental
|
|
722
723
|
class AIAgentDataRetriever:
|
|
723
724
|
# Maximum items to fetch in a single AI Services API call (imposed by the service).
|
|
@@ -748,6 +749,7 @@ class AIAgentDataRetriever:
|
|
|
748
749
|
def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
|
|
749
750
|
pass
|
|
750
751
|
|
|
752
|
+
|
|
751
753
|
@experimental
|
|
752
754
|
class LegacyAgentDataRetriever(AIAgentDataRetriever):
|
|
753
755
|
|
|
@@ -768,7 +770,8 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
|
|
|
768
770
|
after = None
|
|
769
771
|
while has_more:
|
|
770
772
|
messages = self.project_client.agents.list_messages(
|
|
771
|
-
|
|
773
|
+
thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after
|
|
774
|
+
)
|
|
772
775
|
has_more = messages.has_more
|
|
773
776
|
after = messages.last_id
|
|
774
777
|
if messages.data:
|
|
@@ -812,6 +815,7 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
|
|
|
812
815
|
def _get_run(self, thread_id: str, run_id: str):
|
|
813
816
|
return self.project_client.agents.get_run(thread_id=thread_id, run_id=run_id)
|
|
814
817
|
|
|
818
|
+
|
|
815
819
|
@experimental
|
|
816
820
|
class FDPAgentDataRetriever(AIAgentDataRetriever):
|
|
817
821
|
|
|
@@ -833,16 +837,13 @@ class FDPAgentDataRetriever(AIAgentDataRetriever):
|
|
|
833
837
|
|
|
834
838
|
def _list_run_steps_chronological(self, thread_id: str, run_id: str):
|
|
835
839
|
|
|
836
|
-
return
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
limit=self._AI_SERVICES_API_MAX_LIMIT,
|
|
840
|
-
order="asc"
|
|
841
|
-
)
|
|
840
|
+
return self.project_client.agents.run_steps.list(
|
|
841
|
+
thread_id=thread_id, run_id=run_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
|
|
842
|
+
)
|
|
842
843
|
|
|
843
844
|
def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
|
|
844
845
|
runs = self.project_client.agents.runs.list(thread_id=thread_id, order="asc")
|
|
845
846
|
return [run.id for run in runs]
|
|
846
847
|
|
|
847
848
|
def _get_run(self, thread_id: str, run_id: str):
|
|
848
|
-
return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
|
|
849
|
+
return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
|
|
@@ -20,6 +20,7 @@ _SYSTEM = "system"
|
|
|
20
20
|
_USER = "user"
|
|
21
21
|
_AGENT = "assistant"
|
|
22
22
|
_TOOL = "tool"
|
|
23
|
+
_DEVELOPER = "developer" # part of the semantic kernel
|
|
23
24
|
|
|
24
25
|
# Constant definitions for what tool details include.
|
|
25
26
|
_TOOL_CALL = "tool_call"
|
|
@@ -81,6 +82,7 @@ _BUILT_IN_PARAMS = {
|
|
|
81
82
|
},
|
|
82
83
|
}
|
|
83
84
|
|
|
85
|
+
|
|
84
86
|
class Message(BaseModel):
|
|
85
87
|
"""Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
|
|
86
88
|
to JSON for evaluators and we have custom fields such as createdAt, run_id, and tool_call_id, so we cannot use
|
|
@@ -123,6 +125,17 @@ class UserMessage(Message):
|
|
|
123
125
|
role: str = _USER
|
|
124
126
|
|
|
125
127
|
|
|
128
|
+
class SKDeveloperMessage(Message):
|
|
129
|
+
"""Represents a developer message in a conversation with agents, assistants, and tools.
|
|
130
|
+
This is used in the context of Semantic Kernel (SK) agents.
|
|
131
|
+
|
|
132
|
+
:param role: The role of the message sender, which is always 'developer'.
|
|
133
|
+
:type role: str
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
role: str = _DEVELOPER
|
|
137
|
+
|
|
138
|
+
|
|
126
139
|
class ToolMessage(Message):
|
|
127
140
|
"""Represents a tool message in a conversation with agents, assistants, and tools.
|
|
128
141
|
|
|
@@ -139,6 +152,19 @@ class ToolMessage(Message):
|
|
|
139
152
|
tool_call_id: Optional[str] = None
|
|
140
153
|
|
|
141
154
|
|
|
155
|
+
class SKToolMessage(Message):
|
|
156
|
+
"""Represents a tool message in the context of a Semantic Kernel (SK) agent.
|
|
157
|
+
|
|
158
|
+
:param role: The role of the message sender, which is always 'tool'.
|
|
159
|
+
:type role: str
|
|
160
|
+
:param tool_call_id: The ID of the tool call associated with the message. Optional.
|
|
161
|
+
:type tool_call_id: Optional[str]
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
role: str = _TOOL
|
|
165
|
+
tool_call_id: Optional[str] = None
|
|
166
|
+
|
|
167
|
+
|
|
142
168
|
class AssistantMessage(Message):
|
|
143
169
|
"""Represents an assistant message.
|
|
144
170
|
|
|
@@ -152,6 +178,26 @@ class AssistantMessage(Message):
|
|
|
152
178
|
role: str = _AGENT
|
|
153
179
|
|
|
154
180
|
|
|
181
|
+
class SKAssistantMessage(Message):
|
|
182
|
+
"""Represents an assistant message in the context of a Semantic Kernel (SK) agent.
|
|
183
|
+
|
|
184
|
+
:param role: The role of the message sender, which is always 'assistant'.
|
|
185
|
+
:type role: str
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
role: str = _AGENT
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class SKAssistantMessage(Message):
|
|
192
|
+
"""Represents an assistant message in the context of a Semantic Kernel (SK) agent.
|
|
193
|
+
|
|
194
|
+
:param role: The role of the message sender, which is always 'assistant'.
|
|
195
|
+
:type role: str
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
role: str = _AGENT
|
|
199
|
+
|
|
200
|
+
|
|
155
201
|
class ToolDefinition(BaseModel):
|
|
156
202
|
"""Represents a tool definition that will be used in the agent.
|
|
157
203
|
|
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Optional
|
|
4
|
+
|
|
5
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
6
|
+
|
|
7
|
+
from semantic_kernel.contents import (
|
|
8
|
+
AuthorRole,
|
|
9
|
+
TextContent,
|
|
10
|
+
FunctionCallContent,
|
|
11
|
+
FunctionResultContent,
|
|
12
|
+
)
|
|
13
|
+
from semantic_kernel.contents.chat_message_content import ChatMessageContent
|
|
14
|
+
|
|
15
|
+
from semantic_kernel.agents import (
|
|
16
|
+
ChatCompletionAgent,
|
|
17
|
+
ChatHistoryAgentThread,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from ._models import (
|
|
21
|
+
Message,
|
|
22
|
+
SystemMessage,
|
|
23
|
+
UserMessage,
|
|
24
|
+
SKToolMessage,
|
|
25
|
+
SKAssistantMessage,
|
|
26
|
+
ToolDefinition,
|
|
27
|
+
ToolCall,
|
|
28
|
+
EvaluatorData,
|
|
29
|
+
SKDeveloperMessage,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@experimental
|
|
34
|
+
class SKAgentConverter:
|
|
35
|
+
"""
|
|
36
|
+
A converter for SK agent data.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self):
|
|
40
|
+
try:
|
|
41
|
+
import semantic_kernel as sk
|
|
42
|
+
except ImportError as e:
|
|
43
|
+
raise ImportError(
|
|
44
|
+
"semantic_kernel package is not installed. Please install it to use SKAgentConverter."
|
|
45
|
+
) from e
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _transform_tool_definitions(
|
|
49
|
+
tool_list: List[Dict[str, Any]],
|
|
50
|
+
) -> List[ToolDefinition]:
|
|
51
|
+
"""
|
|
52
|
+
Convert verbose tool definition dicts into the `ToolDefinition`s
|
|
53
|
+
:param tool_list: List of tool definitions to transform.
|
|
54
|
+
:type tool_list: List[Dict[str, Any]]
|
|
55
|
+
:return: Transformed list of tool definitions.
|
|
56
|
+
:rtype: List[Dict[str, Any]]
|
|
57
|
+
"""
|
|
58
|
+
# TODO: Add required and default values when also supported by Foundry's converter
|
|
59
|
+
|
|
60
|
+
final_tools: List[ToolDefinition] = []
|
|
61
|
+
|
|
62
|
+
for tool in tool_list:
|
|
63
|
+
filtered_tool = {
|
|
64
|
+
"name": tool["fully_qualified_name"],
|
|
65
|
+
"description": tool.get("description") or "No description",
|
|
66
|
+
"type": "function", # TODO: hardcoded for now.
|
|
67
|
+
"parameters": {
|
|
68
|
+
"type": "object", # Is this always the case?
|
|
69
|
+
"properties": {}, # Will be filled in below
|
|
70
|
+
},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
for param in tool.get("parameters", []):
|
|
74
|
+
param_name = param.get("name")
|
|
75
|
+
filtered_tool["parameters"]["properties"][param_name] = {
|
|
76
|
+
"type": param["type_"],
|
|
77
|
+
"description": param.get("description") or "No description",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
final_tools.append(ToolDefinition(**filtered_tool))
|
|
81
|
+
|
|
82
|
+
return final_tools
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def _get_tool_definitions(agent: ChatCompletionAgent) -> list:
|
|
86
|
+
"""
|
|
87
|
+
Get tool definitions from the agent's plugins.
|
|
88
|
+
:param agent: The ChatCompletionAgent from which to retrieve tool definitions.
|
|
89
|
+
:type agent: ChatCompletionAgent
|
|
90
|
+
:return: A list of tool definitions.
|
|
91
|
+
:rtype: list
|
|
92
|
+
"""
|
|
93
|
+
functions = []
|
|
94
|
+
for plugin in agent.kernel.plugins:
|
|
95
|
+
functions_metadata = agent.kernel.plugins[plugin].get_functions_metadata()
|
|
96
|
+
for function in functions_metadata:
|
|
97
|
+
# Serialize metadata to a dictionary
|
|
98
|
+
function_dict = function.model_dump()
|
|
99
|
+
function_dict["fully_qualified_name"] = function.fully_qualified_name
|
|
100
|
+
# function_dict["type"] = "tool_call"
|
|
101
|
+
functions.append(function_dict)
|
|
102
|
+
|
|
103
|
+
return functions
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def _extract_function_tool_definitions(
|
|
107
|
+
agent: ChatCompletionAgent,
|
|
108
|
+
) -> List[ToolDefinition]:
|
|
109
|
+
"""Get and transform tool definitions from the agent."""
|
|
110
|
+
tool_definitions = SKAgentConverter._get_tool_definitions(agent)
|
|
111
|
+
return SKAgentConverter._transform_tool_definitions(tool_definitions)
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def _is_output_role(role):
|
|
115
|
+
return role in (AuthorRole.ASSISTANT, AuthorRole.TOOL)
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
async def _get_messages_from_thread(
|
|
119
|
+
thread: ChatHistoryAgentThread,
|
|
120
|
+
) -> List[ChatMessageContent]:
|
|
121
|
+
"""
|
|
122
|
+
Get messages from a thread.
|
|
123
|
+
:param thread: The ChatHistoryAgentThread to get messages from.
|
|
124
|
+
:type thread: ChatHistoryAgentThread
|
|
125
|
+
:return: A list of ChatMessageContent objects.
|
|
126
|
+
:rtype: List[ChatMessageContent]
|
|
127
|
+
"""
|
|
128
|
+
return [msg async for msg in thread.get_messages()]
|
|
129
|
+
|
|
130
|
+
@staticmethod
|
|
131
|
+
async def _get_messages_from_thread_with_agent(
|
|
132
|
+
thread: ChatHistoryAgentThread,
|
|
133
|
+
agent: ChatCompletionAgent = None,
|
|
134
|
+
) -> List[ChatMessageContent]:
|
|
135
|
+
"""
|
|
136
|
+
Get messages from a thread with agent instructions included as a system message if available.
|
|
137
|
+
:param thread: The ChatHistoryAgentThread to get messages from.
|
|
138
|
+
:type thread: ChatHistoryAgentThread
|
|
139
|
+
:param agent: The ChatCompletionAgent to use.
|
|
140
|
+
:type agent: ChatCompletionAgent
|
|
141
|
+
:return: A list of ChatMessageContent objects.
|
|
142
|
+
:rtype: List[ChatMessageContent]
|
|
143
|
+
"""
|
|
144
|
+
messages: List[ChatMessageContent] = []
|
|
145
|
+
|
|
146
|
+
# If agent is provided, with instructions, add it as a system message
|
|
147
|
+
if agent and agent.instructions:
|
|
148
|
+
messages.append(
|
|
149
|
+
ChatMessageContent(
|
|
150
|
+
role=AuthorRole.SYSTEM,
|
|
151
|
+
items=[TextContent(text=agent.instructions)],
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
thread_messages = await SKAgentConverter._get_messages_from_thread(thread)
|
|
156
|
+
messages.extend(thread_messages)
|
|
157
|
+
|
|
158
|
+
return messages
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
async def _convert_thread_to_eval_schema(
|
|
162
|
+
thread: ChatHistoryAgentThread,
|
|
163
|
+
turn_index: int,
|
|
164
|
+
agent: ChatCompletionAgent = None,
|
|
165
|
+
):
|
|
166
|
+
"""
|
|
167
|
+
Convert a thread to the evaluation schema.
|
|
168
|
+
:param thread: The ChatHistoryAgentThread containing the conversation history.
|
|
169
|
+
:type thread: ChatHistoryAgentThread
|
|
170
|
+
:param turn_index: The index of the turn in the conversation.
|
|
171
|
+
:type turn_index: int
|
|
172
|
+
:param agent: The ChatCompletionAgent being evaluated.
|
|
173
|
+
:type agent: ChatCompletionAgent
|
|
174
|
+
:return: A dictionary containing the converted data.
|
|
175
|
+
:rtype: dict
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
messages: List[ChatMessageContent] = await SKAgentConverter._get_messages_from_thread_with_agent(
|
|
179
|
+
thread=thread,
|
|
180
|
+
agent=agent,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
turns = SKAgentConverter._extract_turns_from_messages(messages, turn_index_to_stop=turn_index)
|
|
184
|
+
|
|
185
|
+
if turn_index >= len(turns):
|
|
186
|
+
raise ValueError(f"Turn {turn_index} not found. Only {len(turns)} turns exist.")
|
|
187
|
+
|
|
188
|
+
return turns[turn_index]
|
|
189
|
+
|
|
190
|
+
@staticmethod
|
|
191
|
+
def _extract_turns_from_messages(
|
|
192
|
+
messages: List[ChatMessageContent],
|
|
193
|
+
turn_index_to_stop: Optional[int] = None,
|
|
194
|
+
) -> List[Tuple[List[Message], List[Message]]]:
|
|
195
|
+
turns = []
|
|
196
|
+
query: List[Message] = []
|
|
197
|
+
response: List[Message] = []
|
|
198
|
+
|
|
199
|
+
queued_items = []
|
|
200
|
+
is_queued_output = None
|
|
201
|
+
|
|
202
|
+
for msg in messages:
|
|
203
|
+
curr_items = SKAgentConverter._process_message_items(msg)
|
|
204
|
+
curr_is_output = SKAgentConverter._is_output_role(msg.role)
|
|
205
|
+
|
|
206
|
+
# Handle the first message to initialize the output/input mode
|
|
207
|
+
if is_queued_output is None:
|
|
208
|
+
queued_items.extend(curr_items)
|
|
209
|
+
is_queued_output = curr_is_output
|
|
210
|
+
continue # This means if chat starts with an assistant/tool message, it's a separate turn
|
|
211
|
+
|
|
212
|
+
# Same group: still within the same input/output block
|
|
213
|
+
if is_queued_output == curr_is_output:
|
|
214
|
+
queued_items.extend(curr_items)
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# Transition from input → output
|
|
218
|
+
if not is_queued_output and curr_is_output:
|
|
219
|
+
if queued_items:
|
|
220
|
+
query.extend(queued_items)
|
|
221
|
+
queued_items = curr_items
|
|
222
|
+
is_queued_output = True
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
# Transition from output → input = End of a turn
|
|
226
|
+
if is_queued_output and not curr_is_output:
|
|
227
|
+
# Transition from output to input: end of turn
|
|
228
|
+
response = list(queued_items)
|
|
229
|
+
turns.append((query, response))
|
|
230
|
+
# New turn's query would be the whole previous
|
|
231
|
+
query = list(query) + response
|
|
232
|
+
if turn_index_to_stop is not None and len(turns) > turn_index_to_stop:
|
|
233
|
+
break
|
|
234
|
+
queued_items = curr_items
|
|
235
|
+
is_queued_output = False
|
|
236
|
+
|
|
237
|
+
# Handle if final message(s) are assistant/tool messages
|
|
238
|
+
if queued_items and is_queued_output:
|
|
239
|
+
response = list(queued_items)
|
|
240
|
+
turns.append((query, response))
|
|
241
|
+
|
|
242
|
+
return turns
|
|
243
|
+
|
|
244
|
+
@staticmethod
|
|
245
|
+
def _convert_messages_to_schema_new(
|
|
246
|
+
messages: List[ChatMessageContent], turn_index: int
|
|
247
|
+
) -> Tuple[List[Message], List[Message]]:
|
|
248
|
+
"""
|
|
249
|
+
Converts messages to schema for a specific turn.
|
|
250
|
+
"""
|
|
251
|
+
turns = SKAgentConverter._extract_turns_from_messages(messages, turn_index_to_stop=turn_index)
|
|
252
|
+
if turn_index >= len(turns):
|
|
253
|
+
raise ValueError(f"Turn {turn_index} not found. Only {len(turns)} turns exist.")
|
|
254
|
+
return turns[turn_index]
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def _safe_json_loads(value: Any) -> Any:
|
|
258
|
+
"""Safely parse a JSON string into a Python object, return original if parsing fails."""
|
|
259
|
+
if isinstance(value, str):
|
|
260
|
+
try:
|
|
261
|
+
return json.loads(value)
|
|
262
|
+
except json.JSONDecodeError:
|
|
263
|
+
return value
|
|
264
|
+
return value
|
|
265
|
+
|
|
266
|
+
@staticmethod
|
|
267
|
+
def _process_message_items(message: ChatMessageContent) -> List[Message]:
|
|
268
|
+
"""
|
|
269
|
+
Processes the items in a message and converts them to the specified schema.
|
|
270
|
+
Args:
|
|
271
|
+
message (Any): The message object to process.
|
|
272
|
+
Returns:
|
|
273
|
+
List[Dict[str, Any]]: A list of dictionaries representing the message items in the specified schema.
|
|
274
|
+
"""
|
|
275
|
+
converted_messages = []
|
|
276
|
+
for item in message.items:
|
|
277
|
+
message_dict = {
|
|
278
|
+
"role": message.role.value,
|
|
279
|
+
"content": [], # will be filled in later
|
|
280
|
+
}
|
|
281
|
+
if "created" in message.metadata:
|
|
282
|
+
message_dict["createdAt"] = SKAgentConverter._convert_timestamp_to_iso(message.metadata["created"])
|
|
283
|
+
if isinstance(item, TextContent):
|
|
284
|
+
item_text = item.to_dict()["text"]
|
|
285
|
+
if message.role == AuthorRole.SYSTEM: # to match other converters
|
|
286
|
+
message_dict["content"] = item_text
|
|
287
|
+
else:
|
|
288
|
+
message_dict["content"] = [{"type": "text", "text": item_text}]
|
|
289
|
+
|
|
290
|
+
elif isinstance(item, FunctionCallContent):
|
|
291
|
+
item_dict = item.to_dict()
|
|
292
|
+
item_func = item_dict["function"]
|
|
293
|
+
arguments = SKAgentConverter._safe_json_loads(item_func["arguments"])
|
|
294
|
+
|
|
295
|
+
message_dict["content"].append(
|
|
296
|
+
{
|
|
297
|
+
"type": "tool_call",
|
|
298
|
+
"tool_call_id": item_dict.get("id", None),
|
|
299
|
+
"name": item_func["name"],
|
|
300
|
+
"arguments": arguments,
|
|
301
|
+
}
|
|
302
|
+
)
|
|
303
|
+
elif isinstance(item, FunctionResultContent):
|
|
304
|
+
item_dict = item.to_dict()
|
|
305
|
+
message_dict["tool_call_id"] = item_dict.get("tool_call_id", None)
|
|
306
|
+
|
|
307
|
+
item_content = SKAgentConverter._safe_json_loads(item_dict["content"])
|
|
308
|
+
|
|
309
|
+
message_dict["content"].append(
|
|
310
|
+
{
|
|
311
|
+
"type": "tool_result",
|
|
312
|
+
"tool_result": item_content,
|
|
313
|
+
}
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
raise Exception(f"Unexpected item type: {type(item)} in message: {message}")
|
|
317
|
+
|
|
318
|
+
if message.role == AuthorRole.SYSTEM:
|
|
319
|
+
convert_message = SystemMessage(**message_dict)
|
|
320
|
+
elif message.role == AuthorRole.USER:
|
|
321
|
+
convert_message = UserMessage(**message_dict)
|
|
322
|
+
elif message.role == AuthorRole.DEVELOPER:
|
|
323
|
+
convert_message = SKDeveloperMessage(**message_dict)
|
|
324
|
+
elif message.role == AuthorRole.ASSISTANT:
|
|
325
|
+
convert_message = SKAssistantMessage(**message_dict)
|
|
326
|
+
elif message.role == AuthorRole.TOOL:
|
|
327
|
+
convert_message = SKToolMessage(**message_dict)
|
|
328
|
+
else:
|
|
329
|
+
raise ValueError(f"Unknown role: {message.role}")
|
|
330
|
+
|
|
331
|
+
converted_messages.append(convert_message)
|
|
332
|
+
return converted_messages
|
|
333
|
+
|
|
334
|
+
@staticmethod
|
|
335
|
+
def is_turn_complete(message: ChatMessageContent) -> bool:
|
|
336
|
+
"""
|
|
337
|
+
Determines if a message completes a turn (assistant provides a response).
|
|
338
|
+
:param message: The message object to check.
|
|
339
|
+
:type message: ChatMessageContent
|
|
340
|
+
:return: True if the message completes a turn, False otherwise.
|
|
341
|
+
:rtype: bool
|
|
342
|
+
"""
|
|
343
|
+
return any(isinstance(item, TextContent) for item in message.items)
|
|
344
|
+
|
|
345
|
+
@staticmethod
|
|
346
|
+
def _convert_timestamp_to_iso(timestamp: float) -> str:
|
|
347
|
+
"""
|
|
348
|
+
Converts a timestamp to ISO format.
|
|
349
|
+
:param timestamp: The timestamp to convert.
|
|
350
|
+
:type timestamp: float
|
|
351
|
+
:return: The timestamp in ISO format.
|
|
352
|
+
:rtype: str
|
|
353
|
+
"""
|
|
354
|
+
created_dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
|
355
|
+
return created_dt.isoformat().replace("+00:00", "Z")
|
|
356
|
+
|
|
357
|
+
async def convert(
|
|
358
|
+
self,
|
|
359
|
+
thread: ChatHistoryAgentThread,
|
|
360
|
+
agent: ChatCompletionAgent,
|
|
361
|
+
turn_index: int,
|
|
362
|
+
) -> dict:
|
|
363
|
+
"""Convert the sdk chat completion agent run to a format suitable for evaluation.
|
|
364
|
+
|
|
365
|
+
:param thread: The ChatHistoryAgentThread containing the conversation history.
|
|
366
|
+
:type thread: ChatHistoryAgentThread
|
|
367
|
+
:param agent: The ChatCompletionAgent being evaluated.
|
|
368
|
+
:type agent: ChatCompletionAgent
|
|
369
|
+
:param turn_index: The index of the turn in the conversation.
|
|
370
|
+
:type turn_index: int
|
|
371
|
+
:return: The converted data in dictionary format.
|
|
372
|
+
:rtype: dict
|
|
373
|
+
"""
|
|
374
|
+
|
|
375
|
+
tool_definitions: List[ToolDefinition] = SKAgentConverter._extract_function_tool_definitions(agent)
|
|
376
|
+
|
|
377
|
+
if not thread:
|
|
378
|
+
raise ValueError("Thread cannot be None")
|
|
379
|
+
|
|
380
|
+
query, response = await SKAgentConverter._convert_thread_to_eval_schema(
|
|
381
|
+
thread=thread,
|
|
382
|
+
turn_index=turn_index,
|
|
383
|
+
agent=agent,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
result = EvaluatorData(
|
|
387
|
+
query=query,
|
|
388
|
+
response=response,
|
|
389
|
+
tool_definitions=tool_definitions,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
return json.loads(result.to_json())
|
|
393
|
+
|
|
394
|
+
async def prepare_evaluation_data(
|
|
395
|
+
self,
|
|
396
|
+
threads: List[ChatHistoryAgentThread],
|
|
397
|
+
agent: ChatCompletionAgent,
|
|
398
|
+
filename: Optional[str] = None,
|
|
399
|
+
) -> List[dict]:
|
|
400
|
+
"""
|
|
401
|
+
Prepares evaluation data for a list of threads and optionally writes it to a file.
|
|
402
|
+
|
|
403
|
+
:param threads: List of ChatHistoryAgentThread objects.
|
|
404
|
+
:type threads: List[ChatHistoryAgentThread]
|
|
405
|
+
:param agent: The ChatCompletionAgent being evaluated.
|
|
406
|
+
:type agent: ChatCompletionAgent
|
|
407
|
+
:param filename: Optional file path to save evaluation data as JSONL.
|
|
408
|
+
:type filename: Optional[str]
|
|
409
|
+
:return: List of evaluation data dictionaries.
|
|
410
|
+
:rtype: List[dict]
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
if isinstance(threads, ChatHistoryAgentThread):
|
|
414
|
+
threads = [threads]
|
|
415
|
+
|
|
416
|
+
all_eval_data: List[dict] = []
|
|
417
|
+
|
|
418
|
+
for thread in threads:
|
|
419
|
+
thread_data = await self._prepare_single_thread_evaluation_data(thread, agent)
|
|
420
|
+
all_eval_data.extend(thread_data)
|
|
421
|
+
|
|
422
|
+
if filename:
|
|
423
|
+
with open(filename, "w", encoding="utf-8") as f:
|
|
424
|
+
for item in all_eval_data:
|
|
425
|
+
f.write(json.dumps(item) + "\n")
|
|
426
|
+
|
|
427
|
+
return all_eval_data
|
|
428
|
+
|
|
429
|
+
async def _prepare_single_thread_evaluation_data(
|
|
430
|
+
self,
|
|
431
|
+
thread: ChatHistoryAgentThread,
|
|
432
|
+
agent: ChatCompletionAgent,
|
|
433
|
+
) -> List[dict]:
|
|
434
|
+
"""
|
|
435
|
+
Prepares evaluation data for a single thread.
|
|
436
|
+
|
|
437
|
+
:param thread: A ChatHistoryAgentThread object.
|
|
438
|
+
:type thread: ChatHistoryAgentThread
|
|
439
|
+
:param agent: The ChatCompletionAgent being evaluated.
|
|
440
|
+
:type agent: ChatCompletionAgent
|
|
441
|
+
:return: A list of evaluation data dictionaries for the thread.
|
|
442
|
+
:rtype: List[dict]
|
|
443
|
+
"""
|
|
444
|
+
thread_eval_data: List[dict] = []
|
|
445
|
+
|
|
446
|
+
tool_definitions: List[ToolDefinition] = self._extract_function_tool_definitions(agent)
|
|
447
|
+
|
|
448
|
+
if not thread:
|
|
449
|
+
raise ValueError("Thread cannot be None")
|
|
450
|
+
|
|
451
|
+
messages: List[ChatMessageContent] = await SKAgentConverter._get_messages_from_thread_with_agent(
|
|
452
|
+
thread=thread,
|
|
453
|
+
agent=agent,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
turns = SKAgentConverter._extract_turns_from_messages(messages)
|
|
457
|
+
|
|
458
|
+
for query, response in turns:
|
|
459
|
+
turn_eval_data = EvaluatorData(
|
|
460
|
+
query=query,
|
|
461
|
+
response=response,
|
|
462
|
+
tool_definitions=tool_definitions,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
thread_eval_data.append(json.loads(turn_eval_data.to_json()))
|
|
466
|
+
|
|
467
|
+
return thread_eval_data
|
|
468
|
+
|
|
469
|
+
@staticmethod
|
|
470
|
+
async def _get_thread_turn_indices(thread: ChatHistoryAgentThread) -> List[int]:
|
|
471
|
+
"""
|
|
472
|
+
Determines all complete turn indices in a thread.
|
|
473
|
+
|
|
474
|
+
:param thread: The ChatHistoryAgentThread to analyze.
|
|
475
|
+
:type thread: ChatHistoryAgentThread
|
|
476
|
+
:return: A list of valid turn indices (0-based).
|
|
477
|
+
:rtype: List[int]
|
|
478
|
+
"""
|
|
479
|
+
|
|
480
|
+
messages: List[ChatMessageContent] = await SKAgentConverter._get_messages_from_thread(thread)
|
|
481
|
+
if not messages:
|
|
482
|
+
return []
|
|
483
|
+
|
|
484
|
+
# Extract turns from the messages
|
|
485
|
+
turns = SKAgentConverter._extract_turns_from_messages(messages)
|
|
486
|
+
|
|
487
|
+
# Return indices of valid turns
|
|
488
|
+
return SKAgentConverter._get_turn_indices(messages) if turns else []
|
|
489
|
+
|
|
490
|
+
@staticmethod
|
|
491
|
+
def _get_turn_indices(messages: List[ChatMessageContent]) -> List[int]:
|
|
492
|
+
"""
|
|
493
|
+
Returns a list of valid turn indices.
|
|
494
|
+
"""
|
|
495
|
+
return list(range(len(SKAgentConverter._extract_turns_from_messages(messages))))
|
|
@@ -38,7 +38,7 @@ from azure.ai.evaluation import (
|
|
|
38
38
|
TaskAdherenceEvaluator,
|
|
39
39
|
ToolCallAccuracyEvaluator,
|
|
40
40
|
UngroundedAttributesEvaluator,
|
|
41
|
-
ViolenceEvaluator
|
|
41
|
+
ViolenceEvaluator,
|
|
42
42
|
)
|
|
43
43
|
|
|
44
44
|
EVAL_CLASS_MAP = {
|
|
@@ -70,4 +70,4 @@ EVAL_CLASS_MAP = {
|
|
|
70
70
|
ToolCallAccuracyEvaluator: "tool_call_accuracy",
|
|
71
71
|
UngroundedAttributesEvaluator: "ungrounded_attributes",
|
|
72
72
|
ViolenceEvaluator: "violence",
|
|
73
|
-
}
|
|
73
|
+
}
|
|
@@ -26,8 +26,8 @@ class RunSubmitterClient:
|
|
|
26
26
|
def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
|
|
27
27
|
self._config = config or BatchEngineConfig(LOGGER, use_async=True)
|
|
28
28
|
self._thread_pool = ThreadPoolExecutorWithContext(
|
|
29
|
-
thread_name_prefix="evaluators_thread",
|
|
30
|
-
|
|
29
|
+
thread_name_prefix="evaluators_thread", max_workers=self._config.max_concurrency
|
|
30
|
+
)
|
|
31
31
|
|
|
32
32
|
def run(
|
|
33
33
|
self,
|
|
@@ -67,7 +67,7 @@ class RunSubmitterClient:
|
|
|
67
67
|
created_on=kwargs.pop("created_on", None),
|
|
68
68
|
storage_creator=kwargs.pop("storage_creator", None),
|
|
69
69
|
**kwargs,
|
|
70
|
-
)
|
|
70
|
+
),
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
return run_future
|
|
@@ -89,7 +89,7 @@ class RunSubmitterClient:
|
|
|
89
89
|
# Go from a list of dictionaries (i.e. a row view of the data) to a dictionary of lists
|
|
90
90
|
# (i.e. a column view of the data)
|
|
91
91
|
_update("inputs", run.inputs)
|
|
92
|
-
_update("inputs", [{
|
|
92
|
+
_update("inputs", [{LINE_NUMBER: i} for i in range(len(run.inputs))])
|
|
93
93
|
_update("outputs", run.outputs)
|
|
94
94
|
|
|
95
95
|
df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])
|
|
@@ -21,7 +21,7 @@ from azure.ai.evaluation._constants import (
|
|
|
21
21
|
PF_DISABLE_TRACING,
|
|
22
22
|
)
|
|
23
23
|
|
|
24
|
-
from ..._user_agent import
|
|
24
|
+
from ..._user_agent import UserAgentSingleton
|
|
25
25
|
from .._utils import set_event_loop_policy
|
|
26
26
|
from .batch_clients import BatchClient
|
|
27
27
|
from ._run_submitter_client import RunSubmitterClient
|
|
@@ -50,7 +50,7 @@ class EvalRunContext:
|
|
|
50
50
|
self._original_cwd = os.getcwd()
|
|
51
51
|
|
|
52
52
|
if isinstance(self.client, CodeClient):
|
|
53
|
-
ClientUserAgentUtil.append_user_agent(
|
|
53
|
+
ClientUserAgentUtil.append_user_agent(UserAgentSingleton().value)
|
|
54
54
|
inject_openai_api()
|
|
55
55
|
|
|
56
56
|
if isinstance(self.client, ProxyClient):
|