azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +27 -1
- azure/ai/evaluation/_azure/_models.py +6 -6
- azure/ai/evaluation/_common/constants.py +6 -2
- azure/ai/evaluation/_common/rai_service.py +39 -5
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +23 -3
- azure/ai/evaluation/_constants.py +7 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +804 -0
- azure/ai/evaluation/_converters/_models.py +302 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +42 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +109 -64
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +3 -3
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +44 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +4 -2
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +44 -5
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +22 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
- azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +158 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
- azure/ai/evaluation/_exceptions.py +5 -0
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +21 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +251 -150
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +195 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +45 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +74 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_red_team.py +1887 -0
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +3 -0
- azure/ai/evaluation/red_team/_utils/constants.py +65 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +165 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +192 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +54 -27
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/METADATA +80 -15
- azure_ai_evaluation-1.5.0.dist-info/RECORD +207 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.3.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from azure.ai.projects.models import RunStepFunctionToolCall
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional, Union
|
|
9
|
+
|
|
10
|
+
# Message roles constants.
|
|
11
|
+
_SYSTEM = "system"
|
|
12
|
+
_USER = "user"
|
|
13
|
+
_AGENT = "assistant"
|
|
14
|
+
_TOOL = "tool"
|
|
15
|
+
|
|
16
|
+
# Constant definitions for what tool details include.
|
|
17
|
+
_TOOL_CALL = "tool_call"
|
|
18
|
+
_TOOL_RESULT = "tool_result"
|
|
19
|
+
_FUNCTION = "function"
|
|
20
|
+
|
|
21
|
+
# This is returned by AI services in the API to filter against tool invocations.
|
|
22
|
+
_TOOL_CALLS = "tool_calls"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Message(BaseModel):
|
|
26
|
+
"""Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
|
|
27
|
+
to JSON for evaluators and we have custom fields such as createdAt, run_id, and tool_call_id, so we cannot use
|
|
28
|
+
the standard pydantic models provided by OpenAI.
|
|
29
|
+
|
|
30
|
+
:param createdAt: The timestamp when the message was created.
|
|
31
|
+
:type createdAt: datetime.datetime
|
|
32
|
+
:param run_id: The ID of the run associated with the message. Optional.
|
|
33
|
+
:type run_id: Optional[str]
|
|
34
|
+
:param role: The role of the message sender (e.g., system, user, tool, assistant).
|
|
35
|
+
:type role: str
|
|
36
|
+
:param content: The content of the message, which can be a string or a list of dictionaries.
|
|
37
|
+
:type content: Union[str, List[dict]]
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
createdAt: Optional[Union[datetime.datetime, int]] = None # SystemMessage wouldn't have this
|
|
41
|
+
run_id: Optional[str] = None
|
|
42
|
+
tool_call_id: Optional[str] = None # see ToolMessage
|
|
43
|
+
role: str
|
|
44
|
+
content: Union[str, List[dict]]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SystemMessage(Message):
|
|
48
|
+
"""Represents a system message in a conversation with agents, assistants, and tools.
|
|
49
|
+
|
|
50
|
+
:param role: The role of the message sender, which is always 'system'.
|
|
51
|
+
:type role: str
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
role: str = _SYSTEM
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class UserMessage(Message):
|
|
58
|
+
"""Represents a user message in a conversation with agents, assistants, and tools.
|
|
59
|
+
|
|
60
|
+
:param role: The role of the message sender, which is always 'user'.
|
|
61
|
+
:type role: str
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
role: str = _USER
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ToolMessage(Message):
|
|
68
|
+
"""Represents a tool message in a conversation with agents, assistants, and tools.
|
|
69
|
+
|
|
70
|
+
:param run_id: The ID of the run associated with the message.
|
|
71
|
+
:type run_id: str
|
|
72
|
+
:param role: The role of the message sender, which is always 'tool'.
|
|
73
|
+
:type role: str
|
|
74
|
+
:param tool_call_id: The ID of the tool call associated with the message. Optional.
|
|
75
|
+
:type tool_call_id: Optional[str]
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
run_id: str
|
|
79
|
+
role: str = _TOOL
|
|
80
|
+
tool_call_id: Optional[str] = None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class AssistantMessage(Message):
|
|
84
|
+
"""Represents an assistant message.
|
|
85
|
+
|
|
86
|
+
:param run_id: The ID of the run associated with the message.
|
|
87
|
+
:type run_id: str
|
|
88
|
+
:param role: The role of the message sender, which is always 'assistant'.
|
|
89
|
+
:type role: str
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
run_id: str
|
|
93
|
+
role: str = _AGENT
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class ToolDefinition(BaseModel):
|
|
97
|
+
"""Represents a tool definition that will be used in the agent.
|
|
98
|
+
|
|
99
|
+
:param name: The name of the tool.
|
|
100
|
+
:type name: str
|
|
101
|
+
:param description: A description of the tool.
|
|
102
|
+
:type description: str
|
|
103
|
+
:param parameters: The parameters required by the tool.
|
|
104
|
+
:type parameters: dict
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
name: str
|
|
108
|
+
description: Optional[str] = None
|
|
109
|
+
parameters: dict
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ToolCall:
|
|
113
|
+
"""Represents a tool call, used as an intermediate step in the conversion process.
|
|
114
|
+
|
|
115
|
+
:param created: The timestamp when the tool call was created.
|
|
116
|
+
:type created: datetime.datetime
|
|
117
|
+
:param completed: The timestamp when the tool call was completed.
|
|
118
|
+
:type completed: datetime.datetime
|
|
119
|
+
:param details: The details of the tool call.
|
|
120
|
+
:type details: RunStepFunctionToolCall
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
def __init__(self, created: datetime.datetime, completed: datetime.datetime, details: RunStepFunctionToolCall):
|
|
124
|
+
self.created = created
|
|
125
|
+
self.completed = completed
|
|
126
|
+
self.details = details
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class EvaluatorData(BaseModel):
|
|
130
|
+
"""Represents the result of a conversion.
|
|
131
|
+
|
|
132
|
+
:param query: A list of messages representing the system message, chat history, and user query.
|
|
133
|
+
:type query: List[Message]
|
|
134
|
+
:param response: A list of messages representing the assistant's response, including tool calls and results.
|
|
135
|
+
:type response: List[Message]
|
|
136
|
+
:param tool_definitions: A list of tool definitions used in the agent.
|
|
137
|
+
:type tool_definitions: List[ToolDefinition]
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
query: List[Message]
|
|
141
|
+
response: List[Message]
|
|
142
|
+
tool_definitions: List[ToolDefinition]
|
|
143
|
+
|
|
144
|
+
def to_json(self):
|
|
145
|
+
"""Converts the result to a JSON string.
|
|
146
|
+
|
|
147
|
+
:return: The JSON representation of the result.
|
|
148
|
+
:rtype: str
|
|
149
|
+
"""
|
|
150
|
+
return self.model_dump_json(exclude={}, exclude_none=True)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Message]:
|
|
154
|
+
"""
|
|
155
|
+
Breaks a tool call into a list of messages, including the tool call and its result.
|
|
156
|
+
|
|
157
|
+
:param tool_call: The tool call to be broken into messages.
|
|
158
|
+
:type tool_call: ToolCall
|
|
159
|
+
:param run_id: The ID of the run associated with the messages.
|
|
160
|
+
:type run_id: str
|
|
161
|
+
:return: A list of messages representing the tool call and its result.
|
|
162
|
+
:rtype: List[Message]
|
|
163
|
+
"""
|
|
164
|
+
# We will use this as our accumulator.
|
|
165
|
+
messages: List[Message] = []
|
|
166
|
+
|
|
167
|
+
# As of March 17th, 2025, we only support custom functions due to built-in code interpreters and bing grounding
|
|
168
|
+
# tooling not reporting their function calls in the same way. Code interpreters don't include the tool call at
|
|
169
|
+
# all in most of the cases, and bing would only show the API URL, without arguments or results.
|
|
170
|
+
# Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query.
|
|
171
|
+
# TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter.
|
|
172
|
+
if hasattr(tool_call.details, _FUNCTION):
|
|
173
|
+
# This is the internals of the content object that will be included with the tool call.
|
|
174
|
+
tool_call_id = tool_call.details.id
|
|
175
|
+
content_tool_call = {
|
|
176
|
+
"type": _TOOL_CALL,
|
|
177
|
+
"tool_call_id": tool_call_id,
|
|
178
|
+
"name": tool_call.details.function.name,
|
|
179
|
+
"arguments": safe_loads(tool_call.details.function.arguments),
|
|
180
|
+
}
|
|
181
|
+
else:
|
|
182
|
+
# Treat built-in tools separately. Object models may be unique so handle each case separately
|
|
183
|
+
# Just converting to dicts here rather than custom serializers for simplicity for now.
|
|
184
|
+
# Don't fail if we run into a newly seen tool, just skip
|
|
185
|
+
if tool_call.details["type"] == "code_interpreter":
|
|
186
|
+
arguments = {"input": tool_call.details.code_interpreter.input}
|
|
187
|
+
elif tool_call.details["type"] == "bing_grounding":
|
|
188
|
+
arguments = {"requesturl": tool_call.details["bing_grounding"]["requesturl"]}
|
|
189
|
+
elif tool_call.details["type"] == "file_search":
|
|
190
|
+
options = tool_call.details["file_search"]["ranking_options"]
|
|
191
|
+
arguments = {
|
|
192
|
+
"ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
|
|
193
|
+
}
|
|
194
|
+
else:
|
|
195
|
+
# unsupported tool type, skip
|
|
196
|
+
return messages
|
|
197
|
+
try:
|
|
198
|
+
tool_call_id = tool_call.details.id
|
|
199
|
+
content_tool_call = {
|
|
200
|
+
"type": _TOOL_CALL,
|
|
201
|
+
"tool_call_id": tool_call_id,
|
|
202
|
+
"name": tool_call.details.type,
|
|
203
|
+
"arguments": arguments,
|
|
204
|
+
}
|
|
205
|
+
except:
|
|
206
|
+
return messages
|
|
207
|
+
|
|
208
|
+
# We format it into an assistant message, where the content is a singleton list of the content object.
|
|
209
|
+
# It should be a tool message, since this is the call, but the given schema treats this message as
|
|
210
|
+
# assistant's action of calling the tool.
|
|
211
|
+
messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
|
|
212
|
+
|
|
213
|
+
if hasattr(tool_call.details, _FUNCTION):
|
|
214
|
+
output = safe_loads(tool_call.details.function.output)
|
|
215
|
+
else:
|
|
216
|
+
try:
|
|
217
|
+
# Some built-ins may have output, others may not
|
|
218
|
+
# Try to retrieve it, but if we don't find anything, skip adding the message
|
|
219
|
+
# Just manually converting to dicts for easy serialization for now rather than custom serializers
|
|
220
|
+
if tool_call.details.type == "code_interpreter":
|
|
221
|
+
output = tool_call.details.code_interpreter.outputs
|
|
222
|
+
elif tool_call.details.type == "bing_grounding":
|
|
223
|
+
return messages # not supported yet from bing grounding tool
|
|
224
|
+
elif tool_call.details.type == "file_search":
|
|
225
|
+
output = [
|
|
226
|
+
{
|
|
227
|
+
"file_id": result.file_id,
|
|
228
|
+
"file_name": result.file_name,
|
|
229
|
+
"score": result.score,
|
|
230
|
+
"content": result.content,
|
|
231
|
+
}
|
|
232
|
+
for result in tool_call.details.file_search.results
|
|
233
|
+
]
|
|
234
|
+
except:
|
|
235
|
+
return messages
|
|
236
|
+
|
|
237
|
+
# Now, onto the tool result, which only includes the result of the function call.
|
|
238
|
+
content_tool_call_result = {"type": _TOOL_RESULT, _TOOL_RESULT: output}
|
|
239
|
+
|
|
240
|
+
# Since this is a tool's action of returning, we put it as a tool message.
|
|
241
|
+
messages.append(
|
|
242
|
+
ToolMessage(
|
|
243
|
+
run_id=run_id,
|
|
244
|
+
tool_call_id=tool_call_id,
|
|
245
|
+
content=[to_dict(content_tool_call_result)],
|
|
246
|
+
createdAt=tool_call.completed,
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
return messages
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def to_dict(obj) -> dict:
|
|
253
|
+
"""
|
|
254
|
+
Converts an object to a dictionary.
|
|
255
|
+
|
|
256
|
+
:param obj: The object to be converted.
|
|
257
|
+
:type obj: Any
|
|
258
|
+
:return: The dictionary representation of the object.
|
|
259
|
+
:rtype: dict
|
|
260
|
+
"""
|
|
261
|
+
return json.loads(json.dumps(obj))
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def safe_loads(data: str) -> Union[dict, str]:
|
|
265
|
+
"""
|
|
266
|
+
Safely loads a JSON string into a Python dictionary or returns the original string if loading fails.
|
|
267
|
+
:param data: The JSON string to be loaded.
|
|
268
|
+
:type data: str
|
|
269
|
+
:return: The loaded dictionary or the original string.
|
|
270
|
+
:rtype: Union[dict, str]
|
|
271
|
+
"""
|
|
272
|
+
try:
|
|
273
|
+
return json.loads(data)
|
|
274
|
+
except json.JSONDecodeError:
|
|
275
|
+
return data
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def convert_message(msg: dict) -> Message:
|
|
279
|
+
"""
|
|
280
|
+
Converts a dictionary to the appropriate Message subclass.
|
|
281
|
+
|
|
282
|
+
:param msg: The message dictionary.
|
|
283
|
+
:type msg: dict
|
|
284
|
+
:return: The Message object.
|
|
285
|
+
:rtype: Message
|
|
286
|
+
"""
|
|
287
|
+
role = msg["role"]
|
|
288
|
+
if role == "system":
|
|
289
|
+
return SystemMessage(content=str(msg["content"]))
|
|
290
|
+
elif role == "user":
|
|
291
|
+
return UserMessage(content=msg["content"], createdAt=msg["createdAt"])
|
|
292
|
+
elif role == "assistant":
|
|
293
|
+
return AssistantMessage(run_id=str(msg["run_id"]), content=msg["content"], createdAt=msg["createdAt"])
|
|
294
|
+
elif role == "tool":
|
|
295
|
+
return ToolMessage(
|
|
296
|
+
run_id=str(msg["run_id"]),
|
|
297
|
+
tool_call_id=str(msg["tool_call_id"]),
|
|
298
|
+
content=msg["content"],
|
|
299
|
+
createdAt=msg["createdAt"],
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
raise ValueError(f"Unknown role: {role}")
|
|
@@ -3,8 +3,15 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from .eval_run_context import EvalRunContext
|
|
5
5
|
from .code_client import CodeClient
|
|
6
|
-
from .proxy_client import ProxyClient
|
|
6
|
+
from .proxy_client import ProxyClient, ProxyRun
|
|
7
|
+
from ._run_submitter_client import RunSubmitterClient
|
|
7
8
|
from .target_run_context import TargetRunContext
|
|
8
|
-
from .proxy_client import ProxyRun
|
|
9
9
|
|
|
10
|
-
__all__ = [
|
|
10
|
+
__all__ = [
|
|
11
|
+
"CodeClient",
|
|
12
|
+
"ProxyClient",
|
|
13
|
+
"EvalRunContext",
|
|
14
|
+
"TargetRunContext",
|
|
15
|
+
"ProxyRun",
|
|
16
|
+
"RunSubmitterClient",
|
|
17
|
+
]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import sys
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
10
|
+
from os import PathLike
|
|
11
|
+
from typing import Any, Callable, Dict, Final, List, Mapping, Optional, Sequence, Union, cast
|
|
12
|
+
|
|
13
|
+
from .batch_clients import BatchClientRun, HasAsyncCallable
|
|
14
|
+
from ..._legacy._batch_engine._run_submitter import RunSubmitter
|
|
15
|
+
from ..._legacy._batch_engine._config import BatchEngineConfig
|
|
16
|
+
from ..._legacy._batch_engine._run import Run
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
LOGGER = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RunSubmitterClient:
|
|
23
|
+
def __init__(self, config: Optional[BatchEngineConfig] = None) -> None:
|
|
24
|
+
self._config = config or BatchEngineConfig(LOGGER, use_async=True)
|
|
25
|
+
self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
self,
|
|
29
|
+
flow: Callable,
|
|
30
|
+
data: Union[str, PathLike, pd.DataFrame],
|
|
31
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
32
|
+
evaluator_name: Optional[str] = None,
|
|
33
|
+
**kwargs: Any,
|
|
34
|
+
) -> BatchClientRun:
|
|
35
|
+
if not isinstance(data, pd.DataFrame):
|
|
36
|
+
# Should never get here
|
|
37
|
+
raise ValueError("Data must be a pandas DataFrame")
|
|
38
|
+
if not column_mapping:
|
|
39
|
+
raise ValueError("Column mapping must be provided")
|
|
40
|
+
|
|
41
|
+
# The column mappings are index by data to indicate they come from the data
|
|
42
|
+
# input. Update the inputs so that each entry is a dictionary with a data key
|
|
43
|
+
# that contains the original input data.
|
|
44
|
+
inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
|
|
45
|
+
|
|
46
|
+
# always uses async behind the scenes
|
|
47
|
+
if isinstance(flow, HasAsyncCallable):
|
|
48
|
+
flow = flow._to_async() # pylint: disable=protected-access
|
|
49
|
+
|
|
50
|
+
run_submitter = RunSubmitter(self._config)
|
|
51
|
+
run_future = self._thread_pool.submit(
|
|
52
|
+
run_submitter.submit,
|
|
53
|
+
dynamic_callable=flow,
|
|
54
|
+
inputs=inputs,
|
|
55
|
+
column_mapping=column_mapping,
|
|
56
|
+
name_prefix=evaluator_name,
|
|
57
|
+
created_on=kwargs.pop("created_on", None),
|
|
58
|
+
storage_creator=kwargs.pop("storage_creator", None),
|
|
59
|
+
**kwargs,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return run_future
|
|
63
|
+
|
|
64
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
65
|
+
run = self._get_run(client_run)
|
|
66
|
+
|
|
67
|
+
data: Dict[str, List[Any]] = defaultdict(list)
|
|
68
|
+
stop_at: Final[int] = self._config.default_num_results if not all_results else sys.maxsize
|
|
69
|
+
|
|
70
|
+
def _update(prefix: str, items: Sequence[Mapping[str, Any]]) -> None:
|
|
71
|
+
for i, line in enumerate(items):
|
|
72
|
+
if i >= stop_at:
|
|
73
|
+
break
|
|
74
|
+
for k, value in line.items():
|
|
75
|
+
key = f"{prefix}.{k}"
|
|
76
|
+
data[key].append(value)
|
|
77
|
+
|
|
78
|
+
_update("inputs", run.inputs)
|
|
79
|
+
_update("outputs", run.outputs)
|
|
80
|
+
|
|
81
|
+
df = pd.DataFrame(data).reindex(columns=[k for k in data.keys()])
|
|
82
|
+
return df
|
|
83
|
+
|
|
84
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
85
|
+
run = self._get_run(client_run)
|
|
86
|
+
return dict(run.metrics)
|
|
87
|
+
|
|
88
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
89
|
+
run = self._get_run(client_run)
|
|
90
|
+
|
|
91
|
+
total_lines = run.result.total_lines if run.result else 0
|
|
92
|
+
failed_lines = run.result.failed_lines if run.result else 0
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
"status": run.status.value,
|
|
96
|
+
"duration": str(run.duration),
|
|
97
|
+
"completed_lines": total_lines - failed_lines,
|
|
98
|
+
"failed_lines": failed_lines,
|
|
99
|
+
# "log_path": "",
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
@staticmethod
|
|
103
|
+
def _get_run(run: BatchClientRun) -> Run:
|
|
104
|
+
return cast(Future[Run], run).result()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import pandas
|
|
6
|
+
from os import PathLike
|
|
7
|
+
from typing import Any, Awaitable, Callable, Dict, Optional, Protocol, Union, runtime_checkable
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BatchClientRun(Protocol):
|
|
11
|
+
"""The protocol for the batch client run."""
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@runtime_checkable
|
|
17
|
+
class HasAsyncCallable(Protocol):
|
|
18
|
+
"""The protocol for an object that has an async callable."""
|
|
19
|
+
|
|
20
|
+
def _to_async(self) -> Callable[[Any, Any], Awaitable[Any]]: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BatchClient(Protocol):
|
|
24
|
+
"""The protocol for the batch client. This allows for running a flow on a data source
|
|
25
|
+
and getting the details of the run."""
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
self,
|
|
29
|
+
flow: Callable,
|
|
30
|
+
data: Union[str, PathLike, pandas.DataFrame],
|
|
31
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
32
|
+
evaluator_name: Optional[str] = None,
|
|
33
|
+
**kwargs: Any,
|
|
34
|
+
) -> BatchClientRun:
|
|
35
|
+
"""Run the given flow on the data with the given column mapping.
|
|
36
|
+
|
|
37
|
+
:param flow: The flow to run.
|
|
38
|
+
:type flow: Union[Callable, HasAsyncCallable]
|
|
39
|
+
:param data: The JSONL file containing the data to run the flow on,
|
|
40
|
+
or the loaded data
|
|
41
|
+
:type data: Union[str, PathLike]
|
|
42
|
+
:param column_mapping: The column mapping to use.
|
|
43
|
+
:type column_mapping: Mapping[str, str]
|
|
44
|
+
:param name: The name of the run.
|
|
45
|
+
:type name: Optional[str]
|
|
46
|
+
:param kwargs: Additional keyword arguments to pass to the flow.
|
|
47
|
+
:return: The result of the batch client run.
|
|
48
|
+
:rtype: BatchClientRun
|
|
49
|
+
"""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pandas.DataFrame:
|
|
53
|
+
"""Get the details of the run.
|
|
54
|
+
|
|
55
|
+
:param client_run: The run to get the details of.
|
|
56
|
+
:type client_run: BatchClientRun
|
|
57
|
+
:param all_results: Whether to get all results.
|
|
58
|
+
:type all_results: bool
|
|
59
|
+
:return: The details of the run.
|
|
60
|
+
:rtype: pandas.DataFrame
|
|
61
|
+
"""
|
|
62
|
+
...
|
|
63
|
+
|
|
64
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
65
|
+
"""Get the metrics of the run.
|
|
66
|
+
|
|
67
|
+
:param client_run: The run to get the metrics of.
|
|
68
|
+
:type client_run: BatchClientRun
|
|
69
|
+
:return: The metrics of the run.
|
|
70
|
+
:rtype: Mapping[str, Any]
|
|
71
|
+
"""
|
|
72
|
+
...
|
|
73
|
+
|
|
74
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
75
|
+
"""Get the summary of the run.
|
|
76
|
+
|
|
77
|
+
:param client_run: The run to get the summary of.
|
|
78
|
+
:type client_run: BatchClientRun
|
|
79
|
+
:return: The summary of the run.
|
|
80
|
+
:rtype: Mapping[str, Any]
|
|
81
|
+
"""
|
|
82
|
+
...
|
|
@@ -6,17 +6,17 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
8
|
from concurrent.futures import Future
|
|
9
|
-
from
|
|
10
|
-
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
9
|
+
from typing import Any, Callable, Dict, Optional, Sequence, Union, cast
|
|
11
10
|
|
|
12
11
|
import pandas as pd
|
|
13
|
-
from
|
|
14
|
-
from
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters.types import AttrDict
|
|
13
|
+
from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
15
14
|
|
|
16
15
|
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
17
16
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
18
17
|
|
|
19
18
|
from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
19
|
+
from .batch_clients import BatchClientRun
|
|
20
20
|
|
|
21
21
|
LOGGER = logging.getLogger(__name__)
|
|
22
22
|
|
|
@@ -84,7 +84,7 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
84
84
|
for param in inspect.signature(evaluator).parameters.values()
|
|
85
85
|
if param.name not in ["args", "kwargs"]
|
|
86
86
|
}
|
|
87
|
-
for value in input_df.to_dict("records"):
|
|
87
|
+
for value in cast(Sequence[Dict[str, Any]], input_df.to_dict("records")):
|
|
88
88
|
# Filter out only the parameters that are present in the input data
|
|
89
89
|
# if no parameters then pass data as is
|
|
90
90
|
filtered_values = {k: v for k, v in value.items() if k in parameters} if len(parameters) > 0 else value
|
|
@@ -133,10 +133,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
133
133
|
def run(
|
|
134
134
|
self, # pylint: disable=unused-argument
|
|
135
135
|
flow: Callable,
|
|
136
|
-
data: Union[os.PathLike,
|
|
137
|
-
evaluator_name: Optional[str] = None,
|
|
136
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
138
137
|
column_mapping: Optional[Dict[str, str]] = None,
|
|
139
|
-
|
|
138
|
+
evaluator_name: Optional[str] = None,
|
|
139
|
+
**kwargs: Any,
|
|
140
140
|
) -> CodeRun:
|
|
141
141
|
input_df = data
|
|
142
142
|
if not isinstance(input_df, pd.DataFrame):
|
|
@@ -157,7 +157,7 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
157
157
|
evaluator=flow,
|
|
158
158
|
input_df=input_df,
|
|
159
159
|
column_mapping=column_mapping,
|
|
160
|
-
evaluator_name=evaluator_name,
|
|
160
|
+
evaluator_name=evaluator_name or "",
|
|
161
161
|
)
|
|
162
162
|
|
|
163
163
|
return CodeRun(
|
|
@@ -169,11 +169,13 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
169
169
|
),
|
|
170
170
|
)
|
|
171
171
|
|
|
172
|
-
def get_details(self,
|
|
172
|
+
def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
|
|
173
|
+
run = self._get_result(client_run)
|
|
173
174
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
174
175
|
return result_df
|
|
175
176
|
|
|
176
|
-
def get_metrics(self,
|
|
177
|
+
def get_metrics(self, client_run: BatchClientRun) -> Dict[str, Any]:
|
|
178
|
+
run = self._get_result(client_run)
|
|
177
179
|
try:
|
|
178
180
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
179
181
|
print("Aggregated metrics")
|
|
@@ -183,6 +185,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
183
185
|
return {}
|
|
184
186
|
return aggregated_metrics
|
|
185
187
|
|
|
186
|
-
def get_run_summary(self,
|
|
188
|
+
def get_run_summary(self, client_run: BatchClientRun) -> Any: # pylint: disable=unused-argument
|
|
187
189
|
# Not implemented
|
|
188
190
|
return None
|
|
191
|
+
|
|
192
|
+
@staticmethod
|
|
193
|
+
def _get_result(run: BatchClientRun) -> CodeRun:
|
|
194
|
+
return cast(CodeRun, run)
|
|
@@ -5,9 +5,9 @@ import os
|
|
|
5
5
|
import types
|
|
6
6
|
from typing import Optional, Type, Union
|
|
7
7
|
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from
|
|
8
|
+
from azure.ai.evaluation._legacy._adapters._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
9
|
+
from azure.ai.evaluation._legacy._adapters.utils import ClientUserAgentUtil
|
|
10
|
+
from azure.ai.evaluation._legacy._adapters.tracing import inject_openai_api, recover_openai_api
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._constants import (
|
|
13
13
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
|
|
@@ -19,6 +19,8 @@ from azure.ai.evaluation._constants import (
|
|
|
19
19
|
|
|
20
20
|
from ..._user_agent import USER_AGENT
|
|
21
21
|
from .._utils import set_event_loop_policy
|
|
22
|
+
from .batch_clients import BatchClient
|
|
23
|
+
from ._run_submitter_client import RunSubmitterClient
|
|
22
24
|
from .code_client import CodeClient
|
|
23
25
|
from .proxy_client import ProxyClient
|
|
24
26
|
|
|
@@ -33,7 +35,7 @@ class EvalRunContext:
|
|
|
33
35
|
]
|
|
34
36
|
"""
|
|
35
37
|
|
|
36
|
-
def __init__(self, client:
|
|
38
|
+
def __init__(self, client: BatchClient) -> None:
|
|
37
39
|
self.client = client
|
|
38
40
|
self._is_batch_timeout_set_by_system = False
|
|
39
41
|
self._is_otel_timeout_set_by_system = False
|
|
@@ -64,6 +66,9 @@ class EvalRunContext:
|
|
|
64
66
|
# For addressing the issue of asyncio event loop closed on Windows
|
|
65
67
|
set_event_loop_policy()
|
|
66
68
|
|
|
69
|
+
if isinstance(self.client, RunSubmitterClient):
|
|
70
|
+
set_event_loop_policy()
|
|
71
|
+
|
|
67
72
|
def __exit__(
|
|
68
73
|
self,
|
|
69
74
|
exc_type: Optional[Type[BaseException]],
|