agentscope-runtime 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentscope_runtime/engine/services/context_manager.py +28 -1
- agentscope_runtime/engine/services/rag_service.py +101 -0
- agentscope_runtime/sandbox/box/training_box/env_service.py +1 -1
- agentscope_runtime/sandbox/box/training_box/environments/bfcl/bfcl_dataprocess.py +216 -0
- agentscope_runtime/sandbox/box/training_box/environments/bfcl/bfcl_env.py +380 -0
- agentscope_runtime/sandbox/box/training_box/environments/bfcl/env_handler.py +934 -0
- agentscope_runtime/sandbox/box/training_box/training_box.py +139 -9
- agentscope_runtime/sandbox/enums.py +2 -0
- agentscope_runtime/sandbox/manager/container_clients/docker_client.py +19 -9
- agentscope_runtime/sandbox/manager/container_clients/kubernetes_client.py +61 -6
- agentscope_runtime/sandbox/manager/sandbox_manager.py +95 -35
- agentscope_runtime/sandbox/manager/server/app.py +41 -4
- agentscope_runtime/sandbox/model/__init__.py +1 -5
- agentscope_runtime/sandbox/model/manager_config.py +2 -13
- agentscope_runtime/version.py +1 -1
- {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.2.dist-info}/METADATA +6 -1
- {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.2.dist-info}/RECORD +21 -17
- {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.2.dist-info}/WHEEL +0 -0
- {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.2.dist-info}/entry_points.txt +0 -0
- {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,934 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from typing import Dict, List, Any
|
|
5
|
+
import warnings
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import (
|
|
9
|
+
execute_multi_turn_func_call,
|
|
10
|
+
is_empty_execute_response,
|
|
11
|
+
)
|
|
12
|
+
from bfcl_eval.model_handler.utils import (
|
|
13
|
+
convert_to_tool,
|
|
14
|
+
default_decode_execute_prompting,
|
|
15
|
+
)
|
|
16
|
+
from bfcl_eval.utils import _func_doc_language_specific_pre_processing
|
|
17
|
+
|
|
18
|
+
from bfcl_eval.constants.type_mappings import GORILLA_TO_OPENAPI
|
|
19
|
+
from bfcl_eval.constants.default_prompts import (
|
|
20
|
+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
21
|
+
)
|
|
22
|
+
from bfcl_eval.constants.enums import ModelStyle
|
|
23
|
+
from bfcl_eval.eval_checker.eval_runner import (
|
|
24
|
+
relevance_file_runner,
|
|
25
|
+
multi_turn_runner,
|
|
26
|
+
ast_file_runner,
|
|
27
|
+
)
|
|
28
|
+
from bfcl_eval.eval_checker.eval_runner_helper import (
|
|
29
|
+
record_cost_latency,
|
|
30
|
+
)
|
|
31
|
+
from bfcl_eval.utils import (
|
|
32
|
+
is_multi_turn,
|
|
33
|
+
is_relevance_or_irrelevance,
|
|
34
|
+
find_file_by_category,
|
|
35
|
+
load_file,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# monkey patch to locate a possible answer path.
|
|
40
|
+
# users are expected to set this path manually in EnvHandler.
|
|
41
|
+
POSSIBLE_ANSWER_PATH = Path(
|
|
42
|
+
os.path.join(__file__, "..", "..", "..", "..", "data", "possible_answer"),
|
|
43
|
+
).resolve()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EnvHandler:
|
|
47
|
+
"""
|
|
48
|
+
A stateless standardized interface for bfcl v3 environment.
|
|
49
|
+
Interacts with environment using chat messages format.
|
|
50
|
+
This interface provides responses to assistant messages.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
model_name: str = "env_handler",
|
|
56
|
+
answer_path: Path = POSSIBLE_ANSWER_PATH,
|
|
57
|
+
):
|
|
58
|
+
"""
|
|
59
|
+
Initialize the environment handler.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
model_name: Name of the model to use. Defaults to "env_handler".
|
|
63
|
+
"""
|
|
64
|
+
self.original_model_name = model_name
|
|
65
|
+
self.model_name = (
|
|
66
|
+
model_name.replace("/", "_").replace("-", "_").replace(".", "_")
|
|
67
|
+
)
|
|
68
|
+
self.model_style = ModelStyle.OPENAI_COMPLETIONS
|
|
69
|
+
self._answer_path = answer_path
|
|
70
|
+
if not self._answer_path.exists():
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"Answer path {self._answer_path} does not exist. Please refer\
|
|
73
|
+
to README.md for more information.",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# pylint: disable=too-many-return-statements
|
|
77
|
+
def interact(
|
|
78
|
+
self,
|
|
79
|
+
messages: List[Dict[str, Any]],
|
|
80
|
+
test_entry: Dict[str, Any],
|
|
81
|
+
**_kwargs,
|
|
82
|
+
) -> Dict[str, Any]:
|
|
83
|
+
"""
|
|
84
|
+
Process one step in the conversation.
|
|
85
|
+
Both single turn and multi turn are supported.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
messages: List of conversation messages, with the last one being
|
|
89
|
+
assistant response
|
|
90
|
+
test_entry: Test entry containing initial_config, involved_classes,
|
|
91
|
+
question etc.
|
|
92
|
+
**kwargs: Additional arguments for compatibility
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Dict containing next message and tools if applicable
|
|
96
|
+
"""
|
|
97
|
+
try:
|
|
98
|
+
current_turn = self._get_current_turn(messages, test_entry)
|
|
99
|
+
|
|
100
|
+
if not messages:
|
|
101
|
+
return self._handle_user_turn(test_entry, current_turn)
|
|
102
|
+
|
|
103
|
+
if messages[-1]["role"] != "assistant":
|
|
104
|
+
return self._create_error_response(
|
|
105
|
+
"Last message must be from assistant",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if (
|
|
109
|
+
"tool_calls" in messages[-1]
|
|
110
|
+
and len(messages[-1]["tool_calls"]) > 0
|
|
111
|
+
):
|
|
112
|
+
try:
|
|
113
|
+
tool_calls = messages[-1]["tool_calls"]
|
|
114
|
+
decoded_calls = (
|
|
115
|
+
self._convert_tool_calls_to_execution_format(
|
|
116
|
+
tool_calls,
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
print(f"decoded_calls: {decoded_calls}")
|
|
120
|
+
if is_empty_execute_response(decoded_calls):
|
|
121
|
+
warnings.warn(
|
|
122
|
+
f"is_empty_execute_response: \
|
|
123
|
+
{is_empty_execute_response(decoded_calls)}",
|
|
124
|
+
)
|
|
125
|
+
return self._handle_user_turn(test_entry, current_turn)
|
|
126
|
+
|
|
127
|
+
return self._handle_tool_calls(
|
|
128
|
+
tool_calls,
|
|
129
|
+
decoded_calls,
|
|
130
|
+
test_entry,
|
|
131
|
+
current_turn,
|
|
132
|
+
)
|
|
133
|
+
except Exception as e:
|
|
134
|
+
warnings.warn(f"Tool use error: {str(e)}")
|
|
135
|
+
return self._handle_user_turn(test_entry, current_turn)
|
|
136
|
+
else:
|
|
137
|
+
return self._handle_user_turn(test_entry, current_turn)
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
return self._create_error_response(f"Request error: {str(e)}")
|
|
141
|
+
|
|
142
|
+
def _get_current_turn(
|
|
143
|
+
self,
|
|
144
|
+
messages: List[Dict[str, Any]],
|
|
145
|
+
_test_entry: Dict[str, Any],
|
|
146
|
+
) -> int:
|
|
147
|
+
"""
|
|
148
|
+
Get the current turn number in the conversation.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
messages: List of conversation messages
|
|
152
|
+
test_entry: Test entry containing conversation data
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Current turn number based on user messages count
|
|
156
|
+
"""
|
|
157
|
+
user_messages = [msg for msg in messages if msg["role"] == "user"]
|
|
158
|
+
return len(user_messages)
|
|
159
|
+
|
|
160
|
+
def _handle_tool_calls(
|
|
161
|
+
self,
|
|
162
|
+
tool_calls: List[Dict[str, Any]],
|
|
163
|
+
decoded_calls: list[str],
|
|
164
|
+
test_entry: Dict[str, Any],
|
|
165
|
+
_current_turn: int,
|
|
166
|
+
) -> Dict[str, Any]:
|
|
167
|
+
"""
|
|
168
|
+
Handle tool calls from assistant.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
tool_calls: List of tool calls in OpenAI format
|
|
172
|
+
decoded_calls: List of decoded function calls
|
|
173
|
+
test_entry: Test entry containing environment data
|
|
174
|
+
current_turn: Current turn number
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Response containing tool execution results
|
|
178
|
+
"""
|
|
179
|
+
execution_results, _ = execute_multi_turn_func_call(
|
|
180
|
+
func_call_list=decoded_calls,
|
|
181
|
+
initial_config=test_entry["initial_config"],
|
|
182
|
+
involved_classes=test_entry["involved_classes"],
|
|
183
|
+
model_name=self.model_name,
|
|
184
|
+
test_entry_id=test_entry["id"],
|
|
185
|
+
long_context=(
|
|
186
|
+
"long_context" in test_entry["id"]
|
|
187
|
+
or "composite" in test_entry["id"]
|
|
188
|
+
),
|
|
189
|
+
is_evaL_run=False,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
return self._create_tool_response(tool_calls, execution_results)
|
|
193
|
+
|
|
194
|
+
def _handle_user_turn(
|
|
195
|
+
self,
|
|
196
|
+
test_entry: Dict[str, Any],
|
|
197
|
+
current_turn: int,
|
|
198
|
+
) -> Dict[str, Any]:
|
|
199
|
+
"""
|
|
200
|
+
Handle user turn by returning appropriate content from
|
|
201
|
+
test_entry["question"].
|
|
202
|
+
For non-first turns, processes user query and tools.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
test_entry: Test entry containing conversation data
|
|
206
|
+
current_turn: Current turn number
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Response containing next user message and tools
|
|
210
|
+
"""
|
|
211
|
+
try:
|
|
212
|
+
current_turn_message = []
|
|
213
|
+
tools = self._compile_tools(test_entry)
|
|
214
|
+
questions = test_entry.get("question", [])
|
|
215
|
+
holdout_function = test_entry.get("holdout_function", {})
|
|
216
|
+
|
|
217
|
+
if str(current_turn) in holdout_function:
|
|
218
|
+
test_entry["function"].extend(
|
|
219
|
+
holdout_function[str(current_turn)],
|
|
220
|
+
)
|
|
221
|
+
tools = self._compile_tools(test_entry)
|
|
222
|
+
assert (
|
|
223
|
+
len(questions[current_turn]) == 0
|
|
224
|
+
), "Holdout turn should not have user message."
|
|
225
|
+
default_prompt = DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC
|
|
226
|
+
current_turn_message = [
|
|
227
|
+
{
|
|
228
|
+
"role": "user",
|
|
229
|
+
"content": default_prompt,
|
|
230
|
+
},
|
|
231
|
+
]
|
|
232
|
+
return self._create_user_response(current_turn_message, tools)
|
|
233
|
+
if current_turn >= len(questions):
|
|
234
|
+
return self._create_completion_response()
|
|
235
|
+
|
|
236
|
+
current_turn_message = questions[current_turn]
|
|
237
|
+
|
|
238
|
+
return self._create_user_response(current_turn_message, tools)
|
|
239
|
+
|
|
240
|
+
except Exception as e:
|
|
241
|
+
return self._create_error_response(f"处理用户轮次时发生错误: {str(e)}")
|
|
242
|
+
|
|
243
|
+
def _compile_tools(self, test_entry: dict) -> list:
|
|
244
|
+
"""
|
|
245
|
+
Compile functions into tools format.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
test_entry: Test entry containing functions
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
List of tools in OpenAI format
|
|
252
|
+
"""
|
|
253
|
+
functions: list = test_entry["function"]
|
|
254
|
+
test_category: str = test_entry["id"].rsplit("_", 1)[0]
|
|
255
|
+
|
|
256
|
+
functions = _func_doc_language_specific_pre_processing(
|
|
257
|
+
functions,
|
|
258
|
+
test_category,
|
|
259
|
+
)
|
|
260
|
+
tools = convert_to_tool(
|
|
261
|
+
functions,
|
|
262
|
+
GORILLA_TO_OPENAPI,
|
|
263
|
+
self.model_style,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return tools
|
|
267
|
+
|
|
268
|
+
def _convert_tool_calls_to_execution_format(
|
|
269
|
+
self,
|
|
270
|
+
tool_calls: List[Dict[str, Any]],
|
|
271
|
+
) -> List[str]:
|
|
272
|
+
"""
|
|
273
|
+
Convert OpenAI format tool calls to execution format.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
tool_calls: List of tool calls in OpenAI format
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
List of function calls in string format
|
|
280
|
+
"""
|
|
281
|
+
execution_list = []
|
|
282
|
+
|
|
283
|
+
for tool_call in tool_calls:
|
|
284
|
+
function = tool_call.get("function", {})
|
|
285
|
+
function_name = function.get("name", "")
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
arguments = function.get("arguments", "{}")
|
|
289
|
+
if isinstance(arguments, str):
|
|
290
|
+
args_dict = json.loads(arguments)
|
|
291
|
+
else:
|
|
292
|
+
args_dict = arguments
|
|
293
|
+
|
|
294
|
+
args_str = ", ".join(
|
|
295
|
+
[f"{k}={repr(v)}" for k, v in args_dict.items()],
|
|
296
|
+
)
|
|
297
|
+
execution_list.append(f"{function_name}({args_str})")
|
|
298
|
+
|
|
299
|
+
except Exception as e:
|
|
300
|
+
execution_list.append(f"{function_name}(), {str(e)}")
|
|
301
|
+
|
|
302
|
+
return execution_list
|
|
303
|
+
|
|
304
|
+
def _create_tool_response(
|
|
305
|
+
self,
|
|
306
|
+
tool_calls: List[Dict[str, Any]],
|
|
307
|
+
execution_results: List[str],
|
|
308
|
+
) -> Dict[str, Any]:
|
|
309
|
+
"""
|
|
310
|
+
Create response for tool calls.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
tool_calls: List of tool calls
|
|
314
|
+
execution_results: List of execution results
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Response containing tool execution results
|
|
318
|
+
"""
|
|
319
|
+
tool_messages = []
|
|
320
|
+
for i, (tool_call, result) in enumerate(
|
|
321
|
+
zip(tool_calls, execution_results),
|
|
322
|
+
):
|
|
323
|
+
tool_messages.append(
|
|
324
|
+
{
|
|
325
|
+
"role": "tool",
|
|
326
|
+
"content": result,
|
|
327
|
+
"tool_call_id": tool_call.get("id", f"call_{i}"),
|
|
328
|
+
},
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return {"messages": tool_messages}
|
|
332
|
+
|
|
333
|
+
def _create_user_response(
|
|
334
|
+
self,
|
|
335
|
+
question_turn: List[Dict[str, Any]],
|
|
336
|
+
tools: List[Dict[str, Any]],
|
|
337
|
+
) -> Dict[str, Any]:
|
|
338
|
+
"""
|
|
339
|
+
Create response containing user message.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
question_turn: List of messages for current turn
|
|
343
|
+
tools: List of available tools
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
Response containing user message and tools
|
|
347
|
+
"""
|
|
348
|
+
user_content = ""
|
|
349
|
+
for msg in question_turn:
|
|
350
|
+
if msg["role"] == "user":
|
|
351
|
+
user_content = msg["content"]
|
|
352
|
+
break
|
|
353
|
+
|
|
354
|
+
return {
|
|
355
|
+
"messages": [{"role": "user", "content": user_content}],
|
|
356
|
+
"tools": tools,
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
def _create_completion_response(self) -> Dict[str, Any]:
|
|
360
|
+
"""
|
|
361
|
+
Create response indicating conversation completion.
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Response with completion message
|
|
365
|
+
"""
|
|
366
|
+
return {
|
|
367
|
+
"messages": [
|
|
368
|
+
{"role": "env", "content": "[CONVERSATION_COMPLETED]"},
|
|
369
|
+
],
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
def _create_error_response(self, error_message: str) -> Dict[str, Any]:
|
|
373
|
+
"""
|
|
374
|
+
Create response for error conditions.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
error_message: Error message to include
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
Response containing error message
|
|
381
|
+
"""
|
|
382
|
+
return {
|
|
383
|
+
"messages": [
|
|
384
|
+
{"role": "env", "content": f"[ERROR] {error_message}"},
|
|
385
|
+
],
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
def decode_execute(self, result):
|
|
389
|
+
"""
|
|
390
|
+
Decode execute results for compatibility with evaluation framework.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
result: Result to decode
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
List of decoded function calls
|
|
397
|
+
"""
|
|
398
|
+
return default_decode_execute_prompting(result)
|
|
399
|
+
|
|
400
|
+
def evaluate(self, test_entry: Dict[str, Any]) -> Dict[str, Any]:
|
|
401
|
+
"""
|
|
402
|
+
Evaluate function for single test case.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
test_entry: Test entry containing conversation_result and
|
|
406
|
+
original_test_entry
|
|
407
|
+
Expected format:
|
|
408
|
+
{
|
|
409
|
+
"test_id": str,
|
|
410
|
+
"messages": List[Dict],
|
|
411
|
+
"turn_count": int,
|
|
412
|
+
"total_input_tokens": int,
|
|
413
|
+
"total_output_tokens": int,
|
|
414
|
+
"completed": bool,
|
|
415
|
+
"original_test_entry": Dict
|
|
416
|
+
}
|
|
417
|
+
or directly the conversation_result dict
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Evaluation results in format compatible with evaluate_task
|
|
421
|
+
"""
|
|
422
|
+
try:
|
|
423
|
+
conversation_result = test_entry
|
|
424
|
+
original_test_entry = conversation_result.get(
|
|
425
|
+
"original_test_entry",
|
|
426
|
+
{},
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
if not conversation_result or not original_test_entry:
|
|
430
|
+
return self._create_eval_error_result(
|
|
431
|
+
"Missing conversation_result or original_test_entry",
|
|
432
|
+
test_entry.get("test_id", "unknown"),
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
test_id = conversation_result.get("test_id", "unknown")
|
|
436
|
+
category = test_id.rsplit("_", 1)[0] if "_" in test_id else test_id
|
|
437
|
+
|
|
438
|
+
model_name = self.model_name
|
|
439
|
+
from bfcl_eval.model_handler.api_inference.qwen import (
|
|
440
|
+
QwenAPIHandler,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
handler = QwenAPIHandler(
|
|
444
|
+
self.model_name,
|
|
445
|
+
temperature=1.0,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
model_result_data = self._convert_conversation_to_eval_format(
|
|
449
|
+
conversation_result,
|
|
450
|
+
original_test_entry,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
prompt_data = [original_test_entry]
|
|
454
|
+
|
|
455
|
+
state = {"leaderboard_table": {}}
|
|
456
|
+
record_cost_latency(
|
|
457
|
+
state["leaderboard_table"],
|
|
458
|
+
model_name,
|
|
459
|
+
[model_result_data],
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
if is_relevance_or_irrelevance(category):
|
|
463
|
+
accuracy, total_count = self._eval_relevance_test(
|
|
464
|
+
handler,
|
|
465
|
+
model_result_data,
|
|
466
|
+
prompt_data,
|
|
467
|
+
model_name,
|
|
468
|
+
category,
|
|
469
|
+
)
|
|
470
|
+
else:
|
|
471
|
+
possible_answer_file = find_file_by_category(
|
|
472
|
+
category,
|
|
473
|
+
self._answer_path,
|
|
474
|
+
)
|
|
475
|
+
possible_answer = load_file(
|
|
476
|
+
possible_answer_file,
|
|
477
|
+
sort_by_id=True,
|
|
478
|
+
)
|
|
479
|
+
possible_answer = [
|
|
480
|
+
item for item in possible_answer if item["id"] == test_id
|
|
481
|
+
]
|
|
482
|
+
if is_multi_turn(category):
|
|
483
|
+
accuracy, total_count = self._eval_multi_turn_test(
|
|
484
|
+
handler,
|
|
485
|
+
model_result_data,
|
|
486
|
+
prompt_data,
|
|
487
|
+
possible_answer,
|
|
488
|
+
model_name,
|
|
489
|
+
category,
|
|
490
|
+
)
|
|
491
|
+
else:
|
|
492
|
+
accuracy, total_count = self._eval_single_turn_test(
|
|
493
|
+
handler,
|
|
494
|
+
model_result_data,
|
|
495
|
+
prompt_data,
|
|
496
|
+
possible_answer,
|
|
497
|
+
model_name,
|
|
498
|
+
category,
|
|
499
|
+
)
|
|
500
|
+
result = {
|
|
501
|
+
"valid": True,
|
|
502
|
+
"accuracy": accuracy,
|
|
503
|
+
"total_count": total_count,
|
|
504
|
+
"correct_count": int(accuracy * total_count),
|
|
505
|
+
"test_category": category,
|
|
506
|
+
"test_id": test_id,
|
|
507
|
+
"model_name": model_name,
|
|
508
|
+
"input_tokens": conversation_result.get(
|
|
509
|
+
"total_input_tokens",
|
|
510
|
+
0,
|
|
511
|
+
),
|
|
512
|
+
"output_tokens": conversation_result.get(
|
|
513
|
+
"total_output_tokens",
|
|
514
|
+
0,
|
|
515
|
+
),
|
|
516
|
+
"turn_count": conversation_result.get("turn_count", 0),
|
|
517
|
+
"completed": conversation_result.get("completed", False),
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
return result
|
|
521
|
+
|
|
522
|
+
except Exception as e:
|
|
523
|
+
import traceback
|
|
524
|
+
|
|
525
|
+
traceback.print_exc()
|
|
526
|
+
return self._create_eval_error_result(
|
|
527
|
+
f"Evaluation failed: {str(e)}",
|
|
528
|
+
test_entry.get(
|
|
529
|
+
"test_id",
|
|
530
|
+
test_entry.get("conversation_result", {}).get(
|
|
531
|
+
"test_id",
|
|
532
|
+
"unknown",
|
|
533
|
+
),
|
|
534
|
+
),
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
def _create_eval_error_result(
|
|
538
|
+
self,
|
|
539
|
+
error_message: str,
|
|
540
|
+
test_id: str,
|
|
541
|
+
) -> Dict[str, Any]:
|
|
542
|
+
"""
|
|
543
|
+
Create standardized error result for evaluation.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
error_message: Error message to include
|
|
547
|
+
test_id: ID of the test case
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
Dictionary containing error result information
|
|
551
|
+
"""
|
|
552
|
+
return {
|
|
553
|
+
"valid": False,
|
|
554
|
+
"error": error_message,
|
|
555
|
+
"accuracy": 0.0,
|
|
556
|
+
"total_count": 1,
|
|
557
|
+
"correct_count": 0,
|
|
558
|
+
"test_id": test_id,
|
|
559
|
+
"model_name": self.model_name,
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
def _eval_relevance_test(
|
|
563
|
+
self,
|
|
564
|
+
handler,
|
|
565
|
+
model_result_data,
|
|
566
|
+
prompt_data,
|
|
567
|
+
model_name,
|
|
568
|
+
test_category,
|
|
569
|
+
):
|
|
570
|
+
"""
|
|
571
|
+
Evaluate relevance/irrelevance test.
|
|
572
|
+
|
|
573
|
+
Args:
|
|
574
|
+
handler: Model handler instance
|
|
575
|
+
model_result_data: Model result data
|
|
576
|
+
prompt_data: Prompt data
|
|
577
|
+
model_name: Name of the model
|
|
578
|
+
test_category: Category of the test
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
Tuple of (accuracy, total_count)
|
|
582
|
+
"""
|
|
583
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
584
|
+
score_dir = Path(temp_dir)
|
|
585
|
+
accuracy, total_count = relevance_file_runner(
|
|
586
|
+
handler=handler,
|
|
587
|
+
model_result=[model_result_data],
|
|
588
|
+
prompt=prompt_data,
|
|
589
|
+
model_name=model_name,
|
|
590
|
+
test_category=test_category,
|
|
591
|
+
score_dir=score_dir,
|
|
592
|
+
)
|
|
593
|
+
self._capture_and_print_score_files(
|
|
594
|
+
score_dir,
|
|
595
|
+
model_name,
|
|
596
|
+
test_category,
|
|
597
|
+
"relevance",
|
|
598
|
+
)
|
|
599
|
+
return accuracy, total_count
|
|
600
|
+
|
|
601
|
+
def _eval_multi_turn_test(
|
|
602
|
+
self,
|
|
603
|
+
handler,
|
|
604
|
+
model_result_data,
|
|
605
|
+
prompt_data,
|
|
606
|
+
possible_answer,
|
|
607
|
+
model_name,
|
|
608
|
+
test_category,
|
|
609
|
+
):
|
|
610
|
+
"""
|
|
611
|
+
Evaluate multi-turn test.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
handler: Model handler instance
|
|
615
|
+
model_result_data: Model result data
|
|
616
|
+
prompt_data: Prompt data
|
|
617
|
+
possible_answer: Possible answer data
|
|
618
|
+
model_name: Name of the model
|
|
619
|
+
test_category: Category of the test
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
Tuple of (accuracy, total_count)
|
|
623
|
+
"""
|
|
624
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
625
|
+
score_dir = Path(temp_dir)
|
|
626
|
+
accuracy, total_count = multi_turn_runner(
|
|
627
|
+
handler=handler,
|
|
628
|
+
model_result=[model_result_data],
|
|
629
|
+
prompt=prompt_data,
|
|
630
|
+
possible_answer=possible_answer,
|
|
631
|
+
model_name=model_name,
|
|
632
|
+
test_category=test_category,
|
|
633
|
+
score_dir=score_dir,
|
|
634
|
+
)
|
|
635
|
+
self._capture_and_print_score_files(
|
|
636
|
+
score_dir,
|
|
637
|
+
model_name,
|
|
638
|
+
test_category,
|
|
639
|
+
"multi_turn",
|
|
640
|
+
)
|
|
641
|
+
return accuracy, total_count
|
|
642
|
+
|
|
643
|
+
def _eval_single_turn_test(
|
|
644
|
+
self,
|
|
645
|
+
handler,
|
|
646
|
+
model_result_data,
|
|
647
|
+
prompt_data,
|
|
648
|
+
possible_answer,
|
|
649
|
+
model_name,
|
|
650
|
+
test_category,
|
|
651
|
+
):
|
|
652
|
+
"""
|
|
653
|
+
Evaluate single-turn AST test.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
handler: Model handler instance
|
|
657
|
+
model_result_data: Model result data
|
|
658
|
+
prompt_data: Prompt data
|
|
659
|
+
possible_answer: Possible answer data
|
|
660
|
+
model_name: Name of the model
|
|
661
|
+
test_category: Category of the test
|
|
662
|
+
|
|
663
|
+
Returns:
|
|
664
|
+
Tuple of (accuracy, total_count)
|
|
665
|
+
"""
|
|
666
|
+
language = "Python"
|
|
667
|
+
if "java" in test_category.lower():
|
|
668
|
+
language = "Java"
|
|
669
|
+
elif (
|
|
670
|
+
"js" in test_category.lower()
|
|
671
|
+
or "javascript" in test_category.lower()
|
|
672
|
+
):
|
|
673
|
+
language = "JavaScript"
|
|
674
|
+
|
|
675
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
676
|
+
score_dir = Path(temp_dir)
|
|
677
|
+
accuracy, total_count = ast_file_runner(
|
|
678
|
+
handler=handler,
|
|
679
|
+
model_result=[model_result_data],
|
|
680
|
+
prompt=prompt_data,
|
|
681
|
+
possible_answer=possible_answer,
|
|
682
|
+
language=language,
|
|
683
|
+
test_category=test_category,
|
|
684
|
+
model_name=model_name,
|
|
685
|
+
score_dir=score_dir,
|
|
686
|
+
)
|
|
687
|
+
self._capture_and_print_score_files(
|
|
688
|
+
score_dir,
|
|
689
|
+
model_name,
|
|
690
|
+
test_category,
|
|
691
|
+
"single_turn",
|
|
692
|
+
)
|
|
693
|
+
return accuracy, total_count
|
|
694
|
+
|
|
695
|
+
# pylint: disable=too-many-nested-blocks
|
|
696
|
+
def _capture_and_print_score_files(
|
|
697
|
+
self,
|
|
698
|
+
score_dir: Path,
|
|
699
|
+
_model_name: str,
|
|
700
|
+
_test_category: str,
|
|
701
|
+
_eval_type: str,
|
|
702
|
+
):
|
|
703
|
+
"""
|
|
704
|
+
Capture and print contents of score files written to score_dir.
|
|
705
|
+
|
|
706
|
+
Args:
|
|
707
|
+
score_dir: Directory containing score files
|
|
708
|
+
model_name: Name of the model
|
|
709
|
+
test_category: Category of the test
|
|
710
|
+
eval_type: Type of evaluation (relevance/multi_turn/single_turn)
|
|
711
|
+
"""
|
|
712
|
+
try:
|
|
713
|
+
for file_path in score_dir.rglob("*"):
|
|
714
|
+
if file_path.is_file():
|
|
715
|
+
try:
|
|
716
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
717
|
+
content = f.read()
|
|
718
|
+
|
|
719
|
+
if (
|
|
720
|
+
file_path.suffix == ".json"
|
|
721
|
+
or content.strip().startswith("{")
|
|
722
|
+
or content.strip().startswith("[")
|
|
723
|
+
):
|
|
724
|
+
try:
|
|
725
|
+
lines = content.strip().split("\n")
|
|
726
|
+
formatted_lines = []
|
|
727
|
+
for line in lines:
|
|
728
|
+
if line.strip():
|
|
729
|
+
parsed = json.loads(line)
|
|
730
|
+
formatted_lines.append(
|
|
731
|
+
json.dumps(
|
|
732
|
+
parsed,
|
|
733
|
+
ensure_ascii=False,
|
|
734
|
+
indent=2,
|
|
735
|
+
),
|
|
736
|
+
)
|
|
737
|
+
content = "\n".join(formatted_lines)
|
|
738
|
+
except json.JSONDecodeError:
|
|
739
|
+
pass
|
|
740
|
+
|
|
741
|
+
except UnicodeDecodeError:
|
|
742
|
+
print(
|
|
743
|
+
f"[Binary file, size: {file_path.stat().st_size}\
|
|
744
|
+
bytes]",
|
|
745
|
+
)
|
|
746
|
+
except Exception as e:
|
|
747
|
+
print(f"[Error reading file: {str(e)}]")
|
|
748
|
+
|
|
749
|
+
except Exception as e:
|
|
750
|
+
print(f"Error capturing evaluation result files: {str(e)}")
|
|
751
|
+
|
|
752
|
+
def _convert_conversation_to_eval_format(
|
|
753
|
+
self,
|
|
754
|
+
conversation_result: Dict[str, Any],
|
|
755
|
+
_original_test_entry: Dict[str, Any],
|
|
756
|
+
) -> Dict[str, Any]:
|
|
757
|
+
"""
|
|
758
|
+
Convert conversation history to evaluation format.
|
|
759
|
+
|
|
760
|
+
Args:
|
|
761
|
+
conversation_result: Result from run_conversation
|
|
762
|
+
original_test_entry: Original test entry data
|
|
763
|
+
|
|
764
|
+
Returns:
|
|
765
|
+
Data in format expected by multi_turn_runner or other runners
|
|
766
|
+
"""
|
|
767
|
+
test_id = conversation_result.get("test_id", "unknown")
|
|
768
|
+
messages = conversation_result.get("messages", [])
|
|
769
|
+
|
|
770
|
+
test_category = (
|
|
771
|
+
test_id.rsplit("_", 1)[0] if "_" in test_id else test_id
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
if is_multi_turn(test_category):
|
|
775
|
+
turns_data = self._extract_multi_turn_responses(messages)
|
|
776
|
+
else:
|
|
777
|
+
turns_data = self._extract_single_turn_response(messages)
|
|
778
|
+
|
|
779
|
+
model_result_data = {
|
|
780
|
+
"id": test_id,
|
|
781
|
+
"result": turns_data,
|
|
782
|
+
"latency": conversation_result.get("total_latency", 0),
|
|
783
|
+
"input_token_count": conversation_result.get(
|
|
784
|
+
"total_input_tokens",
|
|
785
|
+
0,
|
|
786
|
+
),
|
|
787
|
+
"output_token_count": conversation_result.get(
|
|
788
|
+
"total_output_tokens",
|
|
789
|
+
0,
|
|
790
|
+
),
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
return model_result_data
|
|
794
|
+
|
|
795
|
+
# pylint: disable=too-many-nested-blocks
|
|
796
|
+
def _extract_multi_turn_responses(
|
|
797
|
+
self,
|
|
798
|
+
messages: List[Dict[str, Any]],
|
|
799
|
+
) -> List[List[str]]:
|
|
800
|
+
"""
|
|
801
|
+
Extract multi-turn responses from conversation messages.
|
|
802
|
+
|
|
803
|
+
Args:
|
|
804
|
+
messages: List of conversation messages
|
|
805
|
+
|
|
806
|
+
Returns:
|
|
807
|
+
List of turns, each turn is a list of function call strings
|
|
808
|
+
"""
|
|
809
|
+
turns_data = []
|
|
810
|
+
current_turn_responses = []
|
|
811
|
+
|
|
812
|
+
i = 0
|
|
813
|
+
while i < len(messages):
|
|
814
|
+
message = messages[i]
|
|
815
|
+
|
|
816
|
+
if message["role"] == "user":
|
|
817
|
+
if current_turn_responses:
|
|
818
|
+
turns_data.append(current_turn_responses)
|
|
819
|
+
current_turn_responses = []
|
|
820
|
+
|
|
821
|
+
i += 1
|
|
822
|
+
while i < len(messages) and messages[i]["role"] == "assistant":
|
|
823
|
+
assistant_msg = messages[i]
|
|
824
|
+
|
|
825
|
+
if (
|
|
826
|
+
"tool_calls" in assistant_msg
|
|
827
|
+
and assistant_msg["tool_calls"]
|
|
828
|
+
):
|
|
829
|
+
for tool_call in assistant_msg["tool_calls"]:
|
|
830
|
+
formatted_call = (
|
|
831
|
+
self._format_single_tool_call_for_eval(
|
|
832
|
+
tool_call,
|
|
833
|
+
)
|
|
834
|
+
)
|
|
835
|
+
if formatted_call:
|
|
836
|
+
current_turn_responses.append(formatted_call)
|
|
837
|
+
|
|
838
|
+
i += 1
|
|
839
|
+
|
|
840
|
+
while i < len(messages) and messages[i]["role"] == "tool":
|
|
841
|
+
i += 1
|
|
842
|
+
else:
|
|
843
|
+
i += 1
|
|
844
|
+
|
|
845
|
+
if current_turn_responses:
|
|
846
|
+
turns_data.append(current_turn_responses)
|
|
847
|
+
|
|
848
|
+
return turns_data
|
|
849
|
+
|
|
850
|
+
def _extract_single_turn_response(
|
|
851
|
+
self,
|
|
852
|
+
messages: List[Dict[str, Any]],
|
|
853
|
+
) -> str:
|
|
854
|
+
"""
|
|
855
|
+
Extract single-turn response from conversation messages.
|
|
856
|
+
|
|
857
|
+
Args:
|
|
858
|
+
messages: List of conversation messages
|
|
859
|
+
|
|
860
|
+
Returns:
|
|
861
|
+
String representation of the response
|
|
862
|
+
"""
|
|
863
|
+
for message in reversed(messages):
|
|
864
|
+
if message["role"] == "assistant":
|
|
865
|
+
if "tool_calls" in message and message["tool_calls"]:
|
|
866
|
+
formatted_calls = []
|
|
867
|
+
for tool_call in message["tool_calls"]:
|
|
868
|
+
formatted_call = (
|
|
869
|
+
self._format_single_tool_call_for_eval(
|
|
870
|
+
tool_call,
|
|
871
|
+
)
|
|
872
|
+
)
|
|
873
|
+
if formatted_call:
|
|
874
|
+
formatted_calls.append(formatted_call)
|
|
875
|
+
return (
|
|
876
|
+
"\n".join(formatted_calls) if formatted_calls else ""
|
|
877
|
+
)
|
|
878
|
+
elif message.get("content"):
|
|
879
|
+
return message["content"]
|
|
880
|
+
|
|
881
|
+
return ""
|
|
882
|
+
|
|
883
|
+
def _format_single_tool_call_for_eval(
|
|
884
|
+
self,
|
|
885
|
+
tool_call: Dict[str, Any],
|
|
886
|
+
) -> str:
|
|
887
|
+
"""
|
|
888
|
+
Format a single tool call into string representation for evaluation.
|
|
889
|
+
|
|
890
|
+
Args:
|
|
891
|
+
tool_call: Single tool call in OpenAI format
|
|
892
|
+
|
|
893
|
+
Returns:
|
|
894
|
+
Formatted string representation
|
|
895
|
+
"""
|
|
896
|
+
function = tool_call.get("function", {})
|
|
897
|
+
function_name = function.get("name", "")
|
|
898
|
+
|
|
899
|
+
try:
|
|
900
|
+
arguments = function.get("arguments", "{}")
|
|
901
|
+
if isinstance(arguments, str):
|
|
902
|
+
args_dict = json.loads(arguments)
|
|
903
|
+
else:
|
|
904
|
+
args_dict = arguments
|
|
905
|
+
|
|
906
|
+
args_str = ", ".join(
|
|
907
|
+
[f"{k}={repr(v)}" for k, v in args_dict.items()],
|
|
908
|
+
)
|
|
909
|
+
return f"{function_name}({args_str})"
|
|
910
|
+
|
|
911
|
+
except Exception as e:
|
|
912
|
+
return f"{function_name}, {str(e)}"
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
def env_step(
|
|
916
|
+
messages: List[Dict[str, Any]],
|
|
917
|
+
test_entry: Dict[str, Any],
|
|
918
|
+
model: str = "env-handler",
|
|
919
|
+
**kwargs,
|
|
920
|
+
) -> Dict[str, Any]:
|
|
921
|
+
"""
|
|
922
|
+
Simplified interface for environment chat completion.
|
|
923
|
+
|
|
924
|
+
Args:
|
|
925
|
+
messages: List of conversation messages
|
|
926
|
+
test_entry: Test entry containing conversation data
|
|
927
|
+
model: Model name
|
|
928
|
+
**kwargs: Additional arguments
|
|
929
|
+
|
|
930
|
+
Returns:
|
|
931
|
+
Response from environment handler
|
|
932
|
+
"""
|
|
933
|
+
handler = EnvHandler(model)
|
|
934
|
+
return handler.interact(messages, test_entry, **kwargs)
|