agentscope-runtime 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. agentscope_runtime/engine/agents/agentscope_agent/agent.py +105 -50
  2. agentscope_runtime/engine/agents/agentscope_agent/hooks.py +16 -3
  3. agentscope_runtime/engine/helpers/helper.py +33 -0
  4. agentscope_runtime/engine/runner.py +33 -1
  5. agentscope_runtime/engine/schemas/agent_schemas.py +208 -13
  6. agentscope_runtime/engine/services/context_manager.py +34 -1
  7. agentscope_runtime/engine/services/rag_service.py +195 -0
  8. agentscope_runtime/engine/services/reme_personal_memory_service.py +106 -0
  9. agentscope_runtime/engine/services/reme_task_memory_service.py +11 -0
  10. agentscope_runtime/sandbox/box/browser/browser_sandbox.py +25 -0
  11. agentscope_runtime/sandbox/box/sandbox.py +60 -7
  12. agentscope_runtime/sandbox/box/shared/routers/mcp_utils.py +20 -2
  13. agentscope_runtime/sandbox/box/training_box/env_service.py +1 -1
  14. agentscope_runtime/sandbox/box/training_box/environments/bfcl/bfcl_dataprocess.py +216 -0
  15. agentscope_runtime/sandbox/box/training_box/environments/bfcl/bfcl_env.py +380 -0
  16. agentscope_runtime/sandbox/box/training_box/environments/bfcl/env_handler.py +934 -0
  17. agentscope_runtime/sandbox/box/training_box/training_box.py +139 -9
  18. agentscope_runtime/sandbox/client/http_client.py +1 -1
  19. agentscope_runtime/sandbox/enums.py +2 -0
  20. agentscope_runtime/sandbox/manager/container_clients/docker_client.py +19 -9
  21. agentscope_runtime/sandbox/manager/container_clients/kubernetes_client.py +61 -6
  22. agentscope_runtime/sandbox/manager/sandbox_manager.py +95 -35
  23. agentscope_runtime/sandbox/manager/server/app.py +128 -17
  24. agentscope_runtime/sandbox/model/__init__.py +1 -5
  25. agentscope_runtime/sandbox/model/manager_config.py +2 -13
  26. agentscope_runtime/sandbox/tools/mcp_tool.py +1 -1
  27. agentscope_runtime/version.py +1 -1
  28. {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.3.dist-info}/METADATA +59 -3
  29. {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.3.dist-info}/RECORD +33 -27
  30. {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.3.dist-info}/WHEEL +0 -0
  31. {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.3.dist-info}/entry_points.txt +0 -0
  32. {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.3.dist-info}/licenses/LICENSE +0 -0
  33. {agentscope_runtime-0.1.1.dist-info → agentscope_runtime-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,934 @@
1
+ # -*- coding: utf-8 -*-
2
+ import json
3
+ import os
4
+ from typing import Dict, List, Any
5
+ import warnings
6
+ import tempfile
7
+ from pathlib import Path
8
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import (
9
+ execute_multi_turn_func_call,
10
+ is_empty_execute_response,
11
+ )
12
+ from bfcl_eval.model_handler.utils import (
13
+ convert_to_tool,
14
+ default_decode_execute_prompting,
15
+ )
16
+ from bfcl_eval.utils import _func_doc_language_specific_pre_processing
17
+
18
+ from bfcl_eval.constants.type_mappings import GORILLA_TO_OPENAPI
19
+ from bfcl_eval.constants.default_prompts import (
20
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
21
+ )
22
+ from bfcl_eval.constants.enums import ModelStyle
23
+ from bfcl_eval.eval_checker.eval_runner import (
24
+ relevance_file_runner,
25
+ multi_turn_runner,
26
+ ast_file_runner,
27
+ )
28
+ from bfcl_eval.eval_checker.eval_runner_helper import (
29
+ record_cost_latency,
30
+ )
31
+ from bfcl_eval.utils import (
32
+ is_multi_turn,
33
+ is_relevance_or_irrelevance,
34
+ find_file_by_category,
35
+ load_file,
36
+ )
37
+
38
+
39
+ # monkey patch to locate a possible answer path.
40
+ # users are expected to set this path manually in EnvHandler.
41
+ POSSIBLE_ANSWER_PATH = Path(
42
+ os.path.join(__file__, "..", "..", "..", "..", "data", "possible_answer"),
43
+ ).resolve()
44
+
45
+
46
+ class EnvHandler:
47
+ """
48
+ A stateless standardized interface for bfcl v3 environment.
49
+ Interacts with environment using chat messages format.
50
+ This interface provides responses to assistant messages.
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ model_name: str = "env_handler",
56
+ answer_path: Path = POSSIBLE_ANSWER_PATH,
57
+ ):
58
+ """
59
+ Initialize the environment handler.
60
+
61
+ Args:
62
+ model_name: Name of the model to use. Defaults to "env_handler".
63
+ """
64
+ self.original_model_name = model_name
65
+ self.model_name = (
66
+ model_name.replace("/", "_").replace("-", "_").replace(".", "_")
67
+ )
68
+ self.model_style = ModelStyle.OPENAI_COMPLETIONS
69
+ self._answer_path = answer_path
70
+ if not self._answer_path.exists():
71
+ raise ValueError(
72
+ f"Answer path {self._answer_path} does not exist. Please refer\
73
+ to README.md for more information.",
74
+ )
75
+
76
+ # pylint: disable=too-many-return-statements
77
+ def interact(
78
+ self,
79
+ messages: List[Dict[str, Any]],
80
+ test_entry: Dict[str, Any],
81
+ **_kwargs,
82
+ ) -> Dict[str, Any]:
83
+ """
84
+ Process one step in the conversation.
85
+ Both single turn and multi turn are supported.
86
+
87
+ Args:
88
+ messages: List of conversation messages, with the last one being
89
+ assistant response
90
+ test_entry: Test entry containing initial_config, involved_classes,
91
+ question etc.
92
+ **kwargs: Additional arguments for compatibility
93
+
94
+ Returns:
95
+ Dict containing next message and tools if applicable
96
+ """
97
+ try:
98
+ current_turn = self._get_current_turn(messages, test_entry)
99
+
100
+ if not messages:
101
+ return self._handle_user_turn(test_entry, current_turn)
102
+
103
+ if messages[-1]["role"] != "assistant":
104
+ return self._create_error_response(
105
+ "Last message must be from assistant",
106
+ )
107
+
108
+ if (
109
+ "tool_calls" in messages[-1]
110
+ and len(messages[-1]["tool_calls"]) > 0
111
+ ):
112
+ try:
113
+ tool_calls = messages[-1]["tool_calls"]
114
+ decoded_calls = (
115
+ self._convert_tool_calls_to_execution_format(
116
+ tool_calls,
117
+ )
118
+ )
119
+ print(f"decoded_calls: {decoded_calls}")
120
+ if is_empty_execute_response(decoded_calls):
121
+ warnings.warn(
122
+ f"is_empty_execute_response: \
123
+ {is_empty_execute_response(decoded_calls)}",
124
+ )
125
+ return self._handle_user_turn(test_entry, current_turn)
126
+
127
+ return self._handle_tool_calls(
128
+ tool_calls,
129
+ decoded_calls,
130
+ test_entry,
131
+ current_turn,
132
+ )
133
+ except Exception as e:
134
+ warnings.warn(f"Tool use error: {str(e)}")
135
+ return self._handle_user_turn(test_entry, current_turn)
136
+ else:
137
+ return self._handle_user_turn(test_entry, current_turn)
138
+
139
+ except Exception as e:
140
+ return self._create_error_response(f"Request error: {str(e)}")
141
+
142
+ def _get_current_turn(
143
+ self,
144
+ messages: List[Dict[str, Any]],
145
+ _test_entry: Dict[str, Any],
146
+ ) -> int:
147
+ """
148
+ Get the current turn number in the conversation.
149
+
150
+ Args:
151
+ messages: List of conversation messages
152
+ test_entry: Test entry containing conversation data
153
+
154
+ Returns:
155
+ Current turn number based on user messages count
156
+ """
157
+ user_messages = [msg for msg in messages if msg["role"] == "user"]
158
+ return len(user_messages)
159
+
160
+ def _handle_tool_calls(
161
+ self,
162
+ tool_calls: List[Dict[str, Any]],
163
+ decoded_calls: list[str],
164
+ test_entry: Dict[str, Any],
165
+ _current_turn: int,
166
+ ) -> Dict[str, Any]:
167
+ """
168
+ Handle tool calls from assistant.
169
+
170
+ Args:
171
+ tool_calls: List of tool calls in OpenAI format
172
+ decoded_calls: List of decoded function calls
173
+ test_entry: Test entry containing environment data
174
+ current_turn: Current turn number
175
+
176
+ Returns:
177
+ Response containing tool execution results
178
+ """
179
+ execution_results, _ = execute_multi_turn_func_call(
180
+ func_call_list=decoded_calls,
181
+ initial_config=test_entry["initial_config"],
182
+ involved_classes=test_entry["involved_classes"],
183
+ model_name=self.model_name,
184
+ test_entry_id=test_entry["id"],
185
+ long_context=(
186
+ "long_context" in test_entry["id"]
187
+ or "composite" in test_entry["id"]
188
+ ),
189
+ is_evaL_run=False,
190
+ )
191
+
192
+ return self._create_tool_response(tool_calls, execution_results)
193
+
194
+ def _handle_user_turn(
195
+ self,
196
+ test_entry: Dict[str, Any],
197
+ current_turn: int,
198
+ ) -> Dict[str, Any]:
199
+ """
200
+ Handle user turn by returning appropriate content from
201
+ test_entry["question"].
202
+ For non-first turns, processes user query and tools.
203
+
204
+ Args:
205
+ test_entry: Test entry containing conversation data
206
+ current_turn: Current turn number
207
+
208
+ Returns:
209
+ Response containing next user message and tools
210
+ """
211
+ try:
212
+ current_turn_message = []
213
+ tools = self._compile_tools(test_entry)
214
+ questions = test_entry.get("question", [])
215
+ holdout_function = test_entry.get("holdout_function", {})
216
+
217
+ if str(current_turn) in holdout_function:
218
+ test_entry["function"].extend(
219
+ holdout_function[str(current_turn)],
220
+ )
221
+ tools = self._compile_tools(test_entry)
222
+ assert (
223
+ len(questions[current_turn]) == 0
224
+ ), "Holdout turn should not have user message."
225
+ default_prompt = DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC
226
+ current_turn_message = [
227
+ {
228
+ "role": "user",
229
+ "content": default_prompt,
230
+ },
231
+ ]
232
+ return self._create_user_response(current_turn_message, tools)
233
+ if current_turn >= len(questions):
234
+ return self._create_completion_response()
235
+
236
+ current_turn_message = questions[current_turn]
237
+
238
+ return self._create_user_response(current_turn_message, tools)
239
+
240
+ except Exception as e:
241
+ return self._create_error_response(f"处理用户轮次时发生错误: {str(e)}")
242
+
243
+ def _compile_tools(self, test_entry: dict) -> list:
244
+ """
245
+ Compile functions into tools format.
246
+
247
+ Args:
248
+ test_entry: Test entry containing functions
249
+
250
+ Returns:
251
+ List of tools in OpenAI format
252
+ """
253
+ functions: list = test_entry["function"]
254
+ test_category: str = test_entry["id"].rsplit("_", 1)[0]
255
+
256
+ functions = _func_doc_language_specific_pre_processing(
257
+ functions,
258
+ test_category,
259
+ )
260
+ tools = convert_to_tool(
261
+ functions,
262
+ GORILLA_TO_OPENAPI,
263
+ self.model_style,
264
+ )
265
+
266
+ return tools
267
+
268
+ def _convert_tool_calls_to_execution_format(
269
+ self,
270
+ tool_calls: List[Dict[str, Any]],
271
+ ) -> List[str]:
272
+ """
273
+ Convert OpenAI format tool calls to execution format.
274
+
275
+ Args:
276
+ tool_calls: List of tool calls in OpenAI format
277
+
278
+ Returns:
279
+ List of function calls in string format
280
+ """
281
+ execution_list = []
282
+
283
+ for tool_call in tool_calls:
284
+ function = tool_call.get("function", {})
285
+ function_name = function.get("name", "")
286
+
287
+ try:
288
+ arguments = function.get("arguments", "{}")
289
+ if isinstance(arguments, str):
290
+ args_dict = json.loads(arguments)
291
+ else:
292
+ args_dict = arguments
293
+
294
+ args_str = ", ".join(
295
+ [f"{k}={repr(v)}" for k, v in args_dict.items()],
296
+ )
297
+ execution_list.append(f"{function_name}({args_str})")
298
+
299
+ except Exception as e:
300
+ execution_list.append(f"{function_name}(), {str(e)}")
301
+
302
+ return execution_list
303
+
304
+ def _create_tool_response(
305
+ self,
306
+ tool_calls: List[Dict[str, Any]],
307
+ execution_results: List[str],
308
+ ) -> Dict[str, Any]:
309
+ """
310
+ Create response for tool calls.
311
+
312
+ Args:
313
+ tool_calls: List of tool calls
314
+ execution_results: List of execution results
315
+
316
+ Returns:
317
+ Response containing tool execution results
318
+ """
319
+ tool_messages = []
320
+ for i, (tool_call, result) in enumerate(
321
+ zip(tool_calls, execution_results),
322
+ ):
323
+ tool_messages.append(
324
+ {
325
+ "role": "tool",
326
+ "content": result,
327
+ "tool_call_id": tool_call.get("id", f"call_{i}"),
328
+ },
329
+ )
330
+
331
+ return {"messages": tool_messages}
332
+
333
+ def _create_user_response(
334
+ self,
335
+ question_turn: List[Dict[str, Any]],
336
+ tools: List[Dict[str, Any]],
337
+ ) -> Dict[str, Any]:
338
+ """
339
+ Create response containing user message.
340
+
341
+ Args:
342
+ question_turn: List of messages for current turn
343
+ tools: List of available tools
344
+
345
+ Returns:
346
+ Response containing user message and tools
347
+ """
348
+ user_content = ""
349
+ for msg in question_turn:
350
+ if msg["role"] == "user":
351
+ user_content = msg["content"]
352
+ break
353
+
354
+ return {
355
+ "messages": [{"role": "user", "content": user_content}],
356
+ "tools": tools,
357
+ }
358
+
359
+ def _create_completion_response(self) -> Dict[str, Any]:
360
+ """
361
+ Create response indicating conversation completion.
362
+
363
+ Returns:
364
+ Response with completion message
365
+ """
366
+ return {
367
+ "messages": [
368
+ {"role": "env", "content": "[CONVERSATION_COMPLETED]"},
369
+ ],
370
+ }
371
+
372
+ def _create_error_response(self, error_message: str) -> Dict[str, Any]:
373
+ """
374
+ Create response for error conditions.
375
+
376
+ Args:
377
+ error_message: Error message to include
378
+
379
+ Returns:
380
+ Response containing error message
381
+ """
382
+ return {
383
+ "messages": [
384
+ {"role": "env", "content": f"[ERROR] {error_message}"},
385
+ ],
386
+ }
387
+
388
+ def decode_execute(self, result):
389
+ """
390
+ Decode execute results for compatibility with evaluation framework.
391
+
392
+ Args:
393
+ result: Result to decode
394
+
395
+ Returns:
396
+ List of decoded function calls
397
+ """
398
+ return default_decode_execute_prompting(result)
399
+
400
+ def evaluate(self, test_entry: Dict[str, Any]) -> Dict[str, Any]:
401
+ """
402
+ Evaluate function for single test case.
403
+
404
+ Args:
405
+ test_entry: Test entry containing conversation_result and
406
+ original_test_entry
407
+ Expected format:
408
+ {
409
+ "test_id": str,
410
+ "messages": List[Dict],
411
+ "turn_count": int,
412
+ "total_input_tokens": int,
413
+ "total_output_tokens": int,
414
+ "completed": bool,
415
+ "original_test_entry": Dict
416
+ }
417
+ or directly the conversation_result dict
418
+
419
+ Returns:
420
+ Evaluation results in format compatible with evaluate_task
421
+ """
422
+ try:
423
+ conversation_result = test_entry
424
+ original_test_entry = conversation_result.get(
425
+ "original_test_entry",
426
+ {},
427
+ )
428
+
429
+ if not conversation_result or not original_test_entry:
430
+ return self._create_eval_error_result(
431
+ "Missing conversation_result or original_test_entry",
432
+ test_entry.get("test_id", "unknown"),
433
+ )
434
+
435
+ test_id = conversation_result.get("test_id", "unknown")
436
+ category = test_id.rsplit("_", 1)[0] if "_" in test_id else test_id
437
+
438
+ model_name = self.model_name
439
+ from bfcl_eval.model_handler.api_inference.qwen import (
440
+ QwenAPIHandler,
441
+ )
442
+
443
+ handler = QwenAPIHandler(
444
+ self.model_name,
445
+ temperature=1.0,
446
+ )
447
+
448
+ model_result_data = self._convert_conversation_to_eval_format(
449
+ conversation_result,
450
+ original_test_entry,
451
+ )
452
+
453
+ prompt_data = [original_test_entry]
454
+
455
+ state = {"leaderboard_table": {}}
456
+ record_cost_latency(
457
+ state["leaderboard_table"],
458
+ model_name,
459
+ [model_result_data],
460
+ )
461
+
462
+ if is_relevance_or_irrelevance(category):
463
+ accuracy, total_count = self._eval_relevance_test(
464
+ handler,
465
+ model_result_data,
466
+ prompt_data,
467
+ model_name,
468
+ category,
469
+ )
470
+ else:
471
+ possible_answer_file = find_file_by_category(
472
+ category,
473
+ self._answer_path,
474
+ )
475
+ possible_answer = load_file(
476
+ possible_answer_file,
477
+ sort_by_id=True,
478
+ )
479
+ possible_answer = [
480
+ item for item in possible_answer if item["id"] == test_id
481
+ ]
482
+ if is_multi_turn(category):
483
+ accuracy, total_count = self._eval_multi_turn_test(
484
+ handler,
485
+ model_result_data,
486
+ prompt_data,
487
+ possible_answer,
488
+ model_name,
489
+ category,
490
+ )
491
+ else:
492
+ accuracy, total_count = self._eval_single_turn_test(
493
+ handler,
494
+ model_result_data,
495
+ prompt_data,
496
+ possible_answer,
497
+ model_name,
498
+ category,
499
+ )
500
+ result = {
501
+ "valid": True,
502
+ "accuracy": accuracy,
503
+ "total_count": total_count,
504
+ "correct_count": int(accuracy * total_count),
505
+ "test_category": category,
506
+ "test_id": test_id,
507
+ "model_name": model_name,
508
+ "input_tokens": conversation_result.get(
509
+ "total_input_tokens",
510
+ 0,
511
+ ),
512
+ "output_tokens": conversation_result.get(
513
+ "total_output_tokens",
514
+ 0,
515
+ ),
516
+ "turn_count": conversation_result.get("turn_count", 0),
517
+ "completed": conversation_result.get("completed", False),
518
+ }
519
+
520
+ return result
521
+
522
+ except Exception as e:
523
+ import traceback
524
+
525
+ traceback.print_exc()
526
+ return self._create_eval_error_result(
527
+ f"Evaluation failed: {str(e)}",
528
+ test_entry.get(
529
+ "test_id",
530
+ test_entry.get("conversation_result", {}).get(
531
+ "test_id",
532
+ "unknown",
533
+ ),
534
+ ),
535
+ )
536
+
537
+ def _create_eval_error_result(
538
+ self,
539
+ error_message: str,
540
+ test_id: str,
541
+ ) -> Dict[str, Any]:
542
+ """
543
+ Create standardized error result for evaluation.
544
+
545
+ Args:
546
+ error_message: Error message to include
547
+ test_id: ID of the test case
548
+
549
+ Returns:
550
+ Dictionary containing error result information
551
+ """
552
+ return {
553
+ "valid": False,
554
+ "error": error_message,
555
+ "accuracy": 0.0,
556
+ "total_count": 1,
557
+ "correct_count": 0,
558
+ "test_id": test_id,
559
+ "model_name": self.model_name,
560
+ }
561
+
562
+ def _eval_relevance_test(
563
+ self,
564
+ handler,
565
+ model_result_data,
566
+ prompt_data,
567
+ model_name,
568
+ test_category,
569
+ ):
570
+ """
571
+ Evaluate relevance/irrelevance test.
572
+
573
+ Args:
574
+ handler: Model handler instance
575
+ model_result_data: Model result data
576
+ prompt_data: Prompt data
577
+ model_name: Name of the model
578
+ test_category: Category of the test
579
+
580
+ Returns:
581
+ Tuple of (accuracy, total_count)
582
+ """
583
+ with tempfile.TemporaryDirectory() as temp_dir:
584
+ score_dir = Path(temp_dir)
585
+ accuracy, total_count = relevance_file_runner(
586
+ handler=handler,
587
+ model_result=[model_result_data],
588
+ prompt=prompt_data,
589
+ model_name=model_name,
590
+ test_category=test_category,
591
+ score_dir=score_dir,
592
+ )
593
+ self._capture_and_print_score_files(
594
+ score_dir,
595
+ model_name,
596
+ test_category,
597
+ "relevance",
598
+ )
599
+ return accuracy, total_count
600
+
601
+ def _eval_multi_turn_test(
602
+ self,
603
+ handler,
604
+ model_result_data,
605
+ prompt_data,
606
+ possible_answer,
607
+ model_name,
608
+ test_category,
609
+ ):
610
+ """
611
+ Evaluate multi-turn test.
612
+
613
+ Args:
614
+ handler: Model handler instance
615
+ model_result_data: Model result data
616
+ prompt_data: Prompt data
617
+ possible_answer: Possible answer data
618
+ model_name: Name of the model
619
+ test_category: Category of the test
620
+
621
+ Returns:
622
+ Tuple of (accuracy, total_count)
623
+ """
624
+ with tempfile.TemporaryDirectory() as temp_dir:
625
+ score_dir = Path(temp_dir)
626
+ accuracy, total_count = multi_turn_runner(
627
+ handler=handler,
628
+ model_result=[model_result_data],
629
+ prompt=prompt_data,
630
+ possible_answer=possible_answer,
631
+ model_name=model_name,
632
+ test_category=test_category,
633
+ score_dir=score_dir,
634
+ )
635
+ self._capture_and_print_score_files(
636
+ score_dir,
637
+ model_name,
638
+ test_category,
639
+ "multi_turn",
640
+ )
641
+ return accuracy, total_count
642
+
643
+ def _eval_single_turn_test(
644
+ self,
645
+ handler,
646
+ model_result_data,
647
+ prompt_data,
648
+ possible_answer,
649
+ model_name,
650
+ test_category,
651
+ ):
652
+ """
653
+ Evaluate single-turn AST test.
654
+
655
+ Args:
656
+ handler: Model handler instance
657
+ model_result_data: Model result data
658
+ prompt_data: Prompt data
659
+ possible_answer: Possible answer data
660
+ model_name: Name of the model
661
+ test_category: Category of the test
662
+
663
+ Returns:
664
+ Tuple of (accuracy, total_count)
665
+ """
666
+ language = "Python"
667
+ if "java" in test_category.lower():
668
+ language = "Java"
669
+ elif (
670
+ "js" in test_category.lower()
671
+ or "javascript" in test_category.lower()
672
+ ):
673
+ language = "JavaScript"
674
+
675
+ with tempfile.TemporaryDirectory() as temp_dir:
676
+ score_dir = Path(temp_dir)
677
+ accuracy, total_count = ast_file_runner(
678
+ handler=handler,
679
+ model_result=[model_result_data],
680
+ prompt=prompt_data,
681
+ possible_answer=possible_answer,
682
+ language=language,
683
+ test_category=test_category,
684
+ model_name=model_name,
685
+ score_dir=score_dir,
686
+ )
687
+ self._capture_and_print_score_files(
688
+ score_dir,
689
+ model_name,
690
+ test_category,
691
+ "single_turn",
692
+ )
693
+ return accuracy, total_count
694
+
695
+ # pylint: disable=too-many-nested-blocks
696
+ def _capture_and_print_score_files(
697
+ self,
698
+ score_dir: Path,
699
+ _model_name: str,
700
+ _test_category: str,
701
+ _eval_type: str,
702
+ ):
703
+ """
704
+ Capture and print contents of score files written to score_dir.
705
+
706
+ Args:
707
+ score_dir: Directory containing score files
708
+ model_name: Name of the model
709
+ test_category: Category of the test
710
+ eval_type: Type of evaluation (relevance/multi_turn/single_turn)
711
+ """
712
+ try:
713
+ for file_path in score_dir.rglob("*"):
714
+ if file_path.is_file():
715
+ try:
716
+ with open(file_path, "r", encoding="utf-8") as f:
717
+ content = f.read()
718
+
719
+ if (
720
+ file_path.suffix == ".json"
721
+ or content.strip().startswith("{")
722
+ or content.strip().startswith("[")
723
+ ):
724
+ try:
725
+ lines = content.strip().split("\n")
726
+ formatted_lines = []
727
+ for line in lines:
728
+ if line.strip():
729
+ parsed = json.loads(line)
730
+ formatted_lines.append(
731
+ json.dumps(
732
+ parsed,
733
+ ensure_ascii=False,
734
+ indent=2,
735
+ ),
736
+ )
737
+ content = "\n".join(formatted_lines)
738
+ except json.JSONDecodeError:
739
+ pass
740
+
741
+ except UnicodeDecodeError:
742
+ print(
743
+ f"[Binary file, size: {file_path.stat().st_size}\
744
+ bytes]",
745
+ )
746
+ except Exception as e:
747
+ print(f"[Error reading file: {str(e)}]")
748
+
749
+ except Exception as e:
750
+ print(f"Error capturing evaluation result files: {str(e)}")
751
+
752
+ def _convert_conversation_to_eval_format(
753
+ self,
754
+ conversation_result: Dict[str, Any],
755
+ _original_test_entry: Dict[str, Any],
756
+ ) -> Dict[str, Any]:
757
+ """
758
+ Convert conversation history to evaluation format.
759
+
760
+ Args:
761
+ conversation_result: Result from run_conversation
762
+ original_test_entry: Original test entry data
763
+
764
+ Returns:
765
+ Data in format expected by multi_turn_runner or other runners
766
+ """
767
+ test_id = conversation_result.get("test_id", "unknown")
768
+ messages = conversation_result.get("messages", [])
769
+
770
+ test_category = (
771
+ test_id.rsplit("_", 1)[0] if "_" in test_id else test_id
772
+ )
773
+
774
+ if is_multi_turn(test_category):
775
+ turns_data = self._extract_multi_turn_responses(messages)
776
+ else:
777
+ turns_data = self._extract_single_turn_response(messages)
778
+
779
+ model_result_data = {
780
+ "id": test_id,
781
+ "result": turns_data,
782
+ "latency": conversation_result.get("total_latency", 0),
783
+ "input_token_count": conversation_result.get(
784
+ "total_input_tokens",
785
+ 0,
786
+ ),
787
+ "output_token_count": conversation_result.get(
788
+ "total_output_tokens",
789
+ 0,
790
+ ),
791
+ }
792
+
793
+ return model_result_data
794
+
795
+ # pylint: disable=too-many-nested-blocks
796
+ def _extract_multi_turn_responses(
797
+ self,
798
+ messages: List[Dict[str, Any]],
799
+ ) -> List[List[str]]:
800
+ """
801
+ Extract multi-turn responses from conversation messages.
802
+
803
+ Args:
804
+ messages: List of conversation messages
805
+
806
+ Returns:
807
+ List of turns, each turn is a list of function call strings
808
+ """
809
+ turns_data = []
810
+ current_turn_responses = []
811
+
812
+ i = 0
813
+ while i < len(messages):
814
+ message = messages[i]
815
+
816
+ if message["role"] == "user":
817
+ if current_turn_responses:
818
+ turns_data.append(current_turn_responses)
819
+ current_turn_responses = []
820
+
821
+ i += 1
822
+ while i < len(messages) and messages[i]["role"] == "assistant":
823
+ assistant_msg = messages[i]
824
+
825
+ if (
826
+ "tool_calls" in assistant_msg
827
+ and assistant_msg["tool_calls"]
828
+ ):
829
+ for tool_call in assistant_msg["tool_calls"]:
830
+ formatted_call = (
831
+ self._format_single_tool_call_for_eval(
832
+ tool_call,
833
+ )
834
+ )
835
+ if formatted_call:
836
+ current_turn_responses.append(formatted_call)
837
+
838
+ i += 1
839
+
840
+ while i < len(messages) and messages[i]["role"] == "tool":
841
+ i += 1
842
+ else:
843
+ i += 1
844
+
845
+ if current_turn_responses:
846
+ turns_data.append(current_turn_responses)
847
+
848
+ return turns_data
849
+
850
+ def _extract_single_turn_response(
851
+ self,
852
+ messages: List[Dict[str, Any]],
853
+ ) -> str:
854
+ """
855
+ Extract single-turn response from conversation messages.
856
+
857
+ Args:
858
+ messages: List of conversation messages
859
+
860
+ Returns:
861
+ String representation of the response
862
+ """
863
+ for message in reversed(messages):
864
+ if message["role"] == "assistant":
865
+ if "tool_calls" in message and message["tool_calls"]:
866
+ formatted_calls = []
867
+ for tool_call in message["tool_calls"]:
868
+ formatted_call = (
869
+ self._format_single_tool_call_for_eval(
870
+ tool_call,
871
+ )
872
+ )
873
+ if formatted_call:
874
+ formatted_calls.append(formatted_call)
875
+ return (
876
+ "\n".join(formatted_calls) if formatted_calls else ""
877
+ )
878
+ elif message.get("content"):
879
+ return message["content"]
880
+
881
+ return ""
882
+
883
+ def _format_single_tool_call_for_eval(
884
+ self,
885
+ tool_call: Dict[str, Any],
886
+ ) -> str:
887
+ """
888
+ Format a single tool call into string representation for evaluation.
889
+
890
+ Args:
891
+ tool_call: Single tool call in OpenAI format
892
+
893
+ Returns:
894
+ Formatted string representation
895
+ """
896
+ function = tool_call.get("function", {})
897
+ function_name = function.get("name", "")
898
+
899
+ try:
900
+ arguments = function.get("arguments", "{}")
901
+ if isinstance(arguments, str):
902
+ args_dict = json.loads(arguments)
903
+ else:
904
+ args_dict = arguments
905
+
906
+ args_str = ", ".join(
907
+ [f"{k}={repr(v)}" for k, v in args_dict.items()],
908
+ )
909
+ return f"{function_name}({args_str})"
910
+
911
+ except Exception as e:
912
+ return f"{function_name}, {str(e)}"
913
+
914
+
915
+ def env_step(
916
+ messages: List[Dict[str, Any]],
917
+ test_entry: Dict[str, Any],
918
+ model: str = "env-handler",
919
+ **kwargs,
920
+ ) -> Dict[str, Any]:
921
+ """
922
+ Simplified interface for environment chat completion.
923
+
924
+ Args:
925
+ messages: List of conversation messages
926
+ test_entry: Test entry containing conversation data
927
+ model: Model name
928
+ **kwargs: Additional arguments
929
+
930
+ Returns:
931
+ Response from environment handler
932
+ """
933
+ handler = EnvHandler(model)
934
+ return handler.interact(messages, test_entry, **kwargs)