evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show
  1. evalscope/app/app.py +20 -5
  2. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  3. evalscope/backend/rag_eval/utils/embedding.py +2 -4
  4. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  5. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  6. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  7. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  8. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  9. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  10. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  11. evalscope/benchmarks/benchmark.py +1 -0
  12. evalscope/benchmarks/bfcl/__init__.py +0 -0
  13. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  14. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  15. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  16. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  17. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  18. evalscope/benchmarks/data_adapter.py +2 -0
  19. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  20. evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
  21. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +1 -0
  23. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  26. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  27. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  29. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  30. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  32. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  34. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  35. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  36. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  37. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  38. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
  39. evalscope/benchmarks/needle_haystack/utils.py +2 -2
  40. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  41. evalscope/benchmarks/race/race_adapter.py +3 -0
  42. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  43. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  44. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  45. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  46. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  48. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  49. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  50. evalscope/collections/evaluator.py +50 -28
  51. evalscope/constants.py +1 -1
  52. evalscope/evaluator/evaluator.py +6 -5
  53. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  54. evalscope/models/adapters/__init__.py +2 -0
  55. evalscope/models/adapters/base_adapter.py +31 -27
  56. evalscope/models/adapters/bfcl_adapter.py +244 -0
  57. evalscope/models/adapters/server_adapter.py +78 -17
  58. evalscope/models/custom/custom_model.py +0 -3
  59. evalscope/models/custom/dummy_model.py +77 -39
  60. evalscope/models/local_model.py +1 -1
  61. evalscope/models/register.py +2 -1
  62. evalscope/perf/arguments.py +2 -0
  63. evalscope/perf/benchmark.py +16 -3
  64. evalscope/perf/plugin/api/openai_api.py +2 -0
  65. evalscope/report/combinator.py +38 -12
  66. evalscope/report/utils.py +24 -1
  67. evalscope/run.py +1 -1
  68. evalscope/summarizer.py +1 -1
  69. evalscope/utils/io_utils.py +59 -2
  70. evalscope/version.py +2 -2
  71. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
  72. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
  73. tests/aigc/test_t2i.py +8 -8
  74. tests/cli/test_all.py +40 -33
  75. tests/cli/test_collection.py +4 -3
  76. tests/cli/test_run.py +36 -21
  77. tests/rag/test_clip_benchmark.py +5 -1
  78. tests/rag/test_mteb.py +46 -2
  79. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  80. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  81. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  82. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@ from evalscope.benchmarks import DataAdapter
13
13
  from evalscope.config import TaskConfig
14
14
  from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
15
15
  from evalscope.models import BaseModelAdapter
16
- from evalscope.report import Report, gen_report_table
16
+ from evalscope.report import Report, gen_table
17
17
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
18
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
19
19
  from evalscope.utils.logger import get_logger
@@ -108,7 +108,6 @@ class Evaluator(object):
108
108
  return answer_d
109
109
 
110
110
  def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
111
- answers_list = []
112
111
  try:
113
112
  # get answer from model
114
113
  answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
@@ -117,10 +116,11 @@ class Evaluator(object):
117
116
  # if ignore_errors is True, continue to next input
118
117
  if self.task_cfg.ignore_errors:
119
118
  logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
120
- return answers_list
119
+ return []
121
120
  else:
122
121
  raise e
123
122
  # process answer
123
+ answers_list = []
124
124
  for answer_d, input_prompt in zip(answer_ds, input_prompts):
125
125
  answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
126
126
  processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
@@ -399,8 +399,9 @@ class Evaluator(object):
399
399
 
400
400
  # Make table
401
401
  try:
402
- report_table = gen_report_table(report_map)
403
- logger.info(f'{self.dataset_name_or_path} report table: \n{report_table} \n')
402
+ report_table = gen_table(report_list=[report_map], add_overall_metric=True)
403
+ logger.info(f'\n{self.dataset_name_or_path} report table:'
404
+ f'\n{report_table} \n')
404
405
  except Exception:
405
406
  logger.error('Failed to generate report table.')
406
407
 
@@ -1,66 +1,52 @@
1
- from __future__ import absolute_import, division, print_function
2
-
3
- from .clipscore import CLIPScore, list_all_clipscore_models
4
- from .constants import CACHE_DIR
5
- from .itmscore import ITMScore, list_all_itmscore_models
6
- from .vqascore import VQAScore, list_all_vqascore_models
7
-
8
-
9
- def list_all_models():
10
- return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models()
11
-
12
-
13
- def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
14
- if model in list_all_vqascore_models():
15
- return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)
16
- elif model in list_all_clipscore_models():
17
- return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs)
18
- elif model in list_all_itmscore_models():
19
- return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs)
20
- else:
21
- raise NotImplementedError()
22
-
23
-
24
1
  def clip_flant5_score():
2
+ from .vqascore import VQAScore
25
3
  clip_flant5_score = VQAScore(model='clip-flant5-xxl')
26
4
  return clip_flant5_score
27
5
 
28
6
 
29
7
  def pick_score():
8
+ from .clipscore import CLIPScore
30
9
  pick_score = CLIPScore(model='pickscore-v1')
31
10
  return pick_score
32
11
 
33
12
 
34
13
  def clip_score():
14
+ from .clipscore import CLIPScore
35
15
  clip_score = CLIPScore(model='openai:ViT-L-14-336')
36
16
  return clip_score
37
17
 
38
18
 
39
19
  def blip2_score():
20
+ from .itmscore import ITMScore
40
21
  blip_itm_score = ITMScore(model='blip2-itm')
41
22
  return blip_itm_score
42
23
 
43
24
 
44
25
  def hpsv2_score():
26
+ from .clipscore import CLIPScore
45
27
  hpsv2_score = CLIPScore(model='hpsv2')
46
28
  return hpsv2_score
47
29
 
48
30
 
49
31
  def hpsv2_1_score():
32
+ from .clipscore import CLIPScore
50
33
  hpsv2_1_score = CLIPScore(model='hpsv2.1')
51
34
  return hpsv2_1_score
52
35
 
53
36
 
54
37
  def image_reward_score():
38
+ from .itmscore import ITMScore
55
39
  image_reward_score = ITMScore(model='image-reward-v1')
56
40
  return image_reward_score
57
41
 
58
42
 
59
43
  def fga_blip2_score():
44
+ from .itmscore import ITMScore
60
45
  fga_blip2_score = ITMScore(model='fga_blip2')
61
46
  return fga_blip2_score
62
47
 
63
48
 
64
49
  def mps_score():
50
+ from .clipscore import CLIPScore
65
51
  mps_score = CLIPScore(model='mps')
66
52
  return mps_score
@@ -1,4 +1,5 @@
1
1
  from .base_adapter import BaseModelAdapter, initialize_model_adapter
2
+ from .bfcl_adapter import BFCLAdapter
2
3
  from .chat_adapter import ChatGenerationModelAdapter
3
4
  from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
4
5
  from .custom_adapter import CustomModelAdapter
@@ -13,5 +14,6 @@ __all__ = [
13
14
  'MultiChoiceModelAdapter',
14
15
  'CustomModelAdapter',
15
16
  'ServerModelAdapter',
17
+ 'BFCLAdapter',
16
18
  'T2IModelAdapter',
17
19
  ]
@@ -44,35 +44,39 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
44
44
  raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
45
45
  from evalscope.models import CustomModelAdapter
46
46
  return CustomModelAdapter(custom_model=task_cfg.model)
47
- elif task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
48
- from evalscope.models import ServerModelAdapter
49
-
50
- if benchmark.model_adapter in [OutputType.CONTINUOUS, OutputType.MULTIPLE_CHOICE]:
51
- logger.warning('Output type is set to logits. This is not supported for service evaluation. '
52
- 'Setting output type to generation by default.')
53
- benchmark.model_adapter = OutputType.GENERATION
54
-
55
- return ServerModelAdapter(
56
- api_url=task_cfg.api_url,
57
- model_id=task_cfg.model,
58
- api_key=task_cfg.api_key,
59
- seed=task_cfg.seed,
60
- timeout=task_cfg.timeout,
61
- stream=task_cfg.stream,
62
- )
63
47
  else:
64
48
  from ..register import get_model_adapter
65
49
 
66
- # for local model, we need to determine the model adapter class based on the output type
50
+ # we need to determine the model adapter class based on the output type
67
51
  model_adapter_cls_str = benchmark.model_adapter
68
- if model_adapter_cls_str not in benchmark.output_types:
69
- logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}. '
70
- f'Using {benchmark.output_types[0]} instead.')
71
- model_adapter_cls_str = benchmark.output_types[0]
72
52
 
73
- model_adapter_cls = get_model_adapter(model_adapter_cls_str)
74
- return model_adapter_cls(
75
- model=base_model,
76
- generation_config=task_cfg.generation_config,
77
- chat_template=task_cfg.chat_template,
78
- task_cfg=task_cfg)
53
+ if task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
54
+
55
+ if 'server' not in model_adapter_cls_str:
56
+ model_adapter_cls_str = 'server'
57
+ logger.info(
58
+ f'Using {model_adapter_cls.__name__} for api model evaluation for benchmark {benchmark.name}.')
59
+
60
+ # init server model adapter
61
+ model_adapter_cls = get_model_adapter(model_adapter_cls_str)
62
+
63
+ return model_adapter_cls(
64
+ api_url=task_cfg.api_url,
65
+ model_id=task_cfg.model,
66
+ api_key=task_cfg.api_key,
67
+ seed=task_cfg.seed,
68
+ timeout=task_cfg.timeout,
69
+ stream=task_cfg.stream,
70
+ )
71
+ else:
72
+ if model_adapter_cls_str not in benchmark.output_types:
73
+ logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}.'
74
+ f'Using {benchmark.output_types[0]} instead.')
75
+ model_adapter_cls_str = benchmark.output_types[0]
76
+
77
+ model_adapter_cls = get_model_adapter(model_adapter_cls_str)
78
+ return model_adapter_cls(
79
+ model=base_model,
80
+ generation_config=task_cfg.generation_config,
81
+ chat_template=task_cfg.chat_template,
82
+ task_cfg=task_cfg)
@@ -0,0 +1,244 @@
1
+ import json
2
+ import time
3
+ import uuid
4
+ from typing import Any, List, Optional, Union
5
+
6
+ from evalscope.utils.logger import get_logger
7
+ from .server_adapter import ServerModelAdapter
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ class BFCLAdapter(ServerModelAdapter):
13
+ """
14
+ BFCL model adapter to request remote API model and generate results for BFCL evaluation.
15
+ Support multi-turn and single-turn function calling tasks.
16
+ """
17
+
18
+ def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
19
+ """
20
+ Args:
21
+ api_url: The URL of the remote API model.
22
+ model_id: The ID of the remote API model.
23
+ api_key: The API key of the remote API model.
24
+ """
25
+ super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
26
+
27
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
28
+ """
29
+ Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
30
+ where each list is a follow up turn in the conversation
31
+ each turn is a List[List[Message]]
32
+
33
+ Args:
34
+ inputs (List[dict]): The input data.
35
+ infer_cfg (dict): Inference configuration.
36
+
37
+ Returns:
38
+ res (List[dict]): The model prediction results.
39
+ """
40
+ infer_cfg = infer_cfg or {}
41
+ results = []
42
+
43
+ for input_item in inputs:
44
+ # This flag decides if we pass tools to the API or try tool calling via prompting
45
+ # Passing tools to the API means that we rely on the API to manage system prompt specifics
46
+ # and also expect parsed tool calls in the ChatCompletionMessage object
47
+ # This is how the is_fc_model=True benchmark is designed to work
48
+ # On the other hand, we try to manage
49
+ # tool calling via prompting and parse tool calls in the standard text response
50
+ # This is how the is_fc_model=False benchmark is designed to work
51
+ row = input_item.get('messages')
52
+ is_fc_model = row.get('is_fc_model', False)
53
+
54
+ if is_fc_model:
55
+ response = self.generate_turn_with_tools(row, infer_cfg)
56
+ else:
57
+ response = self.generate_turn(row, infer_cfg)
58
+
59
+ # wrap response with openai types
60
+ res_d = {
61
+ 'choices': [{
62
+ 'index': 0,
63
+ 'message': {
64
+ 'content': response,
65
+ 'role': 'assistant'
66
+ }
67
+ }],
68
+ 'created': time.time(),
69
+ 'model': self.model_id,
70
+ 'object': 'chat.completion',
71
+ 'usage': {
72
+ 'completion_tokens': 0,
73
+ 'prompt_tokens': 0,
74
+ 'total_tokens': 0
75
+ }
76
+ }
77
+ results.append(res_d)
78
+
79
+ return results
80
+
81
+ def generate_turn(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
82
+ from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
83
+ MAXIMUM_STEP_LIMIT)
84
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
85
+ from bfcl_eval.model_handler.utils import default_decode_execute_prompting
86
+
87
+ all_model_responses = []
88
+ current_messages = []
89
+ turns = row['turns']
90
+ for turn_idx, messages in enumerate(turns):
91
+ n_steps = 0
92
+ current_responses = []
93
+ current_messages += messages.copy()
94
+
95
+ if str(turn_idx) in row['missing_functions']:
96
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
97
+ new_turn = [{
98
+ 'role':
99
+ 'user',
100
+ 'content':
101
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
102
+ functions=row['missing_functions'][str(turn_idx)]),
103
+ }]
104
+ current_messages += new_turn
105
+
106
+ while True:
107
+ input_item = {
108
+ 'messages': current_messages,
109
+ }
110
+ responses = self.process_single_input(input_item, infer_cfg)
111
+ result = responses['choices'][0]['message']['content']
112
+
113
+ logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
114
+ current_messages.append({
115
+ 'role': 'assistant',
116
+ 'content': result,
117
+ })
118
+ current_responses.append(result)
119
+
120
+ execute_tools = row.get('should_execute_tool_calls', False)
121
+ if execute_tools:
122
+ try:
123
+ tool_calls = default_decode_execute_prompting(result)
124
+ except Exception:
125
+ tool_calls = None
126
+
127
+ if tool_calls is None:
128
+ break
129
+
130
+ tool_outputs, _ = execute_multi_turn_func_call(
131
+ tool_calls,
132
+ initial_config=row['initial_config'],
133
+ involved_classes=row['involved_classes'],
134
+ model_name='evaluator_loop',
135
+ test_entry_id=row['id'],
136
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
137
+ is_evaL_run=False,
138
+ )
139
+ # Append tool outputs to the current messages
140
+ tool_results = []
141
+ for tool_output, tool_call in zip(tool_outputs, tool_calls):
142
+ tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
143
+ current_messages.append({
144
+ 'role': 'user',
145
+ 'content': repr(tool_results),
146
+ })
147
+ else:
148
+ break
149
+
150
+ n_steps += 1
151
+ if n_steps > MAXIMUM_STEP_LIMIT:
152
+ logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
153
+ break
154
+
155
+ all_model_responses.append(current_responses)
156
+
157
+ return all_model_responses
158
+
159
+ def generate_turn_with_tools(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
160
+ from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
161
+ MAXIMUM_STEP_LIMIT)
162
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
163
+ from bfcl_eval.model_handler.utils import convert_to_function_call
164
+
165
+ all_model_responses = []
166
+ current_messages = []
167
+ turns = row['turns']
168
+ for turn_idx, messages in enumerate(turns):
169
+ n_steps = 0
170
+ current_responses = []
171
+ current_messages += messages.copy()
172
+ tools = row['tools']
173
+
174
+ if str(turn_idx) in row['missing_functions']:
175
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
176
+ # inject new functions on the fly
177
+ new_tools = row['missing_functions'][str(turn_idx)]
178
+ for new_tool in new_tools:
179
+ tools.append({
180
+ 'type': 'function',
181
+ 'function': new_tool[0],
182
+ })
183
+ new_turn = [{
184
+ 'role': 'user',
185
+ 'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
186
+ }]
187
+ current_messages += new_turn
188
+
189
+ while True:
190
+ input_item = {
191
+ 'messages': current_messages,
192
+ 'tools': tools,
193
+ }
194
+ responses = self.process_single_input(input_item, infer_cfg)
195
+ message = responses['choices'][0]['message']
196
+
197
+ current_messages.append(message)
198
+ if isinstance(message, str):
199
+ model_responses = [message]
200
+ tool_call_strs = None
201
+ elif message.get('tool_calls'):
202
+ model_responses = [{
203
+ tc['function']['name']: tc['function']['arguments']
204
+ } for tc in message['tool_calls']]
205
+ try:
206
+ tool_call_strs = convert_to_function_call(model_responses)
207
+ except Exception as e:
208
+ logger.error(f'Error converting tool calls to function call strings: {e}')
209
+ tool_call_strs = None
210
+ else:
211
+ model_responses = [message['content']]
212
+ tool_call_strs = None
213
+
214
+ current_responses.extend(model_responses)
215
+
216
+ execute_tools = row.get('should_execute_tool_calls', False)
217
+ if execute_tools and tool_call_strs is not None:
218
+ tool_outputs, _ = execute_multi_turn_func_call(
219
+ tool_call_strs,
220
+ initial_config=row['initial_config'],
221
+ involved_classes=row['involved_classes'],
222
+ model_name='evaluator_loop',
223
+ test_entry_id=row['id'],
224
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
225
+ is_evaL_run=False,
226
+ )
227
+
228
+ for tc, tool_output in zip(message['tool_calls'], tool_outputs, strict=False):
229
+ current_messages.append({
230
+ 'role': 'tool',
231
+ 'tool_call_id': tc['id'],
232
+ 'content': json.dumps({'response': tool_output}),
233
+ })
234
+ else:
235
+ break
236
+
237
+ n_steps += 1
238
+ if n_steps > MAXIMUM_STEP_LIMIT:
239
+ logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
240
+ break
241
+
242
+ all_model_responses.append(current_responses)
243
+
244
+ return all_model_responses
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  import openai
2
3
  from collections import defaultdict
3
4
  from openai.types.chat import ChatCompletion, ChatCompletionChunk
@@ -61,18 +62,17 @@ class ServerModelAdapter(BaseModelAdapter):
61
62
 
62
63
  def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
63
64
  """Process a single input item."""
64
- if input_item.get('messages', None):
65
- content = input_item['messages']
66
- else:
67
- content = self.make_request_content(input_item)
68
- request_json = self.make_request(content, infer_cfg)
65
+ request_json = self.make_request(input_item, infer_cfg)
69
66
  response = self.send_request(request_json)
70
67
  return response
71
68
 
72
- def make_request_content(self, input_item: dict) -> list:
69
+ def make_request_messages(self, input_item: dict) -> list:
73
70
  """
74
- Make request content for OpenAI API.
71
+ Make request messages for OpenAI API.
75
72
  """
73
+ if input_item.get('messages', None):
74
+ return input_item['messages']
75
+
76
76
  data: list = input_item['data']
77
77
  if isinstance(data[0], tuple): # for truthful_qa and hellaswag
78
78
  query = '\n'.join(''.join(item) for item in data)
@@ -89,18 +89,28 @@ class ServerModelAdapter(BaseModelAdapter):
89
89
 
90
90
  return messages
91
91
 
92
- def make_request(self, content: list, infer_cfg: dict) -> dict:
92
+ def make_request(self, input_item: dict, infer_cfg: dict) -> dict:
93
93
  """Make request to remote API."""
94
+ messages = self.make_request_messages(input_item)
94
95
  # Format request JSON according to OpenAI API format
95
- request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
96
+ request_json = {'model': self.model_id, 'messages': messages, **infer_cfg}
96
97
 
97
98
  if self.timeout:
98
99
  request_json['timeout'] = self.timeout
99
100
 
101
+ request_json['stream'] = self.stream
100
102
  if self.stream:
101
- request_json['stream'] = self.stream
102
103
  request_json['stream_options'] = {'include_usage': True}
103
104
 
105
+ if input_item.get('tools', None):
106
+ tools_copy = copy.deepcopy(input_item.get('tools'))
107
+ # Remove the "responses" from the functions, as that doesn't
108
+ # need to go to the model
109
+ for tool in tools_copy:
110
+ if 'function' in tool and 'response' in tool['function']:
111
+ del tool['function']['response']
112
+ request_json['tools'] = tools_copy
113
+
104
114
  logger.debug(f'Request to remote API: {request_json}')
105
115
 
106
116
  return request_json
@@ -135,19 +145,65 @@ class ServerModelAdapter(BaseModelAdapter):
135
145
  collected_chunks = []
136
146
  collected_messages = defaultdict(list)
137
147
  collected_reasoning = defaultdict(list)
148
+ collected_tool_calls = defaultdict(dict)
138
149
 
139
150
  for chunk in response_stream:
140
151
  collected_chunks.append(chunk)
141
152
  for choice in chunk.choices:
153
+ # Handle reasoning content
142
154
  if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
143
155
  collected_reasoning[choice.index].append(choice.delta.reasoning_content)
156
+
157
+ # Handle regular content
144
158
  if choice.delta.content is not None:
145
159
  collected_messages[choice.index].append(choice.delta.content)
146
160
 
161
+ # Handle tool calls
162
+ if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
163
+ for tool_call in choice.delta.tool_calls:
164
+ tool_id = tool_call.index
165
+
166
+ # Initialize tool call if not present
167
+ if tool_id not in collected_tool_calls[choice.index]:
168
+ collected_tool_calls[choice.index][tool_id] = {
169
+ 'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
170
+ 'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
171
+ 'function': {
172
+ 'name': '',
173
+ 'arguments': ''
174
+ }
175
+ }
176
+
177
+ # Update tool call with new chunks
178
+ if hasattr(tool_call, 'function'):
179
+ if hasattr(tool_call.function, 'name') and tool_call.function.name:
180
+ collected_tool_calls[
181
+ choice.index][tool_id]['function']['name'] = tool_call.function.name
182
+
183
+ if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
184
+ collected_tool_calls[
185
+ choice.index][tool_id]['function']['arguments'] += tool_call.function.arguments
186
+
187
+ # Update ID if it was received later
188
+ if hasattr(tool_call, 'id') and tool_call.id:
189
+ collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
190
+
191
+ # Get all unique choice indices from all collections
192
+ all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(
193
+ collected_tool_calls.keys())
194
+
147
195
  choices = []
148
- for index, messages in collected_messages.items():
149
- full_reply_content = ''.join(messages)
150
- reasoning = ''.join(collected_reasoning[index])
196
+ for index in all_indices:
197
+ full_reply_content = ''.join(collected_messages.get(index, []))
198
+ reasoning = ''.join(collected_reasoning.get(index, []))
199
+
200
+ # Process tool_calls for this choice if any exists
201
+ tool_calls_list = None
202
+ if index in collected_tool_calls and collected_tool_calls[index]:
203
+ tool_calls_list = list(collected_tool_calls[index].values())
204
+ # Filter out any tool calls with None id (incomplete tool calls)
205
+ tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
206
+
151
207
  # use the finish_reason from the last chunk that generated this choice
152
208
  finish_reason = None
153
209
  for chunk in reversed(collected_chunks):
@@ -155,11 +211,16 @@ class ServerModelAdapter(BaseModelAdapter):
155
211
  finish_reason = chunk.choices[0].finish_reason
156
212
  break
157
213
 
214
+ message_kwargs = {'role': 'assistant', 'content': full_reply_content}
215
+
216
+ if reasoning:
217
+ message_kwargs['reasoning_content'] = reasoning
218
+
219
+ if tool_calls_list:
220
+ message_kwargs['tool_calls'] = tool_calls_list
221
+
158
222
  choice = Choice(
159
- finish_reason=finish_reason or 'stop',
160
- index=index,
161
- message=ChatCompletionMessage(
162
- role='assistant', content=full_reply_content, reasoning_content=reasoning))
223
+ finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs))
163
224
  choices.append(choice)
164
225
 
165
226
  # build the final completion object
@@ -10,9 +10,6 @@ class CustomModel(ABC):
10
10
  self.config = config
11
11
  self.kwargs = kwargs
12
12
 
13
- if config.get('model_id', None) is None:
14
- raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
15
-
16
13
  @abstractmethod
17
14
  @torch.no_grad()
18
15
  def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]: