evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +40 -30
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  7. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  8. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  9. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  10. evalscope/backend/rag_eval/utils/embedding.py +77 -39
  11. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  12. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  13. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  14. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  16. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  17. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  18. evalscope/benchmarks/benchmark.py +2 -0
  19. evalscope/benchmarks/bfcl/__init__.py +0 -0
  20. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. evalscope/benchmarks/data_adapter.py +99 -16
  26. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope/benchmarks/docmath/__init__.py +0 -0
  28. evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  29. evalscope/benchmarks/docmath/utils.py +220 -0
  30. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  31. evalscope/benchmarks/frames/__init__.py +0 -0
  32. evalscope/benchmarks/frames/frames_adapter.py +91 -0
  33. evalscope/benchmarks/frames/utils.py +37 -0
  34. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  35. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  36. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  37. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  38. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  39. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  40. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  41. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  42. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  43. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  44. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  45. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  46. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  47. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  48. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  49. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  50. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  51. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  52. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  53. evalscope/benchmarks/race/race_adapter.py +3 -0
  54. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  55. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  56. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  57. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  58. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  59. evalscope/benchmarks/tool_bench/utils.py +5 -4
  60. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  61. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  62. evalscope/benchmarks/utils.py +25 -0
  63. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  64. evalscope/cli/start_app.py +2 -2
  65. evalscope/collections/__init__.py +35 -3
  66. evalscope/collections/evaluator.py +68 -34
  67. evalscope/config.py +8 -2
  68. evalscope/constants.py +1 -1
  69. evalscope/evaluator/evaluator.py +40 -28
  70. evalscope/metrics/__init__.py +3 -1
  71. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  72. evalscope/metrics/llm_judge.py +12 -5
  73. evalscope/metrics/math_parser.py +1 -1
  74. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  75. evalscope/models/adapters/__init__.py +2 -0
  76. evalscope/models/adapters/base_adapter.py +31 -27
  77. evalscope/models/adapters/bfcl_adapter.py +244 -0
  78. evalscope/models/adapters/server_adapter.py +80 -23
  79. evalscope/models/custom/custom_model.py +0 -3
  80. evalscope/models/custom/dummy_model.py +77 -39
  81. evalscope/models/local_model.py +1 -1
  82. evalscope/models/register.py +2 -1
  83. evalscope/perf/arguments.py +4 -2
  84. evalscope/perf/benchmark.py +16 -12
  85. evalscope/perf/main.py +7 -0
  86. evalscope/perf/plugin/api/openai_api.py +2 -0
  87. evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope/perf/utils/benchmark_util.py +1 -1
  89. evalscope/perf/utils/local_server.py +1 -0
  90. evalscope/perf/utils/log_utils.py +12 -5
  91. evalscope/perf/utils/rich_display.py +1 -1
  92. evalscope/report/__init__.py +36 -4
  93. evalscope/report/combinator.py +40 -6
  94. evalscope/report/generator.py +33 -9
  95. evalscope/report/utils.py +84 -4
  96. evalscope/run.py +12 -0
  97. evalscope/summarizer.py +1 -1
  98. evalscope/utils/io_utils.py +59 -2
  99. evalscope/utils/logger.py +1 -1
  100. evalscope/utils/utils.py +12 -0
  101. evalscope/version.py +2 -2
  102. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
  103. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
  104. tests/aigc/test_t2i.py +48 -11
  105. tests/cli/test_all.py +14 -3
  106. tests/cli/test_collection.py +6 -4
  107. tests/cli/test_run.py +50 -25
  108. tests/rag/test_clip_benchmark.py +5 -1
  109. tests/rag/test_mteb.py +51 -7
  110. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  111. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  112. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  113. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  114. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,244 @@
1
+ import json
2
+ import time
3
+ import uuid
4
+ from typing import Any, List, Optional, Union
5
+
6
+ from evalscope.utils.logger import get_logger
7
+ from .server_adapter import ServerModelAdapter
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ class BFCLAdapter(ServerModelAdapter):
13
+ """
14
+ BFCL model adapter to request remote API model and generate results for BFCL evaluation.
15
+ Support multi-turn and single-turn function calling tasks.
16
+ """
17
+
18
+ def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
19
+ """
20
+ Args:
21
+ api_url: The URL of the remote API model.
22
+ model_id: The ID of the remote API model.
23
+ api_key: The API key of the remote API model.
24
+ """
25
+ super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
26
+
27
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
28
+ """
29
+ Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
30
+ where each list is a follow up turn in the conversation
31
+ each turn is a List[List[Message]]
32
+
33
+ Args:
34
+ inputs (List[dict]): The input data.
35
+ infer_cfg (dict): Inference configuration.
36
+
37
+ Returns:
38
+ res (List[dict]): The model prediction results.
39
+ """
40
+ infer_cfg = infer_cfg or {}
41
+ results = []
42
+
43
+ for input_item in inputs:
44
+ # This flag decides if we pass tools to the API or try tool calling via prompting
45
+ # Passing tools to the API means that we rely on the API to manage system prompt specifics
46
+ # and also expect parsed tool calls in the ChatCompletionMessage object
47
+ # This is how the is_fc_model=True benchmark is designed to work
48
+ # On the other hand, we try to manage
49
+ # tool calling via prompting and parse tool calls in the standard text response
50
+ # This is how the is_fc_model=False benchmark is designed to work
51
+ row = input_item.get('messages')
52
+ is_fc_model = row.get('is_fc_model', False)
53
+
54
+ if is_fc_model:
55
+ response = self.generate_turn_with_tools(row, infer_cfg)
56
+ else:
57
+ response = self.generate_turn(row, infer_cfg)
58
+
59
+ # wrap response with openai types
60
+ res_d = {
61
+ 'choices': [{
62
+ 'index': 0,
63
+ 'message': {
64
+ 'content': response,
65
+ 'role': 'assistant'
66
+ }
67
+ }],
68
+ 'created': time.time(),
69
+ 'model': self.model_id,
70
+ 'object': 'chat.completion',
71
+ 'usage': {
72
+ 'completion_tokens': 0,
73
+ 'prompt_tokens': 0,
74
+ 'total_tokens': 0
75
+ }
76
+ }
77
+ results.append(res_d)
78
+
79
+ return results
80
+
81
+ def generate_turn(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
82
+ from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
83
+ MAXIMUM_STEP_LIMIT)
84
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
85
+ from bfcl_eval.model_handler.utils import default_decode_execute_prompting
86
+
87
+ all_model_responses = []
88
+ current_messages = []
89
+ turns = row['turns']
90
+ for turn_idx, messages in enumerate(turns):
91
+ n_steps = 0
92
+ current_responses = []
93
+ current_messages += messages.copy()
94
+
95
+ if str(turn_idx) in row['missing_functions']:
96
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
97
+ new_turn = [{
98
+ 'role':
99
+ 'user',
100
+ 'content':
101
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
102
+ functions=row['missing_functions'][str(turn_idx)]),
103
+ }]
104
+ current_messages += new_turn
105
+
106
+ while True:
107
+ input_item = {
108
+ 'messages': current_messages,
109
+ }
110
+ responses = self.process_single_input(input_item, infer_cfg)
111
+ result = responses['choices'][0]['message']['content']
112
+
113
+ logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
114
+ current_messages.append({
115
+ 'role': 'assistant',
116
+ 'content': result,
117
+ })
118
+ current_responses.append(result)
119
+
120
+ execute_tools = row.get('should_execute_tool_calls', False)
121
+ if execute_tools:
122
+ try:
123
+ tool_calls = default_decode_execute_prompting(result)
124
+ except Exception:
125
+ tool_calls = None
126
+
127
+ if tool_calls is None:
128
+ break
129
+
130
+ tool_outputs, _ = execute_multi_turn_func_call(
131
+ tool_calls,
132
+ initial_config=row['initial_config'],
133
+ involved_classes=row['involved_classes'],
134
+ model_name='evaluator_loop',
135
+ test_entry_id=row['id'],
136
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
137
+ is_evaL_run=False,
138
+ )
139
+ # Append tool outputs to the current messages
140
+ tool_results = []
141
+ for tool_output, tool_call in zip(tool_outputs, tool_calls):
142
+ tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
143
+ current_messages.append({
144
+ 'role': 'user',
145
+ 'content': repr(tool_results),
146
+ })
147
+ else:
148
+ break
149
+
150
+ n_steps += 1
151
+ if n_steps > MAXIMUM_STEP_LIMIT:
152
+ logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
153
+ break
154
+
155
+ all_model_responses.append(current_responses)
156
+
157
+ return all_model_responses
158
+
159
+ def generate_turn_with_tools(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
160
+ from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
161
+ MAXIMUM_STEP_LIMIT)
162
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
163
+ from bfcl_eval.model_handler.utils import convert_to_function_call
164
+
165
+ all_model_responses = []
166
+ current_messages = []
167
+ turns = row['turns']
168
+ for turn_idx, messages in enumerate(turns):
169
+ n_steps = 0
170
+ current_responses = []
171
+ current_messages += messages.copy()
172
+ tools = row['tools']
173
+
174
+ if str(turn_idx) in row['missing_functions']:
175
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
176
+ # inject new functions on the fly
177
+ new_tools = row['missing_functions'][str(turn_idx)]
178
+ for new_tool in new_tools:
179
+ tools.append({
180
+ 'type': 'function',
181
+ 'function': new_tool[0],
182
+ })
183
+ new_turn = [{
184
+ 'role': 'user',
185
+ 'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
186
+ }]
187
+ current_messages += new_turn
188
+
189
+ while True:
190
+ input_item = {
191
+ 'messages': current_messages,
192
+ 'tools': tools,
193
+ }
194
+ responses = self.process_single_input(input_item, infer_cfg)
195
+ message = responses['choices'][0]['message']
196
+
197
+ current_messages.append(message)
198
+ if isinstance(message, str):
199
+ model_responses = [message]
200
+ tool_call_strs = None
201
+ elif message.get('tool_calls'):
202
+ model_responses = [{
203
+ tc['function']['name']: tc['function']['arguments']
204
+ } for tc in message['tool_calls']]
205
+ try:
206
+ tool_call_strs = convert_to_function_call(model_responses)
207
+ except Exception as e:
208
+ logger.error(f'Error converting tool calls to function call strings: {e}')
209
+ tool_call_strs = None
210
+ else:
211
+ model_responses = [message['content']]
212
+ tool_call_strs = None
213
+
214
+ current_responses.extend(model_responses)
215
+
216
+ execute_tools = row.get('should_execute_tool_calls', False)
217
+ if execute_tools and tool_call_strs is not None:
218
+ tool_outputs, _ = execute_multi_turn_func_call(
219
+ tool_call_strs,
220
+ initial_config=row['initial_config'],
221
+ involved_classes=row['involved_classes'],
222
+ model_name='evaluator_loop',
223
+ test_entry_id=row['id'],
224
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
225
+ is_evaL_run=False,
226
+ )
227
+
228
+ for tc, tool_output in zip(message['tool_calls'], tool_outputs, strict=False):
229
+ current_messages.append({
230
+ 'role': 'tool',
231
+ 'tool_call_id': tc['id'],
232
+ 'content': json.dumps({'response': tool_output}),
233
+ })
234
+ else:
235
+ break
236
+
237
+ n_steps += 1
238
+ if n_steps > MAXIMUM_STEP_LIMIT:
239
+ logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
240
+ break
241
+
242
+ all_model_responses.append(current_responses)
243
+
244
+ return all_model_responses
@@ -1,11 +1,12 @@
1
+ import copy
1
2
  import openai
2
3
  from collections import defaultdict
3
- from inspect import signature
4
4
  from openai.types.chat import ChatCompletion, ChatCompletionChunk
5
5
  from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
6
6
  from typing import List, Optional, Union
7
7
 
8
8
  from evalscope.utils.logger import get_logger
9
+ from evalscope.utils.utils import get_supported_params
9
10
  from .base_adapter import BaseModelAdapter
10
11
 
11
12
  logger = get_logger()
@@ -31,7 +32,7 @@ class ServerModelAdapter(BaseModelAdapter):
31
32
  api_key=api_key,
32
33
  base_url=self.api_url,
33
34
  )
34
- self.supported_params = self._get_supported_params()
35
+ self.supported_params = get_supported_params(self.client.chat.completions.create)
35
36
 
36
37
  self.seed = kwargs.get('seed', None)
37
38
  self.timeout = kwargs.get('timeout', 60)
@@ -39,10 +40,6 @@ class ServerModelAdapter(BaseModelAdapter):
39
40
  self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
40
41
  super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
41
42
 
42
- def _get_supported_params(self):
43
- sig = signature(self.client.chat.completions.create)
44
- return list(sig.parameters.keys())
45
-
46
43
  def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
47
44
  """
48
45
  Model prediction func.
@@ -65,18 +62,17 @@ class ServerModelAdapter(BaseModelAdapter):
65
62
 
66
63
  def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
67
64
  """Process a single input item."""
68
- if input_item.get('messages', None):
69
- content = input_item['messages']
70
- else:
71
- content = self.make_request_content(input_item)
72
- request_json = self.make_request(content, infer_cfg)
65
+ request_json = self.make_request(input_item, infer_cfg)
73
66
  response = self.send_request(request_json)
74
67
  return response
75
68
 
76
- def make_request_content(self, input_item: dict) -> list:
69
+ def make_request_messages(self, input_item: dict) -> list:
77
70
  """
78
- Make request content for OpenAI API.
71
+ Make request messages for OpenAI API.
79
72
  """
73
+ if input_item.get('messages', None):
74
+ return input_item['messages']
75
+
80
76
  data: list = input_item['data']
81
77
  if isinstance(data[0], tuple): # for truthful_qa and hellaswag
82
78
  query = '\n'.join(''.join(item) for item in data)
@@ -93,18 +89,28 @@ class ServerModelAdapter(BaseModelAdapter):
93
89
 
94
90
  return messages
95
91
 
96
- def make_request(self, content: list, infer_cfg: dict) -> dict:
92
+ def make_request(self, input_item: dict, infer_cfg: dict) -> dict:
97
93
  """Make request to remote API."""
94
+ messages = self.make_request_messages(input_item)
98
95
  # Format request JSON according to OpenAI API format
99
- request_json = {'model': self.model_id, 'messages': content, **infer_cfg}
96
+ request_json = {'model': self.model_id, 'messages': messages, **infer_cfg}
100
97
 
101
98
  if self.timeout:
102
99
  request_json['timeout'] = self.timeout
103
100
 
101
+ request_json['stream'] = self.stream
104
102
  if self.stream:
105
- request_json['stream'] = self.stream
106
103
  request_json['stream_options'] = {'include_usage': True}
107
104
 
105
+ if input_item.get('tools', None):
106
+ tools_copy = copy.deepcopy(input_item.get('tools'))
107
+ # Remove the "responses" from the functions, as that doesn't
108
+ # need to go to the model
109
+ for tool in tools_copy:
110
+ if 'function' in tool and 'response' in tool['function']:
111
+ del tool['function']['response']
112
+ request_json['tools'] = tools_copy
113
+
108
114
  logger.debug(f'Request to remote API: {request_json}')
109
115
 
110
116
  return request_json
@@ -139,19 +145,65 @@ class ServerModelAdapter(BaseModelAdapter):
139
145
  collected_chunks = []
140
146
  collected_messages = defaultdict(list)
141
147
  collected_reasoning = defaultdict(list)
148
+ collected_tool_calls = defaultdict(dict)
142
149
 
143
150
  for chunk in response_stream:
144
151
  collected_chunks.append(chunk)
145
152
  for choice in chunk.choices:
153
+ # Handle reasoning content
146
154
  if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
147
155
  collected_reasoning[choice.index].append(choice.delta.reasoning_content)
156
+
157
+ # Handle regular content
148
158
  if choice.delta.content is not None:
149
159
  collected_messages[choice.index].append(choice.delta.content)
150
160
 
161
+ # Handle tool calls
162
+ if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
163
+ for tool_call in choice.delta.tool_calls:
164
+ tool_id = tool_call.index
165
+
166
+ # Initialize tool call if not present
167
+ if tool_id not in collected_tool_calls[choice.index]:
168
+ collected_tool_calls[choice.index][tool_id] = {
169
+ 'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
170
+ 'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
171
+ 'function': {
172
+ 'name': '',
173
+ 'arguments': ''
174
+ }
175
+ }
176
+
177
+ # Update tool call with new chunks
178
+ if hasattr(tool_call, 'function'):
179
+ if hasattr(tool_call.function, 'name') and tool_call.function.name:
180
+ collected_tool_calls[
181
+ choice.index][tool_id]['function']['name'] = tool_call.function.name
182
+
183
+ if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
184
+ collected_tool_calls[
185
+ choice.index][tool_id]['function']['arguments'] += tool_call.function.arguments
186
+
187
+ # Update ID if it was received later
188
+ if hasattr(tool_call, 'id') and tool_call.id:
189
+ collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
190
+
191
+ # Get all unique choice indices from all collections
192
+ all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(
193
+ collected_tool_calls.keys())
194
+
151
195
  choices = []
152
- for index, messages in collected_messages.items():
153
- full_reply_content = ''.join(messages)
154
- reasoning = ''.join(collected_reasoning[index])
196
+ for index in all_indices:
197
+ full_reply_content = ''.join(collected_messages.get(index, []))
198
+ reasoning = ''.join(collected_reasoning.get(index, []))
199
+
200
+ # Process tool_calls for this choice if any exists
201
+ tool_calls_list = None
202
+ if index in collected_tool_calls and collected_tool_calls[index]:
203
+ tool_calls_list = list(collected_tool_calls[index].values())
204
+ # Filter out any tool calls with None id (incomplete tool calls)
205
+ tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
206
+
155
207
  # use the finish_reason from the last chunk that generated this choice
156
208
  finish_reason = None
157
209
  for chunk in reversed(collected_chunks):
@@ -159,11 +211,16 @@ class ServerModelAdapter(BaseModelAdapter):
159
211
  finish_reason = chunk.choices[0].finish_reason
160
212
  break
161
213
 
214
+ message_kwargs = {'role': 'assistant', 'content': full_reply_content}
215
+
216
+ if reasoning:
217
+ message_kwargs['reasoning_content'] = reasoning
218
+
219
+ if tool_calls_list:
220
+ message_kwargs['tool_calls'] = tool_calls_list
221
+
162
222
  choice = Choice(
163
- finish_reason=finish_reason or 'stop',
164
- index=index,
165
- message=ChatCompletionMessage(
166
- role='assistant', content=full_reply_content, reasoning_content=reasoning))
223
+ finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs))
167
224
  choices.append(choice)
168
225
 
169
226
  # build the final completion object
@@ -10,9 +10,6 @@ class CustomModel(ABC):
10
10
  self.config = config
11
11
  self.kwargs = kwargs
12
12
 
13
- if config.get('model_id', None) is None:
14
- raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
15
-
16
13
  @abstractmethod
17
14
  @torch.no_grad()
18
15
  def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
@@ -1,61 +1,99 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  import time
4
3
  from typing import List
5
4
 
5
+ from evalscope.models import CustomModel
6
6
  from evalscope.utils.logger import get_logger
7
- from .custom_model import CustomModel
8
7
 
9
8
  logger = get_logger()
10
- """
11
- This script is used to rewrite the evaluation results without re-running the model predictions.
12
- """
13
9
 
14
10
 
15
11
  class DummyCustomModel(CustomModel):
16
12
 
17
- def __init__(self, config: dict = {'model_id': 'dummy-model'}, **kwargs):
13
+ def __init__(self, config: dict = {}, **kwargs):
18
14
  super(DummyCustomModel, self).__init__(config=config, **kwargs)
19
15
 
20
- def predict(self, prompts: List[dict], **kwargs):
21
- # ONLY FOR DUMMY IMPLEMENTATION, DO NOT EDIT OR USE IN PRODUCTION.
16
+ def make_request_messages(self, input_item: dict) -> list:
17
+ """
18
+ Make request messages for OpenAI API.
19
+ """
20
+ if input_item.get('messages', None):
21
+ return input_item['messages']
22
22
 
23
- response = ''
23
+ data: list = input_item['data']
24
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
25
+ query = '\n'.join(''.join(item) for item in data)
26
+ system_prompt = input_item.get('system_prompt', None)
27
+ else:
28
+ query = data[0]
29
+ system_prompt = input_item.get('system_prompt', None)
24
30
 
25
- res_d: dict = {
26
- 'choices': [{
27
- 'index': 0,
28
- 'message': {
29
- 'content': response,
30
- 'role': 'assistant'
31
- }
32
- }],
33
- 'created': time.time(),
34
- 'model': self.config.get('model_id'), # should be model_id
35
- 'object': 'chat.completion',
36
- 'usage': {
37
- 'completion_tokens': 0,
38
- 'prompt_tokens': 0,
39
- 'total_tokens': 0
40
- }
41
- }
31
+ messages = []
32
+ if system_prompt:
33
+ messages.append({'role': 'system', 'content': system_prompt})
42
34
 
43
- return [res_d for _ in prompts]
35
+ messages.append({'role': 'user', 'content': query})
44
36
 
37
+ return messages
45
38
 
46
- if __name__ == '__main__':
47
- from evalscope.run import run_task
48
- from evalscope.utils.io_utils import yaml_to_dict
49
-
50
- # step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
51
- # step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
39
+ def predict(self, prompts: List[dict], **kwargs):
40
+ original_inputs = kwargs.get('origin_inputs', None)
41
+ infer_cfg = kwargs.get('infer_cfg', None)
42
+
43
+ logger.debug(f'** Prompts: {prompts}')
44
+ if original_inputs is not None:
45
+ logger.debug(f'** Original inputs: {original_inputs}')
46
+ if infer_cfg is not None:
47
+ logger.debug(f'** Inference config: {infer_cfg}')
48
+
49
+ # Simulate a response based on the prompts
50
+ # Must return a list of dicts with the same format as the OpenAI API.
51
+ responses = []
52
+ for input_item in original_inputs:
53
+ message = self.make_request_messages(input_item)
54
+ response = f'Dummy response for prompt: {message}'
55
+
56
+ res_d = {
57
+ 'choices': [{
58
+ 'index': 0,
59
+ 'message': {
60
+ 'content': response,
61
+ 'role': 'assistant'
62
+ }
63
+ }],
64
+ 'created': time.time(),
65
+ 'model': self.config.get('model_id'),
66
+ 'object': 'chat.completion',
67
+ 'usage': {
68
+ 'completion_tokens': 0,
69
+ 'prompt_tokens': 0,
70
+ 'total_tokens': 0
71
+ }
72
+ }
52
73
 
53
- swift_model = DummyCustomModel(config={'model_id': 'swift-model-dummy'})
74
+ responses.append(res_d)
54
75
 
55
- task_cfg_file = '/path/to/eval_your_model_results/configs/task_output_config.yaml'
76
+ return responses
56
77
 
57
- task_cfg_d = yaml_to_dict(task_cfg_file)
58
- task_cfg_d.update({'model': swift_model})
59
78
 
60
- eval_results: dict = run_task(task_cfg=task_cfg_d)
61
- print('** Evaluation results finished !\n')
79
+ if __name__ == '__main__':
80
+ from evalscope import TaskConfig, run_task
81
+
82
+ dummy_model = DummyCustomModel()
83
+ task_config = TaskConfig(
84
+ model=dummy_model,
85
+ model_id='evalscope-model-dummy',
86
+ datasets=['gsm8k'],
87
+ eval_type='custom', # must be custom for custom model evaluation
88
+ generation_config={
89
+ 'max_new_tokens': 100,
90
+ 'temperature': 0.0,
91
+ 'top_p': 1.0,
92
+ 'top_k': 50,
93
+ 'repetition_penalty': 1.0
94
+ },
95
+ debug=True,
96
+ limit=5,
97
+ )
98
+
99
+ eval_results = run_task(task_cfg=task_config)
@@ -82,7 +82,7 @@ class LocalImageModel(LocalModel):
82
82
  def __init__(self, **kwargs):
83
83
  super().__init__(**kwargs)
84
84
 
85
- self.pipeline_cls = kwargs.pop('pipeline_cls', None)
85
+ self.pipeline_cls = self.kwargs.pop('pipeline_cls', None)
86
86
  # default to DiffusionPipeline if not specified
87
87
  if self.pipeline_cls is None:
88
88
  if 'flux' in self.model_id.lower():
@@ -47,8 +47,9 @@ def register_model_adapter_class(cls, name=None):
47
47
  # register all model adapters
48
48
  register_model_adapter_class(BaseModelAdapter, name='base')
49
49
  register_model_adapter_class(ChatGenerationModelAdapter, name=OutputType.GENERATION)
50
- register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.LOGITS)
50
+ register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.CONTINUOUS)
51
51
  register_model_adapter_class(MultiChoiceModelAdapter, name=OutputType.MULTIPLE_CHOICE)
52
52
  register_model_adapter_class(CustomModelAdapter, name='custom')
53
53
  register_model_adapter_class(ServerModelAdapter, name='server')
54
+ register_model_adapter_class(BFCLAdapter, name='bfcl_server')
54
55
  register_model_adapter_class(T2IModelAdapter, name=OutputType.IMAGE_GENERATION)
@@ -55,13 +55,14 @@ class Arguments:
55
55
 
56
56
  # Response settings
57
57
  frequency_penalty: Optional[float] = None # Frequency penalty for the response
58
+ repetition_penalty: Optional[float] = None # Repetition penalty for the response
58
59
  logprobs: Optional[bool] = None # Whether to log probabilities
59
60
  max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
60
61
  min_tokens: Optional[int] = None # Minimum number of tokens in the response
61
62
  n_choices: Optional[int] = None # Number of response choices
62
63
  seed: Optional[int] = 0 # Random seed for reproducibility
63
- stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
64
- stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
64
+ stop: Optional[List[str]] = None # Stop sequences for the response
65
+ stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
65
66
  stream: Optional[bool] = True # Whether to stream the response
66
67
  temperature: float = 0.0 # Temperature setting for the response
67
68
  top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
@@ -181,6 +182,7 @@ def add_argument(parser: argparse.ArgumentParser):
181
182
 
182
183
  # Response settings
183
184
  parser.add_argument('--frequency-penalty', type=float, help='The frequency_penalty value', default=None)
185
+ parser.add_argument('--repetition-penalty', type=float, help='The repetition_penalty value', default=None)
184
186
  parser.add_argument('--logprobs', action='store_true', help='The logprobs', default=None)
185
187
  parser.add_argument(
186
188
  '--max-tokens', type=int, help='The maximum number of tokens that can be generated', default=2048)