evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +40 -30
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +77 -39
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +2 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +99 -16
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +91 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/utils.py +25 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +68 -34
- evalscope/config.py +8 -2
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +40 -28
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +80 -23
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +4 -2
- evalscope/perf/benchmark.py +16 -12
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +40 -6
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +84 -4
- evalscope/run.py +12 -0
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
- tests/aigc/test_t2i.py +48 -11
- tests/cli/test_all.py +14 -3
- tests/cli/test_collection.py +6 -4
- tests/cli/test_run.py +50 -25
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +51 -7
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
import uuid
|
|
4
|
+
from typing import Any, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
from .server_adapter import ServerModelAdapter
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BFCLAdapter(ServerModelAdapter):
|
|
13
|
+
"""
|
|
14
|
+
BFCL model adapter to request remote API model and generate results for BFCL evaluation.
|
|
15
|
+
Support multi-turn and single-turn function calling tasks.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
|
|
19
|
+
"""
|
|
20
|
+
Args:
|
|
21
|
+
api_url: The URL of the remote API model.
|
|
22
|
+
model_id: The ID of the remote API model.
|
|
23
|
+
api_key: The API key of the remote API model.
|
|
24
|
+
"""
|
|
25
|
+
super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
|
|
26
|
+
|
|
27
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
28
|
+
"""
|
|
29
|
+
Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
|
|
30
|
+
where each list is a follow up turn in the conversation
|
|
31
|
+
each turn is a List[List[Message]]
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
inputs (List[dict]): The input data.
|
|
35
|
+
infer_cfg (dict): Inference configuration.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
res (List[dict]): The model prediction results.
|
|
39
|
+
"""
|
|
40
|
+
infer_cfg = infer_cfg or {}
|
|
41
|
+
results = []
|
|
42
|
+
|
|
43
|
+
for input_item in inputs:
|
|
44
|
+
# This flag decides if we pass tools to the API or try tool calling via prompting
|
|
45
|
+
# Passing tools to the API means that we rely on the API to manage system prompt specifics
|
|
46
|
+
# and also expect parsed tool calls in the ChatCompletionMessage object
|
|
47
|
+
# This is how the is_fc_model=True benchmark is designed to work
|
|
48
|
+
# On the other hand, we try to manage
|
|
49
|
+
# tool calling via prompting and parse tool calls in the standard text response
|
|
50
|
+
# This is how the is_fc_model=False benchmark is designed to work
|
|
51
|
+
row = input_item.get('messages')
|
|
52
|
+
is_fc_model = row.get('is_fc_model', False)
|
|
53
|
+
|
|
54
|
+
if is_fc_model:
|
|
55
|
+
response = self.generate_turn_with_tools(row, infer_cfg)
|
|
56
|
+
else:
|
|
57
|
+
response = self.generate_turn(row, infer_cfg)
|
|
58
|
+
|
|
59
|
+
# wrap response with openai types
|
|
60
|
+
res_d = {
|
|
61
|
+
'choices': [{
|
|
62
|
+
'index': 0,
|
|
63
|
+
'message': {
|
|
64
|
+
'content': response,
|
|
65
|
+
'role': 'assistant'
|
|
66
|
+
}
|
|
67
|
+
}],
|
|
68
|
+
'created': time.time(),
|
|
69
|
+
'model': self.model_id,
|
|
70
|
+
'object': 'chat.completion',
|
|
71
|
+
'usage': {
|
|
72
|
+
'completion_tokens': 0,
|
|
73
|
+
'prompt_tokens': 0,
|
|
74
|
+
'total_tokens': 0
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
results.append(res_d)
|
|
78
|
+
|
|
79
|
+
return results
|
|
80
|
+
|
|
81
|
+
def generate_turn(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
|
|
82
|
+
from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
|
|
83
|
+
MAXIMUM_STEP_LIMIT)
|
|
84
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
85
|
+
from bfcl_eval.model_handler.utils import default_decode_execute_prompting
|
|
86
|
+
|
|
87
|
+
all_model_responses = []
|
|
88
|
+
current_messages = []
|
|
89
|
+
turns = row['turns']
|
|
90
|
+
for turn_idx, messages in enumerate(turns):
|
|
91
|
+
n_steps = 0
|
|
92
|
+
current_responses = []
|
|
93
|
+
current_messages += messages.copy()
|
|
94
|
+
|
|
95
|
+
if str(turn_idx) in row['missing_functions']:
|
|
96
|
+
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
97
|
+
new_turn = [{
|
|
98
|
+
'role':
|
|
99
|
+
'user',
|
|
100
|
+
'content':
|
|
101
|
+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
|
|
102
|
+
functions=row['missing_functions'][str(turn_idx)]),
|
|
103
|
+
}]
|
|
104
|
+
current_messages += new_turn
|
|
105
|
+
|
|
106
|
+
while True:
|
|
107
|
+
input_item = {
|
|
108
|
+
'messages': current_messages,
|
|
109
|
+
}
|
|
110
|
+
responses = self.process_single_input(input_item, infer_cfg)
|
|
111
|
+
result = responses['choices'][0]['message']['content']
|
|
112
|
+
|
|
113
|
+
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
|
|
114
|
+
current_messages.append({
|
|
115
|
+
'role': 'assistant',
|
|
116
|
+
'content': result,
|
|
117
|
+
})
|
|
118
|
+
current_responses.append(result)
|
|
119
|
+
|
|
120
|
+
execute_tools = row.get('should_execute_tool_calls', False)
|
|
121
|
+
if execute_tools:
|
|
122
|
+
try:
|
|
123
|
+
tool_calls = default_decode_execute_prompting(result)
|
|
124
|
+
except Exception:
|
|
125
|
+
tool_calls = None
|
|
126
|
+
|
|
127
|
+
if tool_calls is None:
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
tool_outputs, _ = execute_multi_turn_func_call(
|
|
131
|
+
tool_calls,
|
|
132
|
+
initial_config=row['initial_config'],
|
|
133
|
+
involved_classes=row['involved_classes'],
|
|
134
|
+
model_name='evaluator_loop',
|
|
135
|
+
test_entry_id=row['id'],
|
|
136
|
+
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
137
|
+
is_evaL_run=False,
|
|
138
|
+
)
|
|
139
|
+
# Append tool outputs to the current messages
|
|
140
|
+
tool_results = []
|
|
141
|
+
for tool_output, tool_call in zip(tool_outputs, tool_calls):
|
|
142
|
+
tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
|
|
143
|
+
current_messages.append({
|
|
144
|
+
'role': 'user',
|
|
145
|
+
'content': repr(tool_results),
|
|
146
|
+
})
|
|
147
|
+
else:
|
|
148
|
+
break
|
|
149
|
+
|
|
150
|
+
n_steps += 1
|
|
151
|
+
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
152
|
+
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
all_model_responses.append(current_responses)
|
|
156
|
+
|
|
157
|
+
return all_model_responses
|
|
158
|
+
|
|
159
|
+
def generate_turn_with_tools(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
|
|
160
|
+
from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
161
|
+
MAXIMUM_STEP_LIMIT)
|
|
162
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
163
|
+
from bfcl_eval.model_handler.utils import convert_to_function_call
|
|
164
|
+
|
|
165
|
+
all_model_responses = []
|
|
166
|
+
current_messages = []
|
|
167
|
+
turns = row['turns']
|
|
168
|
+
for turn_idx, messages in enumerate(turns):
|
|
169
|
+
n_steps = 0
|
|
170
|
+
current_responses = []
|
|
171
|
+
current_messages += messages.copy()
|
|
172
|
+
tools = row['tools']
|
|
173
|
+
|
|
174
|
+
if str(turn_idx) in row['missing_functions']:
|
|
175
|
+
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
176
|
+
# inject new functions on the fly
|
|
177
|
+
new_tools = row['missing_functions'][str(turn_idx)]
|
|
178
|
+
for new_tool in new_tools:
|
|
179
|
+
tools.append({
|
|
180
|
+
'type': 'function',
|
|
181
|
+
'function': new_tool[0],
|
|
182
|
+
})
|
|
183
|
+
new_turn = [{
|
|
184
|
+
'role': 'user',
|
|
185
|
+
'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
186
|
+
}]
|
|
187
|
+
current_messages += new_turn
|
|
188
|
+
|
|
189
|
+
while True:
|
|
190
|
+
input_item = {
|
|
191
|
+
'messages': current_messages,
|
|
192
|
+
'tools': tools,
|
|
193
|
+
}
|
|
194
|
+
responses = self.process_single_input(input_item, infer_cfg)
|
|
195
|
+
message = responses['choices'][0]['message']
|
|
196
|
+
|
|
197
|
+
current_messages.append(message)
|
|
198
|
+
if isinstance(message, str):
|
|
199
|
+
model_responses = [message]
|
|
200
|
+
tool_call_strs = None
|
|
201
|
+
elif message.get('tool_calls'):
|
|
202
|
+
model_responses = [{
|
|
203
|
+
tc['function']['name']: tc['function']['arguments']
|
|
204
|
+
} for tc in message['tool_calls']]
|
|
205
|
+
try:
|
|
206
|
+
tool_call_strs = convert_to_function_call(model_responses)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
logger.error(f'Error converting tool calls to function call strings: {e}')
|
|
209
|
+
tool_call_strs = None
|
|
210
|
+
else:
|
|
211
|
+
model_responses = [message['content']]
|
|
212
|
+
tool_call_strs = None
|
|
213
|
+
|
|
214
|
+
current_responses.extend(model_responses)
|
|
215
|
+
|
|
216
|
+
execute_tools = row.get('should_execute_tool_calls', False)
|
|
217
|
+
if execute_tools and tool_call_strs is not None:
|
|
218
|
+
tool_outputs, _ = execute_multi_turn_func_call(
|
|
219
|
+
tool_call_strs,
|
|
220
|
+
initial_config=row['initial_config'],
|
|
221
|
+
involved_classes=row['involved_classes'],
|
|
222
|
+
model_name='evaluator_loop',
|
|
223
|
+
test_entry_id=row['id'],
|
|
224
|
+
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
225
|
+
is_evaL_run=False,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
for tc, tool_output in zip(message['tool_calls'], tool_outputs, strict=False):
|
|
229
|
+
current_messages.append({
|
|
230
|
+
'role': 'tool',
|
|
231
|
+
'tool_call_id': tc['id'],
|
|
232
|
+
'content': json.dumps({'response': tool_output}),
|
|
233
|
+
})
|
|
234
|
+
else:
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
n_steps += 1
|
|
238
|
+
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
239
|
+
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
240
|
+
break
|
|
241
|
+
|
|
242
|
+
all_model_responses.append(current_responses)
|
|
243
|
+
|
|
244
|
+
return all_model_responses
|
|
@@ -1,11 +1,12 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import openai
|
|
2
3
|
from collections import defaultdict
|
|
3
|
-
from inspect import signature
|
|
4
4
|
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
5
5
|
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
6
6
|
from typing import List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.utils.logger import get_logger
|
|
9
|
+
from evalscope.utils.utils import get_supported_params
|
|
9
10
|
from .base_adapter import BaseModelAdapter
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
@@ -31,7 +32,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
31
32
|
api_key=api_key,
|
|
32
33
|
base_url=self.api_url,
|
|
33
34
|
)
|
|
34
|
-
self.supported_params = self.
|
|
35
|
+
self.supported_params = get_supported_params(self.client.chat.completions.create)
|
|
35
36
|
|
|
36
37
|
self.seed = kwargs.get('seed', None)
|
|
37
38
|
self.timeout = kwargs.get('timeout', 60)
|
|
@@ -39,10 +40,6 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
39
40
|
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
40
41
|
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
41
42
|
|
|
42
|
-
def _get_supported_params(self):
|
|
43
|
-
sig = signature(self.client.chat.completions.create)
|
|
44
|
-
return list(sig.parameters.keys())
|
|
45
|
-
|
|
46
43
|
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
47
44
|
"""
|
|
48
45
|
Model prediction func.
|
|
@@ -65,18 +62,17 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
65
62
|
|
|
66
63
|
def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
67
64
|
"""Process a single input item."""
|
|
68
|
-
|
|
69
|
-
content = input_item['messages']
|
|
70
|
-
else:
|
|
71
|
-
content = self.make_request_content(input_item)
|
|
72
|
-
request_json = self.make_request(content, infer_cfg)
|
|
65
|
+
request_json = self.make_request(input_item, infer_cfg)
|
|
73
66
|
response = self.send_request(request_json)
|
|
74
67
|
return response
|
|
75
68
|
|
|
76
|
-
def
|
|
69
|
+
def make_request_messages(self, input_item: dict) -> list:
|
|
77
70
|
"""
|
|
78
|
-
Make request
|
|
71
|
+
Make request messages for OpenAI API.
|
|
79
72
|
"""
|
|
73
|
+
if input_item.get('messages', None):
|
|
74
|
+
return input_item['messages']
|
|
75
|
+
|
|
80
76
|
data: list = input_item['data']
|
|
81
77
|
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
82
78
|
query = '\n'.join(''.join(item) for item in data)
|
|
@@ -93,18 +89,28 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
93
89
|
|
|
94
90
|
return messages
|
|
95
91
|
|
|
96
|
-
def make_request(self,
|
|
92
|
+
def make_request(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
97
93
|
"""Make request to remote API."""
|
|
94
|
+
messages = self.make_request_messages(input_item)
|
|
98
95
|
# Format request JSON according to OpenAI API format
|
|
99
|
-
request_json = {'model': self.model_id, 'messages':
|
|
96
|
+
request_json = {'model': self.model_id, 'messages': messages, **infer_cfg}
|
|
100
97
|
|
|
101
98
|
if self.timeout:
|
|
102
99
|
request_json['timeout'] = self.timeout
|
|
103
100
|
|
|
101
|
+
request_json['stream'] = self.stream
|
|
104
102
|
if self.stream:
|
|
105
|
-
request_json['stream'] = self.stream
|
|
106
103
|
request_json['stream_options'] = {'include_usage': True}
|
|
107
104
|
|
|
105
|
+
if input_item.get('tools', None):
|
|
106
|
+
tools_copy = copy.deepcopy(input_item.get('tools'))
|
|
107
|
+
# Remove the "responses" from the functions, as that doesn't
|
|
108
|
+
# need to go to the model
|
|
109
|
+
for tool in tools_copy:
|
|
110
|
+
if 'function' in tool and 'response' in tool['function']:
|
|
111
|
+
del tool['function']['response']
|
|
112
|
+
request_json['tools'] = tools_copy
|
|
113
|
+
|
|
108
114
|
logger.debug(f'Request to remote API: {request_json}')
|
|
109
115
|
|
|
110
116
|
return request_json
|
|
@@ -139,19 +145,65 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
139
145
|
collected_chunks = []
|
|
140
146
|
collected_messages = defaultdict(list)
|
|
141
147
|
collected_reasoning = defaultdict(list)
|
|
148
|
+
collected_tool_calls = defaultdict(dict)
|
|
142
149
|
|
|
143
150
|
for chunk in response_stream:
|
|
144
151
|
collected_chunks.append(chunk)
|
|
145
152
|
for choice in chunk.choices:
|
|
153
|
+
# Handle reasoning content
|
|
146
154
|
if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
|
|
147
155
|
collected_reasoning[choice.index].append(choice.delta.reasoning_content)
|
|
156
|
+
|
|
157
|
+
# Handle regular content
|
|
148
158
|
if choice.delta.content is not None:
|
|
149
159
|
collected_messages[choice.index].append(choice.delta.content)
|
|
150
160
|
|
|
161
|
+
# Handle tool calls
|
|
162
|
+
if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
|
|
163
|
+
for tool_call in choice.delta.tool_calls:
|
|
164
|
+
tool_id = tool_call.index
|
|
165
|
+
|
|
166
|
+
# Initialize tool call if not present
|
|
167
|
+
if tool_id not in collected_tool_calls[choice.index]:
|
|
168
|
+
collected_tool_calls[choice.index][tool_id] = {
|
|
169
|
+
'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
|
|
170
|
+
'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
|
|
171
|
+
'function': {
|
|
172
|
+
'name': '',
|
|
173
|
+
'arguments': ''
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
# Update tool call with new chunks
|
|
178
|
+
if hasattr(tool_call, 'function'):
|
|
179
|
+
if hasattr(tool_call.function, 'name') and tool_call.function.name:
|
|
180
|
+
collected_tool_calls[
|
|
181
|
+
choice.index][tool_id]['function']['name'] = tool_call.function.name
|
|
182
|
+
|
|
183
|
+
if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
|
|
184
|
+
collected_tool_calls[
|
|
185
|
+
choice.index][tool_id]['function']['arguments'] += tool_call.function.arguments
|
|
186
|
+
|
|
187
|
+
# Update ID if it was received later
|
|
188
|
+
if hasattr(tool_call, 'id') and tool_call.id:
|
|
189
|
+
collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
|
|
190
|
+
|
|
191
|
+
# Get all unique choice indices from all collections
|
|
192
|
+
all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(
|
|
193
|
+
collected_tool_calls.keys())
|
|
194
|
+
|
|
151
195
|
choices = []
|
|
152
|
-
for index
|
|
153
|
-
full_reply_content = ''.join(
|
|
154
|
-
reasoning = ''.join(collected_reasoning[
|
|
196
|
+
for index in all_indices:
|
|
197
|
+
full_reply_content = ''.join(collected_messages.get(index, []))
|
|
198
|
+
reasoning = ''.join(collected_reasoning.get(index, []))
|
|
199
|
+
|
|
200
|
+
# Process tool_calls for this choice if any exists
|
|
201
|
+
tool_calls_list = None
|
|
202
|
+
if index in collected_tool_calls and collected_tool_calls[index]:
|
|
203
|
+
tool_calls_list = list(collected_tool_calls[index].values())
|
|
204
|
+
# Filter out any tool calls with None id (incomplete tool calls)
|
|
205
|
+
tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
|
|
206
|
+
|
|
155
207
|
# use the finish_reason from the last chunk that generated this choice
|
|
156
208
|
finish_reason = None
|
|
157
209
|
for chunk in reversed(collected_chunks):
|
|
@@ -159,11 +211,16 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
159
211
|
finish_reason = chunk.choices[0].finish_reason
|
|
160
212
|
break
|
|
161
213
|
|
|
214
|
+
message_kwargs = {'role': 'assistant', 'content': full_reply_content}
|
|
215
|
+
|
|
216
|
+
if reasoning:
|
|
217
|
+
message_kwargs['reasoning_content'] = reasoning
|
|
218
|
+
|
|
219
|
+
if tool_calls_list:
|
|
220
|
+
message_kwargs['tool_calls'] = tool_calls_list
|
|
221
|
+
|
|
162
222
|
choice = Choice(
|
|
163
|
-
finish_reason=finish_reason or 'stop',
|
|
164
|
-
index=index,
|
|
165
|
-
message=ChatCompletionMessage(
|
|
166
|
-
role='assistant', content=full_reply_content, reasoning_content=reasoning))
|
|
223
|
+
finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs))
|
|
167
224
|
choices.append(choice)
|
|
168
225
|
|
|
169
226
|
# build the final completion object
|
|
@@ -10,9 +10,6 @@ class CustomModel(ABC):
|
|
|
10
10
|
self.config = config
|
|
11
11
|
self.kwargs = kwargs
|
|
12
12
|
|
|
13
|
-
if config.get('model_id', None) is None:
|
|
14
|
-
raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
|
|
15
|
-
|
|
16
13
|
@abstractmethod
|
|
17
14
|
@torch.no_grad()
|
|
18
15
|
def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
|
|
@@ -1,61 +1,99 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
import time
|
|
4
3
|
from typing import List
|
|
5
4
|
|
|
5
|
+
from evalscope.models import CustomModel
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
7
|
-
from .custom_model import CustomModel
|
|
8
7
|
|
|
9
8
|
logger = get_logger()
|
|
10
|
-
"""
|
|
11
|
-
This script is used to rewrite the evaluation results without re-running the model predictions.
|
|
12
|
-
"""
|
|
13
9
|
|
|
14
10
|
|
|
15
11
|
class DummyCustomModel(CustomModel):
|
|
16
12
|
|
|
17
|
-
def __init__(self, config: dict = {
|
|
13
|
+
def __init__(self, config: dict = {}, **kwargs):
|
|
18
14
|
super(DummyCustomModel, self).__init__(config=config, **kwargs)
|
|
19
15
|
|
|
20
|
-
def
|
|
21
|
-
|
|
16
|
+
def make_request_messages(self, input_item: dict) -> list:
|
|
17
|
+
"""
|
|
18
|
+
Make request messages for OpenAI API.
|
|
19
|
+
"""
|
|
20
|
+
if input_item.get('messages', None):
|
|
21
|
+
return input_item['messages']
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
data: list = input_item['data']
|
|
24
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
25
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
26
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
27
|
+
else:
|
|
28
|
+
query = data[0]
|
|
29
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
24
30
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
'message': {
|
|
29
|
-
'content': response,
|
|
30
|
-
'role': 'assistant'
|
|
31
|
-
}
|
|
32
|
-
}],
|
|
33
|
-
'created': time.time(),
|
|
34
|
-
'model': self.config.get('model_id'), # should be model_id
|
|
35
|
-
'object': 'chat.completion',
|
|
36
|
-
'usage': {
|
|
37
|
-
'completion_tokens': 0,
|
|
38
|
-
'prompt_tokens': 0,
|
|
39
|
-
'total_tokens': 0
|
|
40
|
-
}
|
|
41
|
-
}
|
|
31
|
+
messages = []
|
|
32
|
+
if system_prompt:
|
|
33
|
+
messages.append({'role': 'system', 'content': system_prompt})
|
|
42
34
|
|
|
43
|
-
|
|
35
|
+
messages.append({'role': 'user', 'content': query})
|
|
44
36
|
|
|
37
|
+
return messages
|
|
45
38
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
39
|
+
def predict(self, prompts: List[dict], **kwargs):
|
|
40
|
+
original_inputs = kwargs.get('origin_inputs', None)
|
|
41
|
+
infer_cfg = kwargs.get('infer_cfg', None)
|
|
42
|
+
|
|
43
|
+
logger.debug(f'** Prompts: {prompts}')
|
|
44
|
+
if original_inputs is not None:
|
|
45
|
+
logger.debug(f'** Original inputs: {original_inputs}')
|
|
46
|
+
if infer_cfg is not None:
|
|
47
|
+
logger.debug(f'** Inference config: {infer_cfg}')
|
|
48
|
+
|
|
49
|
+
# Simulate a response based on the prompts
|
|
50
|
+
# Must return a list of dicts with the same format as the OpenAI API.
|
|
51
|
+
responses = []
|
|
52
|
+
for input_item in original_inputs:
|
|
53
|
+
message = self.make_request_messages(input_item)
|
|
54
|
+
response = f'Dummy response for prompt: {message}'
|
|
55
|
+
|
|
56
|
+
res_d = {
|
|
57
|
+
'choices': [{
|
|
58
|
+
'index': 0,
|
|
59
|
+
'message': {
|
|
60
|
+
'content': response,
|
|
61
|
+
'role': 'assistant'
|
|
62
|
+
}
|
|
63
|
+
}],
|
|
64
|
+
'created': time.time(),
|
|
65
|
+
'model': self.config.get('model_id'),
|
|
66
|
+
'object': 'chat.completion',
|
|
67
|
+
'usage': {
|
|
68
|
+
'completion_tokens': 0,
|
|
69
|
+
'prompt_tokens': 0,
|
|
70
|
+
'total_tokens': 0
|
|
71
|
+
}
|
|
72
|
+
}
|
|
52
73
|
|
|
53
|
-
|
|
74
|
+
responses.append(res_d)
|
|
54
75
|
|
|
55
|
-
|
|
76
|
+
return responses
|
|
56
77
|
|
|
57
|
-
task_cfg_d = yaml_to_dict(task_cfg_file)
|
|
58
|
-
task_cfg_d.update({'model': swift_model})
|
|
59
78
|
|
|
60
|
-
|
|
61
|
-
|
|
79
|
+
if __name__ == '__main__':
|
|
80
|
+
from evalscope import TaskConfig, run_task
|
|
81
|
+
|
|
82
|
+
dummy_model = DummyCustomModel()
|
|
83
|
+
task_config = TaskConfig(
|
|
84
|
+
model=dummy_model,
|
|
85
|
+
model_id='evalscope-model-dummy',
|
|
86
|
+
datasets=['gsm8k'],
|
|
87
|
+
eval_type='custom', # must be custom for custom model evaluation
|
|
88
|
+
generation_config={
|
|
89
|
+
'max_new_tokens': 100,
|
|
90
|
+
'temperature': 0.0,
|
|
91
|
+
'top_p': 1.0,
|
|
92
|
+
'top_k': 50,
|
|
93
|
+
'repetition_penalty': 1.0
|
|
94
|
+
},
|
|
95
|
+
debug=True,
|
|
96
|
+
limit=5,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
eval_results = run_task(task_cfg=task_config)
|
evalscope/models/local_model.py
CHANGED
|
@@ -82,7 +82,7 @@ class LocalImageModel(LocalModel):
|
|
|
82
82
|
def __init__(self, **kwargs):
|
|
83
83
|
super().__init__(**kwargs)
|
|
84
84
|
|
|
85
|
-
self.pipeline_cls = kwargs.pop('pipeline_cls', None)
|
|
85
|
+
self.pipeline_cls = self.kwargs.pop('pipeline_cls', None)
|
|
86
86
|
# default to DiffusionPipeline if not specified
|
|
87
87
|
if self.pipeline_cls is None:
|
|
88
88
|
if 'flux' in self.model_id.lower():
|
evalscope/models/register.py
CHANGED
|
@@ -47,8 +47,9 @@ def register_model_adapter_class(cls, name=None):
|
|
|
47
47
|
# register all model adapters
|
|
48
48
|
register_model_adapter_class(BaseModelAdapter, name='base')
|
|
49
49
|
register_model_adapter_class(ChatGenerationModelAdapter, name=OutputType.GENERATION)
|
|
50
|
-
register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.
|
|
50
|
+
register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.CONTINUOUS)
|
|
51
51
|
register_model_adapter_class(MultiChoiceModelAdapter, name=OutputType.MULTIPLE_CHOICE)
|
|
52
52
|
register_model_adapter_class(CustomModelAdapter, name='custom')
|
|
53
53
|
register_model_adapter_class(ServerModelAdapter, name='server')
|
|
54
|
+
register_model_adapter_class(BFCLAdapter, name='bfcl_server')
|
|
54
55
|
register_model_adapter_class(T2IModelAdapter, name=OutputType.IMAGE_GENERATION)
|
evalscope/perf/arguments.py
CHANGED
|
@@ -55,13 +55,14 @@ class Arguments:
|
|
|
55
55
|
|
|
56
56
|
# Response settings
|
|
57
57
|
frequency_penalty: Optional[float] = None # Frequency penalty for the response
|
|
58
|
+
repetition_penalty: Optional[float] = None # Repetition penalty for the response
|
|
58
59
|
logprobs: Optional[bool] = None # Whether to log probabilities
|
|
59
60
|
max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
|
|
60
61
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
61
62
|
n_choices: Optional[int] = None # Number of response choices
|
|
62
63
|
seed: Optional[int] = 0 # Random seed for reproducibility
|
|
63
|
-
stop: Optional[List[str]] =
|
|
64
|
-
stop_token_ids: Optional[List[str]] =
|
|
64
|
+
stop: Optional[List[str]] = None # Stop sequences for the response
|
|
65
|
+
stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
|
|
65
66
|
stream: Optional[bool] = True # Whether to stream the response
|
|
66
67
|
temperature: float = 0.0 # Temperature setting for the response
|
|
67
68
|
top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
|
|
@@ -181,6 +182,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
181
182
|
|
|
182
183
|
# Response settings
|
|
183
184
|
parser.add_argument('--frequency-penalty', type=float, help='The frequency_penalty value', default=None)
|
|
185
|
+
parser.add_argument('--repetition-penalty', type=float, help='The repetition_penalty value', default=None)
|
|
184
186
|
parser.add_argument('--logprobs', action='store_true', help='The logprobs', default=None)
|
|
185
187
|
parser.add_argument(
|
|
186
188
|
'--max-tokens', type=int, help='The maximum number of tokens that can be generated', default=2048)
|