evalscope 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -0
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +0 -5
- evalscope/benchmarks/benchmark.py +3 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -17
- evalscope/benchmarks/data_adapter.py +71 -18
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +6 -10
- evalscope/benchmarks/general_qa/general_qa_adapter.py +4 -5
- evalscope/benchmarks/gpqa/gpqa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +10 -1
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +16 -32
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +68 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -1
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +6 -0
- evalscope/config.py +3 -1
- evalscope/evaluator/evaluator.py +3 -1
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/metrics.py +23 -2
- evalscope/models/base_adapter.py +7 -1
- evalscope/models/chat_adapter.py +1 -1
- evalscope/models/local_model.py +3 -2
- evalscope/models/server_adapter.py +79 -28
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +5 -1
- evalscope/perf/http_client.py +2 -2
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +12 -8
- evalscope/run.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +264 -0
- evalscope/third_party/thinkbench/infer.py +100 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +47 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/METADATA +9 -4
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/RECORD +58 -44
- tests/cli/test_run.py +27 -15
- /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
- /evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import plotly.graph_objects as go
|
|
5
|
+
import re
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from functools import lru_cache
|
|
8
|
+
from modelscope import AutoTokenizer
|
|
9
|
+
from plotly.subplots import make_subplots
|
|
10
|
+
from tqdm.contrib.concurrent import thread_map
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
from evalscope.third_party.thinkbench.tools.llm import request_url
|
|
14
|
+
from evalscope.third_party.thinkbench.tools.utils import extract_answer
|
|
15
|
+
from evalscope.utils.io_utils import dump_jsonl_data
|
|
16
|
+
|
|
17
|
+
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
18
|
+
|
|
19
|
+
class EvalThink:
|
|
20
|
+
def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None):
|
|
21
|
+
self.report_path = report_path
|
|
22
|
+
self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read()
|
|
23
|
+
self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read()
|
|
24
|
+
self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle']
|
|
25
|
+
self.subset_dict = defaultdict(lambda: defaultdict(list))
|
|
26
|
+
self.think_end_token = '</think>'
|
|
27
|
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
|
28
|
+
self.model_name = model_name
|
|
29
|
+
self.dataset_name = dataset_name
|
|
30
|
+
self.subsets = subsets
|
|
31
|
+
self.metrics = ['token_efficiency', 'completion_len', 'thought_num', 'accuracy']
|
|
32
|
+
self.split_strategies = split_strategies # split by llm, keywords, separator
|
|
33
|
+
self.judge_config = judge_config
|
|
34
|
+
|
|
35
|
+
@lru_cache(maxsize=None)
|
|
36
|
+
def get_think_part(self, text):
|
|
37
|
+
last_think_end = text.rfind(self.think_end_token)
|
|
38
|
+
return text[:last_think_end].lower()
|
|
39
|
+
|
|
40
|
+
@lru_cache(maxsize=None)
|
|
41
|
+
def cal_tokens(self, text: str):
|
|
42
|
+
return len(self.tokenizer.encode(text, add_special_tokens=False))
|
|
43
|
+
|
|
44
|
+
def process_choice(self, choice, problem):
|
|
45
|
+
think_part = self.get_think_part(choice['message']['content'])
|
|
46
|
+
answer = choice['review']['gold']
|
|
47
|
+
tokens = self.cal_tokens(think_part)
|
|
48
|
+
switch_count = sum(think_part.count(token) for token in self.switch_tokens)
|
|
49
|
+
useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
|
|
50
|
+
score = choice['review']['result']
|
|
51
|
+
return tokens, switch_count, useful_tokens, score
|
|
52
|
+
|
|
53
|
+
def process_item(self, item):
|
|
54
|
+
problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
|
|
55
|
+
results = []
|
|
56
|
+
for choice in item['choices']:
|
|
57
|
+
results.append(self.process_choice(choice, problem))
|
|
58
|
+
break # only process the first choice
|
|
59
|
+
|
|
60
|
+
tokens, switch_counts, useful_tokens, scores = zip(*results)
|
|
61
|
+
|
|
62
|
+
avg_tokens = sum(tokens) / len(tokens)
|
|
63
|
+
avg_thought_num = sum(switch_counts) / len(switch_counts)
|
|
64
|
+
avg_token_efficiency = sum(useful_tokens) / sum(tokens)
|
|
65
|
+
avg_accuracy = sum(scores) / len(scores)
|
|
66
|
+
|
|
67
|
+
return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy
|
|
68
|
+
|
|
69
|
+
def split_by_llm(self, response, problem) -> List[str]:
|
|
70
|
+
response = response.replace('\n', ' ') # remove newline characters
|
|
71
|
+
prompt = self.reformat_template.format(problem=problem, response=response)
|
|
72
|
+
llm_response = request_url(self.judge_config, prompt)
|
|
73
|
+
return llm_response.split('\n\n')
|
|
74
|
+
|
|
75
|
+
def split_by_keywords(self, text) -> List[str]:
|
|
76
|
+
pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens)))
|
|
77
|
+
segments = re.split(pattern, text)
|
|
78
|
+
# remove empty segments
|
|
79
|
+
segments = [segment.strip() for segment in segments if segment.strip()]
|
|
80
|
+
|
|
81
|
+
return segments if segments else [text]
|
|
82
|
+
|
|
83
|
+
def split_by_separator(self, text) -> List[str]:
|
|
84
|
+
return text.split('\n\n')
|
|
85
|
+
|
|
86
|
+
def get_answer_index(self, response: List[str], problem: str, answer: str) -> int:
|
|
87
|
+
tagged_response = ''
|
|
88
|
+
for sdx, step in enumerate(response):
|
|
89
|
+
tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
|
|
90
|
+
tagged_response = tagged_response.strip()
|
|
91
|
+
|
|
92
|
+
prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
|
|
93
|
+
llm_response = request_url(self.judge_config, prompt)
|
|
94
|
+
answer_index = extract_answer(llm_response)
|
|
95
|
+
|
|
96
|
+
dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
|
|
97
|
+
os.path.join(self.report_path, 'answer_index.jsonl'),
|
|
98
|
+
dump_mode='append')
|
|
99
|
+
try:
|
|
100
|
+
answer_index = int(answer_index)
|
|
101
|
+
except Exception:
|
|
102
|
+
answer_index = -1
|
|
103
|
+
return answer_index
|
|
104
|
+
|
|
105
|
+
def get_first_correct(self, response: str, problem: str, answer: str) -> str:
|
|
106
|
+
if self.split_strategies == 'llm':
|
|
107
|
+
text_list = self.split_by_llm(response, problem)
|
|
108
|
+
elif self.split_strategies == 'keywords':
|
|
109
|
+
text_list = self.split_by_keywords(response)
|
|
110
|
+
else:
|
|
111
|
+
text_list = self.split_by_separator(response)
|
|
112
|
+
|
|
113
|
+
answer_index = self.get_answer_index(text_list, problem, answer)
|
|
114
|
+
|
|
115
|
+
if answer_index == -1: # no correct answer found
|
|
116
|
+
first_correct = ''
|
|
117
|
+
else:
|
|
118
|
+
first_correct = '\n\n'.join(text_list[: answer_index])
|
|
119
|
+
return first_correct
|
|
120
|
+
|
|
121
|
+
def plot_metrics(self, results, output_dir):
|
|
122
|
+
fig = make_subplots(rows=1, cols=len(self.metrics),
|
|
123
|
+
subplot_titles=('Token Efficiency', 'Completion Length', 'Thought Num', 'Accuracy'),
|
|
124
|
+
shared_xaxes=True, x_title='Subsets')
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
for i, metric in enumerate(self.metrics, start=1):
|
|
128
|
+
y_values = [results[metric][subset] for subset in self.subsets]
|
|
129
|
+
fig.add_trace(
|
|
130
|
+
go.Scatter(x=list(range(len(self.subsets))), y=y_values,
|
|
131
|
+
mode='lines+markers',
|
|
132
|
+
name=metric.replace('_', ' ').title()),
|
|
133
|
+
row=1, col=i
|
|
134
|
+
)
|
|
135
|
+
# Add annotations for each data point
|
|
136
|
+
for j, y in enumerate(y_values):
|
|
137
|
+
fig.add_annotation(
|
|
138
|
+
x=j,
|
|
139
|
+
y=y,
|
|
140
|
+
text=f'{y:.2f}',
|
|
141
|
+
showarrow=False,
|
|
142
|
+
yshift=10,
|
|
143
|
+
row=1,
|
|
144
|
+
col=i
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
fig.update_layout(
|
|
148
|
+
height=500,
|
|
149
|
+
width=1500,
|
|
150
|
+
title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
|
|
151
|
+
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
for i in range(1, len(self.metrics) + 1):
|
|
155
|
+
fig.update_xaxes(
|
|
156
|
+
ticktext=self.subsets,
|
|
157
|
+
tickvals=list(range(len(self.subsets))),
|
|
158
|
+
row=1, col=i
|
|
159
|
+
)
|
|
160
|
+
fig.update_yaxes(title_text=self.metrics[i-1].replace('_', ' ').title(), row=1, col=i)
|
|
161
|
+
# Update y-axis ranges
|
|
162
|
+
fig.update_yaxes(range=[0, 1], row=1, col=1) # Token Efficiency
|
|
163
|
+
fig.update_yaxes(range=[0, 13], row=1, col=3) # Switch Frequency
|
|
164
|
+
fig.update_yaxes(range=[0, 1], row=1, col=4) # Accuracy
|
|
165
|
+
|
|
166
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
167
|
+
output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
|
|
168
|
+
fig.write_image(output_path)
|
|
169
|
+
print(f'save figure to: {output_path}')
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def filter_df(self, df, response_len: int = 8000, count: int=10):
|
|
174
|
+
def is_valid_row(row):
|
|
175
|
+
return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices'])
|
|
176
|
+
|
|
177
|
+
bools = df.apply(is_valid_row, axis=1)
|
|
178
|
+
|
|
179
|
+
return df[bools].head(count)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def evaluate(self, output_dir, max_tokens=8000, count=50):
|
|
183
|
+
for subset in self.subsets:
|
|
184
|
+
review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
|
|
185
|
+
review_df = pd.read_json(review_path, lines=True)
|
|
186
|
+
|
|
187
|
+
review_df = self.filter_df(review_df, response_len=max_tokens, count=count)
|
|
188
|
+
|
|
189
|
+
results = thread_map(
|
|
190
|
+
self.process_item,
|
|
191
|
+
(item for _, item in review_df.iterrows()),
|
|
192
|
+
desc=f'Evaluating {subset}',
|
|
193
|
+
total=len(review_df),
|
|
194
|
+
max_workers=16
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy = zip(*results)
|
|
198
|
+
|
|
199
|
+
self.subset_dict[subset]['completion_len'] = sum(avg_tokens) / len(avg_tokens)
|
|
200
|
+
self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
|
|
201
|
+
self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
|
|
202
|
+
self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
|
|
206
|
+
for metric in self.metrics}
|
|
207
|
+
|
|
208
|
+
self.plot_metrics(results, output_dir)
|
|
209
|
+
|
|
210
|
+
return results
|
|
211
|
+
|
|
212
|
+
def run_task(config, output_dir='outputs', max_tokens=8000, count=50):
|
|
213
|
+
evaluator = EvalThink(**config,)
|
|
214
|
+
results = evaluator.evaluate(output_dir, max_tokens, count)
|
|
215
|
+
print(results)
|
|
216
|
+
|
|
217
|
+
judge_config = dict(
|
|
218
|
+
api_key='EMPTY',
|
|
219
|
+
base_url='http://0.0.0.0:8801/v1',
|
|
220
|
+
model_name='Qwen2.5-72B-Instruct',
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
distill_qwen_config = dict(
|
|
224
|
+
report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250218_180219',
|
|
225
|
+
model_name = 'DeepSeek-R1-Distill-Qwen-7B',
|
|
226
|
+
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
|
227
|
+
dataset_name = 'math_500',
|
|
228
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
229
|
+
split_strategies='separator',
|
|
230
|
+
judge_config=judge_config
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
math_qwen_config = dict(
|
|
234
|
+
report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250219_202358',
|
|
235
|
+
model_name = 'Qwen2.5-Math-7B-Instruct',
|
|
236
|
+
tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
|
|
237
|
+
dataset_name = 'math_500',
|
|
238
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
239
|
+
split_strategies='separator'
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
r1_config = dict(
|
|
243
|
+
report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202',
|
|
244
|
+
model_name = 'deepseek-r1',
|
|
245
|
+
tokenizer_path = 'deepseek-ai/DeepSeek-R1',
|
|
246
|
+
dataset_name = 'math_500',
|
|
247
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
248
|
+
split_strategies='separator'
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
qwq_config = dict(
|
|
252
|
+
report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911',
|
|
253
|
+
model_name = 'qwq-32b-preview',
|
|
254
|
+
tokenizer_path = 'Qwen/QwQ-32B-Preview',
|
|
255
|
+
dataset_name = 'math_500',
|
|
256
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
257
|
+
split_strategies='separator'
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
if __name__ == '__main__':
|
|
261
|
+
run_task(distill_qwen_config)
|
|
262
|
+
# run_task(math_qwen_config)
|
|
263
|
+
# run_task(r1_config)
|
|
264
|
+
# run_task(qwq_config)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from evalscope import TaskConfig, run_task
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def eval_distill_qwen():
|
|
7
|
+
model_name = 'DeepSeek-R1-Distill-Qwen-7B'
|
|
8
|
+
dataset_name = 'math_500'
|
|
9
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
10
|
+
|
|
11
|
+
task_config = TaskConfig(
|
|
12
|
+
api_url='http://0.0.0.0:8801/v1/chat/completions',
|
|
13
|
+
model=model_name,
|
|
14
|
+
eval_type='service',
|
|
15
|
+
datasets=[dataset_name],
|
|
16
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
17
|
+
eval_batch_size=32,
|
|
18
|
+
generation_config={
|
|
19
|
+
'max_tokens': 20000, # avoid exceed max length
|
|
20
|
+
'temperature': 0.6,
|
|
21
|
+
'top_p': 0.95,
|
|
22
|
+
'n': 1,
|
|
23
|
+
},
|
|
24
|
+
)
|
|
25
|
+
run_task(task_config)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def eval_math_qwen():
|
|
29
|
+
model_name = 'Qwen2.5-Math-7B-Instruct'
|
|
30
|
+
dataset_name = 'math_500'
|
|
31
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
32
|
+
|
|
33
|
+
task_config = TaskConfig(
|
|
34
|
+
api_url='http://0.0.0.0:8801/v1/chat/completions',
|
|
35
|
+
model=model_name,
|
|
36
|
+
eval_type='service',
|
|
37
|
+
datasets=[dataset_name],
|
|
38
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
39
|
+
eval_batch_size=32,
|
|
40
|
+
generation_config={
|
|
41
|
+
'max_tokens': 3000, # avoid exceed max length
|
|
42
|
+
'temperature': 0.6,
|
|
43
|
+
'top_p': 0.95,
|
|
44
|
+
'n': 3,
|
|
45
|
+
},
|
|
46
|
+
)
|
|
47
|
+
run_task(task_config)
|
|
48
|
+
|
|
49
|
+
def eval_r1():
|
|
50
|
+
model_name = 'deepseek-r1'
|
|
51
|
+
dataset_name = 'math_500'
|
|
52
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
53
|
+
|
|
54
|
+
task_config = TaskConfig(
|
|
55
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
56
|
+
api_key=os.environ['DASHSCOPE_API_KEY'],
|
|
57
|
+
model=model_name,
|
|
58
|
+
eval_type='service',
|
|
59
|
+
datasets=[dataset_name],
|
|
60
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
61
|
+
eval_batch_size=3,
|
|
62
|
+
generation_config={
|
|
63
|
+
'max_tokens': 12000, # avoid exceed max length
|
|
64
|
+
'temperature': 0.6,
|
|
65
|
+
'top_p': 0.95,
|
|
66
|
+
'n': 1,
|
|
67
|
+
},
|
|
68
|
+
limit=50,
|
|
69
|
+
use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202'
|
|
70
|
+
)
|
|
71
|
+
run_task(task_config)
|
|
72
|
+
|
|
73
|
+
def eval_qwq():
|
|
74
|
+
model_name = 'qwq-32b-preview'
|
|
75
|
+
dataset_name = 'math_500'
|
|
76
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
77
|
+
|
|
78
|
+
task_config = TaskConfig(
|
|
79
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
80
|
+
api_key=os.environ['DASHSCOPE_API_KEY'],
|
|
81
|
+
model=model_name,
|
|
82
|
+
eval_type='service',
|
|
83
|
+
datasets=[dataset_name],
|
|
84
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
85
|
+
eval_batch_size=32,
|
|
86
|
+
generation_config={
|
|
87
|
+
'max_tokens': 8000, # avoid exceed max length
|
|
88
|
+
'temperature': 0.6,
|
|
89
|
+
'top_p': 0.95,
|
|
90
|
+
'n': 1,
|
|
91
|
+
},
|
|
92
|
+
use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911'
|
|
93
|
+
)
|
|
94
|
+
run_task(task_config)
|
|
95
|
+
|
|
96
|
+
if __name__ == '__main__':
|
|
97
|
+
# eval_distill_qwen()
|
|
98
|
+
# eval_math_qwen()
|
|
99
|
+
# eval_r1()
|
|
100
|
+
eval_qwq()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
|
|
2
|
+
|
|
3
|
+
[Math Problem]
|
|
4
|
+
|
|
5
|
+
{problem}
|
|
6
|
+
|
|
7
|
+
[Correct Answer]
|
|
8
|
+
|
|
9
|
+
{answer}
|
|
10
|
+
|
|
11
|
+
[Solution]
|
|
12
|
+
|
|
13
|
+
{tagged_response}
|
|
14
|
+
|
|
15
|
+
Your task is to review and critique the solution paragraph by paragraph. Once you identify an correct answer in a paragraph, return the index of the paragraph where the earliest correct answer occurs. Otherwise, return the index of -1 (which typically denotes "not found").
|
|
16
|
+
|
|
17
|
+
Please put your final answer (i.e., the index) in \boxed{{}}.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
I will present you with a solution to a math problem. Unfortunately, the solution lacks proper paragraphing, making it hard to read. Your task is to improve readability by reformatting the solution into well-structured paragraphs. Follow these specific guidelines:
|
|
2
|
+
|
|
3
|
+
* Insert \n\n for paragraph breaks within the original solution. Do **NOT** alter any content of the original solution (the only exception is for itemized lists; see below).
|
|
4
|
+
|
|
5
|
+
- Each paragraph should represent a distinct, concise reasoning step that logically advances the solution.
|
|
6
|
+
|
|
7
|
+
- Reasoning steps can include case discussions, formula simplifications, or formula derivations. Each of these should be treated as an individual reasoning step and paragraphed accordingly.
|
|
8
|
+
|
|
9
|
+
- If an introductory analysis exists in the original solution, treat it as an initial reasoning step and place it as the first paragraph.
|
|
10
|
+
|
|
11
|
+
- Do **NOT** place any mathematical formulas in their own separate paragraphs; instead, include them within the same paragraph as the preceding text to form a cohesive reasoning step.
|
|
12
|
+
|
|
13
|
+
* For any itemized lists (ordered or unordered), convert them into a written format, such as "First/Second/Third." This is the **ONLY** content modification allowed.
|
|
14
|
+
|
|
15
|
+
* Avoid making paragraphs too lengthy, as long paragraphs might contain multiple reasoning steps that should be paragraphed separately.
|
|
16
|
+
|
|
17
|
+
* Disregard the accuracy of the solution content. Do **NOT** alter any of the original solution's content; focus solely on structuring it into logical, readable paragraphs.
|
|
18
|
+
|
|
19
|
+
* Reply with the reformatted solution directly.
|
|
20
|
+
|
|
21
|
+
--------------------------------------------------
|
|
22
|
+
|
|
23
|
+
Here is the math problem, and the solution that needs to be reformatted:
|
|
24
|
+
|
|
25
|
+
[Math Problem]
|
|
26
|
+
|
|
27
|
+
{problem}
|
|
28
|
+
|
|
29
|
+
[Solution]
|
|
30
|
+
|
|
31
|
+
{response}
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from openai import OpenAI
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def request_url(llm_config, content):
|
|
6
|
+
try:
|
|
7
|
+
client = OpenAI(
|
|
8
|
+
api_key=llm_config['api_key'],
|
|
9
|
+
base_url=llm_config['base_url'],
|
|
10
|
+
)
|
|
11
|
+
completion = client.chat.completions.create(
|
|
12
|
+
model=llm_config['model_name'],
|
|
13
|
+
messages=[{'role': 'user', 'content': content}]
|
|
14
|
+
)
|
|
15
|
+
return completion.choices[0].message.content
|
|
16
|
+
except Exception as e:
|
|
17
|
+
print(e)
|
|
18
|
+
|
|
19
|
+
def request_qwen(content):
|
|
20
|
+
try:
|
|
21
|
+
client = OpenAI(
|
|
22
|
+
api_key=os.getenv('DASHSCOPE_API_KEY'),
|
|
23
|
+
base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
completion = client.chat.completions.create(
|
|
27
|
+
model='qwen-max',
|
|
28
|
+
messages=[{'role': 'user', 'content': content}]
|
|
29
|
+
)
|
|
30
|
+
return completion.choices[0].message.content
|
|
31
|
+
except Exception as e:
|
|
32
|
+
print(e)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def request_local(content):
|
|
36
|
+
try:
|
|
37
|
+
client = OpenAI(
|
|
38
|
+
api_key='EMPTY',
|
|
39
|
+
base_url='http://0.0.0.0:8801/v1',
|
|
40
|
+
)
|
|
41
|
+
completion = client.chat.completions.create(
|
|
42
|
+
model='Qwen2.5-72B-Instruct',
|
|
43
|
+
messages=[{'role': 'user', 'content': content}]
|
|
44
|
+
)
|
|
45
|
+
return completion.choices[0].message.content
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(e)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def extract_answer(solution_text: str):
|
|
5
|
+
boxed_pattern = r'\\boxed\{([^}]*)\}'
|
|
6
|
+
matches = re.findall(boxed_pattern, solution_text)
|
|
7
|
+
if matches:
|
|
8
|
+
last_boxed_content = matches[-1]
|
|
9
|
+
number_pattern = r'-?\d+'
|
|
10
|
+
number_matches = re.findall(number_pattern, last_boxed_content)
|
|
11
|
+
if number_matches:
|
|
12
|
+
return number_matches[-1].strip()
|
|
13
|
+
return None
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from enum import Enum
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
if TYPE_CHECKING:
|
|
5
6
|
from transformers import GenerationConfig
|
|
@@ -22,3 +23,18 @@ def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
|
|
|
22
23
|
generation_config.temperature = 1.
|
|
23
24
|
generation_config.top_p = 1.
|
|
24
25
|
generation_config.top_k = 50
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_device() -> str:
|
|
29
|
+
from transformers.utils import is_torch_cuda_available, is_torch_mps_available, is_torch_npu_available
|
|
30
|
+
|
|
31
|
+
if is_torch_npu_available():
|
|
32
|
+
device = 'npu'
|
|
33
|
+
elif is_torch_mps_available():
|
|
34
|
+
device = 'mps'
|
|
35
|
+
elif is_torch_cuda_available():
|
|
36
|
+
device = 'cuda'
|
|
37
|
+
else:
|
|
38
|
+
device = 'cpu'
|
|
39
|
+
|
|
40
|
+
return device
|
evalscope/utils/utils.py
CHANGED
|
@@ -101,50 +101,50 @@ class ResponseParser:
|
|
|
101
101
|
options_concat = '|'.join([str(i) for i in options])
|
|
102
102
|
|
|
103
103
|
patterns = [
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
104
|
+
rf'答案是?\s?([{options_concat}])',
|
|
105
|
+
rf'答案是?\s?:([{options_concat}])',
|
|
106
|
+
rf'答案是?\s?:([{options_concat}])',
|
|
107
|
+
rf'答案应该?是\s?([{options_concat}])',
|
|
108
|
+
rf'答案应该?选\s?([{options_concat}])',
|
|
109
|
+
rf'答案为\s?([{options_concat}])',
|
|
110
|
+
rf'答案选\s?([{options_concat}])',
|
|
111
|
+
rf'选择?\s?([{options_concat}])',
|
|
112
|
+
rf'故选?\s?([{options_concat}])'
|
|
113
|
+
rf'只有选?项?\s?([{options_concat}])\s?是?对',
|
|
114
|
+
rf'只有选?项?\s?([{options_concat}])\s?是?错',
|
|
115
|
+
rf'只有选?项?\s?([{options_concat}])\s?不?正确',
|
|
116
|
+
rf'只有选?项?\s?([{options_concat}])\s?错误',
|
|
117
|
+
rf'说法不?对选?项?的?是\s?([{options_concat}])',
|
|
118
|
+
rf'说法不?正确选?项?的?是\s?([{options_concat}])',
|
|
119
|
+
rf'说法错误选?项?的?是\s?([{options_concat}])',
|
|
120
|
+
rf'([{options_concat}])\s?是正确的',
|
|
121
|
+
rf'([{options_concat}])\s?是正确答案',
|
|
122
|
+
rf'选项\s?([{options_concat}])\s?正确',
|
|
123
|
+
rf'所以答\s?([{options_concat}])',
|
|
124
|
+
rf'所以\s?([{options_concat}][.。$]?$)',
|
|
125
|
+
rf'所有\s?([{options_concat}][.。$]?$)',
|
|
126
|
+
rf'[\s,::,]([{options_concat}])[。,,\.]?$',
|
|
127
|
+
rf'[\s,,::][故即]([{options_concat}])[。\.]?$',
|
|
128
|
+
rf'[\s,,::]因此([{options_concat}])[。\.]?$',
|
|
129
|
+
rf'[是为。]\s?([{options_concat}])[。\.]?$',
|
|
130
|
+
rf'因此\s?([{options_concat}])[。\.]?$',
|
|
131
|
+
rf'显然\s?([{options_concat}])[。\.]?$',
|
|
132
|
+
rf'答案是\s?(\S+)(?:。|$)',
|
|
133
|
+
rf'答案应该是\s?(\S+)(?:。|$)',
|
|
134
|
+
rf'答案为\s?(\S+)(?:。|$)',
|
|
135
|
+
rf'答案是(.*?)[{options_concat}]',
|
|
136
|
+
rf'答案为(.*?)[{options_concat}]',
|
|
137
|
+
rf'固选(.*?)[{options_concat}]',
|
|
138
|
+
rf'答案应该是(.*?)[{options_concat}]',
|
|
139
|
+
rf'[Tt]he answer is \(?[{options_concat}]\)?',
|
|
140
|
+
rf'[Tt]he correct answer is [{options_concat}]',
|
|
141
|
+
rf'[Tt]he correct answer is:\n[{options_concat}]',
|
|
142
|
+
rf'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
|
|
143
|
+
rf'^选项\s?([{options_concat}])',
|
|
144
|
+
rf'^([{options_concat}])\s?选?项',
|
|
145
|
+
rf'(\s|^)[{options_concat}][\s。,,::\.$]',
|
|
146
|
+
rf'(\s|^)[{options_concat}](\s|$)',
|
|
147
|
+
rf'[{options_concat}]',
|
|
148
148
|
]
|
|
149
149
|
|
|
150
150
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
@@ -166,8 +166,8 @@ class ResponseParser:
|
|
|
166
166
|
text: The text to parse.
|
|
167
167
|
"""
|
|
168
168
|
patterns = [
|
|
169
|
-
r'[Aa]nswer:\s*(\w+)',
|
|
170
169
|
r'answer is \(?(\w+)\)?',
|
|
170
|
+
r'[Aa]nswer:\s*(\w+)',
|
|
171
171
|
r'[Tt]he correct answer is:\s*(\w+)',
|
|
172
172
|
r'[Tt]he correct answer is:\n\s*(\w+)',
|
|
173
173
|
r'[Tt]he correct answer is:\n\n-\s*(\w+)',
|
evalscope/version.py
CHANGED