evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +15 -18
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +12 -11
- evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +59 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
- evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +85 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +14 -5
- evalscope/config.py +15 -2
- evalscope/constants.py +14 -0
- evalscope/evaluator/evaluator.py +51 -13
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/benchmark.py +5 -0
- evalscope/perf/http_client.py +15 -5
- evalscope/perf/main.py +1 -0
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +28 -2
- tests/cli/test_run.py +201 -32
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ from typing import List
|
|
|
12
12
|
|
|
13
13
|
from evalscope.third_party.thinkbench.tools.llm import request_url
|
|
14
14
|
from evalscope.third_party.thinkbench.tools.utils import extract_answer
|
|
15
|
-
from evalscope.utils.io_utils import dump_jsonl_data
|
|
15
|
+
from evalscope.utils.io_utils import dict_to_json, dump_jsonl_data, json_to_dict, jsonl_to_list
|
|
16
16
|
|
|
17
17
|
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
18
18
|
|
|
@@ -28,27 +28,42 @@ class EvalThink:
|
|
|
28
28
|
self.model_name = model_name
|
|
29
29
|
self.dataset_name = dataset_name
|
|
30
30
|
self.subsets = subsets
|
|
31
|
-
self.metrics = ['
|
|
31
|
+
self.metrics = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens','token_efficiency', 'thought_num', 'accuracy']
|
|
32
32
|
self.split_strategies = split_strategies # split by llm, keywords, separator
|
|
33
33
|
self.judge_config = judge_config
|
|
34
|
+
self.model_parse_file_path = os.path.join(self.report_path, 'answer_index.jsonl')
|
|
35
|
+
self.model_parse_dict = self.__init_parse_file()
|
|
34
36
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
def __init_parse_file(self):
|
|
38
|
+
if not os.path.exists(self.model_parse_file_path):
|
|
39
|
+
return {}
|
|
40
|
+
else:
|
|
41
|
+
list_file = jsonl_to_list(self.model_parse_file_path)
|
|
42
|
+
# convert to dict prompt as key, answer_index as value
|
|
43
|
+
return {item['prompt']: item['answer_index'] for item in list_file}
|
|
44
|
+
|
|
45
|
+
def get_think_part(self, message: dict) -> str:
|
|
46
|
+
if 'reasoning_content' in message and message['reasoning_content']:
|
|
47
|
+
return message['reasoning_content']
|
|
48
|
+
else:
|
|
49
|
+
text = message['content']
|
|
50
|
+
last_think_end = text.rfind(self.think_end_token)
|
|
51
|
+
return text[:last_think_end]
|
|
39
52
|
|
|
40
53
|
@lru_cache(maxsize=None)
|
|
41
54
|
def cal_tokens(self, text: str):
|
|
42
55
|
return len(self.tokenizer.encode(text, add_special_tokens=False))
|
|
43
56
|
|
|
44
57
|
def process_choice(self, choice, problem):
|
|
45
|
-
think_part = self.get_think_part(choice['message']
|
|
58
|
+
think_part = self.get_think_part(choice['message'])
|
|
46
59
|
answer = choice['review']['gold']
|
|
47
60
|
tokens = self.cal_tokens(think_part)
|
|
48
|
-
switch_count = sum(think_part.count(token) for token in self.switch_tokens)
|
|
61
|
+
switch_count = sum(think_part.lower().count(token) for token in self.switch_tokens)
|
|
49
62
|
useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
|
|
50
|
-
|
|
51
|
-
|
|
63
|
+
reflection_tokens = tokens - useful_tokens
|
|
64
|
+
# score = choice['review']['result']
|
|
65
|
+
score = 0 if useful_tokens == 0 else 1
|
|
66
|
+
return tokens, switch_count, useful_tokens, reflection_tokens, score
|
|
52
67
|
|
|
53
68
|
def process_item(self, item):
|
|
54
69
|
problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
|
|
@@ -57,14 +72,15 @@ class EvalThink:
|
|
|
57
72
|
results.append(self.process_choice(choice, problem))
|
|
58
73
|
break # only process the first choice
|
|
59
74
|
|
|
60
|
-
|
|
75
|
+
total_tokens, switch_counts, useful_tokens, reflection_tokens, scores = zip(*results)
|
|
61
76
|
|
|
62
|
-
avg_tokens = sum(
|
|
77
|
+
avg_tokens = sum(total_tokens) / len(total_tokens)
|
|
63
78
|
avg_thought_num = sum(switch_counts) / len(switch_counts)
|
|
64
|
-
avg_token_efficiency = sum(useful_tokens) / sum(
|
|
79
|
+
avg_token_efficiency = sum(useful_tokens) / sum(total_tokens)
|
|
65
80
|
avg_accuracy = sum(scores) / len(scores)
|
|
66
|
-
|
|
67
|
-
|
|
81
|
+
avg_useful_tokens = sum(useful_tokens) / len(useful_tokens)
|
|
82
|
+
avg_reflection_tokens = sum(reflection_tokens) / len(reflection_tokens)
|
|
83
|
+
return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens
|
|
68
84
|
|
|
69
85
|
def split_by_llm(self, response, problem) -> List[str]:
|
|
70
86
|
response = response.replace('\n', ' ') # remove newline characters
|
|
@@ -90,12 +106,17 @@ class EvalThink:
|
|
|
90
106
|
tagged_response = tagged_response.strip()
|
|
91
107
|
|
|
92
108
|
prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
109
|
+
if prompt in self.model_parse_dict:
|
|
110
|
+
answer_index = self.model_parse_dict[prompt]
|
|
111
|
+
else:
|
|
112
|
+
llm_response = request_url(self.judge_config, prompt)
|
|
113
|
+
if not llm_response:
|
|
114
|
+
answer_index = -1
|
|
115
|
+
else:
|
|
116
|
+
answer_index = extract_answer(llm_response)
|
|
117
|
+
|
|
118
|
+
dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
|
|
119
|
+
self.model_parse_file_path, dump_mode='append')
|
|
99
120
|
try:
|
|
100
121
|
answer_index = int(answer_index)
|
|
101
122
|
except Exception:
|
|
@@ -119,18 +140,27 @@ class EvalThink:
|
|
|
119
140
|
return first_correct
|
|
120
141
|
|
|
121
142
|
def plot_metrics(self, results, output_dir):
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
143
|
+
# Change layout to 2x3
|
|
144
|
+
fig = make_subplots(rows=2, cols=3,
|
|
145
|
+
subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
|
|
146
|
+
'Token Efficiency', 'Thought Num', 'Accuracy'),
|
|
147
|
+
shared_xaxes=True, x_title='Subsets',
|
|
148
|
+
vertical_spacing=0.1, # Decrease vertical spacing between subplots
|
|
149
|
+
horizontal_spacing=0.1) # Decrease horizontal spacing between subplots
|
|
150
|
+
|
|
151
|
+
metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
|
|
152
|
+
'token_efficiency', 'thought_num', 'accuracy']
|
|
153
|
+
|
|
154
|
+
for i, metric in enumerate(metrics_order, start=1):
|
|
128
155
|
y_values = [results[metric][subset] for subset in self.subsets]
|
|
156
|
+
# Determine row and column for 2x3 layout
|
|
157
|
+
row = (i - 1) // 3 + 1
|
|
158
|
+
col = (i - 1) % 3 + 1
|
|
129
159
|
fig.add_trace(
|
|
130
160
|
go.Scatter(x=list(range(len(self.subsets))), y=y_values,
|
|
131
161
|
mode='lines+markers',
|
|
132
162
|
name=metric.replace('_', ' ').title()),
|
|
133
|
-
row=
|
|
163
|
+
row=row, col=col
|
|
134
164
|
)
|
|
135
165
|
# Add annotations for each data point
|
|
136
166
|
for j, y in enumerate(y_values):
|
|
@@ -140,28 +170,34 @@ class EvalThink:
|
|
|
140
170
|
text=f'{y:.2f}',
|
|
141
171
|
showarrow=False,
|
|
142
172
|
yshift=10,
|
|
143
|
-
row=
|
|
144
|
-
col=
|
|
173
|
+
row=row,
|
|
174
|
+
col=col
|
|
145
175
|
)
|
|
146
176
|
|
|
147
177
|
fig.update_layout(
|
|
148
|
-
height=
|
|
149
|
-
width=
|
|
178
|
+
height=800, # Adjust height for 2x3 layout
|
|
179
|
+
width=1200, # Adjust width for 2x3 layout
|
|
150
180
|
title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
|
|
151
181
|
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
|
|
152
182
|
)
|
|
153
183
|
|
|
154
|
-
for i in range(1, len(
|
|
184
|
+
for i in range(1, len(metrics_order) + 1):
|
|
185
|
+
row = (i - 1) // 3 + 1
|
|
186
|
+
col = (i - 1) % 3 + 1
|
|
155
187
|
fig.update_xaxes(
|
|
156
188
|
ticktext=self.subsets,
|
|
157
189
|
tickvals=list(range(len(self.subsets))),
|
|
158
|
-
row=
|
|
190
|
+
row=row, col=col
|
|
159
191
|
)
|
|
160
|
-
fig.update_yaxes(title_text=
|
|
192
|
+
fig.update_yaxes(title_text=metrics_order[i-1].replace('_', ' ').title(), row=row, col=col)
|
|
193
|
+
|
|
161
194
|
# Update y-axis ranges
|
|
162
|
-
fig.update_yaxes(range=[
|
|
163
|
-
fig.update_yaxes(range=[0,
|
|
164
|
-
fig.update_yaxes(range=[0,
|
|
195
|
+
fig.update_yaxes(range=[500, 5000], row=1, col=1) # Reasoning Tokens
|
|
196
|
+
fig.update_yaxes(range=[0, 3000], row=1, col=2) # First Correct Tokens
|
|
197
|
+
fig.update_yaxes(range=[0, 3000], row=1, col=3) # Reflection Tokens
|
|
198
|
+
fig.update_yaxes(range=[0, 1], row=2, col=1) # Token Efficiency
|
|
199
|
+
fig.update_yaxes(range=[0, 13], row=2, col=2) # Thought Num
|
|
200
|
+
fig.update_yaxes(range=[0, 1], row=2, col=3) # Accuracy
|
|
165
201
|
|
|
166
202
|
os.makedirs(output_dir, exist_ok=True)
|
|
167
203
|
output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
|
|
@@ -179,7 +215,7 @@ class EvalThink:
|
|
|
179
215
|
return df[bools].head(count)
|
|
180
216
|
|
|
181
217
|
|
|
182
|
-
def evaluate(self, output_dir, max_tokens=8000, count=50):
|
|
218
|
+
def evaluate(self, output_dir, max_tokens=8000, count=50, workers=128):
|
|
183
219
|
for subset in self.subsets:
|
|
184
220
|
review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
|
|
185
221
|
review_df = pd.read_json(review_path, lines=True)
|
|
@@ -191,15 +227,17 @@ class EvalThink:
|
|
|
191
227
|
(item for _, item in review_df.iterrows()),
|
|
192
228
|
desc=f'Evaluating {subset}',
|
|
193
229
|
total=len(review_df),
|
|
194
|
-
max_workers=
|
|
230
|
+
max_workers=workers
|
|
195
231
|
)
|
|
196
232
|
|
|
197
|
-
avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy = zip(*results)
|
|
233
|
+
avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens = zip(*results)
|
|
198
234
|
|
|
199
|
-
self.subset_dict[subset]['
|
|
235
|
+
self.subset_dict[subset]['reasoning_tokens'] = sum(avg_tokens) / len(avg_tokens)
|
|
200
236
|
self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
|
|
201
237
|
self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
|
|
202
238
|
self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
|
|
239
|
+
self.subset_dict[subset]['first_correct_tokens'] = sum(avg_useful_tokens) / len(avg_useful_tokens)
|
|
240
|
+
self.subset_dict[subset]['reflection_tokens'] = sum(avg_reflection_tokens) / len(avg_reflection_tokens)
|
|
203
241
|
|
|
204
242
|
|
|
205
243
|
results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
|
|
@@ -207,13 +245,111 @@ class EvalThink:
|
|
|
207
245
|
|
|
208
246
|
self.plot_metrics(results, output_dir)
|
|
209
247
|
|
|
248
|
+
# save results to json
|
|
249
|
+
dict_to_json(results, os.path.join(self.report_path, f'think_eval_results.json'))
|
|
210
250
|
return results
|
|
211
251
|
|
|
212
|
-
def run_task(config, output_dir='outputs', max_tokens=8000, count=50):
|
|
252
|
+
def run_task(config, output_dir='outputs', max_tokens=8000, count=50, workers=128):
|
|
213
253
|
evaluator = EvalThink(**config,)
|
|
214
|
-
results = evaluator.evaluate(output_dir, max_tokens, count)
|
|
254
|
+
results = evaluator.evaluate(output_dir, max_tokens, count, workers)
|
|
215
255
|
print(results)
|
|
216
256
|
|
|
257
|
+
def combine_results(configs: List[dict], output_path: str):
|
|
258
|
+
"""
|
|
259
|
+
Combine evaluation results from multiple model configs into one plot.
|
|
260
|
+
All models' results for the same metric will be shown in the same subplot for easy comparison.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
configs: List of model config dicts containing model_name and report_path
|
|
264
|
+
"""
|
|
265
|
+
# Combine results from different runs
|
|
266
|
+
combined_results = defaultdict(lambda: defaultdict(dict))
|
|
267
|
+
for config in configs:
|
|
268
|
+
model_name = config['model_name']
|
|
269
|
+
report_path = config['report_path']
|
|
270
|
+
# Results is a dict with metric as key and subset as value
|
|
271
|
+
results = json_to_dict(os.path.join(report_path, f'think_eval_results.json'))
|
|
272
|
+
combined_results[model_name] = results
|
|
273
|
+
|
|
274
|
+
# Create a 2x3 subplot layout, one subplot per metric
|
|
275
|
+
fig = make_subplots(rows=2, cols=3,
|
|
276
|
+
subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
|
|
277
|
+
'Token Efficiency', 'Thought Num', 'Accuracy'),
|
|
278
|
+
shared_xaxes=True, x_title='Subsets',
|
|
279
|
+
vertical_spacing=0.08, # 减小垂直间距
|
|
280
|
+
horizontal_spacing=0.05) # 减小水平间距
|
|
281
|
+
|
|
282
|
+
metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
|
|
283
|
+
'token_efficiency', 'thought_num', 'accuracy']
|
|
284
|
+
|
|
285
|
+
# Assign different colors for each model
|
|
286
|
+
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
|
|
287
|
+
|
|
288
|
+
# Plot each metric in a separate subplot
|
|
289
|
+
for i, metric in enumerate(metrics_order, start=1):
|
|
290
|
+
row = (i - 1) // 3 + 1
|
|
291
|
+
col = (i - 1) % 3 + 1
|
|
292
|
+
|
|
293
|
+
# Get subsets from first model (assuming all models have same subsets)
|
|
294
|
+
subsets = list(next(iter(combined_results.values()))[metric].keys())
|
|
295
|
+
|
|
296
|
+
# Add all models' data for this metric to the same subplot
|
|
297
|
+
for j, (model_name, results) in enumerate(combined_results.items()):
|
|
298
|
+
y_values = [results[metric][subset] for subset in subsets]
|
|
299
|
+
|
|
300
|
+
fig.add_trace(
|
|
301
|
+
go.Scatter(x=subsets, y=y_values,
|
|
302
|
+
mode='lines+markers',
|
|
303
|
+
name=model_name, # Just model name since metrics are shown in subplot titles
|
|
304
|
+
line=dict(color=colors[j % len(colors)]),
|
|
305
|
+
showlegend=(i == 1)), # Only show legend for first metric
|
|
306
|
+
row=row, col=col
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Add value annotations
|
|
310
|
+
for k, y in enumerate(y_values):
|
|
311
|
+
fig.add_annotation(
|
|
312
|
+
x=subsets[k],
|
|
313
|
+
y=y,
|
|
314
|
+
text=f'{y:.2f}',
|
|
315
|
+
showarrow=False,
|
|
316
|
+
yshift=10,
|
|
317
|
+
font=dict(size=12, color=colors[j % len(colors)]),
|
|
318
|
+
row=row, col=col
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Update axis ranges and labels based on metric type
|
|
322
|
+
# if metric == 'token_efficiency':
|
|
323
|
+
# fig.update_yaxes(range=[0.2, 0.7], row=row, col=col)
|
|
324
|
+
# elif metric == 'accuracy':
|
|
325
|
+
# fig.update_yaxes(range=[0.8, 1], row=row, col=col)
|
|
326
|
+
|
|
327
|
+
fig.update_yaxes(title_text=metric.replace('_', ' ').title(), row=row, col=col)
|
|
328
|
+
|
|
329
|
+
# Update layout
|
|
330
|
+
fig.update_layout(
|
|
331
|
+
height=1000, # 增加高度
|
|
332
|
+
width=1500, # 增加宽度
|
|
333
|
+
title_text=f'Model Comparison Across Evaluation Metrics on MATH-500',
|
|
334
|
+
title=dict(font=dict(size=22)), # 增大标题字号
|
|
335
|
+
font=dict(size=14), # 增大整体字号
|
|
336
|
+
legend=dict(
|
|
337
|
+
orientation='h',
|
|
338
|
+
yanchor='bottom',
|
|
339
|
+
y=1.02,
|
|
340
|
+
xanchor='right',
|
|
341
|
+
x=1,
|
|
342
|
+
font=dict(size=14) # 增大图例字号
|
|
343
|
+
)
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Save plot
|
|
347
|
+
os.makedirs('outputs', exist_ok=True)
|
|
348
|
+
fig.write_image(output_path)
|
|
349
|
+
print(f'Model comparison plot saved to {output_path}')
|
|
350
|
+
|
|
351
|
+
return combined_results
|
|
352
|
+
|
|
217
353
|
judge_config = dict(
|
|
218
354
|
api_key='EMPTY',
|
|
219
355
|
base_url='http://0.0.0.0:8801/v1',
|
|
@@ -221,7 +357,7 @@ judge_config = dict(
|
|
|
221
357
|
)
|
|
222
358
|
|
|
223
359
|
distill_qwen_config = dict(
|
|
224
|
-
report_path = '
|
|
360
|
+
report_path = './outputs/20250218_180219',
|
|
225
361
|
model_name = 'DeepSeek-R1-Distill-Qwen-7B',
|
|
226
362
|
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
|
227
363
|
dataset_name = 'math_500',
|
|
@@ -231,34 +367,63 @@ distill_qwen_config = dict(
|
|
|
231
367
|
)
|
|
232
368
|
|
|
233
369
|
math_qwen_config = dict(
|
|
234
|
-
report_path = '
|
|
370
|
+
report_path = './outputs/20250219_202358',
|
|
235
371
|
model_name = 'Qwen2.5-Math-7B-Instruct',
|
|
236
372
|
tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
|
|
237
373
|
dataset_name = 'math_500',
|
|
238
374
|
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
239
|
-
split_strategies='separator'
|
|
375
|
+
split_strategies='separator',
|
|
376
|
+
judge_config=judge_config
|
|
240
377
|
)
|
|
241
378
|
|
|
242
379
|
r1_config = dict(
|
|
243
|
-
report_path = '
|
|
380
|
+
report_path = './outputs/20250307_000404',
|
|
244
381
|
model_name = 'deepseek-r1',
|
|
245
382
|
tokenizer_path = 'deepseek-ai/DeepSeek-R1',
|
|
246
383
|
dataset_name = 'math_500',
|
|
247
384
|
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
248
|
-
split_strategies='separator'
|
|
385
|
+
split_strategies='separator',
|
|
386
|
+
judge_config=judge_config
|
|
249
387
|
)
|
|
250
388
|
|
|
251
|
-
|
|
252
|
-
report_path = '
|
|
389
|
+
qwq_preview_config = dict(
|
|
390
|
+
report_path = './outputs/20250221_105911',
|
|
253
391
|
model_name = 'qwq-32b-preview',
|
|
254
392
|
tokenizer_path = 'Qwen/QwQ-32B-Preview',
|
|
255
393
|
dataset_name = 'math_500',
|
|
256
394
|
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
257
|
-
split_strategies='separator'
|
|
395
|
+
split_strategies='separator',
|
|
396
|
+
judge_config=judge_config
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
qwq_config = dict(
|
|
400
|
+
report_path = './outputs/20250306_181550',
|
|
401
|
+
model_name = 'QwQ-32B',
|
|
402
|
+
tokenizer_path = 'Qwen/QwQ-32B',
|
|
403
|
+
dataset_name = 'math_500',
|
|
404
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
405
|
+
split_strategies='separator',
|
|
406
|
+
judge_config=judge_config
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
distill_qwen_32b = dict(
|
|
410
|
+
report_path = './outputs/20250306_235951',
|
|
411
|
+
model_name = 'deepseek-r1-distill-qwen-32b',
|
|
412
|
+
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
|
|
413
|
+
dataset_name = 'math_500',
|
|
414
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
415
|
+
split_strategies='separator',
|
|
416
|
+
judge_config=judge_config
|
|
258
417
|
)
|
|
259
418
|
|
|
260
419
|
if __name__ == '__main__':
|
|
261
|
-
run_task(distill_qwen_config)
|
|
420
|
+
# run_task(distill_qwen_config, count=80)
|
|
262
421
|
# run_task(math_qwen_config)
|
|
263
|
-
# run_task(
|
|
264
|
-
# run_task(
|
|
422
|
+
# run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
|
|
423
|
+
# run_task(r1_config, max_tokens=20000, count=200, workers=128)
|
|
424
|
+
# run_task(qwq_config, max_tokens=20000, count=200, workers=128)
|
|
425
|
+
# run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
|
|
426
|
+
|
|
427
|
+
# combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
|
|
428
|
+
# combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
|
|
429
|
+
combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
|
|
3
3
|
from evalscope import TaskConfig, run_task
|
|
4
4
|
|
|
5
|
+
DASHSCOPE_API_KEY = 'sk-723135c241x'
|
|
5
6
|
|
|
6
7
|
def eval_distill_qwen():
|
|
7
8
|
model_name = 'DeepSeek-R1-Distill-Qwen-7B'
|
|
@@ -53,20 +54,48 @@ def eval_r1():
|
|
|
53
54
|
|
|
54
55
|
task_config = TaskConfig(
|
|
55
56
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
56
|
-
api_key=
|
|
57
|
+
api_key=DASHSCOPE_API_KEY,
|
|
57
58
|
model=model_name,
|
|
58
59
|
eval_type='service',
|
|
59
60
|
datasets=[dataset_name],
|
|
60
61
|
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
61
|
-
eval_batch_size=
|
|
62
|
+
eval_batch_size=8,
|
|
63
|
+
generation_config={
|
|
64
|
+
'max_tokens': 20000, # avoid exceed max length
|
|
65
|
+
'temperature': 0.6,
|
|
66
|
+
'top_p': 0.95,
|
|
67
|
+
'n': 1,
|
|
68
|
+
},
|
|
69
|
+
use_cache='./outputs/20250307_000404',
|
|
70
|
+
timeout=36000,
|
|
71
|
+
stream=True
|
|
72
|
+
)
|
|
73
|
+
run_task(task_config)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def eval_distill_32b():
|
|
77
|
+
model_name = 'deepseek-r1-distill-qwen-32b'
|
|
78
|
+
dataset_name = 'math_500'
|
|
79
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
|
|
80
|
+
|
|
81
|
+
task_config = TaskConfig(
|
|
82
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
83
|
+
api_key=DASHSCOPE_API_KEY,
|
|
84
|
+
model=model_name,
|
|
85
|
+
eval_type='service',
|
|
86
|
+
datasets=[dataset_name],
|
|
87
|
+
dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
|
|
88
|
+
eval_batch_size=5,
|
|
62
89
|
generation_config={
|
|
63
90
|
'max_tokens': 12000, # avoid exceed max length
|
|
64
91
|
'temperature': 0.6,
|
|
65
92
|
'top_p': 0.95,
|
|
66
93
|
'n': 1,
|
|
67
94
|
},
|
|
68
|
-
|
|
69
|
-
|
|
95
|
+
use_cache='./outputs/20250306_235951',
|
|
96
|
+
timeout=32000,
|
|
97
|
+
stream=True
|
|
98
|
+
|
|
70
99
|
)
|
|
71
100
|
run_task(task_config)
|
|
72
101
|
|
|
@@ -89,12 +118,13 @@ def eval_qwq():
|
|
|
89
118
|
'top_p': 0.95,
|
|
90
119
|
'n': 1,
|
|
91
120
|
},
|
|
92
|
-
use_cache='
|
|
121
|
+
use_cache='./outputs/20250221_105911'
|
|
93
122
|
)
|
|
94
123
|
run_task(task_config)
|
|
95
124
|
|
|
96
125
|
if __name__ == '__main__':
|
|
97
126
|
# eval_distill_qwen()
|
|
98
127
|
# eval_math_qwen()
|
|
99
|
-
|
|
100
|
-
eval_qwq()
|
|
128
|
+
eval_r1()
|
|
129
|
+
# eval_qwq()
|
|
130
|
+
# eval_distill_32b()
|
|
@@ -1,37 +1,67 @@
|
|
|
1
|
-
|
|
1
|
+
import os
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from swift.llm import
|
|
4
|
-
from swift.utils import seed_everything
|
|
5
|
-
|
|
6
|
-
# TODO: Support custom model for swift infer
|
|
3
|
+
from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
|
|
7
4
|
|
|
5
|
+
# 设置GPU环境变量
|
|
6
|
+
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
8
7
|
|
|
9
8
|
@dataclass
|
|
10
9
|
class SwiftInferArgs:
|
|
11
10
|
model_id_or_path: str
|
|
12
11
|
model_type: str
|
|
12
|
+
infer_backend: str = 'vllm' # 可选 'pt', 'vllm', 'lmdeploy'
|
|
13
13
|
max_new_tokens: int = 2048
|
|
14
|
-
|
|
14
|
+
temperature: float = 0.1
|
|
15
|
+
max_batch_size: int = 16
|
|
15
16
|
|
|
16
17
|
class SwiftInfer:
|
|
17
18
|
|
|
18
19
|
def __init__(self, args: SwiftInferArgs):
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
# infer backend模型初始化
|
|
21
|
+
if args.infer_backend == 'pt':
|
|
22
|
+
self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
|
|
23
|
+
elif args.infer_backend == 'vllm':
|
|
24
|
+
from swift.llm import VllmEngine
|
|
25
|
+
self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
|
|
26
|
+
elif args.infer_backend == 'lmdeploy':
|
|
27
|
+
from swift.llm import LmdeployEngine
|
|
28
|
+
self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
|
|
29
|
+
else:
|
|
30
|
+
raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
|
|
25
31
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
+
# 基本配置获取 (可选)
|
|
33
|
+
self.request_config = RequestConfig(
|
|
34
|
+
max_tokens=args.max_new_tokens,
|
|
35
|
+
temperature=args.temperature,
|
|
36
|
+
stream=False # 可以透传参数改为True进行流式推理
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
def predict(self, system: str, query: str, history: list):
|
|
40
|
+
# Swift 3.0标准接口中,消息传入的格式是:
|
|
41
|
+
# messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
|
|
42
|
+
# {"role": "user", "content": "用户问题内容"},
|
|
43
|
+
# {"role": "assistant", "content": "助手回答内容"}, ...]
|
|
44
|
+
|
|
45
|
+
messages = []
|
|
46
|
+
if system.strip():
|
|
47
|
+
messages.append({'role': 'system', 'content': system})
|
|
48
|
+
|
|
49
|
+
# 将历史对话拼接进message中
|
|
50
|
+
for qa_pair in history:
|
|
51
|
+
# 假定 history 中每个元素形如 ("user input", "model response"),请根据你的数据格式进行调整。
|
|
52
|
+
user_answer, model_response = qa_pair
|
|
53
|
+
messages.append({'role': 'user', 'content': user_answer})
|
|
54
|
+
messages.append({'role': 'assistant', 'content': model_response})
|
|
55
|
+
|
|
56
|
+
# 添加本次用户问题
|
|
57
|
+
messages.append({'role': 'user', 'content': query})
|
|
58
|
+
|
|
59
|
+
infer_request = InferRequest(messages=messages)
|
|
60
|
+
|
|
61
|
+
# 进行推理
|
|
62
|
+
response = self.engine.infer([infer_request], self.request_config)
|
|
34
63
|
|
|
35
|
-
|
|
64
|
+
# 提取模型返回的文本结果(假设非stream模式)
|
|
65
|
+
result_text = response[0].choices[0].message.content.strip()
|
|
36
66
|
|
|
37
|
-
return
|
|
67
|
+
return result_text
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Callable, Dict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Filter:
|
|
6
|
+
"""
|
|
7
|
+
A base Filter class that implements the registry pattern
|
|
8
|
+
"""
|
|
9
|
+
_registry: Dict[str, Callable[[str, Any], str]] = {}
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def register(cls, name: str) -> Callable:
|
|
13
|
+
"""
|
|
14
|
+
Decorator to register a new filter function
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def decorator(func: Callable[[str, Any], str]) -> Callable[[str, Any], str]:
|
|
18
|
+
cls._registry[name] = func
|
|
19
|
+
return func
|
|
20
|
+
|
|
21
|
+
return decorator
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def get_filter(cls, name: str) -> Callable:
|
|
25
|
+
"""
|
|
26
|
+
Get a registered filter by name
|
|
27
|
+
"""
|
|
28
|
+
return cls._registry.get(name)
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def apply(cls, name: str, value: str, *args, **kwargs) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Apply a registered filter to a value
|
|
34
|
+
"""
|
|
35
|
+
filter_func = cls.get_filter(name)
|
|
36
|
+
if filter_func is None:
|
|
37
|
+
raise ValueError(f'Filter {name} not found')
|
|
38
|
+
return filter_func(value, *args, **kwargs)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@Filter.register('remove_until')
|
|
42
|
+
def remove_until(value: str, marker: str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Remove everything before the last occurrence of marker
|
|
45
|
+
"""
|
|
46
|
+
if marker not in value:
|
|
47
|
+
return value
|
|
48
|
+
return value[value.rindex(marker) + len(marker):]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@Filter.register('extract')
|
|
52
|
+
def extract(value: str, pattern: str) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Extract content from string using regex pattern
|
|
55
|
+
"""
|
|
56
|
+
match = re.search(pattern, value)
|
|
57
|
+
if match:
|
|
58
|
+
return match.group(0)
|
|
59
|
+
return ''
|