evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (85) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  4. evalscope/benchmarks/arc/arc_adapter.py +15 -18
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  6. evalscope/benchmarks/benchmark.py +12 -11
  7. evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
  8. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  9. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  10. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
  11. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  12. evalscope/benchmarks/data_adapter.py +59 -21
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  22. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  23. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  24. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  25. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  26. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  27. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  28. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  29. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  30. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  31. evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  32. evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
  33. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  34. evalscope/benchmarks/musr/musr_adapter.py +8 -5
  35. evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  36. evalscope/benchmarks/race/race_adapter.py +12 -16
  37. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  38. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
  39. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  40. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  41. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  42. evalscope/benchmarks/super_gpqa/utils.py +85 -0
  43. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  45. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  46. evalscope/benchmarks/utils.py +43 -0
  47. evalscope/collections/evaluator.py +14 -5
  48. evalscope/config.py +15 -2
  49. evalscope/constants.py +14 -0
  50. evalscope/evaluator/evaluator.py +51 -13
  51. evalscope/metrics/llm_judge.py +104 -0
  52. evalscope/metrics/named_metrics.py +1 -0
  53. evalscope/models/__init__.py +2 -1
  54. evalscope/models/base_adapter.py +25 -5
  55. evalscope/models/chat_adapter.py +3 -0
  56. evalscope/models/choice_adapter.py +4 -0
  57. evalscope/models/custom_adapter.py +2 -0
  58. evalscope/models/register.py +28 -0
  59. evalscope/models/server_adapter.py +35 -8
  60. evalscope/perf/arguments.py +13 -7
  61. evalscope/perf/benchmark.py +5 -0
  62. evalscope/perf/http_client.py +15 -5
  63. evalscope/perf/main.py +1 -0
  64. evalscope/perf/utils/analysis_result.py +1 -1
  65. evalscope/report/app.py +3 -0
  66. evalscope/report/combinator.py +2 -2
  67. evalscope/run.py +6 -5
  68. evalscope/third_party/longbench_write/infer.py +1 -1
  69. evalscope/third_party/thinkbench/eval.py +220 -55
  70. evalscope/third_party/thinkbench/infer.py +37 -7
  71. evalscope/third_party/thinkbench/tools/llm.py +1 -0
  72. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  73. evalscope/utils/chat_service.py +1 -0
  74. evalscope/utils/filters.py +59 -0
  75. evalscope/utils/logger.py +3 -3
  76. evalscope/version.py +2 -2
  77. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
  78. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
  79. tests/cli/test_all.py +144 -0
  80. tests/cli/test_collection.py +28 -2
  81. tests/cli/test_run.py +201 -32
  82. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
  83. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
  84. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
  85. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ from typing import List
12
12
 
13
13
  from evalscope.third_party.thinkbench.tools.llm import request_url
14
14
  from evalscope.third_party.thinkbench.tools.utils import extract_answer
15
- from evalscope.utils.io_utils import dump_jsonl_data
15
+ from evalscope.utils.io_utils import dict_to_json, dump_jsonl_data, json_to_dict, jsonl_to_list
16
16
 
17
17
  cur_path = os.path.dirname(os.path.abspath(__file__))
18
18
 
@@ -28,27 +28,42 @@ class EvalThink:
28
28
  self.model_name = model_name
29
29
  self.dataset_name = dataset_name
30
30
  self.subsets = subsets
31
- self.metrics = ['token_efficiency', 'completion_len', 'thought_num', 'accuracy']
31
+ self.metrics = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens','token_efficiency', 'thought_num', 'accuracy']
32
32
  self.split_strategies = split_strategies # split by llm, keywords, separator
33
33
  self.judge_config = judge_config
34
+ self.model_parse_file_path = os.path.join(self.report_path, 'answer_index.jsonl')
35
+ self.model_parse_dict = self.__init_parse_file()
34
36
 
35
- @lru_cache(maxsize=None)
36
- def get_think_part(self, text):
37
- last_think_end = text.rfind(self.think_end_token)
38
- return text[:last_think_end].lower()
37
+ def __init_parse_file(self):
38
+ if not os.path.exists(self.model_parse_file_path):
39
+ return {}
40
+ else:
41
+ list_file = jsonl_to_list(self.model_parse_file_path)
42
+ # convert to dict prompt as key, answer_index as value
43
+ return {item['prompt']: item['answer_index'] for item in list_file}
44
+
45
+ def get_think_part(self, message: dict) -> str:
46
+ if 'reasoning_content' in message and message['reasoning_content']:
47
+ return message['reasoning_content']
48
+ else:
49
+ text = message['content']
50
+ last_think_end = text.rfind(self.think_end_token)
51
+ return text[:last_think_end]
39
52
 
40
53
  @lru_cache(maxsize=None)
41
54
  def cal_tokens(self, text: str):
42
55
  return len(self.tokenizer.encode(text, add_special_tokens=False))
43
56
 
44
57
  def process_choice(self, choice, problem):
45
- think_part = self.get_think_part(choice['message']['content'])
58
+ think_part = self.get_think_part(choice['message'])
46
59
  answer = choice['review']['gold']
47
60
  tokens = self.cal_tokens(think_part)
48
- switch_count = sum(think_part.count(token) for token in self.switch_tokens)
61
+ switch_count = sum(think_part.lower().count(token) for token in self.switch_tokens)
49
62
  useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
50
- score = choice['review']['result']
51
- return tokens, switch_count, useful_tokens, score
63
+ reflection_tokens = tokens - useful_tokens
64
+ # score = choice['review']['result']
65
+ score = 0 if useful_tokens == 0 else 1
66
+ return tokens, switch_count, useful_tokens, reflection_tokens, score
52
67
 
53
68
  def process_item(self, item):
54
69
  problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
@@ -57,14 +72,15 @@ class EvalThink:
57
72
  results.append(self.process_choice(choice, problem))
58
73
  break # only process the first choice
59
74
 
60
- tokens, switch_counts, useful_tokens, scores = zip(*results)
75
+ total_tokens, switch_counts, useful_tokens, reflection_tokens, scores = zip(*results)
61
76
 
62
- avg_tokens = sum(tokens) / len(tokens)
77
+ avg_tokens = sum(total_tokens) / len(total_tokens)
63
78
  avg_thought_num = sum(switch_counts) / len(switch_counts)
64
- avg_token_efficiency = sum(useful_tokens) / sum(tokens)
79
+ avg_token_efficiency = sum(useful_tokens) / sum(total_tokens)
65
80
  avg_accuracy = sum(scores) / len(scores)
66
-
67
- return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy
81
+ avg_useful_tokens = sum(useful_tokens) / len(useful_tokens)
82
+ avg_reflection_tokens = sum(reflection_tokens) / len(reflection_tokens)
83
+ return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens
68
84
 
69
85
  def split_by_llm(self, response, problem) -> List[str]:
70
86
  response = response.replace('\n', ' ') # remove newline characters
@@ -90,12 +106,17 @@ class EvalThink:
90
106
  tagged_response = tagged_response.strip()
91
107
 
92
108
  prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
93
- llm_response = request_url(self.judge_config, prompt)
94
- answer_index = extract_answer(llm_response)
95
-
96
- dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
97
- os.path.join(self.report_path, 'answer_index.jsonl'),
98
- dump_mode='append')
109
+ if prompt in self.model_parse_dict:
110
+ answer_index = self.model_parse_dict[prompt]
111
+ else:
112
+ llm_response = request_url(self.judge_config, prompt)
113
+ if not llm_response:
114
+ answer_index = -1
115
+ else:
116
+ answer_index = extract_answer(llm_response)
117
+
118
+ dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
119
+ self.model_parse_file_path, dump_mode='append')
99
120
  try:
100
121
  answer_index = int(answer_index)
101
122
  except Exception:
@@ -119,18 +140,27 @@ class EvalThink:
119
140
  return first_correct
120
141
 
121
142
  def plot_metrics(self, results, output_dir):
122
- fig = make_subplots(rows=1, cols=len(self.metrics),
123
- subplot_titles=('Token Efficiency', 'Completion Length', 'Thought Num', 'Accuracy'),
124
- shared_xaxes=True, x_title='Subsets')
125
-
126
-
127
- for i, metric in enumerate(self.metrics, start=1):
143
+ # Change layout to 2x3
144
+ fig = make_subplots(rows=2, cols=3,
145
+ subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
146
+ 'Token Efficiency', 'Thought Num', 'Accuracy'),
147
+ shared_xaxes=True, x_title='Subsets',
148
+ vertical_spacing=0.1, # Decrease vertical spacing between subplots
149
+ horizontal_spacing=0.1) # Decrease horizontal spacing between subplots
150
+
151
+ metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
152
+ 'token_efficiency', 'thought_num', 'accuracy']
153
+
154
+ for i, metric in enumerate(metrics_order, start=1):
128
155
  y_values = [results[metric][subset] for subset in self.subsets]
156
+ # Determine row and column for 2x3 layout
157
+ row = (i - 1) // 3 + 1
158
+ col = (i - 1) % 3 + 1
129
159
  fig.add_trace(
130
160
  go.Scatter(x=list(range(len(self.subsets))), y=y_values,
131
161
  mode='lines+markers',
132
162
  name=metric.replace('_', ' ').title()),
133
- row=1, col=i
163
+ row=row, col=col
134
164
  )
135
165
  # Add annotations for each data point
136
166
  for j, y in enumerate(y_values):
@@ -140,28 +170,34 @@ class EvalThink:
140
170
  text=f'{y:.2f}',
141
171
  showarrow=False,
142
172
  yshift=10,
143
- row=1,
144
- col=i
173
+ row=row,
174
+ col=col
145
175
  )
146
176
 
147
177
  fig.update_layout(
148
- height=500,
149
- width=1500,
178
+ height=800, # Adjust height for 2x3 layout
179
+ width=1200, # Adjust width for 2x3 layout
150
180
  title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
151
181
  legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
152
182
  )
153
183
 
154
- for i in range(1, len(self.metrics) + 1):
184
+ for i in range(1, len(metrics_order) + 1):
185
+ row = (i - 1) // 3 + 1
186
+ col = (i - 1) % 3 + 1
155
187
  fig.update_xaxes(
156
188
  ticktext=self.subsets,
157
189
  tickvals=list(range(len(self.subsets))),
158
- row=1, col=i
190
+ row=row, col=col
159
191
  )
160
- fig.update_yaxes(title_text=self.metrics[i-1].replace('_', ' ').title(), row=1, col=i)
192
+ fig.update_yaxes(title_text=metrics_order[i-1].replace('_', ' ').title(), row=row, col=col)
193
+
161
194
  # Update y-axis ranges
162
- fig.update_yaxes(range=[0, 1], row=1, col=1) # Token Efficiency
163
- fig.update_yaxes(range=[0, 13], row=1, col=3) # Switch Frequency
164
- fig.update_yaxes(range=[0, 1], row=1, col=4) # Accuracy
195
+ fig.update_yaxes(range=[500, 5000], row=1, col=1) # Reasoning Tokens
196
+ fig.update_yaxes(range=[0, 3000], row=1, col=2) # First Correct Tokens
197
+ fig.update_yaxes(range=[0, 3000], row=1, col=3) # Reflection Tokens
198
+ fig.update_yaxes(range=[0, 1], row=2, col=1) # Token Efficiency
199
+ fig.update_yaxes(range=[0, 13], row=2, col=2) # Thought Num
200
+ fig.update_yaxes(range=[0, 1], row=2, col=3) # Accuracy
165
201
 
166
202
  os.makedirs(output_dir, exist_ok=True)
167
203
  output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
@@ -179,7 +215,7 @@ class EvalThink:
179
215
  return df[bools].head(count)
180
216
 
181
217
 
182
- def evaluate(self, output_dir, max_tokens=8000, count=50):
218
+ def evaluate(self, output_dir, max_tokens=8000, count=50, workers=128):
183
219
  for subset in self.subsets:
184
220
  review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
185
221
  review_df = pd.read_json(review_path, lines=True)
@@ -191,15 +227,17 @@ class EvalThink:
191
227
  (item for _, item in review_df.iterrows()),
192
228
  desc=f'Evaluating {subset}',
193
229
  total=len(review_df),
194
- max_workers=16
230
+ max_workers=workers
195
231
  )
196
232
 
197
- avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy = zip(*results)
233
+ avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens = zip(*results)
198
234
 
199
- self.subset_dict[subset]['completion_len'] = sum(avg_tokens) / len(avg_tokens)
235
+ self.subset_dict[subset]['reasoning_tokens'] = sum(avg_tokens) / len(avg_tokens)
200
236
  self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
201
237
  self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
202
238
  self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
239
+ self.subset_dict[subset]['first_correct_tokens'] = sum(avg_useful_tokens) / len(avg_useful_tokens)
240
+ self.subset_dict[subset]['reflection_tokens'] = sum(avg_reflection_tokens) / len(avg_reflection_tokens)
203
241
 
204
242
 
205
243
  results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
@@ -207,13 +245,111 @@ class EvalThink:
207
245
 
208
246
  self.plot_metrics(results, output_dir)
209
247
 
248
+ # save results to json
249
+ dict_to_json(results, os.path.join(self.report_path, f'think_eval_results.json'))
210
250
  return results
211
251
 
212
- def run_task(config, output_dir='outputs', max_tokens=8000, count=50):
252
+ def run_task(config, output_dir='outputs', max_tokens=8000, count=50, workers=128):
213
253
  evaluator = EvalThink(**config,)
214
- results = evaluator.evaluate(output_dir, max_tokens, count)
254
+ results = evaluator.evaluate(output_dir, max_tokens, count, workers)
215
255
  print(results)
216
256
 
257
+ def combine_results(configs: List[dict], output_path: str):
258
+ """
259
+ Combine evaluation results from multiple model configs into one plot.
260
+ All models' results for the same metric will be shown in the same subplot for easy comparison.
261
+
262
+ Args:
263
+ configs: List of model config dicts containing model_name and report_path
264
+ """
265
+ # Combine results from different runs
266
+ combined_results = defaultdict(lambda: defaultdict(dict))
267
+ for config in configs:
268
+ model_name = config['model_name']
269
+ report_path = config['report_path']
270
+ # Results is a dict with metric as key and subset as value
271
+ results = json_to_dict(os.path.join(report_path, f'think_eval_results.json'))
272
+ combined_results[model_name] = results
273
+
274
+ # Create a 2x3 subplot layout, one subplot per metric
275
+ fig = make_subplots(rows=2, cols=3,
276
+ subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
277
+ 'Token Efficiency', 'Thought Num', 'Accuracy'),
278
+ shared_xaxes=True, x_title='Subsets',
279
+ vertical_spacing=0.08, # 减小垂直间距
280
+ horizontal_spacing=0.05) # 减小水平间距
281
+
282
+ metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
283
+ 'token_efficiency', 'thought_num', 'accuracy']
284
+
285
+ # Assign different colors for each model
286
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
287
+
288
+ # Plot each metric in a separate subplot
289
+ for i, metric in enumerate(metrics_order, start=1):
290
+ row = (i - 1) // 3 + 1
291
+ col = (i - 1) % 3 + 1
292
+
293
+ # Get subsets from first model (assuming all models have same subsets)
294
+ subsets = list(next(iter(combined_results.values()))[metric].keys())
295
+
296
+ # Add all models' data for this metric to the same subplot
297
+ for j, (model_name, results) in enumerate(combined_results.items()):
298
+ y_values = [results[metric][subset] for subset in subsets]
299
+
300
+ fig.add_trace(
301
+ go.Scatter(x=subsets, y=y_values,
302
+ mode='lines+markers',
303
+ name=model_name, # Just model name since metrics are shown in subplot titles
304
+ line=dict(color=colors[j % len(colors)]),
305
+ showlegend=(i == 1)), # Only show legend for first metric
306
+ row=row, col=col
307
+ )
308
+
309
+ # Add value annotations
310
+ for k, y in enumerate(y_values):
311
+ fig.add_annotation(
312
+ x=subsets[k],
313
+ y=y,
314
+ text=f'{y:.2f}',
315
+ showarrow=False,
316
+ yshift=10,
317
+ font=dict(size=12, color=colors[j % len(colors)]),
318
+ row=row, col=col
319
+ )
320
+
321
+ # Update axis ranges and labels based on metric type
322
+ # if metric == 'token_efficiency':
323
+ # fig.update_yaxes(range=[0.2, 0.7], row=row, col=col)
324
+ # elif metric == 'accuracy':
325
+ # fig.update_yaxes(range=[0.8, 1], row=row, col=col)
326
+
327
+ fig.update_yaxes(title_text=metric.replace('_', ' ').title(), row=row, col=col)
328
+
329
+ # Update layout
330
+ fig.update_layout(
331
+ height=1000, # 增加高度
332
+ width=1500, # 增加宽度
333
+ title_text=f'Model Comparison Across Evaluation Metrics on MATH-500',
334
+ title=dict(font=dict(size=22)), # 增大标题字号
335
+ font=dict(size=14), # 增大整体字号
336
+ legend=dict(
337
+ orientation='h',
338
+ yanchor='bottom',
339
+ y=1.02,
340
+ xanchor='right',
341
+ x=1,
342
+ font=dict(size=14) # 增大图例字号
343
+ )
344
+ )
345
+
346
+ # Save plot
347
+ os.makedirs('outputs', exist_ok=True)
348
+ fig.write_image(output_path)
349
+ print(f'Model comparison plot saved to {output_path}')
350
+
351
+ return combined_results
352
+
217
353
  judge_config = dict(
218
354
  api_key='EMPTY',
219
355
  base_url='http://0.0.0.0:8801/v1',
@@ -221,7 +357,7 @@ judge_config = dict(
221
357
  )
222
358
 
223
359
  distill_qwen_config = dict(
224
- report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250218_180219',
360
+ report_path = './outputs/20250218_180219',
225
361
  model_name = 'DeepSeek-R1-Distill-Qwen-7B',
226
362
  tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
227
363
  dataset_name = 'math_500',
@@ -231,34 +367,63 @@ distill_qwen_config = dict(
231
367
  )
232
368
 
233
369
  math_qwen_config = dict(
234
- report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250219_202358',
370
+ report_path = './outputs/20250219_202358',
235
371
  model_name = 'Qwen2.5-Math-7B-Instruct',
236
372
  tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
237
373
  dataset_name = 'math_500',
238
374
  subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
239
- split_strategies='separator'
375
+ split_strategies='separator',
376
+ judge_config=judge_config
240
377
  )
241
378
 
242
379
  r1_config = dict(
243
- report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202',
380
+ report_path = './outputs/20250307_000404',
244
381
  model_name = 'deepseek-r1',
245
382
  tokenizer_path = 'deepseek-ai/DeepSeek-R1',
246
383
  dataset_name = 'math_500',
247
384
  subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
248
- split_strategies='separator'
385
+ split_strategies='separator',
386
+ judge_config=judge_config
249
387
  )
250
388
 
251
- qwq_config = dict(
252
- report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911',
389
+ qwq_preview_config = dict(
390
+ report_path = './outputs/20250221_105911',
253
391
  model_name = 'qwq-32b-preview',
254
392
  tokenizer_path = 'Qwen/QwQ-32B-Preview',
255
393
  dataset_name = 'math_500',
256
394
  subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
257
- split_strategies='separator'
395
+ split_strategies='separator',
396
+ judge_config=judge_config
397
+ )
398
+
399
+ qwq_config = dict(
400
+ report_path = './outputs/20250306_181550',
401
+ model_name = 'QwQ-32B',
402
+ tokenizer_path = 'Qwen/QwQ-32B',
403
+ dataset_name = 'math_500',
404
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
405
+ split_strategies='separator',
406
+ judge_config=judge_config
407
+ )
408
+
409
+ distill_qwen_32b = dict(
410
+ report_path = './outputs/20250306_235951',
411
+ model_name = 'deepseek-r1-distill-qwen-32b',
412
+ tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
413
+ dataset_name = 'math_500',
414
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
415
+ split_strategies='separator',
416
+ judge_config=judge_config
258
417
  )
259
418
 
260
419
  if __name__ == '__main__':
261
- run_task(distill_qwen_config)
420
+ # run_task(distill_qwen_config, count=80)
262
421
  # run_task(math_qwen_config)
263
- # run_task(r1_config)
264
- # run_task(qwq_config)
422
+ # run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
423
+ # run_task(r1_config, max_tokens=20000, count=200, workers=128)
424
+ # run_task(qwq_config, max_tokens=20000, count=200, workers=128)
425
+ # run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
426
+
427
+ # combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
428
+ # combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
429
+ combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
@@ -2,6 +2,7 @@ import os
2
2
 
3
3
  from evalscope import TaskConfig, run_task
4
4
 
5
+ DASHSCOPE_API_KEY = 'sk-723135c241x'
5
6
 
6
7
  def eval_distill_qwen():
7
8
  model_name = 'DeepSeek-R1-Distill-Qwen-7B'
@@ -53,20 +54,48 @@ def eval_r1():
53
54
 
54
55
  task_config = TaskConfig(
55
56
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
56
- api_key=os.environ['DASHSCOPE_API_KEY'],
57
+ api_key=DASHSCOPE_API_KEY,
57
58
  model=model_name,
58
59
  eval_type='service',
59
60
  datasets=[dataset_name],
60
61
  dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
61
- eval_batch_size=3,
62
+ eval_batch_size=8,
63
+ generation_config={
64
+ 'max_tokens': 20000, # avoid exceed max length
65
+ 'temperature': 0.6,
66
+ 'top_p': 0.95,
67
+ 'n': 1,
68
+ },
69
+ use_cache='./outputs/20250307_000404',
70
+ timeout=36000,
71
+ stream=True
72
+ )
73
+ run_task(task_config)
74
+
75
+
76
+ def eval_distill_32b():
77
+ model_name = 'deepseek-r1-distill-qwen-32b'
78
+ dataset_name = 'math_500'
79
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
80
+
81
+ task_config = TaskConfig(
82
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
83
+ api_key=DASHSCOPE_API_KEY,
84
+ model=model_name,
85
+ eval_type='service',
86
+ datasets=[dataset_name],
87
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
88
+ eval_batch_size=5,
62
89
  generation_config={
63
90
  'max_tokens': 12000, # avoid exceed max length
64
91
  'temperature': 0.6,
65
92
  'top_p': 0.95,
66
93
  'n': 1,
67
94
  },
68
- limit=50,
69
- use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202'
95
+ use_cache='./outputs/20250306_235951',
96
+ timeout=32000,
97
+ stream=True
98
+
70
99
  )
71
100
  run_task(task_config)
72
101
 
@@ -89,12 +118,13 @@ def eval_qwq():
89
118
  'top_p': 0.95,
90
119
  'n': 1,
91
120
  },
92
- use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911'
121
+ use_cache='./outputs/20250221_105911'
93
122
  )
94
123
  run_task(task_config)
95
124
 
96
125
  if __name__ == '__main__':
97
126
  # eval_distill_qwen()
98
127
  # eval_math_qwen()
99
- # eval_r1()
100
- eval_qwq()
128
+ eval_r1()
129
+ # eval_qwq()
130
+ # eval_distill_32b()
@@ -15,6 +15,7 @@ def request_url(llm_config, content):
15
15
  return completion.choices[0].message.content
16
16
  except Exception as e:
17
17
  print(e)
18
+ return None
18
19
 
19
20
  def request_qwen(content):
20
21
  try:
@@ -1,37 +1,67 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
1
+ import os
2
2
  from dataclasses import dataclass
3
- from swift.llm import get_default_template_type, get_model_tokenizer, get_template, inference
4
- from swift.utils import seed_everything
5
-
6
- # TODO: Support custom model for swift infer
3
+ from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
7
4
 
5
+ # 设置GPU环境变量
6
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
8
7
 
9
8
  @dataclass
10
9
  class SwiftInferArgs:
11
10
  model_id_or_path: str
12
11
  model_type: str
12
+ infer_backend: str = 'vllm' # 可选 'pt', 'vllm', 'lmdeploy'
13
13
  max_new_tokens: int = 2048
14
-
14
+ temperature: float = 0.1
15
+ max_batch_size: int = 16
15
16
 
16
17
  class SwiftInfer:
17
18
 
18
19
  def __init__(self, args: SwiftInferArgs):
19
- model_type = args.model_type
20
- template_type = get_default_template_type(model_type)
21
- model, tokenizer = get_model_tokenizer(
22
- model_type, model_id_or_path=args.model_id_or_path, model_kwargs={'device_map': 'auto'})
23
- model.generation_config.max_new_tokens = args.max_new_tokens
24
- print(f'** Generation config: {model.generation_config}')
20
+ # infer backend模型初始化
21
+ if args.infer_backend == 'pt':
22
+ self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
23
+ elif args.infer_backend == 'vllm':
24
+ from swift.llm import VllmEngine
25
+ self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
26
+ elif args.infer_backend == 'lmdeploy':
27
+ from swift.llm import LmdeployEngine
28
+ self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
29
+ else:
30
+ raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
25
31
 
26
- template = get_template(template_type, tokenizer)
27
- seed_everything(42)
28
-
29
- self.tokenizer = tokenizer
30
- self.model = model
31
- self.template = template
32
+ # 基本配置获取 (可选)
33
+ self.request_config = RequestConfig(
34
+ max_tokens=args.max_new_tokens,
35
+ temperature=args.temperature,
36
+ stream=False # 可以透传参数改为True进行流式推理
37
+ )
32
38
 
33
39
  def predict(self, system: str, query: str, history: list):
40
+ # Swift 3.0标准接口中,消息传入的格式是:
41
+ # messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
42
+ # {"role": "user", "content": "用户问题内容"},
43
+ # {"role": "assistant", "content": "助手回答内容"}, ...]
44
+
45
+ messages = []
46
+ if system.strip():
47
+ messages.append({'role': 'system', 'content': system})
48
+
49
+ # 将历史对话拼接进message中
50
+ for qa_pair in history:
51
+ # 假定 history 中每个元素形如 ("user input", "model response"),请根据你的数据格式进行调整。
52
+ user_answer, model_response = qa_pair
53
+ messages.append({'role': 'user', 'content': user_answer})
54
+ messages.append({'role': 'assistant', 'content': model_response})
55
+
56
+ # 添加本次用户问题
57
+ messages.append({'role': 'user', 'content': query})
58
+
59
+ infer_request = InferRequest(messages=messages)
60
+
61
+ # 进行推理
62
+ response = self.engine.infer([infer_request], self.request_config)
34
63
 
35
- response, history = inference(self.model, self.template, query=query, system=system, history=history)
64
+ # 提取模型返回的文本结果(假设非stream模式)
65
+ result_text = response[0].choices[0].message.content.strip()
36
66
 
37
- return response
67
+ return result_text
@@ -32,6 +32,7 @@ class ModelList(BaseModel):
32
32
  class ChatMessage(BaseModel):
33
33
  role: Literal['user', 'assistant', 'system']
34
34
  content: str
35
+ reasoning_content: Optional[str] = None
35
36
 
36
37
 
37
38
  class DeltaMessage(BaseModel):
@@ -0,0 +1,59 @@
1
+ import re
2
+ from typing import Any, Callable, Dict
3
+
4
+
5
+ class Filter:
6
+ """
7
+ A base Filter class that implements the registry pattern
8
+ """
9
+ _registry: Dict[str, Callable[[str, Any], str]] = {}
10
+
11
+ @classmethod
12
+ def register(cls, name: str) -> Callable:
13
+ """
14
+ Decorator to register a new filter function
15
+ """
16
+
17
+ def decorator(func: Callable[[str, Any], str]) -> Callable[[str, Any], str]:
18
+ cls._registry[name] = func
19
+ return func
20
+
21
+ return decorator
22
+
23
+ @classmethod
24
+ def get_filter(cls, name: str) -> Callable:
25
+ """
26
+ Get a registered filter by name
27
+ """
28
+ return cls._registry.get(name)
29
+
30
+ @classmethod
31
+ def apply(cls, name: str, value: str, *args, **kwargs) -> str:
32
+ """
33
+ Apply a registered filter to a value
34
+ """
35
+ filter_func = cls.get_filter(name)
36
+ if filter_func is None:
37
+ raise ValueError(f'Filter {name} not found')
38
+ return filter_func(value, *args, **kwargs)
39
+
40
+
41
+ @Filter.register('remove_until')
42
+ def remove_until(value: str, marker: str) -> str:
43
+ """
44
+ Remove everything before the last occurrence of marker
45
+ """
46
+ if marker not in value:
47
+ return value
48
+ return value[value.rindex(marker) + len(marker):]
49
+
50
+
51
+ @Filter.register('extract')
52
+ def extract(value: str, pattern: str) -> str:
53
+ """
54
+ Extract content from string using regex pattern
55
+ """
56
+ match = re.search(pattern, value)
57
+ if match:
58
+ return match.group(0)
59
+ return ''