evalscope 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (58) hide show
  1. evalscope/arguments.py +2 -0
  2. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  3. evalscope/benchmarks/bbh/bbh_adapter.py +0 -5
  4. evalscope/benchmarks/benchmark.py +3 -1
  5. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -17
  6. evalscope/benchmarks/data_adapter.py +71 -18
  7. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +6 -10
  8. evalscope/benchmarks/general_qa/general_qa_adapter.py +4 -5
  9. evalscope/benchmarks/gpqa/gpqa_adapter.py +1 -1
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +1 -1
  11. evalscope/benchmarks/ifeval/ifeval_adapter.py +1 -1
  12. evalscope/benchmarks/math_500/math_500_adapter.py +10 -1
  13. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +16 -32
  14. evalscope/benchmarks/musr/__init__.py +0 -0
  15. evalscope/benchmarks/musr/musr_adapter.py +68 -0
  16. evalscope/benchmarks/process_bench/__init__.py +0 -0
  17. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  18. evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
  19. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -1
  20. evalscope/cli/start_app.py +4 -1
  21. evalscope/cli/start_eval.py +4 -3
  22. evalscope/cli/start_perf.py +4 -2
  23. evalscope/collections/evaluator.py +6 -0
  24. evalscope/config.py +3 -1
  25. evalscope/evaluator/evaluator.py +3 -1
  26. evalscope/metrics/__init__.py +2 -1
  27. evalscope/metrics/metrics.py +23 -2
  28. evalscope/models/base_adapter.py +7 -1
  29. evalscope/models/chat_adapter.py +1 -1
  30. evalscope/models/local_model.py +3 -2
  31. evalscope/models/server_adapter.py +79 -28
  32. evalscope/perf/__init__.py +0 -1
  33. evalscope/perf/arguments.py +5 -1
  34. evalscope/perf/http_client.py +2 -2
  35. evalscope/perf/plugin/api/openai_api.py +11 -1
  36. evalscope/perf/utils/benchmark_util.py +6 -2
  37. evalscope/report/app.py +12 -8
  38. evalscope/run.py +1 -1
  39. evalscope/third_party/thinkbench/__init__.py +3 -0
  40. evalscope/third_party/thinkbench/eval.py +264 -0
  41. evalscope/third_party/thinkbench/infer.py +100 -0
  42. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  43. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  44. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  45. evalscope/third_party/thinkbench/tools/llm.py +47 -0
  46. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  47. evalscope/utils/model_utils.py +17 -1
  48. evalscope/utils/utils.py +45 -45
  49. evalscope/version.py +2 -2
  50. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/METADATA +9 -4
  51. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/RECORD +58 -44
  52. tests/cli/test_run.py +27 -15
  53. /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
  54. /evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +0 -0
  55. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
  56. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
  57. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
  58. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,264 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ import plotly.graph_objects as go
5
+ import re
6
+ from collections import defaultdict
7
+ from functools import lru_cache
8
+ from modelscope import AutoTokenizer
9
+ from plotly.subplots import make_subplots
10
+ from tqdm.contrib.concurrent import thread_map
11
+ from typing import List
12
+
13
+ from evalscope.third_party.thinkbench.tools.llm import request_url
14
+ from evalscope.third_party.thinkbench.tools.utils import extract_answer
15
+ from evalscope.utils.io_utils import dump_jsonl_data
16
+
17
+ cur_path = os.path.dirname(os.path.abspath(__file__))
18
+
19
+ class EvalThink:
20
+ def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None):
21
+ self.report_path = report_path
22
+ self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read()
23
+ self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read()
24
+ self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle']
25
+ self.subset_dict = defaultdict(lambda: defaultdict(list))
26
+ self.think_end_token = '</think>'
27
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
28
+ self.model_name = model_name
29
+ self.dataset_name = dataset_name
30
+ self.subsets = subsets
31
+ self.metrics = ['token_efficiency', 'completion_len', 'thought_num', 'accuracy']
32
+ self.split_strategies = split_strategies # split by llm, keywords, separator
33
+ self.judge_config = judge_config
34
+
35
+ @lru_cache(maxsize=None)
36
+ def get_think_part(self, text):
37
+ last_think_end = text.rfind(self.think_end_token)
38
+ return text[:last_think_end].lower()
39
+
40
+ @lru_cache(maxsize=None)
41
+ def cal_tokens(self, text: str):
42
+ return len(self.tokenizer.encode(text, add_special_tokens=False))
43
+
44
+ def process_choice(self, choice, problem):
45
+ think_part = self.get_think_part(choice['message']['content'])
46
+ answer = choice['review']['gold']
47
+ tokens = self.cal_tokens(think_part)
48
+ switch_count = sum(think_part.count(token) for token in self.switch_tokens)
49
+ useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
50
+ score = choice['review']['result']
51
+ return tokens, switch_count, useful_tokens, score
52
+
53
+ def process_item(self, item):
54
+ problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
55
+ results = []
56
+ for choice in item['choices']:
57
+ results.append(self.process_choice(choice, problem))
58
+ break # only process the first choice
59
+
60
+ tokens, switch_counts, useful_tokens, scores = zip(*results)
61
+
62
+ avg_tokens = sum(tokens) / len(tokens)
63
+ avg_thought_num = sum(switch_counts) / len(switch_counts)
64
+ avg_token_efficiency = sum(useful_tokens) / sum(tokens)
65
+ avg_accuracy = sum(scores) / len(scores)
66
+
67
+ return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy
68
+
69
+ def split_by_llm(self, response, problem) -> List[str]:
70
+ response = response.replace('\n', ' ') # remove newline characters
71
+ prompt = self.reformat_template.format(problem=problem, response=response)
72
+ llm_response = request_url(self.judge_config, prompt)
73
+ return llm_response.split('\n\n')
74
+
75
+ def split_by_keywords(self, text) -> List[str]:
76
+ pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens)))
77
+ segments = re.split(pattern, text)
78
+ # remove empty segments
79
+ segments = [segment.strip() for segment in segments if segment.strip()]
80
+
81
+ return segments if segments else [text]
82
+
83
+ def split_by_separator(self, text) -> List[str]:
84
+ return text.split('\n\n')
85
+
86
+ def get_answer_index(self, response: List[str], problem: str, answer: str) -> int:
87
+ tagged_response = ''
88
+ for sdx, step in enumerate(response):
89
+ tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
90
+ tagged_response = tagged_response.strip()
91
+
92
+ prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
93
+ llm_response = request_url(self.judge_config, prompt)
94
+ answer_index = extract_answer(llm_response)
95
+
96
+ dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
97
+ os.path.join(self.report_path, 'answer_index.jsonl'),
98
+ dump_mode='append')
99
+ try:
100
+ answer_index = int(answer_index)
101
+ except Exception:
102
+ answer_index = -1
103
+ return answer_index
104
+
105
+ def get_first_correct(self, response: str, problem: str, answer: str) -> str:
106
+ if self.split_strategies == 'llm':
107
+ text_list = self.split_by_llm(response, problem)
108
+ elif self.split_strategies == 'keywords':
109
+ text_list = self.split_by_keywords(response)
110
+ else:
111
+ text_list = self.split_by_separator(response)
112
+
113
+ answer_index = self.get_answer_index(text_list, problem, answer)
114
+
115
+ if answer_index == -1: # no correct answer found
116
+ first_correct = ''
117
+ else:
118
+ first_correct = '\n\n'.join(text_list[: answer_index])
119
+ return first_correct
120
+
121
+ def plot_metrics(self, results, output_dir):
122
+ fig = make_subplots(rows=1, cols=len(self.metrics),
123
+ subplot_titles=('Token Efficiency', 'Completion Length', 'Thought Num', 'Accuracy'),
124
+ shared_xaxes=True, x_title='Subsets')
125
+
126
+
127
+ for i, metric in enumerate(self.metrics, start=1):
128
+ y_values = [results[metric][subset] for subset in self.subsets]
129
+ fig.add_trace(
130
+ go.Scatter(x=list(range(len(self.subsets))), y=y_values,
131
+ mode='lines+markers',
132
+ name=metric.replace('_', ' ').title()),
133
+ row=1, col=i
134
+ )
135
+ # Add annotations for each data point
136
+ for j, y in enumerate(y_values):
137
+ fig.add_annotation(
138
+ x=j,
139
+ y=y,
140
+ text=f'{y:.2f}',
141
+ showarrow=False,
142
+ yshift=10,
143
+ row=1,
144
+ col=i
145
+ )
146
+
147
+ fig.update_layout(
148
+ height=500,
149
+ width=1500,
150
+ title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
151
+ legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
152
+ )
153
+
154
+ for i in range(1, len(self.metrics) + 1):
155
+ fig.update_xaxes(
156
+ ticktext=self.subsets,
157
+ tickvals=list(range(len(self.subsets))),
158
+ row=1, col=i
159
+ )
160
+ fig.update_yaxes(title_text=self.metrics[i-1].replace('_', ' ').title(), row=1, col=i)
161
+ # Update y-axis ranges
162
+ fig.update_yaxes(range=[0, 1], row=1, col=1) # Token Efficiency
163
+ fig.update_yaxes(range=[0, 13], row=1, col=3) # Switch Frequency
164
+ fig.update_yaxes(range=[0, 1], row=1, col=4) # Accuracy
165
+
166
+ os.makedirs(output_dir, exist_ok=True)
167
+ output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
168
+ fig.write_image(output_path)
169
+ print(f'save figure to: {output_path}')
170
+
171
+
172
+
173
+ def filter_df(self, df, response_len: int = 8000, count: int=10):
174
+ def is_valid_row(row):
175
+ return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices'])
176
+
177
+ bools = df.apply(is_valid_row, axis=1)
178
+
179
+ return df[bools].head(count)
180
+
181
+
182
+ def evaluate(self, output_dir, max_tokens=8000, count=50):
183
+ for subset in self.subsets:
184
+ review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
185
+ review_df = pd.read_json(review_path, lines=True)
186
+
187
+ review_df = self.filter_df(review_df, response_len=max_tokens, count=count)
188
+
189
+ results = thread_map(
190
+ self.process_item,
191
+ (item for _, item in review_df.iterrows()),
192
+ desc=f'Evaluating {subset}',
193
+ total=len(review_df),
194
+ max_workers=16
195
+ )
196
+
197
+ avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy = zip(*results)
198
+
199
+ self.subset_dict[subset]['completion_len'] = sum(avg_tokens) / len(avg_tokens)
200
+ self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
201
+ self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
202
+ self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
203
+
204
+
205
+ results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
206
+ for metric in self.metrics}
207
+
208
+ self.plot_metrics(results, output_dir)
209
+
210
+ return results
211
+
212
+ def run_task(config, output_dir='outputs', max_tokens=8000, count=50):
213
+ evaluator = EvalThink(**config,)
214
+ results = evaluator.evaluate(output_dir, max_tokens, count)
215
+ print(results)
216
+
217
+ judge_config = dict(
218
+ api_key='EMPTY',
219
+ base_url='http://0.0.0.0:8801/v1',
220
+ model_name='Qwen2.5-72B-Instruct',
221
+ )
222
+
223
+ distill_qwen_config = dict(
224
+ report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250218_180219',
225
+ model_name = 'DeepSeek-R1-Distill-Qwen-7B',
226
+ tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
227
+ dataset_name = 'math_500',
228
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
229
+ split_strategies='separator',
230
+ judge_config=judge_config
231
+ )
232
+
233
+ math_qwen_config = dict(
234
+ report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250219_202358',
235
+ model_name = 'Qwen2.5-Math-7B-Instruct',
236
+ tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
237
+ dataset_name = 'math_500',
238
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
239
+ split_strategies='separator'
240
+ )
241
+
242
+ r1_config = dict(
243
+ report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202',
244
+ model_name = 'deepseek-r1',
245
+ tokenizer_path = 'deepseek-ai/DeepSeek-R1',
246
+ dataset_name = 'math_500',
247
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
248
+ split_strategies='separator'
249
+ )
250
+
251
+ qwq_config = dict(
252
+ report_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911',
253
+ model_name = 'qwq-32b-preview',
254
+ tokenizer_path = 'Qwen/QwQ-32B-Preview',
255
+ dataset_name = 'math_500',
256
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
257
+ split_strategies='separator'
258
+ )
259
+
260
+ if __name__ == '__main__':
261
+ run_task(distill_qwen_config)
262
+ # run_task(math_qwen_config)
263
+ # run_task(r1_config)
264
+ # run_task(qwq_config)
@@ -0,0 +1,100 @@
1
+ import os
2
+
3
+ from evalscope import TaskConfig, run_task
4
+
5
+
6
+ def eval_distill_qwen():
7
+ model_name = 'DeepSeek-R1-Distill-Qwen-7B'
8
+ dataset_name = 'math_500'
9
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
10
+
11
+ task_config = TaskConfig(
12
+ api_url='http://0.0.0.0:8801/v1/chat/completions',
13
+ model=model_name,
14
+ eval_type='service',
15
+ datasets=[dataset_name],
16
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
17
+ eval_batch_size=32,
18
+ generation_config={
19
+ 'max_tokens': 20000, # avoid exceed max length
20
+ 'temperature': 0.6,
21
+ 'top_p': 0.95,
22
+ 'n': 1,
23
+ },
24
+ )
25
+ run_task(task_config)
26
+
27
+
28
+ def eval_math_qwen():
29
+ model_name = 'Qwen2.5-Math-7B-Instruct'
30
+ dataset_name = 'math_500'
31
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
32
+
33
+ task_config = TaskConfig(
34
+ api_url='http://0.0.0.0:8801/v1/chat/completions',
35
+ model=model_name,
36
+ eval_type='service',
37
+ datasets=[dataset_name],
38
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
39
+ eval_batch_size=32,
40
+ generation_config={
41
+ 'max_tokens': 3000, # avoid exceed max length
42
+ 'temperature': 0.6,
43
+ 'top_p': 0.95,
44
+ 'n': 3,
45
+ },
46
+ )
47
+ run_task(task_config)
48
+
49
+ def eval_r1():
50
+ model_name = 'deepseek-r1'
51
+ dataset_name = 'math_500'
52
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
53
+
54
+ task_config = TaskConfig(
55
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
56
+ api_key=os.environ['DASHSCOPE_API_KEY'],
57
+ model=model_name,
58
+ eval_type='service',
59
+ datasets=[dataset_name],
60
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
61
+ eval_batch_size=3,
62
+ generation_config={
63
+ 'max_tokens': 12000, # avoid exceed max length
64
+ 'temperature': 0.6,
65
+ 'top_p': 0.95,
66
+ 'n': 1,
67
+ },
68
+ limit=50,
69
+ use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_104202'
70
+ )
71
+ run_task(task_config)
72
+
73
+ def eval_qwq():
74
+ model_name = 'qwq-32b-preview'
75
+ dataset_name = 'math_500'
76
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
77
+
78
+ task_config = TaskConfig(
79
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
80
+ api_key=os.environ['DASHSCOPE_API_KEY'],
81
+ model=model_name,
82
+ eval_type='service',
83
+ datasets=[dataset_name],
84
+ dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
85
+ eval_batch_size=32,
86
+ generation_config={
87
+ 'max_tokens': 8000, # avoid exceed max length
88
+ 'temperature': 0.6,
89
+ 'top_p': 0.95,
90
+ 'n': 1,
91
+ },
92
+ use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250221_105911'
93
+ )
94
+ run_task(task_config)
95
+
96
+ if __name__ == '__main__':
97
+ # eval_distill_qwen()
98
+ # eval_math_qwen()
99
+ # eval_r1()
100
+ eval_qwq()
@@ -0,0 +1,17 @@
1
+ The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
2
+
3
+ [Math Problem]
4
+
5
+ {problem}
6
+
7
+ [Correct Answer]
8
+
9
+ {answer}
10
+
11
+ [Solution]
12
+
13
+ {tagged_response}
14
+
15
+ Your task is to review and critique the solution paragraph by paragraph. Once you identify an correct answer in a paragraph, return the index of the paragraph where the earliest correct answer occurs. Otherwise, return the index of -1 (which typically denotes "not found").
16
+
17
+ Please put your final answer (i.e., the index) in \boxed{{}}.
@@ -0,0 +1,31 @@
1
+ I will present you with a solution to a math problem. Unfortunately, the solution lacks proper paragraphing, making it hard to read. Your task is to improve readability by reformatting the solution into well-structured paragraphs. Follow these specific guidelines:
2
+
3
+ * Insert \n\n for paragraph breaks within the original solution. Do **NOT** alter any content of the original solution (the only exception is for itemized lists; see below).
4
+
5
+ - Each paragraph should represent a distinct, concise reasoning step that logically advances the solution.
6
+
7
+ - Reasoning steps can include case discussions, formula simplifications, or formula derivations. Each of these should be treated as an individual reasoning step and paragraphed accordingly.
8
+
9
+ - If an introductory analysis exists in the original solution, treat it as an initial reasoning step and place it as the first paragraph.
10
+
11
+ - Do **NOT** place any mathematical formulas in their own separate paragraphs; instead, include them within the same paragraph as the preceding text to form a cohesive reasoning step.
12
+
13
+ * For any itemized lists (ordered or unordered), convert them into a written format, such as "First/Second/Third." This is the **ONLY** content modification allowed.
14
+
15
+ * Avoid making paragraphs too lengthy, as long paragraphs might contain multiple reasoning steps that should be paragraphed separately.
16
+
17
+ * Disregard the accuracy of the solution content. Do **NOT** alter any of the original solution's content; focus solely on structuring it into logical, readable paragraphs.
18
+
19
+ * Reply with the reformatted solution directly.
20
+
21
+ --------------------------------------------------
22
+
23
+ Here is the math problem, and the solution that needs to be reformatted:
24
+
25
+ [Math Problem]
26
+
27
+ {problem}
28
+
29
+ [Solution]
30
+
31
+ {response}
File without changes
@@ -0,0 +1,47 @@
1
+ import os
2
+ from openai import OpenAI
3
+
4
+
5
+ def request_url(llm_config, content):
6
+ try:
7
+ client = OpenAI(
8
+ api_key=llm_config['api_key'],
9
+ base_url=llm_config['base_url'],
10
+ )
11
+ completion = client.chat.completions.create(
12
+ model=llm_config['model_name'],
13
+ messages=[{'role': 'user', 'content': content}]
14
+ )
15
+ return completion.choices[0].message.content
16
+ except Exception as e:
17
+ print(e)
18
+
19
+ def request_qwen(content):
20
+ try:
21
+ client = OpenAI(
22
+ api_key=os.getenv('DASHSCOPE_API_KEY'),
23
+ base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
24
+ )
25
+
26
+ completion = client.chat.completions.create(
27
+ model='qwen-max',
28
+ messages=[{'role': 'user', 'content': content}]
29
+ )
30
+ return completion.choices[0].message.content
31
+ except Exception as e:
32
+ print(e)
33
+
34
+
35
+ def request_local(content):
36
+ try:
37
+ client = OpenAI(
38
+ api_key='EMPTY',
39
+ base_url='http://0.0.0.0:8801/v1',
40
+ )
41
+ completion = client.chat.completions.create(
42
+ model='Qwen2.5-72B-Instruct',
43
+ messages=[{'role': 'user', 'content': content}]
44
+ )
45
+ return completion.choices[0].message.content
46
+ except Exception as e:
47
+ print(e)
@@ -0,0 +1,13 @@
1
+ import re
2
+
3
+
4
+ def extract_answer(solution_text: str):
5
+ boxed_pattern = r'\\boxed\{([^}]*)\}'
6
+ matches = re.findall(boxed_pattern, solution_text)
7
+ if matches:
8
+ last_boxed_content = matches[-1]
9
+ number_pattern = r'-?\d+'
10
+ number_matches = re.findall(number_pattern, last_boxed_content)
11
+ if number_matches:
12
+ return number_matches[-1].strip()
13
+ return None
@@ -1,5 +1,6 @@
1
+ import os
1
2
  from enum import Enum
2
- from typing import TYPE_CHECKING
3
+ from typing import TYPE_CHECKING, Optional, Tuple, Union
3
4
 
4
5
  if TYPE_CHECKING:
5
6
  from transformers import GenerationConfig
@@ -22,3 +23,18 @@ def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
22
23
  generation_config.temperature = 1.
23
24
  generation_config.top_p = 1.
24
25
  generation_config.top_k = 50
26
+
27
+
28
+ def get_device() -> str:
29
+ from transformers.utils import is_torch_cuda_available, is_torch_mps_available, is_torch_npu_available
30
+
31
+ if is_torch_npu_available():
32
+ device = 'npu'
33
+ elif is_torch_mps_available():
34
+ device = 'mps'
35
+ elif is_torch_cuda_available():
36
+ device = 'cuda'
37
+ else:
38
+ device = 'cpu'
39
+
40
+ return device
evalscope/utils/utils.py CHANGED
@@ -101,50 +101,50 @@ class ResponseParser:
101
101
  options_concat = '|'.join([str(i) for i in options])
102
102
 
103
103
  patterns = [
104
- f'答案是?\s?([{options_concat}])',
105
- f'答案是?\s?:([{options_concat}])',
106
- f'答案是?\s?:([{options_concat}])',
107
- f'答案应该?是\s?([{options_concat}])',
108
- f'答案应该?选\s?([{options_concat}])',
109
- f'答案为\s?([{options_concat}])',
110
- f'答案选\s?([{options_concat}])',
111
- f'选择?\s?([{options_concat}])',
112
- f'故选?\s?([{options_concat}])'
113
- f'只有选?项?\s?([{options_concat}])\s?是?对',
114
- f'只有选?项?\s?([{options_concat}])\s?是?错',
115
- f'只有选?项?\s?([{options_concat}])\s?不?正确',
116
- f'只有选?项?\s?([{options_concat}])\s?错误',
117
- f'说法不?对选?项?的?是\s?([{options_concat}])',
118
- f'说法不?正确选?项?的?是\s?([{options_concat}])',
119
- f'说法错误选?项?的?是\s?([{options_concat}])',
120
- f'([{options_concat}])\s?是正确的',
121
- f'([{options_concat}])\s?是正确答案',
122
- f'选项\s?([{options_concat}])\s?正确',
123
- f'所以答\s?([{options_concat}])',
124
- f'所以\s?([{options_concat}][.。$]?$)',
125
- f'所有\s?([{options_concat}][.。$]?$)',
126
- f'[\s,::,]([{options_concat}])[。,,\.]?$',
127
- f'[\s,,::][故即]([{options_concat}])[。\.]?$',
128
- f'[\s,,::]因此([{options_concat}])[。\.]?$',
129
- f'[是为。]\s?([{options_concat}])[。\.]?$',
130
- f'因此\s?([{options_concat}])[。\.]?$',
131
- f'显然\s?([{options_concat}])[。\.]?$',
132
- f'答案是\s?(\S+)(?:。|$)',
133
- f'答案应该是\s?(\S+)(?:。|$)',
134
- f'答案为\s?(\S+)(?:。|$)',
135
- f'答案是(.*?)[{options_concat}]',
136
- f'答案为(.*?)[{options_concat}]',
137
- f'固选(.*?)[{options_concat}]',
138
- f'答案应该是(.*?)[{options_concat}]',
139
- f'[Tt]he answer is \(?[{options_concat}]\)?',
140
- f'[Tt]he correct answer is [{options_concat}]',
141
- f'[Tt]he correct answer is:\n[{options_concat}]',
142
- f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
143
- f'^选项\s?([{options_concat}])',
144
- f'^([{options_concat}])\s?选?项',
145
- f'(\s|^)[{options_concat}][\s。,,::\.$]',
146
- f'(\s|^)[{options_concat}](\s|$)',
147
- f'[{options_concat}]',
104
+ rf'答案是?\s?([{options_concat}])',
105
+ rf'答案是?\s?:([{options_concat}])',
106
+ rf'答案是?\s?:([{options_concat}])',
107
+ rf'答案应该?是\s?([{options_concat}])',
108
+ rf'答案应该?选\s?([{options_concat}])',
109
+ rf'答案为\s?([{options_concat}])',
110
+ rf'答案选\s?([{options_concat}])',
111
+ rf'选择?\s?([{options_concat}])',
112
+ rf'故选?\s?([{options_concat}])'
113
+ rf'只有选?项?\s?([{options_concat}])\s?是?对',
114
+ rf'只有选?项?\s?([{options_concat}])\s?是?错',
115
+ rf'只有选?项?\s?([{options_concat}])\s?不?正确',
116
+ rf'只有选?项?\s?([{options_concat}])\s?错误',
117
+ rf'说法不?对选?项?的?是\s?([{options_concat}])',
118
+ rf'说法不?正确选?项?的?是\s?([{options_concat}])',
119
+ rf'说法错误选?项?的?是\s?([{options_concat}])',
120
+ rf'([{options_concat}])\s?是正确的',
121
+ rf'([{options_concat}])\s?是正确答案',
122
+ rf'选项\s?([{options_concat}])\s?正确',
123
+ rf'所以答\s?([{options_concat}])',
124
+ rf'所以\s?([{options_concat}][.。$]?$)',
125
+ rf'所有\s?([{options_concat}][.。$]?$)',
126
+ rf'[\s,::,]([{options_concat}])[。,,\.]?$',
127
+ rf'[\s,,::][故即]([{options_concat}])[。\.]?$',
128
+ rf'[\s,,::]因此([{options_concat}])[。\.]?$',
129
+ rf'[是为。]\s?([{options_concat}])[。\.]?$',
130
+ rf'因此\s?([{options_concat}])[。\.]?$',
131
+ rf'显然\s?([{options_concat}])[。\.]?$',
132
+ rf'答案是\s?(\S+)(?:。|$)',
133
+ rf'答案应该是\s?(\S+)(?:。|$)',
134
+ rf'答案为\s?(\S+)(?:。|$)',
135
+ rf'答案是(.*?)[{options_concat}]',
136
+ rf'答案为(.*?)[{options_concat}]',
137
+ rf'固选(.*?)[{options_concat}]',
138
+ rf'答案应该是(.*?)[{options_concat}]',
139
+ rf'[Tt]he answer is \(?[{options_concat}]\)?',
140
+ rf'[Tt]he correct answer is [{options_concat}]',
141
+ rf'[Tt]he correct answer is:\n[{options_concat}]',
142
+ rf'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
143
+ rf'^选项\s?([{options_concat}])',
144
+ rf'^([{options_concat}])\s?选?项',
145
+ rf'(\s|^)[{options_concat}][\s。,,::\.$]',
146
+ rf'(\s|^)[{options_concat}](\s|$)',
147
+ rf'[{options_concat}]',
148
148
  ]
149
149
 
150
150
  regexes = [re.compile(pattern) for pattern in patterns]
@@ -166,8 +166,8 @@ class ResponseParser:
166
166
  text: The text to parse.
167
167
  """
168
168
  patterns = [
169
- r'[Aa]nswer:\s*(\w+)',
170
169
  r'answer is \(?(\w+)\)?',
170
+ r'[Aa]nswer:\s*(\w+)',
171
171
  r'[Tt]he correct answer is:\s*(\w+)',
172
172
  r'[Tt]he correct answer is:\n\s*(\w+)',
173
173
  r'[Tt]he correct answer is:\n\n-\s*(\w+)',
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.11.0'
4
- __release_datetime__ = '2025-02-13 12:00:00'
3
+ __version__ = '0.12.0'
4
+ __release_datetime__ = '2025-02-27 21:00:00'