evalscope 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -0
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +0 -5
- evalscope/benchmarks/benchmark.py +3 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -17
- evalscope/benchmarks/data_adapter.py +71 -18
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +6 -10
- evalscope/benchmarks/general_qa/general_qa_adapter.py +4 -5
- evalscope/benchmarks/gpqa/gpqa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +10 -1
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +16 -32
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +68 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -1
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +6 -0
- evalscope/config.py +3 -1
- evalscope/evaluator/evaluator.py +3 -1
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/metrics.py +23 -2
- evalscope/models/base_adapter.py +7 -1
- evalscope/models/chat_adapter.py +1 -1
- evalscope/models/local_model.py +3 -2
- evalscope/models/server_adapter.py +79 -28
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +5 -1
- evalscope/perf/http_client.py +2 -2
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +12 -8
- evalscope/run.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +264 -0
- evalscope/third_party/thinkbench/infer.py +100 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +47 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/METADATA +9 -4
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/RECORD +58 -44
- tests/cli/test_run.py +27 -15
- /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
- /evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py
CHANGED
|
@@ -71,6 +71,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
71
71
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
72
72
|
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
73
73
|
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
74
|
+
parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
|
|
75
|
+
parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
|
|
74
76
|
# yapf: enable
|
|
75
77
|
|
|
76
78
|
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
# flake8: noqa
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='aime25',
|
|
13
|
+
dataset_id='TIGER-Lab/AIME25',
|
|
14
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
+
subset_list=['default'],
|
|
16
|
+
metric_list=['AveragePass@1'],
|
|
17
|
+
few_shot_num=0,
|
|
18
|
+
train_split=None,
|
|
19
|
+
eval_split='train', # Only train set is available
|
|
20
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
|
+
)
|
|
22
|
+
class AIME25Adapter(DataAdapter):
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, **kwargs):
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Generate the prompt for the model input.
|
|
30
|
+
"""
|
|
31
|
+
problem = input_d['question']
|
|
32
|
+
full_prompt = self.prompt_template.format(query=problem)
|
|
33
|
+
|
|
34
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
# Extract the gold answer from the input dict.
|
|
38
|
+
return strip_answer_string(input_d['answer'])
|
|
39
|
+
|
|
40
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
41
|
+
"""
|
|
42
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
43
|
+
"""
|
|
44
|
+
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
45
|
+
result = strip_answer_string(extract_answer(result))
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def match(self, gold: str, pred: str) -> float:
|
|
49
|
+
return math_equal(pred, gold)
|
|
@@ -171,11 +171,6 @@ class BBHAdapter(DataAdapter):
|
|
|
171
171
|
prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
|
|
172
172
|
res_dict[sub_name].append(prompt_d)
|
|
173
173
|
|
|
174
|
-
rnd = random.Random()
|
|
175
|
-
rnd.seed(42)
|
|
176
|
-
for k, v in res_dict.items():
|
|
177
|
-
rnd.shuffle(v)
|
|
178
|
-
|
|
179
174
|
return res_dict
|
|
180
175
|
|
|
181
176
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
@@ -24,6 +24,8 @@ class BenchmarkMeta:
|
|
|
24
24
|
eval_split: Optional[str] = None
|
|
25
25
|
prompt_template: Optional[str] = None
|
|
26
26
|
system_prompt: Optional[str] = None
|
|
27
|
+
query_template: Optional[str] = None
|
|
28
|
+
pretty_name: Optional[str] = None
|
|
27
29
|
|
|
28
30
|
def _update(self, args: dict):
|
|
29
31
|
if args.get('local_path'):
|
|
@@ -59,7 +61,7 @@ class Benchmark:
|
|
|
59
61
|
@classmethod
|
|
60
62
|
def get(cls, name: str) -> 'BenchmarkMeta':
|
|
61
63
|
if name not in BENCHMARK_MAPPINGS:
|
|
62
|
-
raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
|
|
64
|
+
raise Exception(f'Unknown benchmark: {name}. Available tasks: {list(BENCHMARK_MAPPINGS.keys())}')
|
|
63
65
|
benchmark = BENCHMARK_MAPPINGS[name]
|
|
64
66
|
return benchmark
|
|
65
67
|
|
|
@@ -23,7 +23,7 @@ logger = get_logger()
|
|
|
23
23
|
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
24
24
|
metric_list=['AveragePass@1'],
|
|
25
25
|
few_shot_num=4,
|
|
26
|
-
train_split=
|
|
26
|
+
train_split=None,
|
|
27
27
|
eval_split='test',
|
|
28
28
|
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
29
29
|
)
|
|
@@ -43,7 +43,8 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
43
43
|
def load(self, **kwargs):
|
|
44
44
|
# default load all levels
|
|
45
45
|
kwargs['subset_list'] = ['default']
|
|
46
|
-
|
|
46
|
+
data_dict = super().load(**kwargs)
|
|
47
|
+
return self.reformat_subset(data_dict, subset_key='level')
|
|
47
48
|
|
|
48
49
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
49
50
|
data_dict = defaultdict(dict)
|
|
@@ -63,21 +64,6 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
63
64
|
|
|
64
65
|
return data_dict
|
|
65
66
|
|
|
66
|
-
def gen_prompts(self, data_dict: dict) -> dict:
|
|
67
|
-
res_dict: dict = defaultdict(list)
|
|
68
|
-
|
|
69
|
-
# use level as subset
|
|
70
|
-
for sub_name, sub_data_dict in data_dict.items():
|
|
71
|
-
for sample_d in sub_data_dict[self.eval_split]:
|
|
72
|
-
level = sample_d['level']
|
|
73
|
-
if level not in self.subset_list:
|
|
74
|
-
continue
|
|
75
|
-
prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=None)
|
|
76
|
-
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
|
|
77
|
-
res_dict[level].append(prompt_d)
|
|
78
|
-
|
|
79
|
-
return res_dict
|
|
80
|
-
|
|
81
67
|
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
82
68
|
"""
|
|
83
69
|
Generate the prompt for the model input.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import os.path
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
from typing import Any, List, Optional, Union
|
|
6
7
|
|
|
7
8
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
@@ -24,6 +25,8 @@ class DataAdapter(ABC):
|
|
|
24
25
|
eval_split: Optional[str] = None,
|
|
25
26
|
prompt_template: Optional[str] = None,
|
|
26
27
|
system_prompt: Optional[str] = None,
|
|
28
|
+
query_template: Optional[str] = None,
|
|
29
|
+
pretty_name: Optional[str] = None,
|
|
27
30
|
**kwargs):
|
|
28
31
|
"""
|
|
29
32
|
Data Adapter for the benchmark. You need to implement the following methods:
|
|
@@ -52,6 +55,8 @@ class DataAdapter(ABC):
|
|
|
52
55
|
self.eval_split = eval_split
|
|
53
56
|
self.prompt_template = prompt_template
|
|
54
57
|
self.system_prompt = system_prompt
|
|
58
|
+
self.query_template = query_template
|
|
59
|
+
self.pretty_name = pretty_name
|
|
55
60
|
self.config_kwargs = kwargs
|
|
56
61
|
self.category_map = kwargs.get('category_map', {})
|
|
57
62
|
|
|
@@ -59,7 +64,6 @@ class DataAdapter(ABC):
|
|
|
59
64
|
dataset_name_or_path: str = None,
|
|
60
65
|
subset_list: list = None,
|
|
61
66
|
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
62
|
-
datasets_hub: str = HubType.MODELSCOPE,
|
|
63
67
|
**kwargs) -> dict:
|
|
64
68
|
"""
|
|
65
69
|
Load the dataset. Remote and local datasets are supported.
|
|
@@ -74,22 +78,40 @@ class DataAdapter(ABC):
|
|
|
74
78
|
|
|
75
79
|
# Try to load dataset from local disk
|
|
76
80
|
if os.path.exists(dataset_name_or_path):
|
|
77
|
-
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
78
|
-
subsets: {subset_list}')
|
|
79
81
|
data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
80
|
-
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
81
|
-
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
82
82
|
else:
|
|
83
|
-
|
|
83
|
+
data_dict = self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
84
|
+
if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
|
|
85
|
+
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
86
|
+
return data_dict
|
|
87
|
+
|
|
88
|
+
def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
|
|
89
|
+
from modelscope.msdatasets import MsDataset
|
|
90
|
+
|
|
91
|
+
datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
|
|
92
|
+
split_as_subset: bool = kwargs.pop('split_as_subset', False)
|
|
93
|
+
# Load dataset from remote
|
|
94
|
+
logger.info(
|
|
95
|
+
f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
84
96
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
90
|
-
if len(split_list) == 0:
|
|
91
|
-
logger.error(f'Got empty split list: {split_list}')
|
|
97
|
+
data_dict = {}
|
|
98
|
+
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
99
|
+
if len(split_list) == 0:
|
|
100
|
+
logger.error(f'Got empty split list: {split_list}')
|
|
92
101
|
|
|
102
|
+
if split_as_subset:
|
|
103
|
+
for sub_name in subset_list:
|
|
104
|
+
data_dict[sub_name] = {}
|
|
105
|
+
# e.g. train: few-shot, test: target dataset to evaluate
|
|
106
|
+
for split in split_list:
|
|
107
|
+
dataset = MsDataset.load(
|
|
108
|
+
dataset_name=dataset_name_or_path,
|
|
109
|
+
split=sub_name, # load subset from split
|
|
110
|
+
cache_dir=work_dir,
|
|
111
|
+
hub=datasets_hub,
|
|
112
|
+
**kwargs)
|
|
113
|
+
data_dict[sub_name].update({split: dataset})
|
|
114
|
+
else:
|
|
93
115
|
for sub_name in subset_list:
|
|
94
116
|
data_dict[sub_name] = {}
|
|
95
117
|
# e.g. train: few-shot, test: target dataset to evaluate
|
|
@@ -101,17 +123,48 @@ class DataAdapter(ABC):
|
|
|
101
123
|
cache_dir=work_dir,
|
|
102
124
|
hub=datasets_hub,
|
|
103
125
|
**kwargs)
|
|
104
|
-
|
|
105
126
|
data_dict[sub_name].update({split: dataset})
|
|
106
127
|
|
|
107
128
|
return data_dict
|
|
108
129
|
|
|
109
|
-
def load_from_disk(self,
|
|
130
|
+
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
110
131
|
"""
|
|
111
132
|
Load the dataset from local disk.
|
|
112
133
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
134
|
+
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
135
|
+
"""
|
|
136
|
+
from modelscope.msdatasets import MsDataset
|
|
137
|
+
|
|
138
|
+
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
139
|
+
subsets: {subset_list}')
|
|
140
|
+
data_dict = {}
|
|
141
|
+
subset_list = subset_list or self.subset_list
|
|
142
|
+
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
143
|
+
for sub_name in subset_list:
|
|
144
|
+
data_dict[sub_name] = {}
|
|
145
|
+
# e.g. train: few-shot, test: target dataset to evaluate
|
|
146
|
+
for split in split_list:
|
|
147
|
+
dataset = MsDataset.load(
|
|
148
|
+
dataset_name=dataset_name_or_path, subset_name=sub_name, split=split, cache_dir=work_dir, **kwargs)
|
|
149
|
+
data_dict[sub_name].update({split: dataset})
|
|
150
|
+
return data_dict
|
|
151
|
+
|
|
152
|
+
def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
|
|
153
|
+
"""
|
|
154
|
+
Reformat the dataset subset with subset_key and format.
|
|
113
155
|
"""
|
|
114
|
-
|
|
156
|
+
res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list})
|
|
157
|
+
|
|
158
|
+
for sub_name, sub_data_dict in data_dict.items():
|
|
159
|
+
for split in [self.train_split, self.eval_split]:
|
|
160
|
+
if split is None:
|
|
161
|
+
continue
|
|
162
|
+
for sample_d in sub_data_dict[split]:
|
|
163
|
+
new_subset_name = format.format(sample_d[subset_key])
|
|
164
|
+
if new_subset_name not in self.subset_list:
|
|
165
|
+
continue
|
|
166
|
+
res_dict[new_subset_name][split].append(sample_d)
|
|
167
|
+
return res_dict
|
|
115
168
|
|
|
116
169
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
117
170
|
"""
|
|
@@ -138,7 +191,7 @@ class DataAdapter(ABC):
|
|
|
138
191
|
|
|
139
192
|
for sub_name, sub_data_dict in data_dict.items():
|
|
140
193
|
few_shot_data = []
|
|
141
|
-
if self.few_shot_num and self.few_shot_num > 0:
|
|
194
|
+
if self.train_split and self.few_shot_num and self.few_shot_num > 0:
|
|
142
195
|
few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
|
|
143
196
|
few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
|
|
144
197
|
self.few_shot_num,
|
|
@@ -161,7 +214,7 @@ class DataAdapter(ABC):
|
|
|
161
214
|
else:
|
|
162
215
|
return data_list[:k]
|
|
163
216
|
|
|
164
|
-
def compute_metric(self, review_res_list: Union[dict, list]) -> List[dict]:
|
|
217
|
+
def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]:
|
|
165
218
|
"""
|
|
166
219
|
Compute evaluation result by specific metrics.
|
|
167
220
|
|
|
@@ -24,7 +24,7 @@ logger = get_logger()
|
|
|
24
24
|
train_split='dev',
|
|
25
25
|
eval_split='val',
|
|
26
26
|
prompt_template='请回答问题,并选出其中的正确答案\n{query}',
|
|
27
|
-
)
|
|
27
|
+
query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
|
|
28
28
|
class GeneralMCQAdapter(DataAdapter):
|
|
29
29
|
|
|
30
30
|
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
@@ -115,15 +115,11 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
115
115
|
def match(self, gold: str, pred: str) -> float:
|
|
116
116
|
return exact_match(gold=gold, pred=pred)
|
|
117
117
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
example = '问题:' + input_d['question']
|
|
121
|
-
for choice in cls.choices:
|
|
122
|
-
if choice in input_d:
|
|
123
|
-
example += f'\n{choice}. {input_d[f"{choice}"]}'
|
|
118
|
+
def _format_example(self, input_d: dict, include_answer=True):
|
|
119
|
+
choices_str = '\n'.join([f'{choice}. {input_d[choice]}' for choice in self.choices if choice in input_d])
|
|
124
120
|
|
|
125
121
|
if include_answer:
|
|
126
|
-
|
|
122
|
+
return self.query_template.format(
|
|
123
|
+
question=input_d['question'], choices=choices_str, answer=input_d['answer'])
|
|
127
124
|
else:
|
|
128
|
-
|
|
129
|
-
return example
|
|
125
|
+
return self.query_template.format(question=input_d['question'], choices=choices_str, answer='').rstrip()
|
|
@@ -22,6 +22,7 @@ logger = get_logger()
|
|
|
22
22
|
few_shot_num=0,
|
|
23
23
|
train_split=None,
|
|
24
24
|
eval_split='test',
|
|
25
|
+
prompt_template='请回答问题\n{query}',
|
|
25
26
|
)
|
|
26
27
|
class GeneralQAAdapter(DataAdapter):
|
|
27
28
|
# TODO: set few_shot_num
|
|
@@ -62,10 +63,8 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
62
63
|
logger.warning('The history is not included in the prompt for GeneralQA. \
|
|
63
64
|
To be supported in the future.')
|
|
64
65
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
# if len(history) > 0:
|
|
68
|
-
# prompt = '\n'.join(history) + '\n' + prompt
|
|
66
|
+
query = input_d.get('question', '') or input_d.get('query', '')
|
|
67
|
+
prompt = self.prompt_template.format(query=query)
|
|
69
68
|
return {'data': [prompt], 'system_prompt': self.system_prompt}
|
|
70
69
|
|
|
71
70
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
@@ -107,7 +106,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
107
106
|
res.update(bleu_dict)
|
|
108
107
|
return res
|
|
109
108
|
|
|
110
|
-
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
109
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
111
110
|
"""
|
|
112
111
|
compute weighted mean of the bleu score of all samples
|
|
113
112
|
|
|
@@ -15,7 +15,7 @@ from evalscope.models import ChatGenerationModelAdapter
|
|
|
15
15
|
subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
|
|
16
16
|
metric_list=['AveragePass@1'],
|
|
17
17
|
few_shot_num=5,
|
|
18
|
-
train_split=
|
|
18
|
+
train_split=None,
|
|
19
19
|
eval_split='train', # only have train split
|
|
20
20
|
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
21
|
)
|
|
@@ -47,7 +47,7 @@ class IFEvalAdapter(DataAdapter):
|
|
|
47
47
|
def match(self, gold: Any, pred: Any) -> Dict:
|
|
48
48
|
return process_results(gold, [pred])
|
|
49
49
|
|
|
50
|
-
def compute_metric(self, review_res_list: List[dict]) -> Any:
|
|
50
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
51
51
|
# aggregate review results
|
|
52
52
|
res_dict = defaultdict(list)
|
|
53
53
|
for res in review_res_list:
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
|
|
1
3
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
|
+
from evalscope.constants import AnswerKeys
|
|
2
5
|
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
6
|
from evalscope.models import ChatGenerationModelAdapter
|
|
4
7
|
from evalscope.utils.logger import get_logger
|
|
@@ -12,7 +15,7 @@ logger = get_logger()
|
|
|
12
15
|
name='math_500',
|
|
13
16
|
dataset_id='AI-ModelScope/MATH-500',
|
|
14
17
|
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
-
subset_list=['
|
|
18
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
16
19
|
metric_list=['AveragePass@1'],
|
|
17
20
|
few_shot_num=0,
|
|
18
21
|
train_split=None,
|
|
@@ -24,6 +27,12 @@ class Math500Adapter(DataAdapter):
|
|
|
24
27
|
def __init__(self, *args, **kwargs):
|
|
25
28
|
super().__init__(*args, **kwargs)
|
|
26
29
|
|
|
30
|
+
def load(self, **kwargs):
|
|
31
|
+
# default load all levels
|
|
32
|
+
kwargs['subset_list'] = ['default']
|
|
33
|
+
data_dict = super().load(**kwargs)
|
|
34
|
+
return self.reformat_subset(data_dict, subset_key='level', format='Level {}')
|
|
35
|
+
|
|
27
36
|
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
37
|
"""
|
|
29
38
|
Generate the prompt for the model input.
|
|
@@ -15,7 +15,7 @@ SUBSET_LIST = [
|
|
|
15
15
|
|
|
16
16
|
@Benchmark.register(
|
|
17
17
|
name='mmlu_pro',
|
|
18
|
-
dataset_id='modelscope/
|
|
18
|
+
dataset_id='modelscope/MMLU-Pro',
|
|
19
19
|
model_adapter=ChatGenerationModelAdapter,
|
|
20
20
|
subset_list=SUBSET_LIST,
|
|
21
21
|
metric_list=['AverageAccuracy'],
|
|
@@ -35,41 +35,25 @@ class MMLUProAdapter(DataAdapter):
|
|
|
35
35
|
def load(self, **kwargs):
|
|
36
36
|
# default load all data
|
|
37
37
|
kwargs['subset_list'] = ['default']
|
|
38
|
-
|
|
38
|
+
data_dict = super().load(**kwargs)
|
|
39
|
+
return self.reformat_subset(data_dict, subset_key='category')
|
|
39
40
|
|
|
40
|
-
def
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
fewshot_prompts = self.get_fewshot_examples(data_dict)
|
|
48
|
-
|
|
49
|
-
# Use the category as key to group the prompts
|
|
50
|
-
res_dict = defaultdict(list)
|
|
51
|
-
# generate prompts for each test sample
|
|
52
|
-
for entry in data_dict[self.eval_split]:
|
|
53
|
-
subset_name = entry['category']
|
|
54
|
-
if subset_name not in self.subset_list:
|
|
55
|
-
continue
|
|
56
|
-
prefix = fewshot_prompts[subset_name]
|
|
57
|
-
query = prefix + 'Q: ' + entry['question'] + '\n' + \
|
|
58
|
-
self.__form_options(entry['options']) + '\n'
|
|
59
|
-
|
|
60
|
-
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
61
|
-
prompt_d = {'data': [full_prompt], 'system_prompt': self.system_prompt, AnswerKeys.RAW_INPUT: entry}
|
|
41
|
+
def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
42
|
+
if self.few_shot_num > 0:
|
|
43
|
+
prefix = self.format_fewshot_examples(few_shot_list)
|
|
44
|
+
else:
|
|
45
|
+
prefix = ''
|
|
46
|
+
query = prefix + 'Q: ' + input_d['question'] + '\n' + \
|
|
47
|
+
self.__form_options(input_d['options']) + '\n'
|
|
62
48
|
|
|
63
|
-
|
|
64
|
-
return
|
|
49
|
+
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
50
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
65
51
|
|
|
66
|
-
def
|
|
52
|
+
def format_fewshot_examples(self, few_shot_list):
|
|
67
53
|
# load few-shot prompts for each category
|
|
68
|
-
prompts =
|
|
69
|
-
for index, d in enumerate(
|
|
70
|
-
|
|
71
|
-
break
|
|
72
|
-
prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
|
|
54
|
+
prompts = ''
|
|
55
|
+
for index, d in enumerate(few_shot_list):
|
|
56
|
+
prompts += 'Q: ' + d['question'] + '\n' + \
|
|
73
57
|
self.__form_options(d['options']) + '\n' + \
|
|
74
58
|
d['cot_content'] + '\n\n'
|
|
75
59
|
return prompts
|
|
File without changes
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.metrics import exact_match
|
|
7
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
8
|
+
from evalscope.utils.utils import ResponseParser
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='musr',
|
|
13
|
+
pretty_name='MuSR',
|
|
14
|
+
dataset_id='AI-ModelScope/MuSR',
|
|
15
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
16
|
+
subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
|
|
17
|
+
metric_list=['AverageAccuracy'],
|
|
18
|
+
few_shot_num=0,
|
|
19
|
+
train_split=None,
|
|
20
|
+
eval_split='test',
|
|
21
|
+
prompt_template=
|
|
22
|
+
'{narrative}\n\n{question}\n\n{choices}\nThink step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.', # noqa: E501
|
|
23
|
+
)
|
|
24
|
+
class MuSRAdapter(DataAdapter):
|
|
25
|
+
|
|
26
|
+
def __init__(self, **kwargs):
|
|
27
|
+
super().__init__(**kwargs)
|
|
28
|
+
|
|
29
|
+
self.choices = ['A', 'B', 'C', 'D', 'E', 'F']
|
|
30
|
+
|
|
31
|
+
def load(self, **kwargs):
|
|
32
|
+
# default load all levels
|
|
33
|
+
kwargs['split_as_subset'] = True
|
|
34
|
+
data_dict = super().load(**kwargs)
|
|
35
|
+
return data_dict
|
|
36
|
+
|
|
37
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
38
|
+
|
|
39
|
+
choices = self.format_choice(ast.literal_eval(input_d['choices']))
|
|
40
|
+
|
|
41
|
+
full_prompt = self.prompt_template.format(
|
|
42
|
+
narrative=input_d['narrative'], question=input_d['question'], choices=choices)
|
|
43
|
+
|
|
44
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
45
|
+
|
|
46
|
+
def format_choice(self, options: list):
|
|
47
|
+
option_str = ''
|
|
48
|
+
for opt, choice in zip(options, self.choices):
|
|
49
|
+
option_str += f'({choice}): {opt}\n'
|
|
50
|
+
return option_str
|
|
51
|
+
|
|
52
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Parse the raw input labels (gold).
|
|
55
|
+
"""
|
|
56
|
+
return self.choices[input_d['answer_index']]
|
|
57
|
+
|
|
58
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Parse the predicted result and extract proper answer.
|
|
61
|
+
"""
|
|
62
|
+
return ResponseParser.parse_first_option(result)
|
|
63
|
+
|
|
64
|
+
def match(self, gold: str, pred: str) -> float:
|
|
65
|
+
"""
|
|
66
|
+
Match the gold answer and the predicted answer.
|
|
67
|
+
"""
|
|
68
|
+
return exact_match(gold=gold, pred=pred)
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
|
|
2
|
+
|
|
3
|
+
[Math Problem]
|
|
4
|
+
|
|
5
|
+
{problem}
|
|
6
|
+
|
|
7
|
+
[Solution]
|
|
8
|
+
|
|
9
|
+
{tagged_response}
|
|
10
|
+
|
|
11
|
+
Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
|
|
12
|
+
|
|
13
|
+
Please put your final answer (i.e., the index) in \boxed{{}}.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, List
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import AnswerKeys, EvalType
|
|
7
|
+
from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
|
|
8
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
|
|
10
|
+
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@Benchmark.register(
|
|
14
|
+
name='process_bench',
|
|
15
|
+
pretty_name='ProcessBench',
|
|
16
|
+
dataset_id='Qwen/ProcessBench',
|
|
17
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
18
|
+
subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
|
|
19
|
+
metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
|
|
20
|
+
few_shot_num=0,
|
|
21
|
+
train_split=None,
|
|
22
|
+
eval_split='test',
|
|
23
|
+
)
|
|
24
|
+
class ProcessBenchAdapter(DataAdapter):
|
|
25
|
+
|
|
26
|
+
def __init__(self, **kwargs):
|
|
27
|
+
super().__init__(**kwargs)
|
|
28
|
+
|
|
29
|
+
self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt')).read()
|
|
30
|
+
|
|
31
|
+
# register metrics
|
|
32
|
+
metric_registry.register(Metric(name='error_acc', object=mean))
|
|
33
|
+
metric_registry.register(Metric(name='correct_acc', object=mean))
|
|
34
|
+
metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
|
|
35
|
+
|
|
36
|
+
def load(self, **kwargs):
|
|
37
|
+
# default load all levels
|
|
38
|
+
kwargs['split_as_subset'] = True
|
|
39
|
+
data_dict = super().load(**kwargs)
|
|
40
|
+
return data_dict
|
|
41
|
+
|
|
42
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
43
|
+
|
|
44
|
+
problem = input_d['problem']
|
|
45
|
+
steps = input_d['steps']
|
|
46
|
+
tagged_response = ''
|
|
47
|
+
for sdx, step in enumerate(steps):
|
|
48
|
+
tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
|
|
49
|
+
tagged_response = tagged_response.strip()
|
|
50
|
+
|
|
51
|
+
full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
|
|
52
|
+
|
|
53
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
54
|
+
|
|
55
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Parse the raw input labels (gold).
|
|
58
|
+
"""
|
|
59
|
+
return int(input_d['label'])
|
|
60
|
+
|
|
61
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Parse the predicted result and extract proper answer.
|
|
64
|
+
"""
|
|
65
|
+
pred = ProcessBenchAdapter.extract_answer(result)
|
|
66
|
+
try:
|
|
67
|
+
pred = int(pred)
|
|
68
|
+
except Exception:
|
|
69
|
+
pred = None
|
|
70
|
+
return pred
|
|
71
|
+
|
|
72
|
+
def match(self, gold: int, pred: int) -> float:
|
|
73
|
+
"""
|
|
74
|
+
Match the gold answer and the predicted answer.
|
|
75
|
+
"""
|
|
76
|
+
return gold == pred
|
|
77
|
+
|
|
78
|
+
def compute_metric(self, review_res_list: list, **kwargs) -> List[dict]:
|
|
79
|
+
reviews_list = kwargs['reviews_list']
|
|
80
|
+
error_data = []
|
|
81
|
+
correct_data = []
|
|
82
|
+
for res, raw in zip(review_res_list, reviews_list):
|
|
83
|
+
if raw[AnswerKeys.RAW_INPUT]['label'] == -1:
|
|
84
|
+
correct_data.append(res)
|
|
85
|
+
else:
|
|
86
|
+
error_data.append(res)
|
|
87
|
+
data = {'error_acc': error_data, 'correct_acc': correct_data, 'simple_f1_score': (correct_data, error_data)}
|
|
88
|
+
return super().compute_metric(data)
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def extract_answer(solution_text: str):
|
|
92
|
+
boxed_pattern = r'\\boxed\{([^}]*)\}'
|
|
93
|
+
matches = re.findall(boxed_pattern, solution_text)
|
|
94
|
+
if matches:
|
|
95
|
+
return matches[-1].strip()
|
|
96
|
+
return None
|