evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +49 -0
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
- evalscope/benchmarks/benchmark.py +5 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
- evalscope/benchmarks/data_adapter.py +88 -29
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +68 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +109 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +4 -1
- evalscope/evaluator/evaluator.py +81 -65
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +39 -3
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/base_adapter.py +7 -1
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +7 -2
- evalscope/models/server_adapter.py +106 -61
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +5 -1
- evalscope/perf/http_client.py +2 -2
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +42 -23
- evalscope/run.py +11 -8
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +264 -0
- evalscope/third_party/thinkbench/infer.py +100 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +47 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
- tests/cli/test_run.py +108 -19
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py
CHANGED
|
@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
58
58
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
59
59
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
60
60
|
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
|
|
61
|
+
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
61
62
|
|
|
62
63
|
# Cache and working directory arguments
|
|
63
64
|
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
@@ -70,6 +71,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
70
71
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
71
72
|
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
72
73
|
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
74
|
+
parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
|
|
75
|
+
parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
|
|
73
76
|
# yapf: enable
|
|
74
77
|
|
|
75
78
|
|
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
# flake8: noqa
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='aime24',
|
|
13
|
+
dataset_id='HuggingFaceH4/aime_2024',
|
|
14
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
+
subset_list=['default'],
|
|
16
|
+
metric_list=['AveragePass@1'],
|
|
17
|
+
few_shot_num=0,
|
|
18
|
+
train_split=None,
|
|
19
|
+
eval_split='train', # Only train set is available
|
|
20
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
|
+
)
|
|
22
|
+
class AIME24Adapter(DataAdapter):
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, **kwargs):
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Generate the prompt for the model input.
|
|
30
|
+
"""
|
|
31
|
+
problem = input_d['problem']
|
|
32
|
+
full_prompt = self.prompt_template.format(query=problem)
|
|
33
|
+
|
|
34
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
# Extract the gold answer from the input dict.
|
|
38
|
+
return strip_answer_string(input_d['answer'])
|
|
39
|
+
|
|
40
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
41
|
+
"""
|
|
42
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
43
|
+
"""
|
|
44
|
+
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
45
|
+
result = strip_answer_string(extract_answer(result))
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def match(self, gold: str, pred: str) -> float:
|
|
49
|
+
return math_equal(pred, gold)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
# flake8: noqa
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='aime25',
|
|
13
|
+
dataset_id='TIGER-Lab/AIME25',
|
|
14
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
+
subset_list=['default'],
|
|
16
|
+
metric_list=['AveragePass@1'],
|
|
17
|
+
few_shot_num=0,
|
|
18
|
+
train_split=None,
|
|
19
|
+
eval_split='train', # Only train set is available
|
|
20
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
|
+
)
|
|
22
|
+
class AIME25Adapter(DataAdapter):
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, **kwargs):
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Generate the prompt for the model input.
|
|
30
|
+
"""
|
|
31
|
+
problem = input_d['question']
|
|
32
|
+
full_prompt = self.prompt_template.format(query=problem)
|
|
33
|
+
|
|
34
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
# Extract the gold answer from the input dict.
|
|
38
|
+
return strip_answer_string(input_d['answer'])
|
|
39
|
+
|
|
40
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
41
|
+
"""
|
|
42
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
43
|
+
"""
|
|
44
|
+
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
45
|
+
result = strip_answer_string(extract_answer(result))
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def match(self, gold: str, pred: str) -> float:
|
|
49
|
+
return math_equal(pred, gold)
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
23
|
+
metric_list=['AverageAccuracy'],
|
|
24
24
|
few_shot_num=0,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
|
|
|
112
112
|
# context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
|
|
113
113
|
full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
114
114
|
|
|
115
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
115
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
116
116
|
|
|
117
117
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
118
118
|
# Get the gold choice
|
|
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
|
|
|
133
133
|
if eval_type == EvalType.CHECKPOINT:
|
|
134
134
|
return result
|
|
135
135
|
elif eval_type == EvalType.SERVICE:
|
|
136
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
137
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
136
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
138
137
|
elif eval_type == EvalType.CUSTOM:
|
|
139
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
140
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
138
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
141
139
|
else:
|
|
142
140
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
143
141
|
|
|
@@ -7,7 +7,7 @@ import re
|
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
|
-
from evalscope.metrics import
|
|
10
|
+
from evalscope.metrics import exact_match
|
|
11
11
|
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
12
|
from evalscope.utils import ResponseParser
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
63
63
|
dataset_id='modelscope/bbh',
|
|
64
64
|
model_adapter=ChatGenerationModelAdapter,
|
|
65
65
|
subset_list=SUBSET_LIST,
|
|
66
|
-
metric_list=[AverageAccuracy],
|
|
66
|
+
metric_list=['AverageAccuracy'],
|
|
67
67
|
few_shot_num=3,
|
|
68
68
|
train_split=None,
|
|
69
69
|
eval_split='test',
|
|
70
|
-
prompt_template='
|
|
70
|
+
prompt_template="Q: {query}\nA: Let's think step by step.",
|
|
71
71
|
)
|
|
72
72
|
class BBHAdapter(DataAdapter):
|
|
73
73
|
"""
|
|
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
|
|
|
119
119
|
{'data': ['xxx']}
|
|
120
120
|
"""
|
|
121
121
|
# few_shot_list: should be ['xxxx']
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
if len(few_shot_list) > 0:
|
|
123
|
+
cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
|
|
124
|
+
else:
|
|
125
|
+
cot_prompts = ''
|
|
126
|
+
full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
|
|
124
127
|
|
|
125
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
128
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
126
129
|
|
|
127
130
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
128
131
|
"""
|
|
@@ -168,18 +171,15 @@ class BBHAdapter(DataAdapter):
|
|
|
168
171
|
prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
|
|
169
172
|
res_dict[sub_name].append(prompt_d)
|
|
170
173
|
|
|
171
|
-
rnd = random.Random()
|
|
172
|
-
rnd.seed(42)
|
|
173
|
-
for k, v in res_dict.items():
|
|
174
|
-
rnd.shuffle(v)
|
|
175
|
-
|
|
176
174
|
return res_dict
|
|
177
175
|
|
|
178
176
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
179
177
|
# Get the gold choice
|
|
180
|
-
gold = input_d.get('target')
|
|
178
|
+
gold = input_d.get('target', '')
|
|
179
|
+
# remove brackets
|
|
181
180
|
if gold is None:
|
|
182
181
|
logger.error(f'BBHAdapter: gold is None.')
|
|
182
|
+
gold = gold.replace('(', '').replace(')', '')
|
|
183
183
|
return gold
|
|
184
184
|
|
|
185
185
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
@@ -228,8 +228,11 @@ class BBHAdapter(DataAdapter):
|
|
|
228
228
|
"""
|
|
229
229
|
Extract the answer from the model output for Free-form task.
|
|
230
230
|
"""
|
|
231
|
-
|
|
232
|
-
|
|
231
|
+
pattern = r'answer is\s+(.*?)\.'
|
|
232
|
+
|
|
233
|
+
match = re.search(pattern, ans)
|
|
234
|
+
if match:
|
|
235
|
+
res = match.group(1)
|
|
233
236
|
return res
|
|
234
237
|
|
|
235
238
|
ans_line = ans.split('answer is ')
|
|
@@ -17,12 +17,15 @@ class BenchmarkMeta:
|
|
|
17
17
|
data_adapter: 'DataAdapter'
|
|
18
18
|
model_adapter: BaseModelAdapter
|
|
19
19
|
subset_list: List[str] = field(default_factory=list)
|
|
20
|
-
metric_list: List[
|
|
20
|
+
metric_list: List[str] = field(default_factory=list)
|
|
21
21
|
few_shot_num: int = 0
|
|
22
22
|
few_shot_random: bool = False
|
|
23
23
|
train_split: Optional[str] = None
|
|
24
24
|
eval_split: Optional[str] = None
|
|
25
25
|
prompt_template: Optional[str] = None
|
|
26
|
+
system_prompt: Optional[str] = None
|
|
27
|
+
query_template: Optional[str] = None
|
|
28
|
+
pretty_name: Optional[str] = None
|
|
26
29
|
|
|
27
30
|
def _update(self, args: dict):
|
|
28
31
|
if args.get('local_path'):
|
|
@@ -40,7 +43,6 @@ class BenchmarkMeta:
|
|
|
40
43
|
# cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
|
|
41
44
|
del cur_dict['data_adapter']
|
|
42
45
|
del cur_dict['model_adapter']
|
|
43
|
-
del cur_dict['metric_list']
|
|
44
46
|
return cur_dict
|
|
45
47
|
|
|
46
48
|
def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
|
|
@@ -59,7 +61,7 @@ class Benchmark:
|
|
|
59
61
|
@classmethod
|
|
60
62
|
def get(cls, name: str) -> 'BenchmarkMeta':
|
|
61
63
|
if name not in BENCHMARK_MAPPINGS:
|
|
62
|
-
raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
|
|
64
|
+
raise Exception(f'Unknown benchmark: {name}. Available tasks: {list(BENCHMARK_MAPPINGS.keys())}')
|
|
63
65
|
benchmark = BENCHMARK_MAPPINGS[name]
|
|
64
66
|
return benchmark
|
|
65
67
|
|
|
@@ -4,10 +4,9 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
8
|
-
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
7
|
+
from evalscope.metrics.metrics import exact_match
|
|
9
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
13
12
|
# flake8: noqa
|
|
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
|
|
|
130
129
|
dataset_id='modelscope/ceval-exam',
|
|
131
130
|
model_adapter=MultiChoiceModelAdapter,
|
|
132
131
|
subset_list=SUBSET_LIST,
|
|
133
|
-
metric_list=[AverageAccuracy],
|
|
132
|
+
metric_list=['AverageAccuracy'],
|
|
134
133
|
few_shot_num=0,
|
|
135
134
|
train_split='dev',
|
|
136
135
|
eval_split='val',
|
|
136
|
+
prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
|
|
137
137
|
)
|
|
138
138
|
class CEVALAdapter(DataAdapter):
|
|
139
139
|
|
|
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
|
|
|
202
202
|
else:
|
|
203
203
|
context = ''
|
|
204
204
|
|
|
205
|
-
|
|
205
|
+
query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
|
|
206
206
|
|
|
207
207
|
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
208
|
-
full_prompt =
|
|
208
|
+
full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
|
|
209
209
|
|
|
210
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
210
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
211
211
|
|
|
212
212
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
213
213
|
# Get the gold choice
|
|
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
|
|
|
228
228
|
if eval_type == EvalType.CHECKPOINT:
|
|
229
229
|
return result
|
|
230
230
|
elif eval_type == EvalType.SERVICE:
|
|
231
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
231
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
232
232
|
elif eval_type == EvalType.CUSTOM:
|
|
233
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
233
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
234
234
|
else:
|
|
235
235
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
236
236
|
|
|
@@ -5,9 +5,9 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
10
|
+
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
|
|
13
13
|
# flake8: noqa
|
|
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
|
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
107
|
model_adapter=MultiChoiceModelAdapter,
|
|
108
108
|
subset_list=SUBSET_LIST,
|
|
109
|
-
metric_list=[AverageAccuracy],
|
|
109
|
+
metric_list=['AverageAccuracy'],
|
|
110
110
|
few_shot_num=5,
|
|
111
111
|
train_split='dev',
|
|
112
112
|
eval_split='test',
|
|
113
|
+
prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
|
|
113
114
|
)
|
|
114
115
|
class CMMLUAdapter(DataAdapter):
|
|
115
116
|
|
|
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
|
|
|
165
166
|
{'data': [(context, continuation), ...]}
|
|
166
167
|
|
|
167
168
|
"""
|
|
168
|
-
prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
|
|
169
169
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
170
|
-
|
|
171
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
170
|
+
context = '\n'.join(few_shot_prompts) + '\n'
|
|
172
171
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
173
|
-
context = prompt + context
|
|
174
172
|
|
|
175
|
-
full_prompt
|
|
173
|
+
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
|
|
176
174
|
|
|
177
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt':
|
|
175
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
178
176
|
|
|
179
177
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
180
178
|
# Get the gold choice
|
|
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
|
|
|
195
193
|
if eval_type == EvalType.CHECKPOINT:
|
|
196
194
|
return result
|
|
197
195
|
elif eval_type == EvalType.SERVICE:
|
|
198
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
196
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
199
197
|
elif eval_type == EvalType.CUSTOM:
|
|
200
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
198
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
201
199
|
else:
|
|
202
200
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
203
201
|
|
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
import glob
|
|
4
4
|
import json
|
|
5
5
|
import os
|
|
6
|
+
from collections import defaultdict
|
|
6
7
|
|
|
7
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.metrics.
|
|
9
|
+
from evalscope.constants import AnswerKeys
|
|
10
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
10
11
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
@@ -19,12 +20,12 @@ logger = get_logger()
|
|
|
19
20
|
name='competition_math',
|
|
20
21
|
dataset_id='modelscope/competition_math',
|
|
21
22
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
|
-
subset_list=['
|
|
23
|
-
metric_list=[
|
|
23
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
24
|
+
metric_list=['AveragePass@1'],
|
|
24
25
|
few_shot_num=4,
|
|
25
|
-
train_split=
|
|
26
|
+
train_split=None,
|
|
26
27
|
eval_split='test',
|
|
27
|
-
prompt_template='
|
|
28
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
28
29
|
)
|
|
29
30
|
class CompetitionMathAdapter(DataAdapter):
|
|
30
31
|
""" To be tested for all models. """
|
|
@@ -39,8 +40,14 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
39
40
|
|
|
40
41
|
super().__init__(**kwargs)
|
|
41
42
|
|
|
43
|
+
def load(self, **kwargs):
|
|
44
|
+
# default load all levels
|
|
45
|
+
kwargs['subset_list'] = ['default']
|
|
46
|
+
data_dict = super().load(**kwargs)
|
|
47
|
+
return self.reformat_subset(data_dict, subset_key='level')
|
|
48
|
+
|
|
42
49
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
43
|
-
data_dict
|
|
50
|
+
data_dict = defaultdict(dict)
|
|
44
51
|
for subset_name in subset_list:
|
|
45
52
|
for split_name in [self.train_split, self.eval_split]:
|
|
46
53
|
if os.path.exists(dataset_name_or_path):
|
|
@@ -53,10 +60,7 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
53
60
|
if os.path.exists(file_path):
|
|
54
61
|
with open(file_path, 'r') as f:
|
|
55
62
|
split_data.append(json.load(f))
|
|
56
|
-
|
|
57
|
-
data_dict[subset_name].update({split_name: split_data})
|
|
58
|
-
else:
|
|
59
|
-
data_dict[subset_name] = {split_name: split_data}
|
|
63
|
+
data_dict[subset_name][split_name] = split_data
|
|
60
64
|
|
|
61
65
|
return data_dict
|
|
62
66
|
|
|
@@ -75,13 +79,13 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
75
79
|
{'data': [prompt]}
|
|
76
80
|
"""
|
|
77
81
|
use_fewshot = self.few_shot_num > 0
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
82
|
+
query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
|
|
83
|
+
full_prompt = self.prompt_template.format(query=query)
|
|
84
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
81
85
|
|
|
82
86
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
83
87
|
# Extract the gold answer from the input dict.
|
|
84
|
-
return
|
|
88
|
+
return strip_answer_string(extract_answer(input_d['solution']))
|
|
85
89
|
|
|
86
90
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
87
91
|
"""
|
|
@@ -96,18 +100,11 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
96
100
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
97
101
|
"""
|
|
98
102
|
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
99
|
-
|
|
100
|
-
result = remove_boxed(last_boxed_only_string(result))
|
|
101
|
-
except Exception:
|
|
102
|
-
return None
|
|
103
|
+
result = strip_answer_string(extract_answer(result))
|
|
103
104
|
return result
|
|
104
105
|
|
|
105
106
|
def match(self, gold: str, pred: str) -> float:
|
|
106
|
-
|
|
107
|
-
if is_equiv(pred, gold):
|
|
108
|
-
res = 1
|
|
109
|
-
|
|
110
|
-
return res
|
|
107
|
+
return math_equal(pred, gold)
|
|
111
108
|
|
|
112
109
|
@classmethod
|
|
113
110
|
def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
|