evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +30 -15
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py
CHANGED
|
@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
58
58
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
59
59
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
60
60
|
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
|
|
61
|
+
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
61
62
|
|
|
62
63
|
# Cache and working directory arguments
|
|
63
64
|
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
# flake8: noqa
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='aime24',
|
|
13
|
+
dataset_id='HuggingFaceH4/aime_2024',
|
|
14
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
+
subset_list=['default'],
|
|
16
|
+
metric_list=['AveragePass@1'],
|
|
17
|
+
few_shot_num=0,
|
|
18
|
+
train_split=None,
|
|
19
|
+
eval_split='train', # Only train set is available
|
|
20
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
|
+
)
|
|
22
|
+
class AIME24Adapter(DataAdapter):
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, **kwargs):
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Generate the prompt for the model input.
|
|
30
|
+
"""
|
|
31
|
+
problem = input_d['problem']
|
|
32
|
+
full_prompt = self.prompt_template.format(query=problem)
|
|
33
|
+
|
|
34
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
# Extract the gold answer from the input dict.
|
|
38
|
+
return strip_answer_string(input_d['answer'])
|
|
39
|
+
|
|
40
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
41
|
+
"""
|
|
42
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
43
|
+
"""
|
|
44
|
+
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
45
|
+
result = strip_answer_string(extract_answer(result))
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def match(self, gold: str, pred: str) -> float:
|
|
49
|
+
return math_equal(pred, gold)
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
23
|
+
metric_list=['AverageAccuracy'],
|
|
24
24
|
few_shot_num=0,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
|
|
|
112
112
|
# context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
|
|
113
113
|
full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
114
114
|
|
|
115
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
115
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
116
116
|
|
|
117
117
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
118
118
|
# Get the gold choice
|
|
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
|
|
|
133
133
|
if eval_type == EvalType.CHECKPOINT:
|
|
134
134
|
return result
|
|
135
135
|
elif eval_type == EvalType.SERVICE:
|
|
136
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
137
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
136
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
138
137
|
elif eval_type == EvalType.CUSTOM:
|
|
139
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
140
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
138
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
141
139
|
else:
|
|
142
140
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
143
141
|
|
|
@@ -7,7 +7,7 @@ import re
|
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
|
-
from evalscope.metrics import
|
|
10
|
+
from evalscope.metrics import exact_match
|
|
11
11
|
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
12
|
from evalscope.utils import ResponseParser
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
63
63
|
dataset_id='modelscope/bbh',
|
|
64
64
|
model_adapter=ChatGenerationModelAdapter,
|
|
65
65
|
subset_list=SUBSET_LIST,
|
|
66
|
-
metric_list=[AverageAccuracy],
|
|
66
|
+
metric_list=['AverageAccuracy'],
|
|
67
67
|
few_shot_num=3,
|
|
68
68
|
train_split=None,
|
|
69
69
|
eval_split='test',
|
|
70
|
-
prompt_template='
|
|
70
|
+
prompt_template="Q: {query}\nA: Let's think step by step.",
|
|
71
71
|
)
|
|
72
72
|
class BBHAdapter(DataAdapter):
|
|
73
73
|
"""
|
|
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
|
|
|
119
119
|
{'data': ['xxx']}
|
|
120
120
|
"""
|
|
121
121
|
# few_shot_list: should be ['xxxx']
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
if len(few_shot_list) > 0:
|
|
123
|
+
cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
|
|
124
|
+
else:
|
|
125
|
+
cot_prompts = ''
|
|
126
|
+
full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
|
|
124
127
|
|
|
125
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
128
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
126
129
|
|
|
127
130
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
128
131
|
"""
|
|
@@ -177,9 +180,11 @@ class BBHAdapter(DataAdapter):
|
|
|
177
180
|
|
|
178
181
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
179
182
|
# Get the gold choice
|
|
180
|
-
gold = input_d.get('target')
|
|
183
|
+
gold = input_d.get('target', '')
|
|
184
|
+
# remove brackets
|
|
181
185
|
if gold is None:
|
|
182
186
|
logger.error(f'BBHAdapter: gold is None.')
|
|
187
|
+
gold = gold.replace('(', '').replace(')', '')
|
|
183
188
|
return gold
|
|
184
189
|
|
|
185
190
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
@@ -228,8 +233,11 @@ class BBHAdapter(DataAdapter):
|
|
|
228
233
|
"""
|
|
229
234
|
Extract the answer from the model output for Free-form task.
|
|
230
235
|
"""
|
|
231
|
-
|
|
232
|
-
|
|
236
|
+
pattern = r'answer is\s+(.*?)\.'
|
|
237
|
+
|
|
238
|
+
match = re.search(pattern, ans)
|
|
239
|
+
if match:
|
|
240
|
+
res = match.group(1)
|
|
233
241
|
return res
|
|
234
242
|
|
|
235
243
|
ans_line = ans.split('answer is ')
|
|
@@ -17,12 +17,13 @@ class BenchmarkMeta:
|
|
|
17
17
|
data_adapter: 'DataAdapter'
|
|
18
18
|
model_adapter: BaseModelAdapter
|
|
19
19
|
subset_list: List[str] = field(default_factory=list)
|
|
20
|
-
metric_list: List[
|
|
20
|
+
metric_list: List[str] = field(default_factory=list)
|
|
21
21
|
few_shot_num: int = 0
|
|
22
22
|
few_shot_random: bool = False
|
|
23
23
|
train_split: Optional[str] = None
|
|
24
24
|
eval_split: Optional[str] = None
|
|
25
25
|
prompt_template: Optional[str] = None
|
|
26
|
+
system_prompt: Optional[str] = None
|
|
26
27
|
|
|
27
28
|
def _update(self, args: dict):
|
|
28
29
|
if args.get('local_path'):
|
|
@@ -40,7 +41,6 @@ class BenchmarkMeta:
|
|
|
40
41
|
# cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
|
|
41
42
|
del cur_dict['data_adapter']
|
|
42
43
|
del cur_dict['model_adapter']
|
|
43
|
-
del cur_dict['metric_list']
|
|
44
44
|
return cur_dict
|
|
45
45
|
|
|
46
46
|
def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
|
|
@@ -4,10 +4,9 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
8
|
-
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
7
|
+
from evalscope.metrics.metrics import exact_match
|
|
9
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
13
12
|
# flake8: noqa
|
|
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
|
|
|
130
129
|
dataset_id='modelscope/ceval-exam',
|
|
131
130
|
model_adapter=MultiChoiceModelAdapter,
|
|
132
131
|
subset_list=SUBSET_LIST,
|
|
133
|
-
metric_list=[AverageAccuracy],
|
|
132
|
+
metric_list=['AverageAccuracy'],
|
|
134
133
|
few_shot_num=0,
|
|
135
134
|
train_split='dev',
|
|
136
135
|
eval_split='val',
|
|
136
|
+
prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
|
|
137
137
|
)
|
|
138
138
|
class CEVALAdapter(DataAdapter):
|
|
139
139
|
|
|
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
|
|
|
202
202
|
else:
|
|
203
203
|
context = ''
|
|
204
204
|
|
|
205
|
-
|
|
205
|
+
query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
|
|
206
206
|
|
|
207
207
|
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
208
|
-
full_prompt =
|
|
208
|
+
full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
|
|
209
209
|
|
|
210
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
210
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
211
211
|
|
|
212
212
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
213
213
|
# Get the gold choice
|
|
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
|
|
|
228
228
|
if eval_type == EvalType.CHECKPOINT:
|
|
229
229
|
return result
|
|
230
230
|
elif eval_type == EvalType.SERVICE:
|
|
231
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
231
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
232
232
|
elif eval_type == EvalType.CUSTOM:
|
|
233
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
233
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
234
234
|
else:
|
|
235
235
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
236
236
|
|
|
@@ -5,9 +5,9 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
10
|
+
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
|
|
13
13
|
# flake8: noqa
|
|
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
|
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
107
|
model_adapter=MultiChoiceModelAdapter,
|
|
108
108
|
subset_list=SUBSET_LIST,
|
|
109
|
-
metric_list=[AverageAccuracy],
|
|
109
|
+
metric_list=['AverageAccuracy'],
|
|
110
110
|
few_shot_num=5,
|
|
111
111
|
train_split='dev',
|
|
112
112
|
eval_split='test',
|
|
113
|
+
prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
|
|
113
114
|
)
|
|
114
115
|
class CMMLUAdapter(DataAdapter):
|
|
115
116
|
|
|
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
|
|
|
165
166
|
{'data': [(context, continuation), ...]}
|
|
166
167
|
|
|
167
168
|
"""
|
|
168
|
-
prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
|
|
169
169
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
170
|
-
|
|
171
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
170
|
+
context = '\n'.join(few_shot_prompts) + '\n'
|
|
172
171
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
173
|
-
context = prompt + context
|
|
174
172
|
|
|
175
|
-
full_prompt
|
|
173
|
+
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
|
|
176
174
|
|
|
177
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt':
|
|
175
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
178
176
|
|
|
179
177
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
180
178
|
# Get the gold choice
|
|
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
|
|
|
195
193
|
if eval_type == EvalType.CHECKPOINT:
|
|
196
194
|
return result
|
|
197
195
|
elif eval_type == EvalType.SERVICE:
|
|
198
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
196
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
199
197
|
elif eval_type == EvalType.CUSTOM:
|
|
200
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
198
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
201
199
|
else:
|
|
202
200
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
203
201
|
|
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
import glob
|
|
4
4
|
import json
|
|
5
5
|
import os
|
|
6
|
+
from collections import defaultdict
|
|
6
7
|
|
|
7
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.metrics.
|
|
9
|
+
from evalscope.constants import AnswerKeys
|
|
10
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
10
11
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
@@ -19,12 +20,12 @@ logger = get_logger()
|
|
|
19
20
|
name='competition_math',
|
|
20
21
|
dataset_id='modelscope/competition_math',
|
|
21
22
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
|
-
subset_list=['
|
|
23
|
-
metric_list=[
|
|
23
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
24
|
+
metric_list=['AveragePass@1'],
|
|
24
25
|
few_shot_num=4,
|
|
25
26
|
train_split='train',
|
|
26
27
|
eval_split='test',
|
|
27
|
-
prompt_template='
|
|
28
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
28
29
|
)
|
|
29
30
|
class CompetitionMathAdapter(DataAdapter):
|
|
30
31
|
""" To be tested for all models. """
|
|
@@ -39,8 +40,13 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
39
40
|
|
|
40
41
|
super().__init__(**kwargs)
|
|
41
42
|
|
|
43
|
+
def load(self, **kwargs):
|
|
44
|
+
# default load all levels
|
|
45
|
+
kwargs['subset_list'] = ['default']
|
|
46
|
+
return super().load(**kwargs)
|
|
47
|
+
|
|
42
48
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
43
|
-
data_dict
|
|
49
|
+
data_dict = defaultdict(dict)
|
|
44
50
|
for subset_name in subset_list:
|
|
45
51
|
for split_name in [self.train_split, self.eval_split]:
|
|
46
52
|
if os.path.exists(dataset_name_or_path):
|
|
@@ -53,13 +59,25 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
53
59
|
if os.path.exists(file_path):
|
|
54
60
|
with open(file_path, 'r') as f:
|
|
55
61
|
split_data.append(json.load(f))
|
|
56
|
-
|
|
57
|
-
data_dict[subset_name].update({split_name: split_data})
|
|
58
|
-
else:
|
|
59
|
-
data_dict[subset_name] = {split_name: split_data}
|
|
62
|
+
data_dict[subset_name][split_name] = split_data
|
|
60
63
|
|
|
61
64
|
return data_dict
|
|
62
65
|
|
|
66
|
+
def gen_prompts(self, data_dict: dict) -> dict:
|
|
67
|
+
res_dict: dict = defaultdict(list)
|
|
68
|
+
|
|
69
|
+
# use level as subset
|
|
70
|
+
for sub_name, sub_data_dict in data_dict.items():
|
|
71
|
+
for sample_d in sub_data_dict[self.eval_split]:
|
|
72
|
+
level = sample_d['level']
|
|
73
|
+
if level not in self.subset_list:
|
|
74
|
+
continue
|
|
75
|
+
prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=None)
|
|
76
|
+
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
|
|
77
|
+
res_dict[level].append(prompt_d)
|
|
78
|
+
|
|
79
|
+
return res_dict
|
|
80
|
+
|
|
63
81
|
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
64
82
|
"""
|
|
65
83
|
Generate the prompt for the model input.
|
|
@@ -75,13 +93,13 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
75
93
|
{'data': [prompt]}
|
|
76
94
|
"""
|
|
77
95
|
use_fewshot = self.few_shot_num > 0
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
96
|
+
query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
|
|
97
|
+
full_prompt = self.prompt_template.format(query=query)
|
|
98
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
81
99
|
|
|
82
100
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
83
101
|
# Extract the gold answer from the input dict.
|
|
84
|
-
return
|
|
102
|
+
return strip_answer_string(extract_answer(input_d['solution']))
|
|
85
103
|
|
|
86
104
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
87
105
|
"""
|
|
@@ -96,18 +114,11 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
96
114
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
97
115
|
"""
|
|
98
116
|
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
99
|
-
|
|
100
|
-
result = remove_boxed(last_boxed_only_string(result))
|
|
101
|
-
except Exception:
|
|
102
|
-
return None
|
|
117
|
+
result = strip_answer_string(extract_answer(result))
|
|
103
118
|
return result
|
|
104
119
|
|
|
105
120
|
def match(self, gold: str, pred: str) -> float:
|
|
106
|
-
|
|
107
|
-
if is_equiv(pred, gold):
|
|
108
|
-
res = 1
|
|
109
|
-
|
|
110
|
-
return res
|
|
121
|
+
return math_equal(pred, gold)
|
|
111
122
|
|
|
112
123
|
@classmethod
|
|
113
124
|
def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
import os.path
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import Any, List, Optional
|
|
5
|
+
from typing import Any, List, Optional, Union
|
|
6
6
|
|
|
7
7
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics.named_metrics import metric_registry
|
|
9
9
|
from evalscope.report import Report, ReportGenerator
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
@@ -16,12 +16,14 @@ class DataAdapter(ABC):
|
|
|
16
16
|
|
|
17
17
|
def __init__(self,
|
|
18
18
|
name: str,
|
|
19
|
+
dataset_id: str,
|
|
19
20
|
subset_list: list,
|
|
20
|
-
metric_list: List[
|
|
21
|
+
metric_list: List[str],
|
|
21
22
|
few_shot_num: Optional[int] = 0,
|
|
22
23
|
train_split: Optional[str] = None,
|
|
23
24
|
eval_split: Optional[str] = None,
|
|
24
25
|
prompt_template: Optional[str] = None,
|
|
26
|
+
system_prompt: Optional[str] = None,
|
|
25
27
|
**kwargs):
|
|
26
28
|
"""
|
|
27
29
|
Data Adapter for the benchmark. You need to implement the following methods:
|
|
@@ -31,6 +33,7 @@ class DataAdapter(ABC):
|
|
|
31
33
|
- match
|
|
32
34
|
Args:
|
|
33
35
|
name: str, the name of the benchmark.
|
|
36
|
+
dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
|
|
34
37
|
subset_list: list of subset names for the dataset.
|
|
35
38
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
36
39
|
few_shot_num: int, number of few-shot examples. Default: 0
|
|
@@ -41,17 +44,19 @@ class DataAdapter(ABC):
|
|
|
41
44
|
the form of A or B or C or D, do not output explanation:`
|
|
42
45
|
"""
|
|
43
46
|
self.name = name
|
|
47
|
+
self.dataset_id = dataset_id
|
|
44
48
|
self.subset_list = subset_list
|
|
45
49
|
self.metric_list = metric_list
|
|
46
50
|
self.few_shot_num = few_shot_num
|
|
47
51
|
self.train_split = train_split
|
|
48
52
|
self.eval_split = eval_split
|
|
49
53
|
self.prompt_template = prompt_template
|
|
54
|
+
self.system_prompt = system_prompt
|
|
50
55
|
self.config_kwargs = kwargs
|
|
51
56
|
self.category_map = kwargs.get('category_map', {})
|
|
52
57
|
|
|
53
58
|
def load(self,
|
|
54
|
-
dataset_name_or_path: str,
|
|
59
|
+
dataset_name_or_path: str = None,
|
|
55
60
|
subset_list: list = None,
|
|
56
61
|
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
57
62
|
datasets_hub: str = HubType.MODELSCOPE,
|
|
@@ -64,7 +69,7 @@ class DataAdapter(ABC):
|
|
|
64
69
|
train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
|
|
65
70
|
|
|
66
71
|
"""
|
|
67
|
-
dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
72
|
+
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
|
|
68
73
|
subset_list = subset_list or self.subset_list
|
|
69
74
|
|
|
70
75
|
# Try to load dataset from local disk
|
|
@@ -156,7 +161,7 @@ class DataAdapter(ABC):
|
|
|
156
161
|
else:
|
|
157
162
|
return data_list[:k]
|
|
158
163
|
|
|
159
|
-
def compute_metric(self, review_res_list: list) -> List[dict]:
|
|
164
|
+
def compute_metric(self, review_res_list: Union[dict, list]) -> List[dict]:
|
|
160
165
|
"""
|
|
161
166
|
Compute evaluation result by specific metrics.
|
|
162
167
|
|
|
@@ -170,14 +175,15 @@ class DataAdapter(ABC):
|
|
|
170
175
|
raise ValueError('No metric list found for the benchmark.')
|
|
171
176
|
|
|
172
177
|
res_list = []
|
|
173
|
-
for
|
|
178
|
+
for metric_str in self.metric_list:
|
|
179
|
+
metric = metric_registry.get(metric_str)
|
|
174
180
|
metric_name = metric.name
|
|
175
181
|
metric_func = metric.object
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
})
|
|
182
|
+
if isinstance(review_res_list, dict):
|
|
183
|
+
review_res = review_res_list.get(metric_name, [])
|
|
184
|
+
else:
|
|
185
|
+
review_res = review_res_list
|
|
186
|
+
res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
|
|
181
187
|
return res_list
|
|
182
188
|
|
|
183
189
|
def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
|
|
File without changes
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
|
|
8
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='data_collection',
|
|
17
|
+
dataset_id='', # dataset_id need to be set
|
|
18
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
19
|
+
subset_list=['default'],
|
|
20
|
+
metric_list=['AverageAccuracy'],
|
|
21
|
+
few_shot_num=0,
|
|
22
|
+
train_split=None,
|
|
23
|
+
eval_split='test',
|
|
24
|
+
prompt_template='',
|
|
25
|
+
)
|
|
26
|
+
class DataCollectionAdapter(DataAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
"""
|
|
30
|
+
Data adapter for collection dataset.
|
|
31
|
+
"""
|
|
32
|
+
super().__init__(**kwargs)
|
|
33
|
+
|
|
34
|
+
def load(self,
|
|
35
|
+
dataset_name_or_path: str = None,
|
|
36
|
+
subset_list: list = None,
|
|
37
|
+
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
38
|
+
datasets_hub: str = HubType.MODELSCOPE,
|
|
39
|
+
**kwargs) -> dict:
|
|
40
|
+
"""
|
|
41
|
+
Load the dataset. Remote and local datasets are supported.
|
|
42
|
+
"""
|
|
43
|
+
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
|
|
44
|
+
subset_list = subset_list or self.subset_list
|
|
45
|
+
|
|
46
|
+
# Try to load dataset from local disk
|
|
47
|
+
if os.path.exists(dataset_name_or_path):
|
|
48
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
49
|
+
dataset = jsonl_to_list(dataset_name_or_path)
|
|
50
|
+
if len(dataset) == 0:
|
|
51
|
+
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
52
|
+
else:
|
|
53
|
+
from modelscope.msdatasets import MsDataset
|
|
54
|
+
|
|
55
|
+
# Load dataset from remote
|
|
56
|
+
logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
|
|
57
|
+
|
|
58
|
+
dataset = MsDataset.load(dataset_name=dataset_name_or_path, cache_dir=work_dir, hub=datasets_hub, **kwargs)
|
|
59
|
+
|
|
60
|
+
dataset = dataset[self.eval_split].to_list()
|
|
61
|
+
|
|
62
|
+
return dataset
|
|
63
|
+
|
|
64
|
+
def get_gold_answer(self, input_d: Any) -> Any:
|
|
65
|
+
return super().get_gold_answer(input_d)
|
|
66
|
+
|
|
67
|
+
def match(self, gold: Any, pred: Any) -> Any:
|
|
68
|
+
return super().match(gold, pred)
|
|
69
|
+
|
|
70
|
+
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
71
|
+
return super().parse_pred_result(result, raw_input_d, eval_type)
|
|
File without changes
|