evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -5
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
- evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
- evalscope/benchmarks/data_adapter.py +69 -70
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
- evalscope/benchmarks/race/race_adapter.py +4 -73
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/evaluator.py +82 -62
- evalscope/collections/sampler.py +47 -41
- evalscope/collections/schema.py +14 -10
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +22 -13
- evalscope/metrics/__init__.py +2 -5
- evalscope/metrics/metrics.py +11 -2
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/models/server_adapter.py +11 -4
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +16 -11
- evalscope/summarizer.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/logger.py +1 -0
- evalscope/utils/model_utils.py +5 -2
- evalscope/version.py +2 -2
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
- tests/cli/test_collection.py +11 -7
- tests/cli/test_run.py +13 -4
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py
CHANGED
|
@@ -33,6 +33,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
33
33
|
# yapf: disable
|
|
34
34
|
# Model-related arguments
|
|
35
35
|
parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
|
|
36
|
+
parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
|
|
36
37
|
parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
|
|
37
38
|
|
|
38
39
|
# Template-related arguments
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
|
-
metric_list=[
|
|
23
|
+
metric_list=[AverageAccuracy],
|
|
24
24
|
few_shot_num=0,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -109,12 +109,10 @@ class ARCAdapter(DataAdapter):
|
|
|
109
109
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
110
110
|
context: str = '\n'.join(few_shot_prompts)
|
|
111
111
|
|
|
112
|
-
context = f'{self.prompt_template}\n{context}' if self.prompt_template else context
|
|
113
|
-
|
|
114
112
|
# context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
|
|
115
113
|
full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
116
114
|
|
|
117
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
115
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
118
116
|
|
|
119
117
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
120
118
|
# Get the gold choice
|
|
@@ -7,7 +7,7 @@ import re
|
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
|
-
from evalscope.metrics import
|
|
10
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
11
11
|
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
12
|
from evalscope.utils import ResponseParser
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -63,7 +63,7 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
63
63
|
dataset_id='modelscope/bbh',
|
|
64
64
|
model_adapter=ChatGenerationModelAdapter,
|
|
65
65
|
subset_list=SUBSET_LIST,
|
|
66
|
-
metric_list=[
|
|
66
|
+
metric_list=[AverageAccuracy],
|
|
67
67
|
few_shot_num=3,
|
|
68
68
|
train_split=None,
|
|
69
69
|
eval_split='test',
|
|
@@ -122,7 +122,7 @@ class BBHAdapter(DataAdapter):
|
|
|
122
122
|
cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
|
|
123
123
|
full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
|
|
124
124
|
|
|
125
|
-
return {'data': [full_prompt]}
|
|
125
|
+
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
126
126
|
|
|
127
127
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
128
128
|
"""
|
|
@@ -22,7 +22,7 @@ class BenchmarkMeta:
|
|
|
22
22
|
few_shot_random: bool = False
|
|
23
23
|
train_split: Optional[str] = None
|
|
24
24
|
eval_split: Optional[str] = None
|
|
25
|
-
prompt_template: str =
|
|
25
|
+
prompt_template: Optional[str] = None
|
|
26
26
|
|
|
27
27
|
def _update(self, args: dict):
|
|
28
28
|
if args.get('local_path'):
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import AverageAccuracy
|
|
8
8
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser, normalize_score
|
|
@@ -130,7 +130,7 @@ SUBJECT_MAPPING = {
|
|
|
130
130
|
dataset_id='modelscope/ceval-exam',
|
|
131
131
|
model_adapter=MultiChoiceModelAdapter,
|
|
132
132
|
subset_list=SUBSET_LIST,
|
|
133
|
-
metric_list=[
|
|
133
|
+
metric_list=[AverageAccuracy],
|
|
134
134
|
few_shot_num=0,
|
|
135
135
|
train_split='dev',
|
|
136
136
|
eval_split='val',
|
|
@@ -145,9 +145,10 @@ class CEVALAdapter(DataAdapter):
|
|
|
145
145
|
if few_shot_num > 5:
|
|
146
146
|
logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
|
|
147
147
|
kwargs['few_shot_num'] = 5
|
|
148
|
-
|
|
149
148
|
super().__init__(**kwargs)
|
|
150
149
|
|
|
150
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
151
|
+
|
|
151
152
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
152
153
|
data_dict = {}
|
|
153
154
|
for subset_name in subset_list:
|
|
@@ -206,7 +207,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
206
207
|
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
207
208
|
full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
|
|
208
209
|
|
|
209
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
210
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
210
211
|
|
|
211
212
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
212
213
|
# Get the gold choice
|
|
@@ -236,84 +237,6 @@ class CEVALAdapter(DataAdapter):
|
|
|
236
237
|
def match(self, gold: str, pred: str) -> float:
|
|
237
238
|
return exact_match(gold=gold, pred=pred)
|
|
238
239
|
|
|
239
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
240
|
-
"""
|
|
241
|
-
Generate report for the evaluation.
|
|
242
|
-
|
|
243
|
-
Args:
|
|
244
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
245
|
-
report_name: The user-defined report name.
|
|
246
|
-
|
|
247
|
-
Returns:
|
|
248
|
-
{
|
|
249
|
-
"name":"C-Eval",
|
|
250
|
-
"metric":"WeightedAverageAccuracy",
|
|
251
|
-
"score":0.3389,
|
|
252
|
-
"category":[
|
|
253
|
-
{
|
|
254
|
-
"name":"STEM",
|
|
255
|
-
"score":0.2528,
|
|
256
|
-
"subset":[
|
|
257
|
-
{
|
|
258
|
-
"name":"computer_network",
|
|
259
|
-
"score":0.2632
|
|
260
|
-
},
|
|
261
|
-
{
|
|
262
|
-
"name":"operating_system",
|
|
263
|
-
"score":0.3157
|
|
264
|
-
},
|
|
265
|
-
{
|
|
266
|
-
"name":"computer_architecture",
|
|
267
|
-
"score":0.4285
|
|
268
|
-
}
|
|
269
|
-
]
|
|
270
|
-
}
|
|
271
|
-
],
|
|
272
|
-
"total_num":59
|
|
273
|
-
}
|
|
274
|
-
"""
|
|
275
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
276
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
277
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
278
|
-
|
|
279
|
-
# Get domain-subject mapping
|
|
280
|
-
subject_review_map = {}
|
|
281
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
282
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else 'DEFAULT'
|
|
283
|
-
if domain_name in subject_review_map:
|
|
284
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
285
|
-
else:
|
|
286
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
287
|
-
|
|
288
|
-
# Get domain score
|
|
289
|
-
category_list = []
|
|
290
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
291
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
292
|
-
sum([num for _, _, num in domain_res_list])
|
|
293
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
294
|
-
category_list.append({
|
|
295
|
-
'name':
|
|
296
|
-
domain_name,
|
|
297
|
-
'score':
|
|
298
|
-
domain_weighted_avg_acc,
|
|
299
|
-
'subset': [{
|
|
300
|
-
'name': subset_name,
|
|
301
|
-
'score': normalize_score(score=subset_score)
|
|
302
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
303
|
-
})
|
|
304
|
-
|
|
305
|
-
category_list = sorted(category_list, key=lambda x: x['name'])
|
|
306
|
-
|
|
307
|
-
# Get final dict of report
|
|
308
|
-
res_map = dict(
|
|
309
|
-
name=report_name or 'ceval',
|
|
310
|
-
metric=self.metric_list[0]['name'],
|
|
311
|
-
score=weighted_avg_acc,
|
|
312
|
-
category=category_list,
|
|
313
|
-
total_num=total_num)
|
|
314
|
-
|
|
315
|
-
return res_map
|
|
316
|
-
|
|
317
240
|
@classmethod
|
|
318
241
|
def _format_example(cls, input_d: dict, include_answer=True):
|
|
319
242
|
example = '问题:' + input_d['question']
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser, normalize_score
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -106,7 +106,7 @@ SUBJECT_MAPPING = {
|
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
107
|
model_adapter=MultiChoiceModelAdapter,
|
|
108
108
|
subset_list=SUBSET_LIST,
|
|
109
|
-
metric_list=[
|
|
109
|
+
metric_list=[AverageAccuracy],
|
|
110
110
|
few_shot_num=5,
|
|
111
111
|
train_split='dev',
|
|
112
112
|
eval_split='test',
|
|
@@ -116,9 +116,10 @@ class CMMLUAdapter(DataAdapter):
|
|
|
116
116
|
choices = ['A', 'B', 'C', 'D']
|
|
117
117
|
|
|
118
118
|
def __init__(self, **kwargs):
|
|
119
|
-
|
|
120
119
|
super().__init__(**kwargs)
|
|
121
120
|
|
|
121
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
122
|
+
|
|
122
123
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
123
124
|
data_dict = {}
|
|
124
125
|
for subset_name in subset_list:
|
|
@@ -173,7 +174,7 @@ class CMMLUAdapter(DataAdapter):
|
|
|
173
174
|
|
|
174
175
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
175
176
|
|
|
176
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
177
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
|
|
177
178
|
|
|
178
179
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
179
180
|
# Get the gold choice
|
|
@@ -203,81 +204,6 @@ class CMMLUAdapter(DataAdapter):
|
|
|
203
204
|
def match(self, gold: str, pred: str) -> float:
|
|
204
205
|
return exact_match(gold=gold, pred=pred)
|
|
205
206
|
|
|
206
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
207
|
-
"""
|
|
208
|
-
Generate report for the evaluation.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
212
|
-
report_name: the user-defined report name. Default: None
|
|
213
|
-
|
|
214
|
-
Returns:
|
|
215
|
-
{
|
|
216
|
-
"name":"CMMLU",
|
|
217
|
-
"metric":"WeightedAverageAccuracy",
|
|
218
|
-
"score":0.3389,
|
|
219
|
-
"category":[
|
|
220
|
-
{
|
|
221
|
-
"name":"STEM",
|
|
222
|
-
"score":0.2528,
|
|
223
|
-
"subset":[
|
|
224
|
-
{
|
|
225
|
-
"name":"computer_network",
|
|
226
|
-
"score":0.2632
|
|
227
|
-
},
|
|
228
|
-
{
|
|
229
|
-
"name":"operating_system",
|
|
230
|
-
"score":0.3157
|
|
231
|
-
},
|
|
232
|
-
{
|
|
233
|
-
"name":"computer_architecture",
|
|
234
|
-
"score":0.4285
|
|
235
|
-
}
|
|
236
|
-
]
|
|
237
|
-
}
|
|
238
|
-
],
|
|
239
|
-
"total_num":59
|
|
240
|
-
}
|
|
241
|
-
"""
|
|
242
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
243
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
244
|
-
|
|
245
|
-
# Get domain-subject mapping
|
|
246
|
-
subject_review_map = {}
|
|
247
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
248
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
249
|
-
if domain_name in subject_review_map:
|
|
250
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
251
|
-
else:
|
|
252
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
253
|
-
|
|
254
|
-
# Get domain score
|
|
255
|
-
category_list = []
|
|
256
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
257
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
258
|
-
sum([num for _, _, num in domain_res_list])
|
|
259
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
260
|
-
category_list.append({
|
|
261
|
-
'name':
|
|
262
|
-
domain_name,
|
|
263
|
-
'score':
|
|
264
|
-
domain_weighted_avg_acc,
|
|
265
|
-
'subset': [{
|
|
266
|
-
'name': subset_name,
|
|
267
|
-
'score': normalize_score(subset_score)
|
|
268
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
269
|
-
})
|
|
270
|
-
|
|
271
|
-
# Get final dict of report
|
|
272
|
-
res_map = dict(
|
|
273
|
-
name=report_name or 'cmmlu',
|
|
274
|
-
metric=self.metric_list[0]['name'],
|
|
275
|
-
score=weighted_avg_acc,
|
|
276
|
-
category=category_list,
|
|
277
|
-
total_num=total_num)
|
|
278
|
-
|
|
279
|
-
return res_map
|
|
280
|
-
|
|
281
207
|
@classmethod
|
|
282
208
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
283
209
|
|
|
@@ -5,7 +5,7 @@ import json
|
|
|
5
5
|
import os
|
|
6
6
|
|
|
7
7
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import AverageAccuracy
|
|
9
9
|
from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
|
|
10
10
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,11 +20,11 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/competition_math',
|
|
21
21
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
22
|
subset_list=['default'],
|
|
23
|
-
metric_list=[
|
|
23
|
+
metric_list=[AverageAccuracy],
|
|
24
24
|
few_shot_num=4,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
27
|
-
prompt_template='',
|
|
27
|
+
prompt_template='Put the final answer in \\boxed{}.',
|
|
28
28
|
)
|
|
29
29
|
class CompetitionMathAdapter(DataAdapter):
|
|
30
30
|
""" To be tested for all models. """
|
|
@@ -77,7 +77,7 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
77
77
|
use_fewshot = self.few_shot_num > 0
|
|
78
78
|
full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
|
|
79
79
|
|
|
80
|
-
return {'data': [full_prompt], 'system_prompt':
|
|
80
|
+
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
81
81
|
|
|
82
82
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
83
83
|
# Extract the gold answer from the input dict.
|
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
import os.path
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import Any, Optional
|
|
5
|
+
from typing import Any, List, Optional
|
|
6
6
|
|
|
7
7
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
|
-
from evalscope.
|
|
8
|
+
from evalscope.metrics import Metric
|
|
9
|
+
from evalscope.report import Report, ReportGenerator
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
@@ -14,12 +15,13 @@ logger = get_logger()
|
|
|
14
15
|
class DataAdapter(ABC):
|
|
15
16
|
|
|
16
17
|
def __init__(self,
|
|
18
|
+
name: str,
|
|
17
19
|
subset_list: list,
|
|
18
|
-
metric_list:
|
|
20
|
+
metric_list: List[Metric],
|
|
19
21
|
few_shot_num: Optional[int] = 0,
|
|
20
22
|
train_split: Optional[str] = None,
|
|
21
23
|
eval_split: Optional[str] = None,
|
|
22
|
-
prompt_template: str =
|
|
24
|
+
prompt_template: Optional[str] = None,
|
|
23
25
|
**kwargs):
|
|
24
26
|
"""
|
|
25
27
|
Data Adapter for the benchmark. You need to implement the following methods:
|
|
@@ -28,6 +30,7 @@ class DataAdapter(ABC):
|
|
|
28
30
|
- parse_pred_result
|
|
29
31
|
- match
|
|
30
32
|
Args:
|
|
33
|
+
name: str, the name of the benchmark.
|
|
31
34
|
subset_list: list of subset names for the dataset.
|
|
32
35
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
33
36
|
few_shot_num: int, number of few-shot examples. Default: 0
|
|
@@ -37,6 +40,7 @@ class DataAdapter(ABC):
|
|
|
37
40
|
e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
|
|
38
41
|
the form of A or B or C or D, do not output explanation:`
|
|
39
42
|
"""
|
|
43
|
+
self.name = name
|
|
40
44
|
self.subset_list = subset_list
|
|
41
45
|
self.metric_list = metric_list
|
|
42
46
|
self.few_shot_num = few_shot_num
|
|
@@ -44,6 +48,7 @@ class DataAdapter(ABC):
|
|
|
44
48
|
self.eval_split = eval_split
|
|
45
49
|
self.prompt_template = prompt_template
|
|
46
50
|
self.config_kwargs = kwargs
|
|
51
|
+
self.category_map = kwargs.get('category_map', {})
|
|
47
52
|
|
|
48
53
|
def load(self,
|
|
49
54
|
dataset_name_or_path: str,
|
|
@@ -142,59 +147,6 @@ class DataAdapter(ABC):
|
|
|
142
147
|
|
|
143
148
|
return res_dict
|
|
144
149
|
|
|
145
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
146
|
-
"""
|
|
147
|
-
Generate report for the evaluation results for all subsets.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
subset_score_map: The subset-score map.
|
|
151
|
-
e.g. {subset_name: (score, num)}
|
|
152
|
-
|
|
153
|
-
report_name: str, the user-defined report name. Default: None
|
|
154
|
-
|
|
155
|
-
Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
|
|
156
|
-
|
|
157
|
-
Here is a format example for ARC-Challenge:
|
|
158
|
-
{
|
|
159
|
-
"name":"ARC-Challenge",
|
|
160
|
-
"metric":"WeightedAverageAccuracy",
|
|
161
|
-
"score": 0.3389,
|
|
162
|
-
"category":[
|
|
163
|
-
{
|
|
164
|
-
"name":"DEFAULT",
|
|
165
|
-
"score": 0.3389,
|
|
166
|
-
"subset":[
|
|
167
|
-
{
|
|
168
|
-
"name":"ARC-Challenge",
|
|
169
|
-
"score": 0.3389,
|
|
170
|
-
"num": 100
|
|
171
|
-
},
|
|
172
|
-
]
|
|
173
|
-
}
|
|
174
|
-
],
|
|
175
|
-
"total_num":100
|
|
176
|
-
}
|
|
177
|
-
""" # noqa: E501
|
|
178
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
179
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
180
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
181
|
-
cate_avg_list = [{
|
|
182
|
-
'name': subset_name,
|
|
183
|
-
'score': normalize_score(score=score),
|
|
184
|
-
'num': num
|
|
185
|
-
} for subset_name, (score, num) in subset_score_map.items()]
|
|
186
|
-
|
|
187
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
188
|
-
|
|
189
|
-
res_map = dict(
|
|
190
|
-
name=report_name or 'DEFAULT',
|
|
191
|
-
metric=self.metric_list[0]['name'],
|
|
192
|
-
score=weighted_avg_acc,
|
|
193
|
-
category=[category_d],
|
|
194
|
-
total_num=total_num)
|
|
195
|
-
|
|
196
|
-
return res_map
|
|
197
|
-
|
|
198
150
|
def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
|
|
199
151
|
|
|
200
152
|
if k > len(data_list):
|
|
@@ -204,28 +156,75 @@ class DataAdapter(ABC):
|
|
|
204
156
|
else:
|
|
205
157
|
return data_list[:k]
|
|
206
158
|
|
|
207
|
-
def compute_metric(self, review_res_list: list) ->
|
|
159
|
+
def compute_metric(self, review_res_list: list) -> List[dict]:
|
|
208
160
|
"""
|
|
209
161
|
Compute evaluation result by specific metrics.
|
|
210
162
|
|
|
211
163
|
Args:
|
|
212
164
|
review_res_list: list, the review result list, each item of which is match result for gold and pred.
|
|
213
165
|
|
|
214
|
-
Attributes:
|
|
215
|
-
DataAdapter.metric_func_map: metric_name -> metric_func mapping,
|
|
216
|
-
e.g. {'WeightedAverageAccuracy': weighted_average_acc}
|
|
217
|
-
|
|
218
166
|
Returns:
|
|
219
|
-
Metric results.
|
|
167
|
+
Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
|
|
220
168
|
"""
|
|
221
169
|
if len(self.metric_list) == 0:
|
|
222
170
|
raise ValueError('No metric list found for the benchmark.')
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
171
|
+
|
|
172
|
+
res_list = []
|
|
173
|
+
for metric in self.metric_list:
|
|
174
|
+
metric_name = metric.name
|
|
175
|
+
metric_func = metric.object
|
|
176
|
+
res_list.append({
|
|
177
|
+
'metric_name': metric_name,
|
|
178
|
+
'score': metric_func(review_res_list),
|
|
179
|
+
'num': len(review_res_list)
|
|
180
|
+
})
|
|
181
|
+
return res_list
|
|
182
|
+
|
|
183
|
+
def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
|
|
184
|
+
"""
|
|
185
|
+
Generate report for the evaluation results for all subsets.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
subset_score_map: The subset-score map.
|
|
189
|
+
e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
|
|
190
|
+
|
|
191
|
+
report_name: str, the user-defined report name. Default: None
|
|
192
|
+
|
|
193
|
+
Returns: The evaluation report.
|
|
194
|
+
|
|
195
|
+
Here is a format example for gsm8k:
|
|
196
|
+
{
|
|
197
|
+
"name": "qwen2.5_gsm8k",
|
|
198
|
+
"metrics": [
|
|
199
|
+
{
|
|
200
|
+
"name": "AverageAccuracy",
|
|
201
|
+
"categories": [
|
|
202
|
+
{
|
|
203
|
+
"name": "default",
|
|
204
|
+
"subsets": [
|
|
205
|
+
{
|
|
206
|
+
"name": "main",
|
|
207
|
+
"score": 0.0,
|
|
208
|
+
"num": 2
|
|
209
|
+
}
|
|
210
|
+
],
|
|
211
|
+
"num": 2,
|
|
212
|
+
"score": 0.0,
|
|
213
|
+
"macro_score": 0.0
|
|
214
|
+
}
|
|
215
|
+
],
|
|
216
|
+
"num": 2,
|
|
217
|
+
"score": 0.0,
|
|
218
|
+
"macro_score": 0.0
|
|
219
|
+
}
|
|
220
|
+
],
|
|
221
|
+
"dataset_name": "gsm8k",
|
|
222
|
+
"model_name": "qwen2.5"
|
|
223
|
+
}
|
|
224
|
+
""" # noqa: E501
|
|
225
|
+
kwargs['category_map'] = self.category_map
|
|
226
|
+
kwargs['metric_list'] = self.metric_list
|
|
227
|
+
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
229
228
|
|
|
230
229
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
231
230
|
"""
|
|
@@ -276,7 +275,7 @@ class DataAdapter(ABC):
|
|
|
276
275
|
raise NotImplementedError
|
|
277
276
|
|
|
278
277
|
@abstractmethod
|
|
279
|
-
def match(self, gold: Any, pred: Any) ->
|
|
278
|
+
def match(self, gold: Any, pred: Any) -> Any:
|
|
280
279
|
"""
|
|
281
280
|
Match the gold answer and the predicted answer.
|
|
282
281
|
|