evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +26 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +9 -26
- evalscope/evaluator/evaluator.py +87 -121
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +48 -72
- evalscope/run_arena.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +5 -4
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +5 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +52 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Dict, Optional, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks.ifeval import instructions_registry
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclasses.dataclass
|
|
8
|
+
class InputExample:
|
|
9
|
+
key: int
|
|
10
|
+
instruction_id_list: list[str]
|
|
11
|
+
prompt: str
|
|
12
|
+
kwargs: list[Dict[str, Optional[Union[str, int]]]]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclasses.dataclass
|
|
16
|
+
class OutputExample:
|
|
17
|
+
instruction_id_list: list[str]
|
|
18
|
+
prompt: str
|
|
19
|
+
response: str
|
|
20
|
+
follow_all_instructions: bool
|
|
21
|
+
follow_instruction_list: list[bool]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_instruction_following_strict(
|
|
25
|
+
inp,
|
|
26
|
+
response,
|
|
27
|
+
):
|
|
28
|
+
"""Tests response to see if instructions are followed."""
|
|
29
|
+
instruction_list = inp.instruction_id_list
|
|
30
|
+
is_following_list = []
|
|
31
|
+
|
|
32
|
+
for index, instruction_id in enumerate(instruction_list):
|
|
33
|
+
instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
|
|
34
|
+
instruction = instruction_cls(instruction_id)
|
|
35
|
+
|
|
36
|
+
# Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
|
|
37
|
+
kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
|
|
38
|
+
instruction.build_description(**kwargs)
|
|
39
|
+
args = instruction.get_instruction_args()
|
|
40
|
+
if args and 'prompt' in args:
|
|
41
|
+
instruction.build_description(prompt=inp.prompt)
|
|
42
|
+
|
|
43
|
+
if response.strip() and instruction.check_following(response):
|
|
44
|
+
is_following_list.append(True)
|
|
45
|
+
else:
|
|
46
|
+
is_following_list.append(False)
|
|
47
|
+
|
|
48
|
+
return OutputExample(
|
|
49
|
+
instruction_id_list=inp.instruction_id_list,
|
|
50
|
+
prompt=inp.prompt,
|
|
51
|
+
response=response,
|
|
52
|
+
follow_all_instructions=all(is_following_list),
|
|
53
|
+
follow_instruction_list=is_following_list,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_instruction_following_loose(
|
|
58
|
+
inp,
|
|
59
|
+
response,
|
|
60
|
+
):
|
|
61
|
+
"""Tests response for an upper bound for following instructions."""
|
|
62
|
+
r = response.split('\n')
|
|
63
|
+
response_remove_first = '\n'.join(r[1:]).strip()
|
|
64
|
+
response_remove_last = '\n'.join(r[:-1]).strip()
|
|
65
|
+
response_remove_both = '\n'.join(r[1:-1]).strip()
|
|
66
|
+
revised_response = response.replace('*', '')
|
|
67
|
+
revised_response_remove_first = response_remove_first.replace('*', '')
|
|
68
|
+
revised_response_remove_last = response_remove_last.replace('*', '')
|
|
69
|
+
revised_response_remove_both = response_remove_both.replace('*', '')
|
|
70
|
+
all_responses = [
|
|
71
|
+
response,
|
|
72
|
+
revised_response,
|
|
73
|
+
response_remove_first,
|
|
74
|
+
response_remove_last,
|
|
75
|
+
response_remove_both,
|
|
76
|
+
revised_response_remove_first,
|
|
77
|
+
revised_response_remove_last,
|
|
78
|
+
revised_response_remove_both,
|
|
79
|
+
]
|
|
80
|
+
instruction_list = inp.instruction_id_list
|
|
81
|
+
is_following_list = []
|
|
82
|
+
|
|
83
|
+
for index, instruction_id in enumerate(instruction_list):
|
|
84
|
+
instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
|
|
85
|
+
instruction = instruction_cls(instruction_id)
|
|
86
|
+
|
|
87
|
+
# Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
|
|
88
|
+
kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
|
|
89
|
+
instruction.build_description(**kwargs)
|
|
90
|
+
args = instruction.get_instruction_args()
|
|
91
|
+
if args and 'prompt' in args:
|
|
92
|
+
instruction.build_description(prompt=inp.prompt)
|
|
93
|
+
|
|
94
|
+
is_following = False
|
|
95
|
+
for r in all_responses:
|
|
96
|
+
if r.strip() and instruction.check_following(r):
|
|
97
|
+
is_following = True
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
is_following_list.append(is_following)
|
|
101
|
+
|
|
102
|
+
return OutputExample(
|
|
103
|
+
instruction_id_list=inp.instruction_id_list,
|
|
104
|
+
prompt=inp.prompt,
|
|
105
|
+
response=response,
|
|
106
|
+
follow_all_instructions=all(is_following_list),
|
|
107
|
+
follow_instruction_list=is_following_list,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def process_results(doc, results):
|
|
112
|
+
inp = InputExample(
|
|
113
|
+
key=doc['key'],
|
|
114
|
+
instruction_id_list=doc['instruction_id_list'],
|
|
115
|
+
prompt=doc['prompt'],
|
|
116
|
+
kwargs=doc['kwargs'],
|
|
117
|
+
)
|
|
118
|
+
response = results[0]
|
|
119
|
+
|
|
120
|
+
out_strict = test_instruction_following_strict(inp, response)
|
|
121
|
+
out_loose = test_instruction_following_loose(inp, response)
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
'prompt_level_strict_acc': out_strict.follow_all_instructions,
|
|
125
|
+
'inst_level_strict_acc': out_strict.follow_instruction_list,
|
|
126
|
+
'prompt_level_loose_acc': out_loose.follow_all_instructions,
|
|
127
|
+
'inst_level_loose_acc': out_loose.follow_instruction_list,
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def agg_inst_level_acc(items):
|
|
132
|
+
flat_items = [item for sublist in items for item in sublist]
|
|
133
|
+
inst_level_acc = sum(flat_items) / len(flat_items)
|
|
134
|
+
return inst_level_acc
|
|
File without changes
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import AnswerKeys, EvalType
|
|
3
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
4
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
5
|
+
from evalscope.utils.utils import ResponseParser
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@Benchmark.register(
|
|
9
|
+
name='iquiz',
|
|
10
|
+
dataset_id='AI-ModelScope/IQuiz',
|
|
11
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
12
|
+
subset_list=['IQ', 'EQ'],
|
|
13
|
+
metric_list=[AverageAccuracy],
|
|
14
|
+
few_shot_num=0,
|
|
15
|
+
train_split=None,
|
|
16
|
+
eval_split='test',
|
|
17
|
+
prompt_template='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
|
|
18
|
+
)
|
|
19
|
+
class IQuizAdapter(DataAdapter):
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
super().__init__(**kwargs)
|
|
23
|
+
|
|
24
|
+
self.choices = ['A', 'B', 'C', 'D', 'E']
|
|
25
|
+
|
|
26
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Generate model prompt from input data.
|
|
29
|
+
example:
|
|
30
|
+
{
|
|
31
|
+
"question":"天气预报说本周星期三会下雨,昨天果然下雨了,今天星期几?",
|
|
32
|
+
"choices":["星期一","星期二","星期三","星期四"],
|
|
33
|
+
"answer":"D",
|
|
34
|
+
"level":1
|
|
35
|
+
}
|
|
36
|
+
"""
|
|
37
|
+
prompt = f"问题: {input_d['question']}\n"
|
|
38
|
+
prompt += self.__form_options(input_d['choices'])
|
|
39
|
+
return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
40
|
+
|
|
41
|
+
def __form_options(self, options: list):
|
|
42
|
+
option_str = '选项:\n'
|
|
43
|
+
for opt, choice in zip(options, self.choices):
|
|
44
|
+
option_str += f'({choice}): {opt}' + '\n'
|
|
45
|
+
return option_str
|
|
46
|
+
|
|
47
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Parse the raw input labels (gold).
|
|
50
|
+
"""
|
|
51
|
+
return input_d['answer']
|
|
52
|
+
|
|
53
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
54
|
+
"""
|
|
55
|
+
Parse the predicted result and extract proper answer.
|
|
56
|
+
"""
|
|
57
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
58
|
+
|
|
59
|
+
def match(self, gold: str, pred: str) -> float:
|
|
60
|
+
"""
|
|
61
|
+
Match the gold answer and the predicted answer.
|
|
62
|
+
"""
|
|
63
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter
|
|
5
|
-
from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
import csv
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
|
-
from evalscope.benchmarks
|
|
6
|
-
from evalscope.
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
8
|
+
from evalscope.models import MultiChoiceModelAdapter
|
|
7
9
|
from evalscope.utils import ResponseParser, normalize_score
|
|
8
10
|
from evalscope.utils.logger import get_logger
|
|
9
11
|
|
|
@@ -134,40 +136,31 @@ SUBJECT_MAPPING = {
|
|
|
134
136
|
}
|
|
135
137
|
|
|
136
138
|
|
|
139
|
+
@Benchmark.register(
|
|
140
|
+
name='mmlu',
|
|
141
|
+
dataset_id='modelscope/mmlu',
|
|
142
|
+
model_adapter=MultiChoiceModelAdapter,
|
|
143
|
+
subset_list=SUBSET_LIST,
|
|
144
|
+
metric_list=[AverageAccuracy],
|
|
145
|
+
few_shot_num=5,
|
|
146
|
+
train_split='train',
|
|
147
|
+
eval_split='test',
|
|
148
|
+
prompt_template='',
|
|
149
|
+
)
|
|
137
150
|
class MMLUAdapter(DataAdapter):
|
|
138
151
|
|
|
139
152
|
choices = ['A', 'B', 'C', 'D']
|
|
140
153
|
|
|
141
|
-
def __init__(self,
|
|
142
|
-
subset_list: list = None,
|
|
143
|
-
metric_list: list = None,
|
|
144
|
-
few_shot_num: int = None,
|
|
145
|
-
train_split: str = 'train',
|
|
146
|
-
eval_split: str = 'test',
|
|
147
|
-
**kwargs):
|
|
148
|
-
|
|
149
|
-
if subset_list is None:
|
|
150
|
-
subset_list = SUBSET_LIST
|
|
151
|
-
|
|
152
|
-
if metric_list is None:
|
|
153
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
154
|
-
|
|
155
|
-
if few_shot_num is None:
|
|
156
|
-
# Use 5-shot by default
|
|
157
|
-
logger.info(f'Set 5-shot examples by system for MMLU.')
|
|
158
|
-
few_shot_num = 5
|
|
154
|
+
def __init__(self, **kwargs):
|
|
159
155
|
|
|
156
|
+
few_shot_num = kwargs.get('few_shot_num', 5)
|
|
160
157
|
if few_shot_num > 5:
|
|
161
158
|
logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
|
|
162
|
-
few_shot_num = 5
|
|
159
|
+
kwargs['few_shot_num'] = 5
|
|
163
160
|
|
|
164
|
-
super().__init__(
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
few_shot_num=few_shot_num,
|
|
168
|
-
train_split=train_split,
|
|
169
|
-
eval_split=eval_split,
|
|
170
|
-
**kwargs)
|
|
161
|
+
super().__init__(**kwargs)
|
|
162
|
+
|
|
163
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
171
164
|
|
|
172
165
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
173
166
|
data_dict = {}
|
|
@@ -175,11 +168,11 @@ class MMLUAdapter(DataAdapter):
|
|
|
175
168
|
data_dict[subset_name] = {}
|
|
176
169
|
|
|
177
170
|
for split_name in [self.train_split, self.eval_split]:
|
|
178
|
-
if
|
|
171
|
+
if split_name == 'train':
|
|
179
172
|
split_name_suffix = 'dev'
|
|
180
|
-
elif
|
|
173
|
+
elif split_name == 'test':
|
|
181
174
|
split_name_suffix = 'test'
|
|
182
|
-
elif
|
|
175
|
+
elif split_name == 'validation':
|
|
183
176
|
split_name_suffix = 'val'
|
|
184
177
|
else:
|
|
185
178
|
raise ValueError(f'Invalid split name: {split_name}')
|
|
@@ -225,7 +218,7 @@ class MMLUAdapter(DataAdapter):
|
|
|
225
218
|
'target': 'A'}
|
|
226
219
|
|
|
227
220
|
Returns:
|
|
228
|
-
{'data': [
|
|
221
|
+
{'data': [full_prompt], 'multi_choices': self.choices}
|
|
229
222
|
|
|
230
223
|
"""
|
|
231
224
|
prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
|
|
@@ -238,13 +231,13 @@ class MMLUAdapter(DataAdapter):
|
|
|
238
231
|
|
|
239
232
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
240
233
|
|
|
241
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
234
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
242
235
|
|
|
243
236
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
244
237
|
# Get the gold choice
|
|
245
238
|
return input_d.get('target', '')
|
|
246
239
|
|
|
247
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
240
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
248
241
|
"""
|
|
249
242
|
Parse the model output to get the answer. Could be the best choice index.
|
|
250
243
|
|
|
@@ -256,109 +249,18 @@ class MMLUAdapter(DataAdapter):
|
|
|
256
249
|
Returns:
|
|
257
250
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
258
251
|
"""
|
|
259
|
-
if eval_type ==
|
|
252
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
260
253
|
return result
|
|
261
|
-
elif eval_type ==
|
|
262
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
263
|
-
elif eval_type ==
|
|
264
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
254
|
+
elif eval_type == EvalType.SERVICE:
|
|
255
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
256
|
+
elif eval_type == EvalType.CUSTOM:
|
|
257
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
265
258
|
else:
|
|
266
259
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
267
260
|
|
|
268
261
|
def match(self, gold: str, pred: str) -> float:
|
|
269
262
|
return exact_match(gold=gold, pred=pred)
|
|
270
263
|
|
|
271
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
272
|
-
"""
|
|
273
|
-
Compute evaluation result by specific metric.
|
|
274
|
-
|
|
275
|
-
Args:
|
|
276
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
277
|
-
|
|
278
|
-
Returns:
|
|
279
|
-
The metric score.
|
|
280
|
-
"""
|
|
281
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
282
|
-
return weighted_mean(items)
|
|
283
|
-
|
|
284
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
285
|
-
"""
|
|
286
|
-
Generate report for the evaluation.
|
|
287
|
-
|
|
288
|
-
Args:
|
|
289
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
290
|
-
report_name: The user-defined report name.
|
|
291
|
-
|
|
292
|
-
Returns:
|
|
293
|
-
{
|
|
294
|
-
"name":"MMLU",
|
|
295
|
-
"metric":"WeightedAverageAccuracy",
|
|
296
|
-
"score":0.3389,
|
|
297
|
-
"category":[
|
|
298
|
-
{
|
|
299
|
-
"name":"STEM",
|
|
300
|
-
"score":0.2528,
|
|
301
|
-
"subset":[
|
|
302
|
-
{
|
|
303
|
-
"name":"computer_network",
|
|
304
|
-
"score":0.2632
|
|
305
|
-
},
|
|
306
|
-
{
|
|
307
|
-
"name":"operating_system",
|
|
308
|
-
"score":0.3157
|
|
309
|
-
},
|
|
310
|
-
{
|
|
311
|
-
"name":"computer_architecture",
|
|
312
|
-
"score":0.4285
|
|
313
|
-
}
|
|
314
|
-
]
|
|
315
|
-
}
|
|
316
|
-
],
|
|
317
|
-
"total_num":59
|
|
318
|
-
}
|
|
319
|
-
"""
|
|
320
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
321
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
322
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
323
|
-
|
|
324
|
-
# Get domain-subject mapping
|
|
325
|
-
subject_review_map = {}
|
|
326
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
327
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
328
|
-
if domain_name in subject_review_map:
|
|
329
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
330
|
-
else:
|
|
331
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
332
|
-
|
|
333
|
-
# Get domain score
|
|
334
|
-
category_list = []
|
|
335
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
336
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
337
|
-
sum([num for _, _, num in domain_res_list])
|
|
338
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
339
|
-
category_list.append({
|
|
340
|
-
'name':
|
|
341
|
-
domain_name,
|
|
342
|
-
'score':
|
|
343
|
-
domain_weighted_avg_acc,
|
|
344
|
-
'subset': [{
|
|
345
|
-
'name': subset_name,
|
|
346
|
-
'score': normalize_score(score=subset_score)
|
|
347
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
348
|
-
})
|
|
349
|
-
|
|
350
|
-
category_list = sorted(category_list, key=lambda x: x['name'])
|
|
351
|
-
|
|
352
|
-
# Get final dict of report
|
|
353
|
-
res_map = dict(
|
|
354
|
-
name=report_name or 'mmlu',
|
|
355
|
-
metric=self.metric_list[0]['name'],
|
|
356
|
-
score=weighted_avg_acc,
|
|
357
|
-
category=category_list,
|
|
358
|
-
total_num=total_num)
|
|
359
|
-
|
|
360
|
-
return res_map
|
|
361
|
-
|
|
362
264
|
@classmethod
|
|
363
265
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
364
266
|
|
|
File without changes
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.constants import AnswerKeys, EvalType
|
|
6
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
7
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
8
|
+
from evalscope.utils.utils import ResponseParser
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='mmlu_pro',
|
|
13
|
+
dataset_id='modelscope/mmlu-pro',
|
|
14
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
+
subset_list=['default'],
|
|
16
|
+
metric_list=[AverageAccuracy],
|
|
17
|
+
few_shot_num=5,
|
|
18
|
+
train_split='validation',
|
|
19
|
+
eval_split='test',
|
|
20
|
+
prompt_template=
|
|
21
|
+
'You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...`.', # noqa: E501
|
|
22
|
+
)
|
|
23
|
+
class MMLUProAdapter(DataAdapter):
|
|
24
|
+
|
|
25
|
+
def __init__(self, **kwargs):
|
|
26
|
+
super().__init__(**kwargs)
|
|
27
|
+
|
|
28
|
+
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
29
|
+
self.categories = [
|
|
30
|
+
'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
|
|
31
|
+
'philosophy', 'economics', 'other', 'psychology', 'history'
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
|
|
35
|
+
"""
|
|
36
|
+
Generate model prompt from raw input, unify the prompt format for MMLU-Pro benchmark.
|
|
37
|
+
Return a dict with category as key and list of prompts as value.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
data_dict = data_dict[self.subset_list[0]] # Only one subset for MMLU-Pro
|
|
41
|
+
fewshot_prompts = self.get_fewshot_examples(data_dict)
|
|
42
|
+
|
|
43
|
+
# Use the category as key to group the prompts
|
|
44
|
+
res_dict = defaultdict(list)
|
|
45
|
+
# generate prompts for each test sample
|
|
46
|
+
for entry in data_dict[self.eval_split]:
|
|
47
|
+
prefix = fewshot_prompts[entry['category']]
|
|
48
|
+
query = prefix + 'Q: ' + entry['question'] + '\n' + \
|
|
49
|
+
self.__form_options(entry['options']) + '\n'
|
|
50
|
+
|
|
51
|
+
prompt_d = {'data': [query], 'system_prompt': self.prompt_template, AnswerKeys.RAW_INPUT: entry}
|
|
52
|
+
|
|
53
|
+
res_dict[entry['category']].append(prompt_d)
|
|
54
|
+
return res_dict
|
|
55
|
+
|
|
56
|
+
def get_fewshot_examples(self, data_dict: dict):
|
|
57
|
+
# load 5-shot prompts for each category
|
|
58
|
+
prompts = {c: '' for c in self.categories}
|
|
59
|
+
for d in data_dict[self.train_split]:
|
|
60
|
+
prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
|
|
61
|
+
self.__form_options(d['options']) + '\n' + \
|
|
62
|
+
d['cot_content'] + '\n\n'
|
|
63
|
+
return prompts
|
|
64
|
+
|
|
65
|
+
def __form_options(self, options: list):
|
|
66
|
+
option_str = 'Options are:\n'
|
|
67
|
+
for opt, choice in zip(options, self.choices):
|
|
68
|
+
option_str += f'({choice}): {opt}' + '\n'
|
|
69
|
+
return option_str
|
|
70
|
+
|
|
71
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Parse the raw input labels (gold).
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
input_d: input raw data. Depending on the dataset.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
80
|
+
"""
|
|
81
|
+
return input_d['answer']
|
|
82
|
+
|
|
83
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
84
|
+
"""
|
|
85
|
+
Parse the predicted result and extract proper answer.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
result: Predicted answer from the model. Usually a string for chat.
|
|
89
|
+
raw_input_d: The raw input. Depending on the dataset.
|
|
90
|
+
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
94
|
+
"""
|
|
95
|
+
return ResponseParser.parse_first_option(result)
|
|
96
|
+
|
|
97
|
+
def match(self, gold: str, pred: str) -> float:
|
|
98
|
+
"""
|
|
99
|
+
Match the gold answer and the predicted answer.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
103
|
+
e.g. 'A', extracted from get_gold_answer method.
|
|
104
|
+
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
105
|
+
e.g. 'B', extracted from parse_pred_result method.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
109
|
+
"""
|
|
110
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.race.race_adapter import RACEAdapter
|
|
5
|
-
from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|