evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -5
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
- evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
- evalscope/benchmarks/data_adapter.py +69 -70
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
- evalscope/benchmarks/race/race_adapter.py +4 -73
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/evaluator.py +82 -62
- evalscope/collections/sampler.py +47 -41
- evalscope/collections/schema.py +14 -10
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +22 -13
- evalscope/metrics/__init__.py +2 -5
- evalscope/metrics/metrics.py +11 -2
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/models/server_adapter.py +11 -4
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +16 -11
- evalscope/summarizer.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/logger.py +1 -0
- evalscope/utils/model_utils.py +5 -2
- evalscope/version.py +2 -2
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
- tests/cli/test_collection.py +11 -7
- tests/cli/test_run.py +13 -4
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Dict, Optional, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks.ifeval import instructions_registry
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclasses.dataclass
|
|
8
|
+
class InputExample:
|
|
9
|
+
key: int
|
|
10
|
+
instruction_id_list: list[str]
|
|
11
|
+
prompt: str
|
|
12
|
+
kwargs: list[Dict[str, Optional[Union[str, int]]]]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclasses.dataclass
|
|
16
|
+
class OutputExample:
|
|
17
|
+
instruction_id_list: list[str]
|
|
18
|
+
prompt: str
|
|
19
|
+
response: str
|
|
20
|
+
follow_all_instructions: bool
|
|
21
|
+
follow_instruction_list: list[bool]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_instruction_following_strict(
|
|
25
|
+
inp,
|
|
26
|
+
response,
|
|
27
|
+
):
|
|
28
|
+
"""Tests response to see if instructions are followed."""
|
|
29
|
+
instruction_list = inp.instruction_id_list
|
|
30
|
+
is_following_list = []
|
|
31
|
+
|
|
32
|
+
for index, instruction_id in enumerate(instruction_list):
|
|
33
|
+
instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
|
|
34
|
+
instruction = instruction_cls(instruction_id)
|
|
35
|
+
|
|
36
|
+
# Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
|
|
37
|
+
kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
|
|
38
|
+
instruction.build_description(**kwargs)
|
|
39
|
+
args = instruction.get_instruction_args()
|
|
40
|
+
if args and 'prompt' in args:
|
|
41
|
+
instruction.build_description(prompt=inp.prompt)
|
|
42
|
+
|
|
43
|
+
if response.strip() and instruction.check_following(response):
|
|
44
|
+
is_following_list.append(True)
|
|
45
|
+
else:
|
|
46
|
+
is_following_list.append(False)
|
|
47
|
+
|
|
48
|
+
return OutputExample(
|
|
49
|
+
instruction_id_list=inp.instruction_id_list,
|
|
50
|
+
prompt=inp.prompt,
|
|
51
|
+
response=response,
|
|
52
|
+
follow_all_instructions=all(is_following_list),
|
|
53
|
+
follow_instruction_list=is_following_list,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_instruction_following_loose(
|
|
58
|
+
inp,
|
|
59
|
+
response,
|
|
60
|
+
):
|
|
61
|
+
"""Tests response for an upper bound for following instructions."""
|
|
62
|
+
r = response.split('\n')
|
|
63
|
+
response_remove_first = '\n'.join(r[1:]).strip()
|
|
64
|
+
response_remove_last = '\n'.join(r[:-1]).strip()
|
|
65
|
+
response_remove_both = '\n'.join(r[1:-1]).strip()
|
|
66
|
+
revised_response = response.replace('*', '')
|
|
67
|
+
revised_response_remove_first = response_remove_first.replace('*', '')
|
|
68
|
+
revised_response_remove_last = response_remove_last.replace('*', '')
|
|
69
|
+
revised_response_remove_both = response_remove_both.replace('*', '')
|
|
70
|
+
all_responses = [
|
|
71
|
+
response,
|
|
72
|
+
revised_response,
|
|
73
|
+
response_remove_first,
|
|
74
|
+
response_remove_last,
|
|
75
|
+
response_remove_both,
|
|
76
|
+
revised_response_remove_first,
|
|
77
|
+
revised_response_remove_last,
|
|
78
|
+
revised_response_remove_both,
|
|
79
|
+
]
|
|
80
|
+
instruction_list = inp.instruction_id_list
|
|
81
|
+
is_following_list = []
|
|
82
|
+
|
|
83
|
+
for index, instruction_id in enumerate(instruction_list):
|
|
84
|
+
instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
|
|
85
|
+
instruction = instruction_cls(instruction_id)
|
|
86
|
+
|
|
87
|
+
# Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
|
|
88
|
+
kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
|
|
89
|
+
instruction.build_description(**kwargs)
|
|
90
|
+
args = instruction.get_instruction_args()
|
|
91
|
+
if args and 'prompt' in args:
|
|
92
|
+
instruction.build_description(prompt=inp.prompt)
|
|
93
|
+
|
|
94
|
+
is_following = False
|
|
95
|
+
for r in all_responses:
|
|
96
|
+
if r.strip() and instruction.check_following(r):
|
|
97
|
+
is_following = True
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
is_following_list.append(is_following)
|
|
101
|
+
|
|
102
|
+
return OutputExample(
|
|
103
|
+
instruction_id_list=inp.instruction_id_list,
|
|
104
|
+
prompt=inp.prompt,
|
|
105
|
+
response=response,
|
|
106
|
+
follow_all_instructions=all(is_following_list),
|
|
107
|
+
follow_instruction_list=is_following_list,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def process_results(doc, results):
|
|
112
|
+
inp = InputExample(
|
|
113
|
+
key=doc['key'],
|
|
114
|
+
instruction_id_list=doc['instruction_id_list'],
|
|
115
|
+
prompt=doc['prompt'],
|
|
116
|
+
kwargs=doc['kwargs'],
|
|
117
|
+
)
|
|
118
|
+
response = results[0]
|
|
119
|
+
|
|
120
|
+
out_strict = test_instruction_following_strict(inp, response)
|
|
121
|
+
out_loose = test_instruction_following_loose(inp, response)
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
'prompt_level_strict_acc': out_strict.follow_all_instructions,
|
|
125
|
+
'inst_level_strict_acc': out_strict.follow_instruction_list,
|
|
126
|
+
'prompt_level_loose_acc': out_loose.follow_all_instructions,
|
|
127
|
+
'inst_level_loose_acc': out_loose.follow_instruction_list,
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def agg_inst_level_acc(items):
|
|
132
|
+
flat_items = [item for sublist in items for item in sublist]
|
|
133
|
+
inst_level_acc = sum(flat_items) / len(flat_items)
|
|
134
|
+
return inst_level_acc
|
|
File without changes
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import AnswerKeys, EvalType
|
|
3
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
4
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
5
|
+
from evalscope.utils.utils import ResponseParser
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@Benchmark.register(
|
|
9
|
+
name='iquiz',
|
|
10
|
+
dataset_id='AI-ModelScope/IQuiz',
|
|
11
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
12
|
+
subset_list=['IQ', 'EQ'],
|
|
13
|
+
metric_list=[AverageAccuracy],
|
|
14
|
+
few_shot_num=0,
|
|
15
|
+
train_split=None,
|
|
16
|
+
eval_split='test',
|
|
17
|
+
prompt_template='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
|
|
18
|
+
)
|
|
19
|
+
class IQuizAdapter(DataAdapter):
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
super().__init__(**kwargs)
|
|
23
|
+
|
|
24
|
+
self.choices = ['A', 'B', 'C', 'D', 'E']
|
|
25
|
+
|
|
26
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Generate model prompt from input data.
|
|
29
|
+
example:
|
|
30
|
+
{
|
|
31
|
+
"question":"天气预报说本周星期三会下雨,昨天果然下雨了,今天星期几?",
|
|
32
|
+
"choices":["星期一","星期二","星期三","星期四"],
|
|
33
|
+
"answer":"D",
|
|
34
|
+
"level":1
|
|
35
|
+
}
|
|
36
|
+
"""
|
|
37
|
+
prompt = f"问题: {input_d['question']}\n"
|
|
38
|
+
prompt += self.__form_options(input_d['choices'])
|
|
39
|
+
return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
40
|
+
|
|
41
|
+
def __form_options(self, options: list):
|
|
42
|
+
option_str = '选项:\n'
|
|
43
|
+
for opt, choice in zip(options, self.choices):
|
|
44
|
+
option_str += f'({choice}): {opt}' + '\n'
|
|
45
|
+
return option_str
|
|
46
|
+
|
|
47
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Parse the raw input labels (gold).
|
|
50
|
+
"""
|
|
51
|
+
return input_d['answer']
|
|
52
|
+
|
|
53
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
54
|
+
"""
|
|
55
|
+
Parse the predicted result and extract proper answer.
|
|
56
|
+
"""
|
|
57
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
58
|
+
|
|
59
|
+
def match(self, gold: str, pred: str) -> float:
|
|
60
|
+
"""
|
|
61
|
+
Match the gold answer and the predicted answer.
|
|
62
|
+
"""
|
|
63
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
8
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
9
9
|
from evalscope.utils import ResponseParser, normalize_score
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
@@ -141,7 +141,7 @@ SUBJECT_MAPPING = {
|
|
|
141
141
|
dataset_id='modelscope/mmlu',
|
|
142
142
|
model_adapter=MultiChoiceModelAdapter,
|
|
143
143
|
subset_list=SUBSET_LIST,
|
|
144
|
-
metric_list=[
|
|
144
|
+
metric_list=[AverageAccuracy],
|
|
145
145
|
few_shot_num=5,
|
|
146
146
|
train_split='train',
|
|
147
147
|
eval_split='test',
|
|
@@ -160,17 +160,19 @@ class MMLUAdapter(DataAdapter):
|
|
|
160
160
|
|
|
161
161
|
super().__init__(**kwargs)
|
|
162
162
|
|
|
163
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
164
|
+
|
|
163
165
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
164
166
|
data_dict = {}
|
|
165
167
|
for subset_name in subset_list:
|
|
166
168
|
data_dict[subset_name] = {}
|
|
167
169
|
|
|
168
170
|
for split_name in [self.train_split, self.eval_split]:
|
|
169
|
-
if
|
|
171
|
+
if split_name == 'train':
|
|
170
172
|
split_name_suffix = 'dev'
|
|
171
|
-
elif
|
|
173
|
+
elif split_name == 'test':
|
|
172
174
|
split_name_suffix = 'test'
|
|
173
|
-
elif
|
|
175
|
+
elif split_name == 'validation':
|
|
174
176
|
split_name_suffix = 'val'
|
|
175
177
|
else:
|
|
176
178
|
raise ValueError(f'Invalid split name: {split_name}')
|
|
@@ -229,7 +231,7 @@ class MMLUAdapter(DataAdapter):
|
|
|
229
231
|
|
|
230
232
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
231
233
|
|
|
232
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
234
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
233
235
|
|
|
234
236
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
235
237
|
# Get the gold choice
|
|
@@ -259,84 +261,6 @@ class MMLUAdapter(DataAdapter):
|
|
|
259
261
|
def match(self, gold: str, pred: str) -> float:
|
|
260
262
|
return exact_match(gold=gold, pred=pred)
|
|
261
263
|
|
|
262
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
263
|
-
"""
|
|
264
|
-
Generate report for the evaluation.
|
|
265
|
-
|
|
266
|
-
Args:
|
|
267
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
268
|
-
report_name: The user-defined report name.
|
|
269
|
-
|
|
270
|
-
Returns:
|
|
271
|
-
{
|
|
272
|
-
"name":"MMLU",
|
|
273
|
-
"metric":"WeightedAverageAccuracy",
|
|
274
|
-
"score":0.3389,
|
|
275
|
-
"category":[
|
|
276
|
-
{
|
|
277
|
-
"name":"STEM",
|
|
278
|
-
"score":0.2528,
|
|
279
|
-
"subset":[
|
|
280
|
-
{
|
|
281
|
-
"name":"computer_network",
|
|
282
|
-
"score":0.2632
|
|
283
|
-
},
|
|
284
|
-
{
|
|
285
|
-
"name":"operating_system",
|
|
286
|
-
"score":0.3157
|
|
287
|
-
},
|
|
288
|
-
{
|
|
289
|
-
"name":"computer_architecture",
|
|
290
|
-
"score":0.4285
|
|
291
|
-
}
|
|
292
|
-
]
|
|
293
|
-
}
|
|
294
|
-
],
|
|
295
|
-
"total_num":59
|
|
296
|
-
}
|
|
297
|
-
"""
|
|
298
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
299
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
300
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
301
|
-
|
|
302
|
-
# Get domain-subject mapping
|
|
303
|
-
subject_review_map = {}
|
|
304
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
305
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
306
|
-
if domain_name in subject_review_map:
|
|
307
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
308
|
-
else:
|
|
309
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
310
|
-
|
|
311
|
-
# Get domain score
|
|
312
|
-
category_list = []
|
|
313
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
314
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
315
|
-
sum([num for _, _, num in domain_res_list])
|
|
316
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
317
|
-
category_list.append({
|
|
318
|
-
'name':
|
|
319
|
-
domain_name,
|
|
320
|
-
'score':
|
|
321
|
-
domain_weighted_avg_acc,
|
|
322
|
-
'subset': [{
|
|
323
|
-
'name': subset_name,
|
|
324
|
-
'score': normalize_score(score=subset_score)
|
|
325
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
326
|
-
})
|
|
327
|
-
|
|
328
|
-
category_list = sorted(category_list, key=lambda x: x['name'])
|
|
329
|
-
|
|
330
|
-
# Get final dict of report
|
|
331
|
-
res_map = dict(
|
|
332
|
-
name=report_name or 'mmlu',
|
|
333
|
-
metric=self.metric_list[0]['name'],
|
|
334
|
-
score=weighted_avg_acc,
|
|
335
|
-
category=category_list,
|
|
336
|
-
total_num=total_num)
|
|
337
|
-
|
|
338
|
-
return res_map
|
|
339
|
-
|
|
340
264
|
@classmethod
|
|
341
265
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
342
266
|
|
|
@@ -3,7 +3,7 @@ from typing import Any, Dict
|
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import AnswerKeys, EvalType
|
|
6
|
-
from evalscope.metrics import
|
|
6
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
7
7
|
from evalscope.models import ChatGenerationModelAdapter
|
|
8
8
|
from evalscope.utils.utils import ResponseParser
|
|
9
9
|
|
|
@@ -13,7 +13,7 @@ from evalscope.utils.utils import ResponseParser
|
|
|
13
13
|
dataset_id='modelscope/mmlu-pro',
|
|
14
14
|
model_adapter=ChatGenerationModelAdapter,
|
|
15
15
|
subset_list=['default'],
|
|
16
|
-
metric_list=[
|
|
16
|
+
metric_list=[AverageAccuracy],
|
|
17
17
|
few_shot_num=5,
|
|
18
18
|
train_split='validation',
|
|
19
19
|
eval_split='test',
|
|
@@ -4,9 +4,9 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
8
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
9
|
-
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
10
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
|
|
@@ -14,15 +14,13 @@ from evalscope.utils.logger import get_logger
|
|
|
14
14
|
|
|
15
15
|
logger = get_logger()
|
|
16
16
|
|
|
17
|
-
SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
|
|
18
|
-
|
|
19
17
|
|
|
20
18
|
@Benchmark.register(
|
|
21
19
|
name='race',
|
|
22
20
|
dataset_id='modelscope/race',
|
|
23
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
24
22
|
subset_list=['high', 'middle'],
|
|
25
|
-
metric_list=[
|
|
23
|
+
metric_list=[AverageAccuracy],
|
|
26
24
|
few_shot_num=3,
|
|
27
25
|
train_split='train',
|
|
28
26
|
eval_split='test',
|
|
@@ -84,7 +82,7 @@ class RACEAdapter(DataAdapter):
|
|
|
84
82
|
|
|
85
83
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
86
84
|
|
|
87
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
85
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
88
86
|
|
|
89
87
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
90
88
|
# Get the gold choice
|
|
@@ -114,73 +112,6 @@ class RACEAdapter(DataAdapter):
|
|
|
114
112
|
def match(self, gold: str, pred: str) -> float:
|
|
115
113
|
return exact_match(gold=gold, pred=pred)
|
|
116
114
|
|
|
117
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
118
|
-
"""
|
|
119
|
-
Generate report for the evaluation.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
123
|
-
report_name: The user-defined report name.
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
{
|
|
127
|
-
"name":"RACE",
|
|
128
|
-
"metric":"WeightedAverageAccuracy",
|
|
129
|
-
"score":0.3389,
|
|
130
|
-
"category":[
|
|
131
|
-
{
|
|
132
|
-
"name":"High",
|
|
133
|
-
"score":0.2528,
|
|
134
|
-
"subset":[
|
|
135
|
-
{
|
|
136
|
-
"name":"high",
|
|
137
|
-
"score":0.2528
|
|
138
|
-
}
|
|
139
|
-
]
|
|
140
|
-
}
|
|
141
|
-
],
|
|
142
|
-
"total_num":59
|
|
143
|
-
}
|
|
144
|
-
"""
|
|
145
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
146
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
147
|
-
|
|
148
|
-
# Get domain-subject mapping
|
|
149
|
-
subject_review_map = {}
|
|
150
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
151
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)
|
|
152
|
-
if domain_name in subject_review_map:
|
|
153
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
154
|
-
else:
|
|
155
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
156
|
-
|
|
157
|
-
# Get domain score
|
|
158
|
-
category_list = []
|
|
159
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
160
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
161
|
-
sum([num for _, _, num in domain_res_list])
|
|
162
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
163
|
-
category_list.append({
|
|
164
|
-
'name':
|
|
165
|
-
domain_name,
|
|
166
|
-
'score':
|
|
167
|
-
normalize_score(score=domain_weighted_avg_acc),
|
|
168
|
-
'subset': [{
|
|
169
|
-
'name': subset_name,
|
|
170
|
-
'score': subset_score
|
|
171
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
172
|
-
})
|
|
173
|
-
|
|
174
|
-
# Get final dict of report
|
|
175
|
-
res_map = dict(
|
|
176
|
-
name=report_name or 'race',
|
|
177
|
-
metric=self.metric_list[0]['name'],
|
|
178
|
-
score=weighted_avg_acc,
|
|
179
|
-
category=category_list,
|
|
180
|
-
total_num=total_num)
|
|
181
|
-
|
|
182
|
-
return res_map
|
|
183
|
-
|
|
184
115
|
@classmethod
|
|
185
116
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
186
117
|
|
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) EleutherAI Inc, and its affiliates.
|
|
3
3
|
import csv
|
|
4
|
-
import numpy as np
|
|
5
4
|
import os
|
|
6
|
-
from typing import List
|
|
7
5
|
|
|
8
6
|
from evalscope.benchmarks import Benchmark
|
|
9
7
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
10
8
|
from evalscope.constants import EvalType
|
|
11
|
-
from evalscope.metrics import
|
|
12
|
-
from evalscope.metrics.metrics import exact_match
|
|
9
|
+
from evalscope.metrics import AverageAccuracy
|
|
13
10
|
from evalscope.models import ChatGenerationModelAdapter
|
|
14
11
|
from evalscope.utils import get_logger
|
|
15
12
|
|
|
@@ -23,7 +20,7 @@ logger = get_logger()
|
|
|
23
20
|
dataset_id='modelscope/trivia_qa',
|
|
24
21
|
model_adapter=ChatGenerationModelAdapter,
|
|
25
22
|
subset_list=['default'],
|
|
26
|
-
metric_list=[
|
|
23
|
+
metric_list=[AverageAccuracy],
|
|
27
24
|
few_shot_num=5,
|
|
28
25
|
train_split='dev',
|
|
29
26
|
eval_split='test',
|
|
@@ -104,7 +101,7 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
104
101
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
105
102
|
full_prompt = context
|
|
106
103
|
|
|
107
|
-
return {'data': [full_prompt], 'system_prompt': prompt}
|
|
104
|
+
return {'data': [full_prompt], 'system_prompt': prompt or self.prompt_template}
|
|
108
105
|
|
|
109
106
|
def get_gold_answer(self, input_d: dict) -> list:
|
|
110
107
|
# Get the gold choice
|
|
@@ -9,8 +9,7 @@ from typing import List
|
|
|
9
9
|
from evalscope.benchmarks import Benchmark
|
|
10
10
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
11
11
|
from evalscope.constants import EvalType
|
|
12
|
-
from evalscope.metrics import
|
|
13
|
-
from evalscope.metrics.metrics import weighted_mean
|
|
12
|
+
from evalscope.metrics import AverageAccuracy
|
|
14
13
|
from evalscope.models import ContinuationLogitsModelAdapter
|
|
15
14
|
from evalscope.utils import get_logger, normalize_score
|
|
16
15
|
|
|
@@ -26,7 +25,7 @@ logger = get_logger()
|
|
|
26
25
|
dataset_id='modelscope/truthful_qa',
|
|
27
26
|
model_adapter=ContinuationLogitsModelAdapter,
|
|
28
27
|
subset_list=['multiple_choice'],
|
|
29
|
-
metric_list=[
|
|
28
|
+
metric_list=[AverageAccuracy],
|
|
30
29
|
few_shot_num=0,
|
|
31
30
|
train_split=None,
|
|
32
31
|
eval_split='validation',
|
|
@@ -260,7 +259,7 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
260
259
|
|
|
261
260
|
return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}} # or {'generation': xxx}
|
|
262
261
|
|
|
263
|
-
def compute_metric(self, review_res_list: List[dict]) ->
|
|
262
|
+
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
264
263
|
"""
|
|
265
264
|
Compute evaluation result by specific metric for each subset.
|
|
266
265
|
|
|
@@ -285,56 +284,8 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
285
284
|
logger.error(f'** Unknown review_res: {review_res_d}')
|
|
286
285
|
|
|
287
286
|
# To get mc2 score
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
Generate the report for the model output.
|
|
294
|
-
|
|
295
|
-
Args:
|
|
296
|
-
subset_score_map: {subset_name: (score, num), ...}
|
|
297
|
-
report_name: The user-defined report name.
|
|
298
|
-
|
|
299
|
-
Returns:
|
|
300
|
-
{
|
|
301
|
-
"name":"TruthfulQA",
|
|
302
|
-
"metric":"WeightedAverageAccuracy",
|
|
303
|
-
"score":0.3389,
|
|
304
|
-
"category":[
|
|
305
|
-
{
|
|
306
|
-
"name":"DEFAULT",
|
|
307
|
-
"score":0.2527,
|
|
308
|
-
"subset":[
|
|
309
|
-
{
|
|
310
|
-
"name":"multiple_choice",
|
|
311
|
-
"score":0.3157
|
|
312
|
-
},
|
|
313
|
-
# {
|
|
314
|
-
# "name":"generation",
|
|
315
|
-
# "score":0.2631
|
|
316
|
-
# }
|
|
317
|
-
]
|
|
318
|
-
}
|
|
319
|
-
],
|
|
320
|
-
"total_num":100
|
|
321
|
-
}
|
|
322
|
-
"""
|
|
323
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
324
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
325
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
326
|
-
cate_avg_list = [{
|
|
327
|
-
'name': subset_name,
|
|
328
|
-
'score': normalize_score(score=score)
|
|
329
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
330
|
-
|
|
331
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
332
|
-
|
|
333
|
-
res_map = dict(
|
|
334
|
-
name=report_name or 'truthful_qa',
|
|
335
|
-
metric=self.metric_list[0]['name'],
|
|
336
|
-
score=weighted_avg_acc,
|
|
337
|
-
category=[category_d],
|
|
338
|
-
total_num=total_num)
|
|
339
|
-
|
|
340
|
-
return res_map
|
|
287
|
+
return [{
|
|
288
|
+
'metric_name': self.metric_list[0].name,
|
|
289
|
+
'score': self.metric_list[0].object(mc2_list),
|
|
290
|
+
'num': len(mc2_list)
|
|
291
|
+
}]
|
evalscope/cli/cli.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
4
|
|
|
5
|
+
from evalscope.cli.start_app import StartAppCMD
|
|
5
6
|
from evalscope.cli.start_eval import EvalCMD
|
|
6
7
|
from evalscope.cli.start_perf import PerfBenchCMD
|
|
7
8
|
|
|
@@ -12,6 +13,7 @@ def run_cmd():
|
|
|
12
13
|
|
|
13
14
|
PerfBenchCMD.define_args(subparsers)
|
|
14
15
|
EvalCMD.define_args(subparsers)
|
|
16
|
+
StartAppCMD.define_args(subparsers)
|
|
15
17
|
|
|
16
18
|
args = parser.parse_args()
|
|
17
19
|
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
from argparse import ArgumentParser
|
|
4
|
+
|
|
5
|
+
from evalscope.cli.base import CLICommand
|
|
6
|
+
from evalscope.report.app import create_app
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def subparser_func(args):
|
|
10
|
+
""" Function which will be called for a specific sub parser.
|
|
11
|
+
"""
|
|
12
|
+
return StartAppCMD(args)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StartAppCMD(CLICommand):
|
|
16
|
+
name = 'app'
|
|
17
|
+
|
|
18
|
+
def __init__(self, args):
|
|
19
|
+
self.args = args
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def define_args(parsers: ArgumentParser):
|
|
23
|
+
""" define args for create pipeline template command.
|
|
24
|
+
"""
|
|
25
|
+
parser = parsers.add_parser(StartAppCMD.name)
|
|
26
|
+
parser.set_defaults(func=subparser_func)
|
|
27
|
+
|
|
28
|
+
def execute(self):
|
|
29
|
+
create_app()
|