evalscope 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +14 -17
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +31 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +25 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +90 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +11 -2
- evalscope/config.py +10 -2
- evalscope/constants.py +7 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/http_client.py +6 -4
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +5 -4
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +7 -3
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +68 -58
- tests/cli/test_collection.py +1 -1
- tests/cli/test_run.py +135 -28
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
evalscope/arguments.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import OutputType
|
|
2
3
|
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
|
|
6
6
|
# flake8: noqa
|
|
@@ -10,8 +10,8 @@ logger = get_logger()
|
|
|
10
10
|
|
|
11
11
|
@Benchmark.register(
|
|
12
12
|
name='aime24',
|
|
13
|
+
pretty_name='AIME-2024',
|
|
13
14
|
dataset_id='HuggingFaceH4/aime_2024',
|
|
14
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
15
15
|
subset_list=['default'],
|
|
16
16
|
metric_list=['AveragePass@1'],
|
|
17
17
|
few_shot_num=0,
|
|
@@ -31,7 +31,7 @@ class AIME24Adapter(DataAdapter):
|
|
|
31
31
|
problem = input_d['problem']
|
|
32
32
|
full_prompt = self.prompt_template.format(query=problem)
|
|
33
33
|
|
|
34
|
-
return
|
|
34
|
+
return self.gen_prompt_data(full_prompt)
|
|
35
35
|
|
|
36
36
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
37
|
# Extract the gold answer from the input dict.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import OutputType
|
|
2
3
|
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
|
|
6
6
|
# flake8: noqa
|
|
@@ -10,8 +10,8 @@ logger = get_logger()
|
|
|
10
10
|
|
|
11
11
|
@Benchmark.register(
|
|
12
12
|
name='aime25',
|
|
13
|
+
pretty_name='AIME-2025',
|
|
13
14
|
dataset_id='TIGER-Lab/AIME25',
|
|
14
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
15
15
|
subset_list=['default'],
|
|
16
16
|
metric_list=['AveragePass@1'],
|
|
17
17
|
few_shot_num=0,
|
|
@@ -31,7 +31,7 @@ class AIME25Adapter(DataAdapter):
|
|
|
31
31
|
problem = input_d['question']
|
|
32
32
|
full_prompt = self.prompt_template.format(query=problem)
|
|
33
33
|
|
|
34
|
-
return
|
|
34
|
+
return self.gen_prompt_data(full_prompt)
|
|
35
35
|
|
|
36
36
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
37
|
# Extract the gold answer from the input dict.
|
|
@@ -4,9 +4,8 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
-
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
10
9
|
from evalscope.utils import ResponseParser
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
@@ -17,19 +16,20 @@ logger = get_logger()
|
|
|
17
16
|
|
|
18
17
|
@Benchmark.register(
|
|
19
18
|
name='arc',
|
|
19
|
+
pretty_name='ARC',
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
|
-
model_adapter=
|
|
21
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
22
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
22
23
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
24
|
metric_list=['AverageAccuracy'],
|
|
24
25
|
few_shot_num=0,
|
|
25
26
|
train_split='train',
|
|
26
27
|
eval_split='test',
|
|
27
|
-
prompt_template=
|
|
28
|
+
prompt_template=
|
|
29
|
+
'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n{query}',
|
|
28
30
|
)
|
|
29
31
|
class ARCAdapter(DataAdapter):
|
|
30
32
|
|
|
31
|
-
choices = ['A', 'B', 'C', 'D']
|
|
32
|
-
|
|
33
33
|
def __init__(self, **kwargs):
|
|
34
34
|
few_shot_num = kwargs.get('few_shot_num', None)
|
|
35
35
|
if few_shot_num is None:
|
|
@@ -42,6 +42,8 @@ class ARCAdapter(DataAdapter):
|
|
|
42
42
|
|
|
43
43
|
super().__init__(**kwargs)
|
|
44
44
|
|
|
45
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
46
|
+
|
|
45
47
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
46
48
|
"""
|
|
47
49
|
Load the dataset from local disk.
|
|
@@ -60,7 +62,7 @@ class ARCAdapter(DataAdapter):
|
|
|
60
62
|
for split_name in ['Train', 'Test']:
|
|
61
63
|
split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
|
|
62
64
|
if os.path.exists(split_path):
|
|
63
|
-
with open(split_path, 'r', errors='ignore') as in_f:
|
|
65
|
+
with open(split_path, 'r', errors='ignore', encoding='utf-8') as in_f:
|
|
64
66
|
rows = []
|
|
65
67
|
for line in in_f:
|
|
66
68
|
item = json.loads(line.strip())
|
|
@@ -107,12 +109,11 @@ class ARCAdapter(DataAdapter):
|
|
|
107
109
|
{'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
|
|
108
110
|
"""
|
|
109
111
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
110
|
-
context
|
|
112
|
+
context = '\n'.join(few_shot_prompts) + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
111
113
|
|
|
112
|
-
|
|
113
|
-
full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
114
|
+
full_prompt = self.prompt_template.format(query=context)
|
|
114
115
|
|
|
115
|
-
return
|
|
116
|
+
return self.gen_prompt_data(full_prompt)
|
|
116
117
|
|
|
117
118
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
118
119
|
# Get the gold choice
|
|
@@ -130,14 +131,10 @@ class ARCAdapter(DataAdapter):
|
|
|
130
131
|
Returns:
|
|
131
132
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
132
133
|
"""
|
|
133
|
-
if
|
|
134
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
134
135
|
return result
|
|
135
|
-
elif eval_type == EvalType.SERVICE:
|
|
136
|
-
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
137
|
-
elif eval_type == EvalType.CUSTOM:
|
|
138
|
-
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
139
136
|
else:
|
|
140
|
-
|
|
137
|
+
return ResponseParser.parse_first_capital(text=result, options=self.choices)
|
|
141
138
|
|
|
142
139
|
def match(self, gold: str, pred: str) -> float:
|
|
143
140
|
return exact_match(gold=gold, pred=pred)
|
|
@@ -8,8 +8,6 @@ import re
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
10
|
from evalscope.metrics import exact_match
|
|
11
|
-
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
|
-
from evalscope.utils import ResponseParser
|
|
13
11
|
from evalscope.utils.logger import get_logger
|
|
14
12
|
|
|
15
13
|
# flake8: noqa
|
|
@@ -60,8 +58,8 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
60
58
|
|
|
61
59
|
@Benchmark.register(
|
|
62
60
|
name='bbh',
|
|
61
|
+
pretty_name='BBH',
|
|
63
62
|
dataset_id='modelscope/bbh',
|
|
64
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
65
63
|
subset_list=SUBSET_LIST,
|
|
66
64
|
metric_list=['AverageAccuracy'],
|
|
67
65
|
few_shot_num=3,
|
|
@@ -94,7 +92,7 @@ class BBHAdapter(DataAdapter):
|
|
|
94
92
|
else:
|
|
95
93
|
file_path: str = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}.json')
|
|
96
94
|
if os.path.exists(file_path):
|
|
97
|
-
with open(file_path, 'r') as f:
|
|
95
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
98
96
|
examples = json.load(f)['examples']
|
|
99
97
|
if subset_name in data_dict:
|
|
100
98
|
data_dict[subset_name].update({split_name: examples})
|
|
@@ -125,7 +123,7 @@ class BBHAdapter(DataAdapter):
|
|
|
125
123
|
cot_prompts = ''
|
|
126
124
|
full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
|
|
127
125
|
|
|
128
|
-
return
|
|
126
|
+
return self.gen_prompt_data(full_prompt)
|
|
129
127
|
|
|
130
128
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
131
129
|
"""
|
|
@@ -153,7 +151,9 @@ class BBHAdapter(DataAdapter):
|
|
|
153
151
|
for sub_name, sub_data_dict in data_dict.items():
|
|
154
152
|
few_shot_data = []
|
|
155
153
|
if self.few_shot_num > 0:
|
|
156
|
-
with open(
|
|
154
|
+
with open(
|
|
155
|
+
os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r',
|
|
156
|
+
encoding='utf-8') as f:
|
|
157
157
|
cot_prompt_str = f.read()
|
|
158
158
|
few_shot_data = [cot_prompt_str]
|
|
159
159
|
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
from collections import OrderedDict
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
4
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
7
|
+
|
|
5
8
|
if TYPE_CHECKING:
|
|
6
9
|
from evalscope.benchmarks import DataAdapter
|
|
7
10
|
|
|
8
|
-
from evalscope.models import BaseModelAdapter
|
|
9
|
-
|
|
10
11
|
BENCHMARK_MAPPINGS = {}
|
|
11
12
|
|
|
12
13
|
|
|
@@ -15,8 +16,9 @@ class BenchmarkMeta:
|
|
|
15
16
|
name: str
|
|
16
17
|
dataset_id: str
|
|
17
18
|
data_adapter: 'DataAdapter'
|
|
18
|
-
model_adapter:
|
|
19
|
-
|
|
19
|
+
model_adapter: Optional[str] = OutputType.GENERATION
|
|
20
|
+
output_types: Optional[List[str]] = field(default_factory=lambda: [OutputType.GENERATION])
|
|
21
|
+
subset_list: List[str] = field(default_factory=lambda: ['default'])
|
|
20
22
|
metric_list: List[str] = field(default_factory=list)
|
|
21
23
|
few_shot_num: int = 0
|
|
22
24
|
few_shot_random: bool = False
|
|
@@ -26,6 +28,7 @@ class BenchmarkMeta:
|
|
|
26
28
|
system_prompt: Optional[str] = None
|
|
27
29
|
query_template: Optional[str] = None
|
|
28
30
|
pretty_name: Optional[str] = None
|
|
31
|
+
filters: Optional[OrderedDict] = None
|
|
29
32
|
|
|
30
33
|
def _update(self, args: dict):
|
|
31
34
|
if args.get('local_path'):
|
|
@@ -39,10 +42,7 @@ class BenchmarkMeta:
|
|
|
39
42
|
def to_string_dict(self) -> dict:
|
|
40
43
|
cur_dict = copy.deepcopy(self.__dict__)
|
|
41
44
|
# cur_dict['data_adapter'] = self.data_adapter.__name__
|
|
42
|
-
# cur_dict['model_adapter'] = self.model_adapter.__name__
|
|
43
|
-
# cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
|
|
44
45
|
del cur_dict['data_adapter']
|
|
45
|
-
del cur_dict['model_adapter']
|
|
46
46
|
return cur_dict
|
|
47
47
|
|
|
48
48
|
def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
|
|
@@ -66,13 +66,13 @@ class Benchmark:
|
|
|
66
66
|
return benchmark
|
|
67
67
|
|
|
68
68
|
@classmethod
|
|
69
|
-
def register(cls, name: str, dataset_id: str,
|
|
69
|
+
def register(cls, name: str, dataset_id: str, **kwargs):
|
|
70
70
|
|
|
71
71
|
def register_wrapper(data_adapter):
|
|
72
72
|
if name in BENCHMARK_MAPPINGS:
|
|
73
73
|
raise Exception(f'Benchmark {name} already registered')
|
|
74
74
|
BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
|
|
75
|
-
name=name, data_adapter=data_adapter,
|
|
75
|
+
name=name, data_adapter=data_adapter, dataset_id=dataset_id, **kwargs)
|
|
76
76
|
return data_adapter
|
|
77
77
|
|
|
78
78
|
return register_wrapper
|
|
@@ -3,9 +3,8 @@ import csv
|
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics.metrics import exact_match
|
|
8
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
9
8
|
from evalscope.utils import ResponseParser
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
11
10
|
|
|
@@ -126,8 +125,10 @@ SUBJECT_MAPPING = {
|
|
|
126
125
|
|
|
127
126
|
@Benchmark.register(
|
|
128
127
|
name='ceval',
|
|
128
|
+
pretty_name='C-Eval',
|
|
129
129
|
dataset_id='modelscope/ceval-exam',
|
|
130
|
-
model_adapter=
|
|
130
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
131
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
131
132
|
subset_list=SUBSET_LIST,
|
|
132
133
|
metric_list=['AverageAccuracy'],
|
|
133
134
|
few_shot_num=0,
|
|
@@ -137,8 +138,6 @@ SUBJECT_MAPPING = {
|
|
|
137
138
|
)
|
|
138
139
|
class CEVALAdapter(DataAdapter):
|
|
139
140
|
|
|
140
|
-
choices = ['A', 'B', 'C', 'D']
|
|
141
|
-
|
|
142
141
|
def __init__(self, **kwargs):
|
|
143
142
|
|
|
144
143
|
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
@@ -148,6 +147,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
148
147
|
super().__init__(**kwargs)
|
|
149
148
|
|
|
150
149
|
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
150
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
151
151
|
|
|
152
152
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
153
153
|
data_dict = {}
|
|
@@ -207,7 +207,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
207
207
|
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
208
208
|
full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
|
|
209
209
|
|
|
210
|
-
return
|
|
210
|
+
return self.gen_prompt_data(full_prompt)
|
|
211
211
|
|
|
212
212
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
213
213
|
# Get the gold choice
|
|
@@ -225,22 +225,17 @@ class CEVALAdapter(DataAdapter):
|
|
|
225
225
|
Returns:
|
|
226
226
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
227
227
|
"""
|
|
228
|
-
if
|
|
228
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
229
229
|
return result
|
|
230
|
-
elif eval_type == EvalType.SERVICE:
|
|
231
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
232
|
-
elif eval_type == EvalType.CUSTOM:
|
|
233
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
234
230
|
else:
|
|
235
|
-
|
|
231
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
236
232
|
|
|
237
233
|
def match(self, gold: str, pred: str) -> float:
|
|
238
234
|
return exact_match(gold=gold, pred=pred)
|
|
239
235
|
|
|
240
|
-
|
|
241
|
-
def _format_example(cls, input_d: dict, include_answer=True):
|
|
236
|
+
def _format_example(self, input_d: dict, include_answer=True):
|
|
242
237
|
example = '问题:' + input_d['question']
|
|
243
|
-
for choice in
|
|
238
|
+
for choice in self.choices:
|
|
244
239
|
example += f'\n{choice}. {input_d[f"{choice}"]}'
|
|
245
240
|
|
|
246
241
|
if include_answer:
|
|
@@ -4,9 +4,8 @@ import csv
|
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
-
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
10
9
|
from evalscope.utils import ResponseParser
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
@@ -103,8 +102,10 @@ SUBJECT_MAPPING = {
|
|
|
103
102
|
|
|
104
103
|
@Benchmark.register(
|
|
105
104
|
name='cmmlu',
|
|
105
|
+
pretty_name='C-MMLU',
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
|
-
model_adapter=
|
|
107
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
108
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
108
109
|
subset_list=SUBSET_LIST,
|
|
109
110
|
metric_list=['AverageAccuracy'],
|
|
110
111
|
few_shot_num=5,
|
|
@@ -114,12 +115,11 @@ SUBJECT_MAPPING = {
|
|
|
114
115
|
)
|
|
115
116
|
class CMMLUAdapter(DataAdapter):
|
|
116
117
|
|
|
117
|
-
choices = ['A', 'B', 'C', 'D']
|
|
118
|
-
|
|
119
118
|
def __init__(self, **kwargs):
|
|
120
119
|
super().__init__(**kwargs)
|
|
121
120
|
|
|
122
121
|
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
122
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
123
123
|
|
|
124
124
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
125
125
|
data_dict = {}
|
|
@@ -172,7 +172,7 @@ class CMMLUAdapter(DataAdapter):
|
|
|
172
172
|
|
|
173
173
|
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
|
|
174
174
|
|
|
175
|
-
return
|
|
175
|
+
return self.gen_prompt_data(full_prompt)
|
|
176
176
|
|
|
177
177
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
178
178
|
# Get the gold choice
|
|
@@ -190,26 +190,21 @@ class CMMLUAdapter(DataAdapter):
|
|
|
190
190
|
Returns:
|
|
191
191
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
192
192
|
"""
|
|
193
|
-
if
|
|
193
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
194
194
|
return result
|
|
195
|
-
elif eval_type == EvalType.SERVICE:
|
|
196
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
197
|
-
elif eval_type == EvalType.CUSTOM:
|
|
198
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
199
195
|
else:
|
|
200
|
-
|
|
196
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
201
197
|
|
|
202
198
|
def match(self, gold: str, pred: str) -> float:
|
|
203
199
|
return exact_match(gold=gold, pred=pred)
|
|
204
200
|
|
|
205
|
-
|
|
206
|
-
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
201
|
+
def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
|
|
207
202
|
|
|
208
203
|
input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
|
|
209
204
|
|
|
210
205
|
example: str = input_d['Question']
|
|
211
|
-
for j in range(len(
|
|
212
|
-
example += '\n{}. {}'.format(
|
|
206
|
+
for j in range(len(self.choices)):
|
|
207
|
+
example += '\n{}. {}'.format(self.choices[j], input_choices[j])
|
|
213
208
|
|
|
214
209
|
example += '\nAnswer:'
|
|
215
210
|
if include_answer:
|
|
@@ -18,8 +18,8 @@ logger = get_logger()
|
|
|
18
18
|
|
|
19
19
|
@Benchmark.register(
|
|
20
20
|
name='competition_math',
|
|
21
|
+
pretty_name='MATH',
|
|
21
22
|
dataset_id='modelscope/competition_math',
|
|
22
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
23
23
|
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
24
24
|
metric_list=['AveragePass@1'],
|
|
25
25
|
few_shot_num=4,
|
|
@@ -58,7 +58,7 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
58
58
|
split_data = []
|
|
59
59
|
for file_path in split_files:
|
|
60
60
|
if os.path.exists(file_path):
|
|
61
|
-
with open(file_path, 'r') as f:
|
|
61
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
62
62
|
split_data.append(json.load(f))
|
|
63
63
|
data_dict[subset_name][split_name] = split_data
|
|
64
64
|
|
|
@@ -81,7 +81,7 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
81
81
|
use_fewshot = self.few_shot_num > 0
|
|
82
82
|
query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
|
|
83
83
|
full_prompt = self.prompt_template.format(query=query)
|
|
84
|
-
return
|
|
84
|
+
return self.gen_prompt_data(full_prompt)
|
|
85
85
|
|
|
86
86
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
87
87
|
# Extract the gold answer from the input dict.
|
|
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from typing import Any, List, Optional, Union
|
|
7
7
|
|
|
8
|
+
from evalscope.benchmarks.utils import PromptData, preprocess_decorator
|
|
8
9
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
9
10
|
from evalscope.metrics.named_metrics import metric_registry
|
|
10
11
|
from evalscope.report import Report, ReportGenerator
|
|
@@ -18,6 +19,7 @@ class DataAdapter(ABC):
|
|
|
18
19
|
def __init__(self,
|
|
19
20
|
name: str,
|
|
20
21
|
dataset_id: str,
|
|
22
|
+
model_adapter: str,
|
|
21
23
|
subset_list: list,
|
|
22
24
|
metric_list: List[str],
|
|
23
25
|
few_shot_num: Optional[int] = 0,
|
|
@@ -48,6 +50,7 @@ class DataAdapter(ABC):
|
|
|
48
50
|
"""
|
|
49
51
|
self.name = name
|
|
50
52
|
self.dataset_id = dataset_id
|
|
53
|
+
self.model_adapter = model_adapter
|
|
51
54
|
self.subset_list = subset_list
|
|
52
55
|
self.metric_list = metric_list
|
|
53
56
|
self.few_shot_num = few_shot_num
|
|
@@ -59,6 +62,15 @@ class DataAdapter(ABC):
|
|
|
59
62
|
self.pretty_name = pretty_name
|
|
60
63
|
self.config_kwargs = kwargs
|
|
61
64
|
self.category_map = kwargs.get('category_map', {})
|
|
65
|
+
self.choices = kwargs.get('choices', None)
|
|
66
|
+
|
|
67
|
+
def __init_subclass__(cls, **kwargs):
|
|
68
|
+
super().__init_subclass__(**kwargs)
|
|
69
|
+
|
|
70
|
+
# find and decorate parse_pred_result method
|
|
71
|
+
if hasattr(cls, 'parse_pred_result'):
|
|
72
|
+
original_method = cls.parse_pred_result
|
|
73
|
+
cls.parse_pred_result = preprocess_decorator(original_method)
|
|
62
74
|
|
|
63
75
|
def load(self,
|
|
64
76
|
dataset_name_or_path: str = None,
|
|
@@ -78,11 +90,15 @@ class DataAdapter(ABC):
|
|
|
78
90
|
|
|
79
91
|
# Try to load dataset from local disk
|
|
80
92
|
if os.path.exists(dataset_name_or_path):
|
|
81
|
-
|
|
93
|
+
logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
|
|
94
|
+
data_dict = self.load_from_disk(
|
|
95
|
+
dataset_name_or_path, subset_list, work_dir, trust_remote_code=False, **kwargs)
|
|
82
96
|
else:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
97
|
+
logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
|
|
98
|
+
data_dict = self.load_from_hub(
|
|
99
|
+
dataset_name_or_path, subset_list, work_dir, trust_remote_code=True, **kwargs)
|
|
100
|
+
if len(data_dict) == 0:
|
|
101
|
+
raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
|
|
86
102
|
return data_dict
|
|
87
103
|
|
|
88
104
|
def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
|
|
@@ -91,8 +107,7 @@ class DataAdapter(ABC):
|
|
|
91
107
|
datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
|
|
92
108
|
split_as_subset: bool = kwargs.pop('split_as_subset', False)
|
|
93
109
|
# Load dataset from remote
|
|
94
|
-
logger.info(
|
|
95
|
-
f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
110
|
+
logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
|
|
96
111
|
|
|
97
112
|
data_dict = {}
|
|
98
113
|
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
@@ -133,21 +148,7 @@ class DataAdapter(ABC):
|
|
|
133
148
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
134
149
|
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
135
150
|
"""
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
|
|
139
|
-
subsets: {subset_list}')
|
|
140
|
-
data_dict = {}
|
|
141
|
-
subset_list = subset_list or self.subset_list
|
|
142
|
-
split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
|
|
143
|
-
for sub_name in subset_list:
|
|
144
|
-
data_dict[sub_name] = {}
|
|
145
|
-
# e.g. train: few-shot, test: target dataset to evaluate
|
|
146
|
-
for split in split_list:
|
|
147
|
-
dataset = MsDataset.load(
|
|
148
|
-
dataset_name=dataset_name_or_path, subset_name=sub_name, split=split, cache_dir=work_dir, **kwargs)
|
|
149
|
-
data_dict[sub_name].update({split: dataset})
|
|
150
|
-
return data_dict
|
|
151
|
+
return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
|
|
151
152
|
|
|
152
153
|
def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
|
|
153
154
|
"""
|
|
@@ -285,6 +286,12 @@ class DataAdapter(ABC):
|
|
|
285
286
|
kwargs['metric_list'] = self.metric_list
|
|
286
287
|
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
287
288
|
|
|
289
|
+
def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
|
|
290
|
+
if not isinstance(prompt, list):
|
|
291
|
+
prompt = [prompt]
|
|
292
|
+
prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
|
|
293
|
+
return prompt_data.to_dict()
|
|
294
|
+
|
|
288
295
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
289
296
|
"""
|
|
290
297
|
Generate model prompt from raw input, unify the prompt format for different datasets.
|
|
@@ -348,3 +355,6 @@ class DataAdapter(ABC):
|
|
|
348
355
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
349
356
|
"""
|
|
350
357
|
raise NotImplementedError
|
|
358
|
+
|
|
359
|
+
def llm_match(self, *args, **kwargs):
|
|
360
|
+
pass
|
|
@@ -3,9 +3,8 @@ import csv
|
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics.metrics import exact_match
|
|
8
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
9
8
|
from evalscope.utils import ResponseParser
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
11
10
|
|
|
@@ -16,8 +15,10 @@ logger = get_logger()
|
|
|
16
15
|
|
|
17
16
|
@Benchmark.register(
|
|
18
17
|
name='general_mcq',
|
|
18
|
+
pretty_name='General MCQ',
|
|
19
19
|
dataset_id='general_mcq',
|
|
20
|
-
model_adapter=
|
|
20
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
21
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
21
22
|
subset_list=['default'],
|
|
22
23
|
metric_list=['AverageAccuracy'],
|
|
23
24
|
few_shot_num=0,
|
|
@@ -27,11 +28,11 @@ logger = get_logger()
|
|
|
27
28
|
query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
|
|
28
29
|
class GeneralMCQAdapter(DataAdapter):
|
|
29
30
|
|
|
30
|
-
choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
31
|
-
|
|
32
31
|
def __init__(self, **kwargs):
|
|
33
32
|
super().__init__(**kwargs)
|
|
34
33
|
|
|
34
|
+
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
35
|
+
|
|
35
36
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
36
37
|
data_dict = {}
|
|
37
38
|
for subset_name in subset_list:
|
|
@@ -85,7 +86,7 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
85
86
|
|
|
86
87
|
full_prompt = self.prompt_template.format(query=context)
|
|
87
88
|
|
|
88
|
-
return
|
|
89
|
+
return self.gen_prompt_data(full_prompt)
|
|
89
90
|
|
|
90
91
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
91
92
|
# Get the gold choice
|
|
@@ -103,14 +104,10 @@ class GeneralMCQAdapter(DataAdapter):
|
|
|
103
104
|
Returns:
|
|
104
105
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
105
106
|
"""
|
|
106
|
-
if
|
|
107
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
107
108
|
return result
|
|
108
|
-
elif eval_type == EvalType.SERVICE:
|
|
109
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
110
|
-
elif eval_type == EvalType.CUSTOM:
|
|
111
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
112
109
|
else:
|
|
113
|
-
|
|
110
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
114
111
|
|
|
115
112
|
def match(self, gold: str, pred: str) -> float:
|
|
116
113
|
return exact_match(gold=gold, pred=pred)
|