evalscope 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +14 -17
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +31 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +25 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +90 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +11 -2
- evalscope/config.py +10 -2
- evalscope/constants.py +7 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/http_client.py +6 -4
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +5 -4
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +7 -3
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +68 -58
- tests/cli/test_collection.py +1 -1
- tests/cli/test_run.py +135 -28
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
|
@@ -16,9 +16,8 @@ logger = get_logger()
|
|
|
16
16
|
@Benchmark.register(
|
|
17
17
|
name='general_qa',
|
|
18
18
|
dataset_id='general_qa',
|
|
19
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
20
19
|
subset_list=['default'],
|
|
21
|
-
metric_list=['AverageBLEU'],
|
|
20
|
+
metric_list=['AverageBLEU', 'AverageRouge'],
|
|
22
21
|
few_shot_num=0,
|
|
23
22
|
train_split=None,
|
|
24
23
|
eval_split='test',
|
|
@@ -31,18 +30,31 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
31
30
|
|
|
32
31
|
super().__init__(**kwargs)
|
|
33
32
|
|
|
34
|
-
def load(self, **kwargs) -> dict:
|
|
33
|
+
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
34
|
+
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
35
|
+
subset_list = subset_list or self.subset_list
|
|
35
36
|
|
|
36
|
-
|
|
37
|
+
data_file_dict = defaultdict(str)
|
|
37
38
|
data_list = []
|
|
38
39
|
|
|
40
|
+
# get data file path and subset name
|
|
41
|
+
if os.path.isdir(dataset_name_or_path):
|
|
42
|
+
for subset_name in subset_list:
|
|
43
|
+
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
44
|
+
elif os.path.isfile(dataset_name_or_path):
|
|
45
|
+
cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
|
|
46
|
+
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
47
|
+
else:
|
|
48
|
+
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
49
|
+
|
|
50
|
+
# load data from local disk
|
|
39
51
|
try:
|
|
40
|
-
for file_path in
|
|
52
|
+
for subset_name, file_path in data_file_dict.items():
|
|
41
53
|
data_list.extend(jsonl_to_list(file_path))
|
|
42
54
|
except Exception as e:
|
|
43
55
|
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
44
56
|
|
|
45
|
-
data_dict = {
|
|
57
|
+
data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
|
|
46
58
|
|
|
47
59
|
return data_dict
|
|
48
60
|
|
|
@@ -65,7 +77,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
65
77
|
|
|
66
78
|
query = input_d.get('question', '') or input_d.get('query', '')
|
|
67
79
|
prompt = self.prompt_template.format(query=query)
|
|
68
|
-
return
|
|
80
|
+
return self.gen_prompt_data(prompt)
|
|
69
81
|
|
|
70
82
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
71
83
|
"""
|
|
@@ -100,10 +112,12 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
100
112
|
|
|
101
113
|
"""
|
|
102
114
|
res = dict()
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
115
|
+
if 'AverageRouge' in self.metric_list:
|
|
116
|
+
rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
|
|
117
|
+
res.update(rouge_dict)
|
|
118
|
+
if 'AverageBLEU' in self.metric_list:
|
|
119
|
+
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
120
|
+
res.update(bleu_dict)
|
|
107
121
|
return res
|
|
108
122
|
|
|
109
123
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
@@ -3,15 +3,16 @@ import random
|
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='gpqa',
|
|
12
|
+
pretty_name='GPQA',
|
|
13
13
|
dataset_id='modelscope/gpqa',
|
|
14
|
-
model_adapter=
|
|
14
|
+
model_adapter=OutputType.GENERATION,
|
|
15
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
15
16
|
subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
|
|
16
17
|
metric_list=['AveragePass@1'],
|
|
17
18
|
few_shot_num=5,
|
|
@@ -27,8 +28,9 @@ class GPQAAdapter(DataAdapter):
|
|
|
27
28
|
self.choices = ['A', 'B', 'C', 'D']
|
|
28
29
|
if self.few_shot_num and self.few_shot_num > 0:
|
|
29
30
|
self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
|
|
30
|
-
self.prompt_prefix += open(
|
|
31
|
-
|
|
31
|
+
self.prompt_prefix += open(
|
|
32
|
+
os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'), 'r',
|
|
33
|
+
encoding='utf-8').read() + '\nQuestion: '
|
|
32
34
|
else:
|
|
33
35
|
self.prompt_prefix = 'What is the correct answer to this question:'
|
|
34
36
|
|
|
@@ -50,7 +52,7 @@ class GPQAAdapter(DataAdapter):
|
|
|
50
52
|
query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
|
|
51
53
|
|
|
52
54
|
prompt = self.prompt_template.format(query=query)
|
|
53
|
-
return
|
|
55
|
+
return self.gen_prompt_data(prompt)
|
|
54
56
|
|
|
55
57
|
def __process_input(self, input_d: dict) -> dict:
|
|
56
58
|
|
|
@@ -94,7 +96,10 @@ class GPQAAdapter(DataAdapter):
|
|
|
94
96
|
"""
|
|
95
97
|
Parse the predicted result and extract proper answer.
|
|
96
98
|
"""
|
|
97
|
-
|
|
99
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
100
|
+
return result
|
|
101
|
+
else:
|
|
102
|
+
return GPQAAdapter.get_multiple_choice_answer(result)
|
|
98
103
|
|
|
99
104
|
def match(self, gold: str, pred: str) -> float:
|
|
100
105
|
"""
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
import re
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
10
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
@@ -15,8 +14,8 @@ logger = get_logger()
|
|
|
15
14
|
|
|
16
15
|
@Benchmark.register(
|
|
17
16
|
name='gsm8k',
|
|
17
|
+
pretty_name='GSM8K',
|
|
18
18
|
dataset_id='modelscope/gsm8k',
|
|
19
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
20
19
|
subset_list=['main'],
|
|
21
20
|
metric_list=['AverageAccuracy'],
|
|
22
21
|
few_shot_num=4,
|
|
@@ -76,7 +75,7 @@ class GSM8KAdapter(DataAdapter):
|
|
|
76
75
|
|
|
77
76
|
full_prompt = context + self.prompt_template.format(query=input_d['question'])
|
|
78
77
|
|
|
79
|
-
return
|
|
78
|
+
return self.gen_prompt_data(full_prompt)
|
|
80
79
|
|
|
81
80
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
82
81
|
# Extract the gold answer from the input dict.
|
|
@@ -4,9 +4,8 @@ import os
|
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
-
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.models import ContinuationLogitsModelAdapter
|
|
10
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
from evalscope.utils.utils import ResponseParser
|
|
@@ -18,8 +17,10 @@ logger = get_logger()
|
|
|
18
17
|
|
|
19
18
|
@Benchmark.register(
|
|
20
19
|
name='hellaswag',
|
|
20
|
+
pretty_name='HellaSwag',
|
|
21
21
|
dataset_id='modelscope/hellaswag',
|
|
22
|
-
model_adapter=
|
|
22
|
+
model_adapter=OutputType.CONTINUOUS,
|
|
23
|
+
output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
|
|
23
24
|
subset_list=['default'],
|
|
24
25
|
metric_list=['AverageAccuracy'],
|
|
25
26
|
few_shot_num=0,
|
|
@@ -30,8 +31,6 @@ logger = get_logger()
|
|
|
30
31
|
)
|
|
31
32
|
class HellaSwagAdapter(DataAdapter):
|
|
32
33
|
|
|
33
|
-
choices = ['0', '1', '2', '3']
|
|
34
|
-
|
|
35
34
|
def __init__(self, **kwargs):
|
|
36
35
|
|
|
37
36
|
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
@@ -40,6 +39,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
40
39
|
kwargs['few_shot_num'] = 0
|
|
41
40
|
|
|
42
41
|
super().__init__(**kwargs)
|
|
42
|
+
self.choices = ['0', '1', '2', '3']
|
|
43
43
|
|
|
44
44
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
45
45
|
data_dict = {}
|
|
@@ -89,7 +89,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
89
89
|
|
|
90
90
|
ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
|
|
91
91
|
|
|
92
|
-
return
|
|
92
|
+
return self.gen_prompt_data(ctx_continuation_pair_list)
|
|
93
93
|
|
|
94
94
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
95
95
|
# Get the gold choice
|
|
@@ -107,7 +107,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
107
107
|
Returns:
|
|
108
108
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
109
109
|
"""
|
|
110
|
-
if
|
|
110
|
+
if self.model_adapter == OutputType.CONTINUOUS:
|
|
111
111
|
# answer: in the form of [-2.3, -4.5, ...], len of self.choices
|
|
112
112
|
result = np.array(result)
|
|
113
113
|
endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
|
|
@@ -115,12 +115,8 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
115
115
|
best_choice_idx = np.argmax(result / completion_len)
|
|
116
116
|
|
|
117
117
|
return str(best_choice_idx)
|
|
118
|
-
elif eval_type == EvalType.SERVICE:
|
|
119
|
-
return ResponseParser.parse_first_option(result)
|
|
120
|
-
elif eval_type == EvalType.CUSTOM:
|
|
121
|
-
return ResponseParser.parse_first_option(result)
|
|
122
118
|
else:
|
|
123
|
-
|
|
119
|
+
return ResponseParser.parse_first_option(result)
|
|
124
120
|
|
|
125
121
|
def match(self, gold: str, pred: str) -> float:
|
|
126
122
|
return exact_match(gold=str(gold), pred=str(pred))
|
|
@@ -13,8 +13,8 @@ logger = get_logger()
|
|
|
13
13
|
|
|
14
14
|
@Benchmark.register(
|
|
15
15
|
name='humaneval',
|
|
16
|
+
pretty_name='HumanEval',
|
|
16
17
|
dataset_id='modelscope/humaneval',
|
|
17
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
18
18
|
subset_list=['openai_humaneval'],
|
|
19
19
|
metric_list=['Pass@1'],
|
|
20
20
|
few_shot_num=0,
|
|
@@ -66,7 +66,7 @@ class HumanevalAdapter(DataAdapter):
|
|
|
66
66
|
query = input_d['prompt']
|
|
67
67
|
full_prompt = self.prompt_template.format(query=query)
|
|
68
68
|
|
|
69
|
-
return
|
|
69
|
+
return self.gen_prompt_data(full_prompt)
|
|
70
70
|
|
|
71
71
|
@classmethod
|
|
72
72
|
def _postprocess(cls, text: str) -> str:
|
|
@@ -5,13 +5,12 @@ from evalscope.benchmarks import Benchmark, DataAdapter
|
|
|
5
5
|
from evalscope.benchmarks.ifeval.utils import process_results
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
7
|
from evalscope.metrics import Metric, mean, metric_registry
|
|
8
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
@Benchmark.register(
|
|
12
11
|
name='ifeval',
|
|
12
|
+
pretty_name='IFEval',
|
|
13
13
|
dataset_id='opencompass/ifeval',
|
|
14
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
15
14
|
subset_list=['default'],
|
|
16
15
|
metric_list=[
|
|
17
16
|
'prompt_level_strict_acc',
|
|
@@ -36,7 +35,7 @@ class IFEvalAdapter(DataAdapter):
|
|
|
36
35
|
metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
|
|
37
36
|
|
|
38
37
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
39
|
-
return
|
|
38
|
+
return self.gen_prompt_data(input_d['prompt'])
|
|
40
39
|
|
|
41
40
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
42
41
|
return input_d
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.constants import EvalType
|
|
2
|
+
from evalscope.constants import EvalType, OutputType
|
|
3
3
|
from evalscope.metrics import exact_match
|
|
4
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
5
4
|
from evalscope.utils.utils import ResponseParser
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
@Benchmark.register(
|
|
9
8
|
name='iquiz',
|
|
9
|
+
pretty_name='IQuiz',
|
|
10
10
|
dataset_id='AI-ModelScope/IQuiz',
|
|
11
|
-
model_adapter=
|
|
11
|
+
model_adapter=OutputType.GENERATION,
|
|
12
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
12
13
|
subset_list=['IQ', 'EQ'],
|
|
13
14
|
metric_list=['AverageAccuracy'],
|
|
14
15
|
few_shot_num=0,
|
|
@@ -36,7 +37,7 @@ class IQuizAdapter(DataAdapter):
|
|
|
36
37
|
"""
|
|
37
38
|
prompt = f"问题: {input_d['question']}\n"
|
|
38
39
|
prompt += self.__form_options(input_d['choices'])
|
|
39
|
-
return
|
|
40
|
+
return self.gen_prompt_data(prompt)
|
|
40
41
|
|
|
41
42
|
def __form_options(self, options: list):
|
|
42
43
|
option_str = '选项:\n'
|
|
@@ -54,7 +55,10 @@ class IQuizAdapter(DataAdapter):
|
|
|
54
55
|
"""
|
|
55
56
|
Parse the predicted result and extract proper answer.
|
|
56
57
|
"""
|
|
57
|
-
|
|
58
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
59
|
+
return result
|
|
60
|
+
else:
|
|
61
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
58
62
|
|
|
59
63
|
def match(self, gold: str, pred: str) -> float:
|
|
60
64
|
"""
|
|
@@ -1,9 +1,5 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
|
-
|
|
3
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
|
-
from evalscope.constants import AnswerKeys
|
|
5
2
|
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
6
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
7
3
|
from evalscope.utils.logger import get_logger
|
|
8
4
|
|
|
9
5
|
# flake8: noqa
|
|
@@ -13,8 +9,8 @@ logger = get_logger()
|
|
|
13
9
|
|
|
14
10
|
@Benchmark.register(
|
|
15
11
|
name='math_500',
|
|
12
|
+
pretty_name='MATH-500',
|
|
16
13
|
dataset_id='AI-ModelScope/MATH-500',
|
|
17
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
18
14
|
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
19
15
|
metric_list=['AveragePass@1'],
|
|
20
16
|
few_shot_num=0,
|
|
@@ -40,7 +36,7 @@ class Math500Adapter(DataAdapter):
|
|
|
40
36
|
problem = input_d['problem']
|
|
41
37
|
full_prompt = self.prompt_template.format(query=problem)
|
|
42
38
|
|
|
43
|
-
return
|
|
39
|
+
return self.gen_prompt_data(full_prompt)
|
|
44
40
|
|
|
45
41
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
46
42
|
# Extract the gold answer from the input dict.
|
|
@@ -3,9 +3,8 @@ import csv
|
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
9
8
|
from evalscope.utils import ResponseParser
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
11
10
|
|
|
@@ -136,8 +135,10 @@ SUBJECT_MAPPING = {
|
|
|
136
135
|
|
|
137
136
|
@Benchmark.register(
|
|
138
137
|
name='mmlu',
|
|
138
|
+
pretty_name='MMLU',
|
|
139
139
|
dataset_id='modelscope/mmlu',
|
|
140
|
-
model_adapter=
|
|
140
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
141
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
141
142
|
subset_list=SUBSET_LIST,
|
|
142
143
|
metric_list=['AverageAccuracy'],
|
|
143
144
|
few_shot_num=5,
|
|
@@ -147,8 +148,6 @@ SUBJECT_MAPPING = {
|
|
|
147
148
|
)
|
|
148
149
|
class MMLUAdapter(DataAdapter):
|
|
149
150
|
|
|
150
|
-
choices = ['A', 'B', 'C', 'D']
|
|
151
|
-
|
|
152
151
|
def __init__(self, **kwargs):
|
|
153
152
|
|
|
154
153
|
few_shot_num = kwargs.get('few_shot_num', 5)
|
|
@@ -159,6 +158,7 @@ class MMLUAdapter(DataAdapter):
|
|
|
159
158
|
super().__init__(**kwargs)
|
|
160
159
|
|
|
161
160
|
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
161
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
162
162
|
|
|
163
163
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
164
164
|
data_dict = {}
|
|
@@ -227,7 +227,7 @@ class MMLUAdapter(DataAdapter):
|
|
|
227
227
|
|
|
228
228
|
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
|
|
229
229
|
|
|
230
|
-
return
|
|
230
|
+
return self.gen_prompt_data(full_prompt)
|
|
231
231
|
|
|
232
232
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
233
233
|
# Get the gold choice
|
|
@@ -245,26 +245,21 @@ class MMLUAdapter(DataAdapter):
|
|
|
245
245
|
Returns:
|
|
246
246
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
247
247
|
"""
|
|
248
|
-
if
|
|
248
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
249
249
|
return result
|
|
250
|
-
elif eval_type == EvalType.SERVICE:
|
|
251
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
252
|
-
elif eval_type == EvalType.CUSTOM:
|
|
253
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
254
250
|
else:
|
|
255
|
-
|
|
251
|
+
return ResponseParser.parse_first_option(result, self.choices)
|
|
256
252
|
|
|
257
253
|
def match(self, gold: str, pred: str) -> float:
|
|
258
254
|
return exact_match(gold=gold, pred=pred)
|
|
259
255
|
|
|
260
|
-
|
|
261
|
-
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
256
|
+
def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
|
|
262
257
|
|
|
263
258
|
input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
|
|
264
259
|
|
|
265
260
|
example: str = input_d['input']
|
|
266
|
-
for j in range(len(
|
|
267
|
-
example += '\n{}. {}'.format(
|
|
261
|
+
for j in range(len(self.choices)):
|
|
262
|
+
example += '\n{}. {}'.format(self.choices[j], input_choices[j])
|
|
268
263
|
|
|
269
264
|
example += '\nAnswer:'
|
|
270
265
|
if include_answer:
|
|
@@ -2,9 +2,8 @@ from collections import defaultdict
|
|
|
2
2
|
from typing import Any, Dict
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
-
from evalscope.constants import
|
|
5
|
+
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.metrics import exact_match
|
|
7
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
8
7
|
from evalscope.utils.utils import ResponseParser
|
|
9
8
|
|
|
10
9
|
SUBSET_LIST = [
|
|
@@ -15,8 +14,10 @@ SUBSET_LIST = [
|
|
|
15
14
|
|
|
16
15
|
@Benchmark.register(
|
|
17
16
|
name='mmlu_pro',
|
|
17
|
+
pretty_name='MMLU-Pro',
|
|
18
18
|
dataset_id='modelscope/MMLU-Pro',
|
|
19
|
-
model_adapter=
|
|
19
|
+
model_adapter=OutputType.GENERATION,
|
|
20
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
20
21
|
subset_list=SUBSET_LIST,
|
|
21
22
|
metric_list=['AverageAccuracy'],
|
|
22
23
|
few_shot_num=5,
|
|
@@ -47,7 +48,7 @@ class MMLUProAdapter(DataAdapter):
|
|
|
47
48
|
self.__form_options(input_d['options']) + '\n'
|
|
48
49
|
|
|
49
50
|
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
50
|
-
return
|
|
51
|
+
return self.gen_prompt_data(full_prompt)
|
|
51
52
|
|
|
52
53
|
def format_fewshot_examples(self, few_shot_list):
|
|
53
54
|
# load few-shot prompts for each category
|
|
@@ -88,7 +89,10 @@ class MMLUProAdapter(DataAdapter):
|
|
|
88
89
|
Returns:
|
|
89
90
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
90
91
|
"""
|
|
91
|
-
|
|
92
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
93
|
+
return result
|
|
94
|
+
else:
|
|
95
|
+
return ResponseParser.parse_first_option(result)
|
|
92
96
|
|
|
93
97
|
def match(self, gold: str, pred: str) -> float:
|
|
94
98
|
"""
|
|
@@ -2,9 +2,8 @@ import ast
|
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
-
from evalscope.constants import EvalType
|
|
5
|
+
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.metrics import exact_match
|
|
7
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
8
7
|
from evalscope.utils.utils import ResponseParser
|
|
9
8
|
|
|
10
9
|
|
|
@@ -12,7 +11,8 @@ from evalscope.utils.utils import ResponseParser
|
|
|
12
11
|
name='musr',
|
|
13
12
|
pretty_name='MuSR',
|
|
14
13
|
dataset_id='AI-ModelScope/MuSR',
|
|
15
|
-
model_adapter=
|
|
14
|
+
model_adapter=OutputType.GENERATION,
|
|
15
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
16
16
|
subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
|
|
17
17
|
metric_list=['AverageAccuracy'],
|
|
18
18
|
few_shot_num=0,
|
|
@@ -41,7 +41,7 @@ class MuSRAdapter(DataAdapter):
|
|
|
41
41
|
full_prompt = self.prompt_template.format(
|
|
42
42
|
narrative=input_d['narrative'], question=input_d['question'], choices=choices)
|
|
43
43
|
|
|
44
|
-
return
|
|
44
|
+
return self.gen_prompt_data(full_prompt)
|
|
45
45
|
|
|
46
46
|
def format_choice(self, options: list):
|
|
47
47
|
option_str = ''
|
|
@@ -59,7 +59,10 @@ class MuSRAdapter(DataAdapter):
|
|
|
59
59
|
"""
|
|
60
60
|
Parse the predicted result and extract proper answer.
|
|
61
61
|
"""
|
|
62
|
-
|
|
62
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
63
|
+
return result
|
|
64
|
+
else:
|
|
65
|
+
return ResponseParser.parse_first_option(result)
|
|
63
66
|
|
|
64
67
|
def match(self, gold: str, pred: str) -> float:
|
|
65
68
|
"""
|
|
@@ -5,7 +5,6 @@ from typing import Any, List
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import AnswerKeys, EvalType
|
|
7
7
|
from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
|
|
8
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
8
|
|
|
10
9
|
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
11
10
|
|
|
@@ -14,7 +13,6 @@ cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
|
14
13
|
name='process_bench',
|
|
15
14
|
pretty_name='ProcessBench',
|
|
16
15
|
dataset_id='Qwen/ProcessBench',
|
|
17
|
-
model_adapter=ChatGenerationModelAdapter,
|
|
18
16
|
subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
|
|
19
17
|
metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
|
|
20
18
|
few_shot_num=0,
|
|
@@ -26,7 +24,7 @@ class ProcessBenchAdapter(DataAdapter):
|
|
|
26
24
|
def __init__(self, **kwargs):
|
|
27
25
|
super().__init__(**kwargs)
|
|
28
26
|
|
|
29
|
-
self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt')).read()
|
|
27
|
+
self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt'), encoding='utf-8').read()
|
|
30
28
|
|
|
31
29
|
# register metrics
|
|
32
30
|
metric_registry.register(Metric(name='error_acc', object=mean))
|
|
@@ -50,7 +48,7 @@ class ProcessBenchAdapter(DataAdapter):
|
|
|
50
48
|
|
|
51
49
|
full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
|
|
52
50
|
|
|
53
|
-
return
|
|
51
|
+
return self.gen_prompt_data(full_prompt)
|
|
54
52
|
|
|
55
53
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
56
54
|
"""
|
|
@@ -84,7 +82,12 @@ class ProcessBenchAdapter(DataAdapter):
|
|
|
84
82
|
correct_data.append(res)
|
|
85
83
|
else:
|
|
86
84
|
error_data.append(res)
|
|
87
|
-
data = {
|
|
85
|
+
data = {}
|
|
86
|
+
if len(correct_data) != 0:
|
|
87
|
+
data.update({'correct_acc': correct_data})
|
|
88
|
+
if len(error_data) != 0:
|
|
89
|
+
data.update({'error_acc': error_data})
|
|
90
|
+
data.update({'simple_f1_score': (correct_data, error_data)})
|
|
88
91
|
return super().compute_metric(data)
|
|
89
92
|
|
|
90
93
|
@staticmethod
|
|
@@ -3,9 +3,8 @@
|
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.models import MultiChoiceModelAdapter
|
|
9
8
|
from evalscope.utils import ResponseParser
|
|
10
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
@@ -17,8 +16,10 @@ logger = get_logger()
|
|
|
17
16
|
|
|
18
17
|
@Benchmark.register(
|
|
19
18
|
name='race',
|
|
19
|
+
pretty_name='RACE',
|
|
20
20
|
dataset_id='modelscope/race',
|
|
21
|
-
model_adapter=
|
|
21
|
+
model_adapter=OutputType.MULTIPLE_CHOICE,
|
|
22
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
22
23
|
subset_list=['high', 'middle'],
|
|
23
24
|
metric_list=['AverageAccuracy'],
|
|
24
25
|
few_shot_num=3,
|
|
@@ -27,8 +28,6 @@ logger = get_logger()
|
|
|
27
28
|
)
|
|
28
29
|
class RACEAdapter(DataAdapter):
|
|
29
30
|
|
|
30
|
-
choices = ['A', 'B', 'C', 'D']
|
|
31
|
-
|
|
32
31
|
def __init__(self, **kwargs):
|
|
33
32
|
few_shot_num = kwargs.get('few_shot_num', 3)
|
|
34
33
|
if few_shot_num > 3:
|
|
@@ -37,6 +36,8 @@ class RACEAdapter(DataAdapter):
|
|
|
37
36
|
|
|
38
37
|
super().__init__(**kwargs)
|
|
39
38
|
|
|
39
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
40
|
+
|
|
40
41
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
41
42
|
data_dict = {}
|
|
42
43
|
for subset_name in subset_list:
|
|
@@ -82,7 +83,7 @@ class RACEAdapter(DataAdapter):
|
|
|
82
83
|
|
|
83
84
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
84
85
|
|
|
85
|
-
return
|
|
86
|
+
return self.gen_prompt_data(full_prompt)
|
|
86
87
|
|
|
87
88
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
88
89
|
# Get the gold choice
|
|
@@ -100,26 +101,21 @@ class RACEAdapter(DataAdapter):
|
|
|
100
101
|
Returns:
|
|
101
102
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
102
103
|
"""
|
|
103
|
-
if
|
|
104
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
104
105
|
return result
|
|
105
|
-
elif eval_type == EvalType.SERVICE:
|
|
106
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
107
|
-
elif eval_type == EvalType.CUSTOM:
|
|
108
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
109
106
|
else:
|
|
110
|
-
|
|
107
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
111
108
|
|
|
112
109
|
def match(self, gold: str, pred: str) -> float:
|
|
113
110
|
return exact_match(gold=gold, pred=pred)
|
|
114
111
|
|
|
115
|
-
|
|
116
|
-
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
112
|
+
def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
|
|
117
113
|
|
|
118
114
|
input_choices: list = input_d['options']
|
|
119
115
|
|
|
120
116
|
example: str = 'Article:\n{}\nQuestion:\n{}'.format(input_d['article'], input_d['question'])
|
|
121
|
-
for j in range(len(
|
|
122
|
-
example += '\n{}. {}'.format(
|
|
117
|
+
for j in range(len(self.choices)):
|
|
118
|
+
example += '\n{}. {}'.format(self.choices[j], input_choices[j])
|
|
123
119
|
|
|
124
120
|
example += '\nAnswer:'
|
|
125
121
|
if include_answer:
|
|
File without changes
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.utils.logger import get_logger
|
|
3
|
+
|
|
4
|
+
# flake8: noqa
|
|
5
|
+
|
|
6
|
+
logger = get_logger()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@Benchmark.register(
|
|
10
|
+
name='simple_qa',
|
|
11
|
+
pretty_name='SimpleQA',
|
|
12
|
+
dataset_id='AI-ModelScope/SimpleQA',
|
|
13
|
+
metric_list=['AverageAccuracy'],
|
|
14
|
+
few_shot_num=0,
|
|
15
|
+
train_split=None,
|
|
16
|
+
eval_split='test')
|
|
17
|
+
class SimpleQAAdapter(DataAdapter):
|
|
18
|
+
|
|
19
|
+
def __init__(self, *args, **kwargs):
|
|
20
|
+
super().__init__(*args, **kwargs)
|
|
File without changes
|