evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +49 -0
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
- evalscope/benchmarks/benchmark.py +5 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
- evalscope/benchmarks/data_adapter.py +88 -29
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +68 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +109 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +4 -1
- evalscope/evaluator/evaluator.py +81 -65
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +39 -3
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/base_adapter.py +7 -1
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +7 -2
- evalscope/models/server_adapter.py +106 -61
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +5 -1
- evalscope/perf/http_client.py +2 -2
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +42 -23
- evalscope/run.py +11 -8
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +264 -0
- evalscope/third_party/thinkbench/infer.py +100 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +47 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
- tests/cli/test_run.py +108 -19
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
-
from evalscope.metrics import Pass1
|
|
6
5
|
from evalscope.models import ChatGenerationModelAdapter
|
|
7
6
|
from evalscope.utils.logger import get_logger
|
|
8
7
|
|
|
@@ -17,11 +16,11 @@ logger = get_logger()
|
|
|
17
16
|
dataset_id='modelscope/humaneval',
|
|
18
17
|
model_adapter=ChatGenerationModelAdapter,
|
|
19
18
|
subset_list=['openai_humaneval'],
|
|
20
|
-
metric_list=[
|
|
19
|
+
metric_list=['Pass@1'],
|
|
21
20
|
few_shot_num=0,
|
|
22
21
|
train_split=None,
|
|
23
22
|
eval_split='test',
|
|
24
|
-
prompt_template='',
|
|
23
|
+
prompt_template='Complete the following python code:\n{query}',
|
|
25
24
|
)
|
|
26
25
|
class HumanevalAdapter(DataAdapter):
|
|
27
26
|
"""
|
|
@@ -64,10 +63,10 @@ class HumanevalAdapter(DataAdapter):
|
|
|
64
63
|
input_d (dict): The raw input. A single data format of the Humaneval:
|
|
65
64
|
{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
|
|
66
65
|
"""
|
|
67
|
-
|
|
68
|
-
full_prompt =
|
|
66
|
+
query = input_d['prompt']
|
|
67
|
+
full_prompt = self.prompt_template.format(query=query)
|
|
69
68
|
|
|
70
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
69
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
71
70
|
|
|
72
71
|
@classmethod
|
|
73
72
|
def _postprocess(cls, text: str) -> str:
|
|
@@ -2,9 +2,9 @@ from collections import defaultdict
|
|
|
2
2
|
from typing import Any, Dict, List
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
-
from evalscope.benchmarks.ifeval.utils import
|
|
5
|
+
from evalscope.benchmarks.ifeval.utils import process_results
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import Metric, mean
|
|
7
|
+
from evalscope.metrics import Metric, mean, metric_registry
|
|
8
8
|
from evalscope.models import ChatGenerationModelAdapter
|
|
9
9
|
|
|
10
10
|
|
|
@@ -14,10 +14,10 @@ from evalscope.models import ChatGenerationModelAdapter
|
|
|
14
14
|
model_adapter=ChatGenerationModelAdapter,
|
|
15
15
|
subset_list=['default'],
|
|
16
16
|
metric_list=[
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
'prompt_level_strict_acc',
|
|
18
|
+
'inst_level_strict_acc',
|
|
19
|
+
'prompt_level_loose_acc',
|
|
20
|
+
'inst_level_loose_acc',
|
|
21
21
|
],
|
|
22
22
|
few_shot_num=0,
|
|
23
23
|
train_split=None,
|
|
@@ -29,8 +29,14 @@ class IFEvalAdapter(DataAdapter):
|
|
|
29
29
|
def __init__(self, **kwargs):
|
|
30
30
|
super().__init__(**kwargs)
|
|
31
31
|
|
|
32
|
+
# register metrics
|
|
33
|
+
metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
|
|
34
|
+
metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
|
|
35
|
+
metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
|
|
36
|
+
metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
|
|
37
|
+
|
|
32
38
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
33
|
-
return {'data': [input_d['prompt']], 'system_prompt': self.
|
|
39
|
+
return {'data': [input_d['prompt']], 'system_prompt': self.system_prompt}
|
|
34
40
|
|
|
35
41
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
36
42
|
return input_d
|
|
@@ -41,16 +47,11 @@ class IFEvalAdapter(DataAdapter):
|
|
|
41
47
|
def match(self, gold: Any, pred: Any) -> Dict:
|
|
42
48
|
return process_results(gold, [pred])
|
|
43
49
|
|
|
44
|
-
def compute_metric(self, review_res_list: List[dict]) -> Any:
|
|
50
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
45
51
|
# aggregate review results
|
|
46
52
|
res_dict = defaultdict(list)
|
|
47
53
|
for res in review_res_list:
|
|
48
54
|
for k, v in res.items():
|
|
49
55
|
res_dict[k].append(v)
|
|
50
56
|
|
|
51
|
-
|
|
52
|
-
for metric in self.metric_list:
|
|
53
|
-
metric_name = metric.name
|
|
54
|
-
pred_value = res_dict[metric_name]
|
|
55
|
-
metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
|
|
56
|
-
return metrics
|
|
57
|
+
return super().compute_metric(res_dict)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.constants import
|
|
3
|
-
from evalscope.metrics import
|
|
2
|
+
from evalscope.constants import EvalType
|
|
3
|
+
from evalscope.metrics import exact_match
|
|
4
4
|
from evalscope.models import ChatGenerationModelAdapter
|
|
5
5
|
from evalscope.utils.utils import ResponseParser
|
|
6
6
|
|
|
@@ -10,11 +10,11 @@ from evalscope.utils.utils import ResponseParser
|
|
|
10
10
|
dataset_id='AI-ModelScope/IQuiz',
|
|
11
11
|
model_adapter=ChatGenerationModelAdapter,
|
|
12
12
|
subset_list=['IQ', 'EQ'],
|
|
13
|
-
metric_list=[AverageAccuracy],
|
|
13
|
+
metric_list=['AverageAccuracy'],
|
|
14
14
|
few_shot_num=0,
|
|
15
15
|
train_split=None,
|
|
16
16
|
eval_split='test',
|
|
17
|
-
|
|
17
|
+
system_prompt='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
|
|
18
18
|
)
|
|
19
19
|
class IQuizAdapter(DataAdapter):
|
|
20
20
|
|
|
@@ -36,7 +36,7 @@ class IQuizAdapter(DataAdapter):
|
|
|
36
36
|
"""
|
|
37
37
|
prompt = f"问题: {input_d['question']}\n"
|
|
38
38
|
prompt += self.__form_options(input_d['choices'])
|
|
39
|
-
return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
39
|
+
return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
40
40
|
|
|
41
41
|
def __form_options(self, options: list):
|
|
42
42
|
option_str = '选项:\n'
|
|
File without changes
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
|
|
3
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
|
+
from evalscope.constants import AnswerKeys
|
|
5
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
6
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
# flake8: noqa
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@Benchmark.register(
|
|
15
|
+
name='math_500',
|
|
16
|
+
dataset_id='AI-ModelScope/MATH-500',
|
|
17
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
18
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
19
|
+
metric_list=['AveragePass@1'],
|
|
20
|
+
few_shot_num=0,
|
|
21
|
+
train_split=None,
|
|
22
|
+
eval_split='test',
|
|
23
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
24
|
+
)
|
|
25
|
+
class Math500Adapter(DataAdapter):
|
|
26
|
+
|
|
27
|
+
def __init__(self, *args, **kwargs):
|
|
28
|
+
super().__init__(*args, **kwargs)
|
|
29
|
+
|
|
30
|
+
def load(self, **kwargs):
|
|
31
|
+
# default load all levels
|
|
32
|
+
kwargs['subset_list'] = ['default']
|
|
33
|
+
data_dict = super().load(**kwargs)
|
|
34
|
+
return self.reformat_subset(data_dict, subset_key='level', format='Level {}')
|
|
35
|
+
|
|
36
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
37
|
+
"""
|
|
38
|
+
Generate the prompt for the model input.
|
|
39
|
+
"""
|
|
40
|
+
problem = input_d['problem']
|
|
41
|
+
full_prompt = self.prompt_template.format(query=problem)
|
|
42
|
+
|
|
43
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
44
|
+
|
|
45
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
46
|
+
# Extract the gold answer from the input dict.
|
|
47
|
+
return strip_answer_string(input_d['answer'])
|
|
48
|
+
|
|
49
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
50
|
+
"""
|
|
51
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
52
|
+
"""
|
|
53
|
+
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
54
|
+
result = strip_answer_string(extract_answer(result))
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
def match(self, gold: str, pred: str) -> float:
|
|
58
|
+
return math_equal(pred, gold)
|
|
@@ -4,17 +4,15 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
9
|
-
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
# flake8: noqa
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
15
15
|
|
|
16
|
-
DATASET_ID = 'modelscope/mmlu'
|
|
17
|
-
|
|
18
16
|
SUBSET_LIST = [
|
|
19
17
|
'high_school_european_history',
|
|
20
18
|
'business_ethics',
|
|
@@ -141,11 +139,11 @@ SUBJECT_MAPPING = {
|
|
|
141
139
|
dataset_id='modelscope/mmlu',
|
|
142
140
|
model_adapter=MultiChoiceModelAdapter,
|
|
143
141
|
subset_list=SUBSET_LIST,
|
|
144
|
-
metric_list=[AverageAccuracy],
|
|
142
|
+
metric_list=['AverageAccuracy'],
|
|
145
143
|
few_shot_num=5,
|
|
146
144
|
train_split='train',
|
|
147
145
|
eval_split='test',
|
|
148
|
-
prompt_template='',
|
|
146
|
+
prompt_template='The following are multiple choice questions (with answers) about {subset_name}. \n{query}',
|
|
149
147
|
)
|
|
150
148
|
class MMLUAdapter(DataAdapter):
|
|
151
149
|
|
|
@@ -221,17 +219,15 @@ class MMLUAdapter(DataAdapter):
|
|
|
221
219
|
{'data': [full_prompt], 'multi_choices': self.choices}
|
|
222
220
|
|
|
223
221
|
"""
|
|
224
|
-
prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
|
|
225
|
-
self._format_subject(subset_name))
|
|
226
222
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
227
223
|
|
|
228
224
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
229
225
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
230
|
-
|
|
226
|
+
query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
231
227
|
|
|
232
|
-
full_prompt
|
|
228
|
+
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
|
|
233
229
|
|
|
234
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
230
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
235
231
|
|
|
236
232
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
237
233
|
# Get the gold choice
|
|
@@ -3,22 +3,27 @@ from typing import Any, Dict
|
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import AnswerKeys, EvalType
|
|
6
|
-
from evalscope.metrics import
|
|
6
|
+
from evalscope.metrics import exact_match
|
|
7
7
|
from evalscope.models import ChatGenerationModelAdapter
|
|
8
8
|
from evalscope.utils.utils import ResponseParser
|
|
9
9
|
|
|
10
|
+
SUBSET_LIST = [
|
|
11
|
+
'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
|
|
12
|
+
'philosophy', 'economics', 'other', 'psychology', 'history'
|
|
13
|
+
]
|
|
14
|
+
|
|
10
15
|
|
|
11
16
|
@Benchmark.register(
|
|
12
17
|
name='mmlu_pro',
|
|
13
|
-
dataset_id='modelscope/
|
|
18
|
+
dataset_id='modelscope/MMLU-Pro',
|
|
14
19
|
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
-
subset_list=
|
|
16
|
-
metric_list=[AverageAccuracy],
|
|
20
|
+
subset_list=SUBSET_LIST,
|
|
21
|
+
metric_list=['AverageAccuracy'],
|
|
17
22
|
few_shot_num=5,
|
|
18
23
|
train_split='validation',
|
|
19
24
|
eval_split='test',
|
|
20
25
|
prompt_template=
|
|
21
|
-
'
|
|
26
|
+
'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
|
|
22
27
|
)
|
|
23
28
|
class MMLUProAdapter(DataAdapter):
|
|
24
29
|
|
|
@@ -26,38 +31,29 @@ class MMLUProAdapter(DataAdapter):
|
|
|
26
31
|
super().__init__(**kwargs)
|
|
27
32
|
|
|
28
33
|
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
29
|
-
self.categories = [
|
|
30
|
-
'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
|
|
31
|
-
'philosophy', 'economics', 'other', 'psychology', 'history'
|
|
32
|
-
]
|
|
33
|
-
|
|
34
|
-
def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
|
|
35
|
-
"""
|
|
36
|
-
Generate model prompt from raw input, unify the prompt format for MMLU-Pro benchmark.
|
|
37
|
-
Return a dict with category as key and list of prompts as value.
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
data_dict = data_dict[self.subset_list[0]] # Only one subset for MMLU-Pro
|
|
41
|
-
fewshot_prompts = self.get_fewshot_examples(data_dict)
|
|
42
|
-
|
|
43
|
-
# Use the category as key to group the prompts
|
|
44
|
-
res_dict = defaultdict(list)
|
|
45
|
-
# generate prompts for each test sample
|
|
46
|
-
for entry in data_dict[self.eval_split]:
|
|
47
|
-
prefix = fewshot_prompts[entry['category']]
|
|
48
|
-
query = prefix + 'Q: ' + entry['question'] + '\n' + \
|
|
49
|
-
self.__form_options(entry['options']) + '\n'
|
|
50
|
-
|
|
51
|
-
prompt_d = {'data': [query], 'system_prompt': self.prompt_template, AnswerKeys.RAW_INPUT: entry}
|
|
52
|
-
|
|
53
|
-
res_dict[entry['category']].append(prompt_d)
|
|
54
|
-
return res_dict
|
|
55
34
|
|
|
56
|
-
def
|
|
57
|
-
# load
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
35
|
+
def load(self, **kwargs):
|
|
36
|
+
# default load all data
|
|
37
|
+
kwargs['subset_list'] = ['default']
|
|
38
|
+
data_dict = super().load(**kwargs)
|
|
39
|
+
return self.reformat_subset(data_dict, subset_key='category')
|
|
40
|
+
|
|
41
|
+
def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
42
|
+
if self.few_shot_num > 0:
|
|
43
|
+
prefix = self.format_fewshot_examples(few_shot_list)
|
|
44
|
+
else:
|
|
45
|
+
prefix = ''
|
|
46
|
+
query = prefix + 'Q: ' + input_d['question'] + '\n' + \
|
|
47
|
+
self.__form_options(input_d['options']) + '\n'
|
|
48
|
+
|
|
49
|
+
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
50
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
51
|
+
|
|
52
|
+
def format_fewshot_examples(self, few_shot_list):
|
|
53
|
+
# load few-shot prompts for each category
|
|
54
|
+
prompts = ''
|
|
55
|
+
for index, d in enumerate(few_shot_list):
|
|
56
|
+
prompts += 'Q: ' + d['question'] + '\n' + \
|
|
61
57
|
self.__form_options(d['options']) + '\n' + \
|
|
62
58
|
d['cot_content'] + '\n\n'
|
|
63
59
|
return prompts
|
|
File without changes
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.metrics import exact_match
|
|
7
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
8
|
+
from evalscope.utils.utils import ResponseParser
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='musr',
|
|
13
|
+
pretty_name='MuSR',
|
|
14
|
+
dataset_id='AI-ModelScope/MuSR',
|
|
15
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
16
|
+
subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
|
|
17
|
+
metric_list=['AverageAccuracy'],
|
|
18
|
+
few_shot_num=0,
|
|
19
|
+
train_split=None,
|
|
20
|
+
eval_split='test',
|
|
21
|
+
prompt_template=
|
|
22
|
+
'{narrative}\n\n{question}\n\n{choices}\nThink step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.', # noqa: E501
|
|
23
|
+
)
|
|
24
|
+
class MuSRAdapter(DataAdapter):
|
|
25
|
+
|
|
26
|
+
def __init__(self, **kwargs):
|
|
27
|
+
super().__init__(**kwargs)
|
|
28
|
+
|
|
29
|
+
self.choices = ['A', 'B', 'C', 'D', 'E', 'F']
|
|
30
|
+
|
|
31
|
+
def load(self, **kwargs):
|
|
32
|
+
# default load all levels
|
|
33
|
+
kwargs['split_as_subset'] = True
|
|
34
|
+
data_dict = super().load(**kwargs)
|
|
35
|
+
return data_dict
|
|
36
|
+
|
|
37
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
38
|
+
|
|
39
|
+
choices = self.format_choice(ast.literal_eval(input_d['choices']))
|
|
40
|
+
|
|
41
|
+
full_prompt = self.prompt_template.format(
|
|
42
|
+
narrative=input_d['narrative'], question=input_d['question'], choices=choices)
|
|
43
|
+
|
|
44
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
45
|
+
|
|
46
|
+
def format_choice(self, options: list):
|
|
47
|
+
option_str = ''
|
|
48
|
+
for opt, choice in zip(options, self.choices):
|
|
49
|
+
option_str += f'({choice}): {opt}\n'
|
|
50
|
+
return option_str
|
|
51
|
+
|
|
52
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Parse the raw input labels (gold).
|
|
55
|
+
"""
|
|
56
|
+
return self.choices[input_d['answer_index']]
|
|
57
|
+
|
|
58
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Parse the predicted result and extract proper answer.
|
|
61
|
+
"""
|
|
62
|
+
return ResponseParser.parse_first_option(result)
|
|
63
|
+
|
|
64
|
+
def match(self, gold: str, pred: str) -> float:
|
|
65
|
+
"""
|
|
66
|
+
Match the gold answer and the predicted answer.
|
|
67
|
+
"""
|
|
68
|
+
return exact_match(gold=gold, pred=pred)
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
|
|
2
|
+
|
|
3
|
+
[Math Problem]
|
|
4
|
+
|
|
5
|
+
{problem}
|
|
6
|
+
|
|
7
|
+
[Solution]
|
|
8
|
+
|
|
9
|
+
{tagged_response}
|
|
10
|
+
|
|
11
|
+
Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
|
|
12
|
+
|
|
13
|
+
Please put your final answer (i.e., the index) in \boxed{{}}.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, List
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import AnswerKeys, EvalType
|
|
7
|
+
from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
|
|
8
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
|
|
10
|
+
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@Benchmark.register(
|
|
14
|
+
name='process_bench',
|
|
15
|
+
pretty_name='ProcessBench',
|
|
16
|
+
dataset_id='Qwen/ProcessBench',
|
|
17
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
18
|
+
subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
|
|
19
|
+
metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
|
|
20
|
+
few_shot_num=0,
|
|
21
|
+
train_split=None,
|
|
22
|
+
eval_split='test',
|
|
23
|
+
)
|
|
24
|
+
class ProcessBenchAdapter(DataAdapter):
|
|
25
|
+
|
|
26
|
+
def __init__(self, **kwargs):
|
|
27
|
+
super().__init__(**kwargs)
|
|
28
|
+
|
|
29
|
+
self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt')).read()
|
|
30
|
+
|
|
31
|
+
# register metrics
|
|
32
|
+
metric_registry.register(Metric(name='error_acc', object=mean))
|
|
33
|
+
metric_registry.register(Metric(name='correct_acc', object=mean))
|
|
34
|
+
metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
|
|
35
|
+
|
|
36
|
+
def load(self, **kwargs):
|
|
37
|
+
# default load all levels
|
|
38
|
+
kwargs['split_as_subset'] = True
|
|
39
|
+
data_dict = super().load(**kwargs)
|
|
40
|
+
return data_dict
|
|
41
|
+
|
|
42
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
43
|
+
|
|
44
|
+
problem = input_d['problem']
|
|
45
|
+
steps = input_d['steps']
|
|
46
|
+
tagged_response = ''
|
|
47
|
+
for sdx, step in enumerate(steps):
|
|
48
|
+
tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
|
|
49
|
+
tagged_response = tagged_response.strip()
|
|
50
|
+
|
|
51
|
+
full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
|
|
52
|
+
|
|
53
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
54
|
+
|
|
55
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Parse the raw input labels (gold).
|
|
58
|
+
"""
|
|
59
|
+
return int(input_d['label'])
|
|
60
|
+
|
|
61
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Parse the predicted result and extract proper answer.
|
|
64
|
+
"""
|
|
65
|
+
pred = ProcessBenchAdapter.extract_answer(result)
|
|
66
|
+
try:
|
|
67
|
+
pred = int(pred)
|
|
68
|
+
except Exception:
|
|
69
|
+
pred = None
|
|
70
|
+
return pred
|
|
71
|
+
|
|
72
|
+
def match(self, gold: int, pred: int) -> float:
|
|
73
|
+
"""
|
|
74
|
+
Match the gold answer and the predicted answer.
|
|
75
|
+
"""
|
|
76
|
+
return gold == pred
|
|
77
|
+
|
|
78
|
+
def compute_metric(self, review_res_list: list, **kwargs) -> List[dict]:
|
|
79
|
+
reviews_list = kwargs['reviews_list']
|
|
80
|
+
error_data = []
|
|
81
|
+
correct_data = []
|
|
82
|
+
for res, raw in zip(review_res_list, reviews_list):
|
|
83
|
+
if raw[AnswerKeys.RAW_INPUT]['label'] == -1:
|
|
84
|
+
correct_data.append(res)
|
|
85
|
+
else:
|
|
86
|
+
error_data.append(res)
|
|
87
|
+
data = {'error_acc': error_data, 'correct_acc': correct_data, 'simple_f1_score': (correct_data, error_data)}
|
|
88
|
+
return super().compute_metric(data)
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def extract_answer(solution_text: str):
|
|
92
|
+
boxed_pattern = r'\\boxed\{([^}]*)\}'
|
|
93
|
+
matches = re.findall(boxed_pattern, solution_text)
|
|
94
|
+
if matches:
|
|
95
|
+
return matches[-1].strip()
|
|
96
|
+
return None
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
9
9
|
from evalscope.utils import ResponseParser
|
|
10
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/race',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['high', 'middle'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
23
|
+
metric_list=['AverageAccuracy'],
|
|
24
24
|
few_shot_num=3,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -82,7 +82,7 @@ class RACEAdapter(DataAdapter):
|
|
|
82
82
|
|
|
83
83
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
84
84
|
|
|
85
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
85
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
86
86
|
|
|
87
87
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
88
88
|
# Get the gold choice
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark
|
|
7
7
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
8
8
|
from evalscope.constants import EvalType
|
|
9
|
-
from evalscope.metrics import AverageAccuracy
|
|
10
9
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
10
|
from evalscope.utils import get_logger
|
|
12
11
|
|
|
@@ -20,7 +19,7 @@ logger = get_logger()
|
|
|
20
19
|
dataset_id='modelscope/trivia_qa',
|
|
21
20
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
21
|
subset_list=['default'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
22
|
+
metric_list=['AverageAccuracy'],
|
|
24
23
|
few_shot_num=5,
|
|
25
24
|
train_split='dev',
|
|
26
25
|
eval_split='test',
|
|
@@ -9,9 +9,8 @@ from typing import List
|
|
|
9
9
|
from evalscope.benchmarks import Benchmark
|
|
10
10
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
11
11
|
from evalscope.constants import EvalType
|
|
12
|
-
from evalscope.metrics import AverageAccuracy
|
|
13
12
|
from evalscope.models import ContinuationLogitsModelAdapter
|
|
14
|
-
from evalscope.utils import get_logger
|
|
13
|
+
from evalscope.utils import get_logger
|
|
15
14
|
|
|
16
15
|
# flake8: noqa
|
|
17
16
|
|
|
@@ -25,7 +24,7 @@ logger = get_logger()
|
|
|
25
24
|
dataset_id='modelscope/truthful_qa',
|
|
26
25
|
model_adapter=ContinuationLogitsModelAdapter,
|
|
27
26
|
subset_list=['multiple_choice'],
|
|
28
|
-
metric_list=[AverageAccuracy],
|
|
27
|
+
metric_list=['AverageAccuracy'],
|
|
29
28
|
few_shot_num=0,
|
|
30
29
|
train_split=None,
|
|
31
30
|
eval_split='validation',
|
|
@@ -259,7 +258,7 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
259
258
|
|
|
260
259
|
return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}} # or {'generation': xxx}
|
|
261
260
|
|
|
262
|
-
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
261
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
263
262
|
"""
|
|
264
263
|
Compute evaluation result by specific metric for each subset.
|
|
265
264
|
|
|
@@ -284,8 +283,9 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
284
283
|
logger.error(f'** Unknown review_res: {review_res_d}')
|
|
285
284
|
|
|
286
285
|
# To get mc2 score
|
|
287
|
-
return [{
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
}]
|
|
286
|
+
# return [{
|
|
287
|
+
# 'metric_name': self.metric_list[0].name,
|
|
288
|
+
# 'score': self.metric_list[0].object(mc2_list),
|
|
289
|
+
# 'num': len(mc2_list)
|
|
290
|
+
# }]
|
|
291
|
+
return super().compute_metric(mc2_list)
|
evalscope/cli/start_app.py
CHANGED
|
@@ -3,7 +3,6 @@ import os
|
|
|
3
3
|
from argparse import ArgumentParser
|
|
4
4
|
|
|
5
5
|
from evalscope.cli.base import CLICommand
|
|
6
|
-
from evalscope.report.app import add_argument, create_app
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
def subparser_func(args):
|
|
@@ -22,9 +21,13 @@ class StartAppCMD(CLICommand):
|
|
|
22
21
|
def define_args(parsers: ArgumentParser):
|
|
23
22
|
""" define args for create pipeline template command.
|
|
24
23
|
"""
|
|
24
|
+
from evalscope.report.app import add_argument
|
|
25
|
+
|
|
25
26
|
parser = parsers.add_parser(StartAppCMD.name)
|
|
26
27
|
add_argument(parser)
|
|
27
28
|
parser.set_defaults(func=subparser_func)
|
|
28
29
|
|
|
29
30
|
def execute(self):
|
|
31
|
+
from evalscope.report.app import create_app
|
|
32
|
+
|
|
30
33
|
create_app(self.args)
|
evalscope/cli/start_eval.py
CHANGED
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
from argparse import ArgumentParser
|
|
4
3
|
|
|
5
|
-
from evalscope.arguments import add_argument
|
|
6
4
|
from evalscope.cli.base import CLICommand
|
|
7
|
-
from evalscope.run import run_task
|
|
8
5
|
|
|
9
6
|
|
|
10
7
|
def subparser_func(args):
|
|
@@ -23,9 +20,13 @@ class EvalCMD(CLICommand):
|
|
|
23
20
|
def define_args(parsers: ArgumentParser):
|
|
24
21
|
""" define args for create pipeline template command.
|
|
25
22
|
"""
|
|
23
|
+
from evalscope.arguments import add_argument
|
|
24
|
+
|
|
26
25
|
parser = parsers.add_parser(EvalCMD.name)
|
|
27
26
|
add_argument(parser)
|
|
28
27
|
parser.set_defaults(func=subparser_func)
|
|
29
28
|
|
|
30
29
|
def execute(self):
|
|
30
|
+
from evalscope.run import run_task
|
|
31
|
+
|
|
31
32
|
run_task(self.args)
|