evalscope 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (58) hide show
  1. evalscope/arguments.py +2 -0
  2. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  3. evalscope/benchmarks/bbh/bbh_adapter.py +0 -5
  4. evalscope/benchmarks/benchmark.py +3 -1
  5. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -17
  6. evalscope/benchmarks/data_adapter.py +71 -18
  7. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +6 -10
  8. evalscope/benchmarks/general_qa/general_qa_adapter.py +4 -5
  9. evalscope/benchmarks/gpqa/gpqa_adapter.py +1 -1
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +1 -1
  11. evalscope/benchmarks/ifeval/ifeval_adapter.py +1 -1
  12. evalscope/benchmarks/math_500/math_500_adapter.py +10 -1
  13. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +16 -32
  14. evalscope/benchmarks/musr/__init__.py +0 -0
  15. evalscope/benchmarks/musr/musr_adapter.py +68 -0
  16. evalscope/benchmarks/process_bench/__init__.py +0 -0
  17. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  18. evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
  19. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -1
  20. evalscope/cli/start_app.py +4 -1
  21. evalscope/cli/start_eval.py +4 -3
  22. evalscope/cli/start_perf.py +4 -2
  23. evalscope/collections/evaluator.py +6 -0
  24. evalscope/config.py +3 -1
  25. evalscope/evaluator/evaluator.py +3 -1
  26. evalscope/metrics/__init__.py +2 -1
  27. evalscope/metrics/metrics.py +23 -2
  28. evalscope/models/base_adapter.py +7 -1
  29. evalscope/models/chat_adapter.py +1 -1
  30. evalscope/models/local_model.py +3 -2
  31. evalscope/models/server_adapter.py +79 -28
  32. evalscope/perf/__init__.py +0 -1
  33. evalscope/perf/arguments.py +5 -1
  34. evalscope/perf/http_client.py +2 -2
  35. evalscope/perf/plugin/api/openai_api.py +11 -1
  36. evalscope/perf/utils/benchmark_util.py +6 -2
  37. evalscope/report/app.py +12 -8
  38. evalscope/run.py +1 -1
  39. evalscope/third_party/thinkbench/__init__.py +3 -0
  40. evalscope/third_party/thinkbench/eval.py +264 -0
  41. evalscope/third_party/thinkbench/infer.py +100 -0
  42. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  43. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  44. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  45. evalscope/third_party/thinkbench/tools/llm.py +47 -0
  46. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  47. evalscope/utils/model_utils.py +17 -1
  48. evalscope/utils/utils.py +45 -45
  49. evalscope/version.py +2 -2
  50. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/METADATA +9 -4
  51. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/RECORD +58 -44
  52. tests/cli/test_run.py +27 -15
  53. /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
  54. /evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +0 -0
  55. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
  56. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
  57. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
  58. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -71,6 +71,8 @@ def add_argument(parser: argparse.ArgumentParser):
71
71
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
72
72
  parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
73
73
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
74
+ parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
75
+ parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
74
76
  # yapf: enable
75
77
 
76
78
 
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='aime25',
13
+ dataset_id='TIGER-Lab/AIME25',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='train', # Only train set is available
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class AIME25Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['question']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -171,11 +171,6 @@ class BBHAdapter(DataAdapter):
171
171
  prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
172
172
  res_dict[sub_name].append(prompt_d)
173
173
 
174
- rnd = random.Random()
175
- rnd.seed(42)
176
- for k, v in res_dict.items():
177
- rnd.shuffle(v)
178
-
179
174
  return res_dict
180
175
 
181
176
  def get_gold_answer(self, input_d: dict) -> str:
@@ -24,6 +24,8 @@ class BenchmarkMeta:
24
24
  eval_split: Optional[str] = None
25
25
  prompt_template: Optional[str] = None
26
26
  system_prompt: Optional[str] = None
27
+ query_template: Optional[str] = None
28
+ pretty_name: Optional[str] = None
27
29
 
28
30
  def _update(self, args: dict):
29
31
  if args.get('local_path'):
@@ -59,7 +61,7 @@ class Benchmark:
59
61
  @classmethod
60
62
  def get(cls, name: str) -> 'BenchmarkMeta':
61
63
  if name not in BENCHMARK_MAPPINGS:
62
- raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
64
+ raise Exception(f'Unknown benchmark: {name}. Available tasks: {list(BENCHMARK_MAPPINGS.keys())}')
63
65
  benchmark = BENCHMARK_MAPPINGS[name]
64
66
  return benchmark
65
67
 
@@ -23,7 +23,7 @@ logger = get_logger()
23
23
  subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
24
24
  metric_list=['AveragePass@1'],
25
25
  few_shot_num=4,
26
- train_split='train',
26
+ train_split=None,
27
27
  eval_split='test',
28
28
  prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
29
29
  )
@@ -43,7 +43,8 @@ class CompetitionMathAdapter(DataAdapter):
43
43
  def load(self, **kwargs):
44
44
  # default load all levels
45
45
  kwargs['subset_list'] = ['default']
46
- return super().load(**kwargs)
46
+ data_dict = super().load(**kwargs)
47
+ return self.reformat_subset(data_dict, subset_key='level')
47
48
 
48
49
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
49
50
  data_dict = defaultdict(dict)
@@ -63,21 +64,6 @@ class CompetitionMathAdapter(DataAdapter):
63
64
 
64
65
  return data_dict
65
66
 
66
- def gen_prompts(self, data_dict: dict) -> dict:
67
- res_dict: dict = defaultdict(list)
68
-
69
- # use level as subset
70
- for sub_name, sub_data_dict in data_dict.items():
71
- for sample_d in sub_data_dict[self.eval_split]:
72
- level = sample_d['level']
73
- if level not in self.subset_list:
74
- continue
75
- prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=None)
76
- prompt_d[AnswerKeys.RAW_INPUT] = sample_d
77
- res_dict[level].append(prompt_d)
78
-
79
- return res_dict
80
-
81
67
  def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
82
68
  """
83
69
  Generate the prompt for the model input.
@@ -2,6 +2,7 @@
2
2
  import os.path
3
3
  import random
4
4
  from abc import ABC, abstractmethod
5
+ from collections import defaultdict
5
6
  from typing import Any, List, Optional, Union
6
7
 
7
8
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
@@ -24,6 +25,8 @@ class DataAdapter(ABC):
24
25
  eval_split: Optional[str] = None,
25
26
  prompt_template: Optional[str] = None,
26
27
  system_prompt: Optional[str] = None,
28
+ query_template: Optional[str] = None,
29
+ pretty_name: Optional[str] = None,
27
30
  **kwargs):
28
31
  """
29
32
  Data Adapter for the benchmark. You need to implement the following methods:
@@ -52,6 +55,8 @@ class DataAdapter(ABC):
52
55
  self.eval_split = eval_split
53
56
  self.prompt_template = prompt_template
54
57
  self.system_prompt = system_prompt
58
+ self.query_template = query_template
59
+ self.pretty_name = pretty_name
55
60
  self.config_kwargs = kwargs
56
61
  self.category_map = kwargs.get('category_map', {})
57
62
 
@@ -59,7 +64,6 @@ class DataAdapter(ABC):
59
64
  dataset_name_or_path: str = None,
60
65
  subset_list: list = None,
61
66
  work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
62
- datasets_hub: str = HubType.MODELSCOPE,
63
67
  **kwargs) -> dict:
64
68
  """
65
69
  Load the dataset. Remote and local datasets are supported.
@@ -74,22 +78,40 @@ class DataAdapter(ABC):
74
78
 
75
79
  # Try to load dataset from local disk
76
80
  if os.path.exists(dataset_name_or_path):
77
- logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
78
- subsets: {subset_list}')
79
81
  data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
80
- if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
81
- raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
82
82
  else:
83
- from modelscope.msdatasets import MsDataset
83
+ data_dict = self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
84
+ if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
85
+ raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
86
+ return data_dict
87
+
88
+ def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
89
+ from modelscope.msdatasets import MsDataset
90
+
91
+ datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
92
+ split_as_subset: bool = kwargs.pop('split_as_subset', False)
93
+ # Load dataset from remote
94
+ logger.info(
95
+ f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
84
96
 
85
- # Load dataset from remote
86
- logger.info(
87
- f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
88
- data_dict = {}
89
- split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
90
- if len(split_list) == 0:
91
- logger.error(f'Got empty split list: {split_list}')
97
+ data_dict = {}
98
+ split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
99
+ if len(split_list) == 0:
100
+ logger.error(f'Got empty split list: {split_list}')
92
101
 
102
+ if split_as_subset:
103
+ for sub_name in subset_list:
104
+ data_dict[sub_name] = {}
105
+ # e.g. train: few-shot, test: target dataset to evaluate
106
+ for split in split_list:
107
+ dataset = MsDataset.load(
108
+ dataset_name=dataset_name_or_path,
109
+ split=sub_name, # load subset from split
110
+ cache_dir=work_dir,
111
+ hub=datasets_hub,
112
+ **kwargs)
113
+ data_dict[sub_name].update({split: dataset})
114
+ else:
93
115
  for sub_name in subset_list:
94
116
  data_dict[sub_name] = {}
95
117
  # e.g. train: few-shot, test: target dataset to evaluate
@@ -101,17 +123,48 @@ class DataAdapter(ABC):
101
123
  cache_dir=work_dir,
102
124
  hub=datasets_hub,
103
125
  **kwargs)
104
-
105
126
  data_dict[sub_name].update({split: dataset})
106
127
 
107
128
  return data_dict
108
129
 
109
- def load_from_disk(self, *args, **kwargs) -> dict:
130
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
110
131
  """
111
132
  Load the dataset from local disk.
112
133
  If you want to support local dataset, please rewrite this method in xxx_data_adapter.
134
+ Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
135
+ """
136
+ from modelscope.msdatasets import MsDataset
137
+
138
+ logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
139
+ subsets: {subset_list}')
140
+ data_dict = {}
141
+ subset_list = subset_list or self.subset_list
142
+ split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
143
+ for sub_name in subset_list:
144
+ data_dict[sub_name] = {}
145
+ # e.g. train: few-shot, test: target dataset to evaluate
146
+ for split in split_list:
147
+ dataset = MsDataset.load(
148
+ dataset_name=dataset_name_or_path, subset_name=sub_name, split=split, cache_dir=work_dir, **kwargs)
149
+ data_dict[sub_name].update({split: dataset})
150
+ return data_dict
151
+
152
+ def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
153
+ """
154
+ Reformat the dataset subset with subset_key and format.
113
155
  """
114
- return {}
156
+ res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list})
157
+
158
+ for sub_name, sub_data_dict in data_dict.items():
159
+ for split in [self.train_split, self.eval_split]:
160
+ if split is None:
161
+ continue
162
+ for sample_d in sub_data_dict[split]:
163
+ new_subset_name = format.format(sample_d[subset_key])
164
+ if new_subset_name not in self.subset_list:
165
+ continue
166
+ res_dict[new_subset_name][split].append(sample_d)
167
+ return res_dict
115
168
 
116
169
  def gen_prompts(self, data_dict: dict) -> dict:
117
170
  """
@@ -138,7 +191,7 @@ class DataAdapter(ABC):
138
191
 
139
192
  for sub_name, sub_data_dict in data_dict.items():
140
193
  few_shot_data = []
141
- if self.few_shot_num and self.few_shot_num > 0:
194
+ if self.train_split and self.few_shot_num and self.few_shot_num > 0:
142
195
  few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
143
196
  few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
144
197
  self.few_shot_num,
@@ -161,7 +214,7 @@ class DataAdapter(ABC):
161
214
  else:
162
215
  return data_list[:k]
163
216
 
164
- def compute_metric(self, review_res_list: Union[dict, list]) -> List[dict]:
217
+ def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]:
165
218
  """
166
219
  Compute evaluation result by specific metrics.
167
220
 
@@ -24,7 +24,7 @@ logger = get_logger()
24
24
  train_split='dev',
25
25
  eval_split='val',
26
26
  prompt_template='请回答问题,并选出其中的正确答案\n{query}',
27
- )
27
+ query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
28
28
  class GeneralMCQAdapter(DataAdapter):
29
29
 
30
30
  choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
@@ -115,15 +115,11 @@ class GeneralMCQAdapter(DataAdapter):
115
115
  def match(self, gold: str, pred: str) -> float:
116
116
  return exact_match(gold=gold, pred=pred)
117
117
 
118
- @classmethod
119
- def _format_example(cls, input_d: dict, include_answer=True):
120
- example = '问题:' + input_d['question']
121
- for choice in cls.choices:
122
- if choice in input_d:
123
- example += f'\n{choice}. {input_d[f"{choice}"]}'
118
+ def _format_example(self, input_d: dict, include_answer=True):
119
+ choices_str = '\n'.join([f'{choice}. {input_d[choice]}' for choice in self.choices if choice in input_d])
124
120
 
125
121
  if include_answer:
126
- example += '\n答案: ' + input_d['answer'] + '\n\n'
122
+ return self.query_template.format(
123
+ question=input_d['question'], choices=choices_str, answer=input_d['answer'])
127
124
  else:
128
- example += '\n答案: '
129
- return example
125
+ return self.query_template.format(question=input_d['question'], choices=choices_str, answer='').rstrip()
@@ -22,6 +22,7 @@ logger = get_logger()
22
22
  few_shot_num=0,
23
23
  train_split=None,
24
24
  eval_split='test',
25
+ prompt_template='请回答问题\n{query}',
25
26
  )
26
27
  class GeneralQAAdapter(DataAdapter):
27
28
  # TODO: set few_shot_num
@@ -62,10 +63,8 @@ class GeneralQAAdapter(DataAdapter):
62
63
  logger.warning('The history is not included in the prompt for GeneralQA. \
63
64
  To be supported in the future.')
64
65
 
65
- prompt = input_d.get('question', '') or input_d.get('query', '')
66
-
67
- # if len(history) > 0:
68
- # prompt = '\n'.join(history) + '\n' + prompt
66
+ query = input_d.get('question', '') or input_d.get('query', '')
67
+ prompt = self.prompt_template.format(query=query)
69
68
  return {'data': [prompt], 'system_prompt': self.system_prompt}
70
69
 
71
70
  def get_gold_answer(self, input_d: dict) -> str:
@@ -107,7 +106,7 @@ class GeneralQAAdapter(DataAdapter):
107
106
  res.update(bleu_dict)
108
107
  return res
109
108
 
110
- def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
109
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
111
110
  """
112
111
  compute weighted mean of the bleu score of all samples
113
112
 
@@ -15,7 +15,7 @@ from evalscope.models import ChatGenerationModelAdapter
15
15
  subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=5,
18
- train_split='train',
18
+ train_split=None,
19
19
  eval_split='train', # only have train split
20
20
  prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
21
  )
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  subset_list=['main'],
21
21
  metric_list=['AverageAccuracy'],
22
22
  few_shot_num=4,
23
- train_split='train',
23
+ train_split=None,
24
24
  eval_split='test',
25
25
  prompt_template="Question: {query}\nLet's think step by step\nAnswer:",
26
26
  )
@@ -47,7 +47,7 @@ class IFEvalAdapter(DataAdapter):
47
47
  def match(self, gold: Any, pred: Any) -> Dict:
48
48
  return process_results(gold, [pred])
49
49
 
50
- def compute_metric(self, review_res_list: List[dict]) -> Any:
50
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
51
51
  # aggregate review results
52
52
  res_dict = defaultdict(list)
53
53
  for res in review_res_list:
@@ -1,4 +1,7 @@
1
+ from collections import defaultdict
2
+
1
3
  from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from evalscope.constants import AnswerKeys
2
5
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
6
  from evalscope.models import ChatGenerationModelAdapter
4
7
  from evalscope.utils.logger import get_logger
@@ -12,7 +15,7 @@ logger = get_logger()
12
15
  name='math_500',
13
16
  dataset_id='AI-ModelScope/MATH-500',
14
17
  model_adapter=ChatGenerationModelAdapter,
15
- subset_list=['default'],
18
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
16
19
  metric_list=['AveragePass@1'],
17
20
  few_shot_num=0,
18
21
  train_split=None,
@@ -24,6 +27,12 @@ class Math500Adapter(DataAdapter):
24
27
  def __init__(self, *args, **kwargs):
25
28
  super().__init__(*args, **kwargs)
26
29
 
30
+ def load(self, **kwargs):
31
+ # default load all levels
32
+ kwargs['subset_list'] = ['default']
33
+ data_dict = super().load(**kwargs)
34
+ return self.reformat_subset(data_dict, subset_key='level', format='Level {}')
35
+
27
36
  def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
37
  """
29
38
  Generate the prompt for the model input.
@@ -15,7 +15,7 @@ SUBSET_LIST = [
15
15
 
16
16
  @Benchmark.register(
17
17
  name='mmlu_pro',
18
- dataset_id='modelscope/mmlu-pro',
18
+ dataset_id='modelscope/MMLU-Pro',
19
19
  model_adapter=ChatGenerationModelAdapter,
20
20
  subset_list=SUBSET_LIST,
21
21
  metric_list=['AverageAccuracy'],
@@ -35,41 +35,25 @@ class MMLUProAdapter(DataAdapter):
35
35
  def load(self, **kwargs):
36
36
  # default load all data
37
37
  kwargs['subset_list'] = ['default']
38
- return super().load(**kwargs)
38
+ data_dict = super().load(**kwargs)
39
+ return self.reformat_subset(data_dict, subset_key='category')
39
40
 
40
- def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
41
- """
42
- Generate model prompt from raw input, unify the prompt format for MMLU-Pro benchmark.
43
- Return a dict with category as key and list of prompts as value.
44
- """
45
-
46
- data_dict = data_dict['default'] # Only one subset for MMLU-Pro
47
- fewshot_prompts = self.get_fewshot_examples(data_dict)
48
-
49
- # Use the category as key to group the prompts
50
- res_dict = defaultdict(list)
51
- # generate prompts for each test sample
52
- for entry in data_dict[self.eval_split]:
53
- subset_name = entry['category']
54
- if subset_name not in self.subset_list:
55
- continue
56
- prefix = fewshot_prompts[subset_name]
57
- query = prefix + 'Q: ' + entry['question'] + '\n' + \
58
- self.__form_options(entry['options']) + '\n'
59
-
60
- full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
61
- prompt_d = {'data': [full_prompt], 'system_prompt': self.system_prompt, AnswerKeys.RAW_INPUT: entry}
41
+ def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
42
+ if self.few_shot_num > 0:
43
+ prefix = self.format_fewshot_examples(few_shot_list)
44
+ else:
45
+ prefix = ''
46
+ query = prefix + 'Q: ' + input_d['question'] + '\n' + \
47
+ self.__form_options(input_d['options']) + '\n'
62
48
 
63
- res_dict[subset_name].append(prompt_d)
64
- return res_dict
49
+ full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
50
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
65
51
 
66
- def get_fewshot_examples(self, data_dict: dict):
52
+ def format_fewshot_examples(self, few_shot_list):
67
53
  # load few-shot prompts for each category
68
- prompts = {c: '' for c in self.subset_list}
69
- for index, d in enumerate(data_dict[self.train_split]):
70
- if index >= self.few_shot_num:
71
- break
72
- prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
54
+ prompts = ''
55
+ for index, d in enumerate(few_shot_list):
56
+ prompts += 'Q: ' + d['question'] + '\n' + \
73
57
  self.__form_options(d['options']) + '\n' + \
74
58
  d['cot_content'] + '\n\n'
75
59
  return prompts
File without changes
@@ -0,0 +1,68 @@
1
+ import ast
2
+ from typing import Any
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.constants import EvalType
6
+ from evalscope.metrics import exact_match
7
+ from evalscope.models import ChatGenerationModelAdapter
8
+ from evalscope.utils.utils import ResponseParser
9
+
10
+
11
+ @Benchmark.register(
12
+ name='musr',
13
+ pretty_name='MuSR',
14
+ dataset_id='AI-ModelScope/MuSR',
15
+ model_adapter=ChatGenerationModelAdapter,
16
+ subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
17
+ metric_list=['AverageAccuracy'],
18
+ few_shot_num=0,
19
+ train_split=None,
20
+ eval_split='test',
21
+ prompt_template=
22
+ '{narrative}\n\n{question}\n\n{choices}\nThink step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.', # noqa: E501
23
+ )
24
+ class MuSRAdapter(DataAdapter):
25
+
26
+ def __init__(self, **kwargs):
27
+ super().__init__(**kwargs)
28
+
29
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F']
30
+
31
+ def load(self, **kwargs):
32
+ # default load all levels
33
+ kwargs['split_as_subset'] = True
34
+ data_dict = super().load(**kwargs)
35
+ return data_dict
36
+
37
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
38
+
39
+ choices = self.format_choice(ast.literal_eval(input_d['choices']))
40
+
41
+ full_prompt = self.prompt_template.format(
42
+ narrative=input_d['narrative'], question=input_d['question'], choices=choices)
43
+
44
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
45
+
46
+ def format_choice(self, options: list):
47
+ option_str = ''
48
+ for opt, choice in zip(options, self.choices):
49
+ option_str += f'({choice}): {opt}\n'
50
+ return option_str
51
+
52
+ def get_gold_answer(self, input_d: dict) -> str:
53
+ """
54
+ Parse the raw input labels (gold).
55
+ """
56
+ return self.choices[input_d['answer_index']]
57
+
58
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
59
+ """
60
+ Parse the predicted result and extract proper answer.
61
+ """
62
+ return ResponseParser.parse_first_option(result)
63
+
64
+ def match(self, gold: str, pred: str) -> float:
65
+ """
66
+ Match the gold answer and the predicted answer.
67
+ """
68
+ return exact_match(gold=gold, pred=pred)
File without changes
@@ -0,0 +1,13 @@
1
+ The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
2
+
3
+ [Math Problem]
4
+
5
+ {problem}
6
+
7
+ [Solution]
8
+
9
+ {tagged_response}
10
+
11
+ Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
12
+
13
+ Please put your final answer (i.e., the index) in \boxed{{}}.
@@ -0,0 +1,96 @@
1
+ import os
2
+ import re
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import AnswerKeys, EvalType
7
+ from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
8
+ from evalscope.models import ChatGenerationModelAdapter
9
+
10
+ cur_path = os.path.dirname(os.path.abspath(__file__))
11
+
12
+
13
+ @Benchmark.register(
14
+ name='process_bench',
15
+ pretty_name='ProcessBench',
16
+ dataset_id='Qwen/ProcessBench',
17
+ model_adapter=ChatGenerationModelAdapter,
18
+ subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
19
+ metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
20
+ few_shot_num=0,
21
+ train_split=None,
22
+ eval_split='test',
23
+ )
24
+ class ProcessBenchAdapter(DataAdapter):
25
+
26
+ def __init__(self, **kwargs):
27
+ super().__init__(**kwargs)
28
+
29
+ self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt')).read()
30
+
31
+ # register metrics
32
+ metric_registry.register(Metric(name='error_acc', object=mean))
33
+ metric_registry.register(Metric(name='correct_acc', object=mean))
34
+ metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
35
+
36
+ def load(self, **kwargs):
37
+ # default load all levels
38
+ kwargs['split_as_subset'] = True
39
+ data_dict = super().load(**kwargs)
40
+ return data_dict
41
+
42
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
43
+
44
+ problem = input_d['problem']
45
+ steps = input_d['steps']
46
+ tagged_response = ''
47
+ for sdx, step in enumerate(steps):
48
+ tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
49
+ tagged_response = tagged_response.strip()
50
+
51
+ full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
52
+
53
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
54
+
55
+ def get_gold_answer(self, input_d: dict) -> str:
56
+ """
57
+ Parse the raw input labels (gold).
58
+ """
59
+ return int(input_d['label'])
60
+
61
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
62
+ """
63
+ Parse the predicted result and extract proper answer.
64
+ """
65
+ pred = ProcessBenchAdapter.extract_answer(result)
66
+ try:
67
+ pred = int(pred)
68
+ except Exception:
69
+ pred = None
70
+ return pred
71
+
72
+ def match(self, gold: int, pred: int) -> float:
73
+ """
74
+ Match the gold answer and the predicted answer.
75
+ """
76
+ return gold == pred
77
+
78
+ def compute_metric(self, review_res_list: list, **kwargs) -> List[dict]:
79
+ reviews_list = kwargs['reviews_list']
80
+ error_data = []
81
+ correct_data = []
82
+ for res, raw in zip(review_res_list, reviews_list):
83
+ if raw[AnswerKeys.RAW_INPUT]['label'] == -1:
84
+ correct_data.append(res)
85
+ else:
86
+ error_data.append(res)
87
+ data = {'error_acc': error_data, 'correct_acc': correct_data, 'simple_f1_score': (correct_data, error_data)}
88
+ return super().compute_metric(data)
89
+
90
+ @staticmethod
91
+ def extract_answer(solution_text: str):
92
+ boxed_pattern = r'\\boxed\{([^}]*)\}'
93
+ matches = re.findall(boxed_pattern, solution_text)
94
+ if matches:
95
+ return matches[-1].strip()
96
+ return None