evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (81) hide show
  1. evalscope/arguments.py +3 -0
  2. evalscope/benchmarks/aime/__init__.py +0 -0
  3. evalscope/benchmarks/aime/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  5. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  6. evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
  7. evalscope/benchmarks/benchmark.py +5 -3
  8. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  9. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  10. evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
  11. evalscope/benchmarks/data_adapter.py +88 -29
  12. evalscope/benchmarks/data_collection/__init__.py +0 -0
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  14. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  15. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
  16. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
  17. evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
  18. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
  19. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  20. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  21. evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
  22. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  23. evalscope/benchmarks/math_500/__init__.py +0 -0
  24. evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
  25. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  26. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
  27. evalscope/benchmarks/musr/__init__.py +0 -0
  28. evalscope/benchmarks/musr/musr_adapter.py +68 -0
  29. evalscope/benchmarks/process_bench/__init__.py +0 -0
  30. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  31. evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
  32. evalscope/benchmarks/race/race_adapter.py +3 -3
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  34. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
  35. evalscope/cli/start_app.py +4 -1
  36. evalscope/cli/start_eval.py +4 -3
  37. evalscope/cli/start_perf.py +4 -2
  38. evalscope/collections/evaluator.py +109 -39
  39. evalscope/collections/sampler.py +2 -1
  40. evalscope/collections/schema.py +1 -2
  41. evalscope/config.py +4 -1
  42. evalscope/evaluator/evaluator.py +81 -65
  43. evalscope/metrics/__init__.py +2 -1
  44. evalscope/metrics/math_parser.py +526 -0
  45. evalscope/metrics/metrics.py +39 -3
  46. evalscope/metrics/named_metrics.py +31 -7
  47. evalscope/models/base_adapter.py +7 -1
  48. evalscope/models/chat_adapter.py +69 -49
  49. evalscope/models/choice_adapter.py +52 -45
  50. evalscope/models/custom_adapter.py +2 -2
  51. evalscope/models/local_model.py +7 -2
  52. evalscope/models/server_adapter.py +106 -61
  53. evalscope/perf/__init__.py +0 -1
  54. evalscope/perf/arguments.py +5 -1
  55. evalscope/perf/http_client.py +2 -2
  56. evalscope/perf/plugin/api/openai_api.py +11 -1
  57. evalscope/perf/utils/benchmark_util.py +6 -2
  58. evalscope/report/app.py +42 -23
  59. evalscope/run.py +11 -8
  60. evalscope/third_party/thinkbench/__init__.py +3 -0
  61. evalscope/third_party/thinkbench/eval.py +264 -0
  62. evalscope/third_party/thinkbench/infer.py +100 -0
  63. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  64. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  65. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  66. evalscope/third_party/thinkbench/tools/llm.py +47 -0
  67. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  68. evalscope/utils/chat_service.py +2 -2
  69. evalscope/utils/io_utils.py +1 -1
  70. evalscope/utils/model_utils.py +17 -1
  71. evalscope/utils/utils.py +45 -45
  72. evalscope/version.py +2 -2
  73. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
  74. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
  75. tests/cli/test_run.py +108 -19
  76. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  77. evalscope/metrics/math_accuracy.py +0 -200
  78. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
  79. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
  80. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
  81. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
58
58
  parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
59
59
  choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
60
60
  parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
61
+ parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
61
62
 
62
63
  # Cache and working directory arguments
63
64
  parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
@@ -70,6 +71,8 @@ def add_argument(parser: argparse.ArgumentParser):
70
71
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
71
72
  parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
72
73
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
74
+ parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
75
+ parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
73
76
  # yapf: enable
74
77
 
75
78
 
File without changes
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='aime24',
13
+ dataset_id='HuggingFaceH4/aime_2024',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='train', # Only train set is available
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class AIME24Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['problem']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='aime25',
13
+ dataset_id='TIGER-Lab/AIME25',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='train', # Only train set is available
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class AIME25Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['question']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/ai2_arc',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
- metric_list=[AverageAccuracy],
23
+ metric_list=['AverageAccuracy'],
24
24
  few_shot_num=0,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
112
112
  # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
113
113
  full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
114
114
 
115
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
115
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
116
116
 
117
117
  def get_gold_answer(self, input_d: dict) -> str:
118
118
  # Get the gold choice
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
133
133
  if eval_type == EvalType.CHECKPOINT:
134
134
  return result
135
135
  elif eval_type == EvalType.SERVICE:
136
- return ResponseParser.parse_first_option_with_choices(
137
- text=result, options=self.choices) # TODO: to be checked !
136
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
138
137
  elif eval_type == EvalType.CUSTOM:
139
- return ResponseParser.parse_first_option_with_choices(
140
- text=result, options=self.choices) # TODO: to be checked !
138
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
141
139
  else:
142
140
  raise ValueError(f'Invalid eval_type: {eval_type}')
143
141
 
@@ -7,7 +7,7 @@ import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics import AverageAccuracy, exact_match
10
+ from evalscope.metrics import exact_match
11
11
  from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
12
  from evalscope.utils import ResponseParser
13
13
  from evalscope.utils.logger import get_logger
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
63
63
  dataset_id='modelscope/bbh',
64
64
  model_adapter=ChatGenerationModelAdapter,
65
65
  subset_list=SUBSET_LIST,
66
- metric_list=[AverageAccuracy],
66
+ metric_list=['AverageAccuracy'],
67
67
  few_shot_num=3,
68
68
  train_split=None,
69
69
  eval_split='test',
70
- prompt_template='',
70
+ prompt_template="Q: {query}\nA: Let's think step by step.",
71
71
  )
72
72
  class BBHAdapter(DataAdapter):
73
73
  """
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
119
119
  {'data': ['xxx']}
120
120
  """
121
121
  # few_shot_list: should be ['xxxx']
122
- cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
123
- full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
122
+ if len(few_shot_list) > 0:
123
+ cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
124
+ else:
125
+ cot_prompts = ''
126
+ full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
124
127
 
125
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
128
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
126
129
 
127
130
  def gen_prompts(self, data_dict: dict) -> dict:
128
131
  """
@@ -168,18 +171,15 @@ class BBHAdapter(DataAdapter):
168
171
  prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
169
172
  res_dict[sub_name].append(prompt_d)
170
173
 
171
- rnd = random.Random()
172
- rnd.seed(42)
173
- for k, v in res_dict.items():
174
- rnd.shuffle(v)
175
-
176
174
  return res_dict
177
175
 
178
176
  def get_gold_answer(self, input_d: dict) -> str:
179
177
  # Get the gold choice
180
- gold = input_d.get('target')
178
+ gold = input_d.get('target', '')
179
+ # remove brackets
181
180
  if gold is None:
182
181
  logger.error(f'BBHAdapter: gold is None.')
182
+ gold = gold.replace('(', '').replace(')', '')
183
183
  return gold
184
184
 
185
185
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -228,8 +228,11 @@ class BBHAdapter(DataAdapter):
228
228
  """
229
229
  Extract the answer from the model output for Free-form task.
230
230
  """
231
- res = ResponseParser.parse_first_option(ans)
232
- if res:
231
+ pattern = r'answer is\s+(.*?)\.'
232
+
233
+ match = re.search(pattern, ans)
234
+ if match:
235
+ res = match.group(1)
233
236
  return res
234
237
 
235
238
  ans_line = ans.split('answer is ')
@@ -17,12 +17,15 @@ class BenchmarkMeta:
17
17
  data_adapter: 'DataAdapter'
18
18
  model_adapter: BaseModelAdapter
19
19
  subset_list: List[str] = field(default_factory=list)
20
- metric_list: List[dict] = field(default_factory=list)
20
+ metric_list: List[str] = field(default_factory=list)
21
21
  few_shot_num: int = 0
22
22
  few_shot_random: bool = False
23
23
  train_split: Optional[str] = None
24
24
  eval_split: Optional[str] = None
25
25
  prompt_template: Optional[str] = None
26
+ system_prompt: Optional[str] = None
27
+ query_template: Optional[str] = None
28
+ pretty_name: Optional[str] = None
26
29
 
27
30
  def _update(self, args: dict):
28
31
  if args.get('local_path'):
@@ -40,7 +43,6 @@ class BenchmarkMeta:
40
43
  # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
41
44
  del cur_dict['data_adapter']
42
45
  del cur_dict['model_adapter']
43
- del cur_dict['metric_list']
44
46
  return cur_dict
45
47
 
46
48
  def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
@@ -59,7 +61,7 @@ class Benchmark:
59
61
  @classmethod
60
62
  def get(cls, name: str) -> 'BenchmarkMeta':
61
63
  if name not in BENCHMARK_MAPPINGS:
62
- raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
64
+ raise Exception(f'Unknown benchmark: {name}. Available tasks: {list(BENCHMARK_MAPPINGS.keys())}')
63
65
  benchmark = BENCHMARK_MAPPINGS[name]
64
66
  return benchmark
65
67
 
@@ -4,10 +4,9 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy
8
- from evalscope.metrics.metrics import exact_match, weighted_mean
7
+ from evalscope.metrics.metrics import exact_match
9
8
  from evalscope.models import MultiChoiceModelAdapter
10
- from evalscope.utils import ResponseParser, normalize_score
9
+ from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
13
12
  # flake8: noqa
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
130
129
  dataset_id='modelscope/ceval-exam',
131
130
  model_adapter=MultiChoiceModelAdapter,
132
131
  subset_list=SUBSET_LIST,
133
- metric_list=[AverageAccuracy],
132
+ metric_list=['AverageAccuracy'],
134
133
  few_shot_num=0,
135
134
  train_split='dev',
136
135
  eval_split='val',
136
+ prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
137
137
  )
138
138
  class CEVALAdapter(DataAdapter):
139
139
 
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
202
202
  else:
203
203
  context = ''
204
204
 
205
- full_prompt: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
205
+ query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
206
206
 
207
207
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
208
- full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
208
+ full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
209
209
 
210
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
210
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
211
211
 
212
212
  def get_gold_answer(self, input_d: dict) -> str:
213
213
  # Get the gold choice
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
228
228
  if eval_type == EvalType.CHECKPOINT:
229
229
  return result
230
230
  elif eval_type == EvalType.SERVICE:
231
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
231
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
232
232
  elif eval_type == EvalType.CUSTOM:
233
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
233
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
234
234
  else:
235
235
  raise ValueError(f'Invalid eval_type: {eval_type}')
236
236
 
@@ -5,9 +5,9 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
- from evalscope.utils import ResponseParser, normalize_score
10
+ from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
12
12
 
13
13
  # flake8: noqa
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
106
106
  dataset_id='modelscope/cmmlu',
107
107
  model_adapter=MultiChoiceModelAdapter,
108
108
  subset_list=SUBSET_LIST,
109
- metric_list=[AverageAccuracy],
109
+ metric_list=['AverageAccuracy'],
110
110
  few_shot_num=5,
111
111
  train_split='dev',
112
112
  eval_split='test',
113
+ prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
113
114
  )
114
115
  class CMMLUAdapter(DataAdapter):
115
116
 
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
165
166
  {'data': [(context, continuation), ...]}
166
167
 
167
168
  """
168
- prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
169
169
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
170
-
171
- context: str = '\n'.join(few_shot_prompts) + '\n'
170
+ context = '\n'.join(few_shot_prompts) + '\n'
172
171
  context += self._generate_prompt(input_d=input_d, include_answer=False)
173
- context = prompt + context
174
172
 
175
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
173
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
176
174
 
177
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
175
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
178
176
 
179
177
  def get_gold_answer(self, input_d: dict) -> str:
180
178
  # Get the gold choice
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
195
193
  if eval_type == EvalType.CHECKPOINT:
196
194
  return result
197
195
  elif eval_type == EvalType.SERVICE:
198
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
196
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
199
197
  elif eval_type == EvalType.CUSTOM:
200
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
198
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
201
199
  else:
202
200
  raise ValueError(f'Invalid eval_type: {eval_type}')
203
201
 
@@ -3,10 +3,11 @@
3
3
  import glob
4
4
  import json
5
5
  import os
6
+ from collections import defaultdict
6
7
 
7
8
  from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.metrics import AverageAccuracy
9
- from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
9
+ from evalscope.constants import AnswerKeys
10
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
10
11
  from evalscope.models import ChatGenerationModelAdapter
11
12
  from evalscope.utils.logger import get_logger
12
13
 
@@ -19,12 +20,12 @@ logger = get_logger()
19
20
  name='competition_math',
20
21
  dataset_id='modelscope/competition_math',
21
22
  model_adapter=ChatGenerationModelAdapter,
22
- subset_list=['default'],
23
- metric_list=[AverageAccuracy],
23
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
24
+ metric_list=['AveragePass@1'],
24
25
  few_shot_num=4,
25
- train_split='train',
26
+ train_split=None,
26
27
  eval_split='test',
27
- prompt_template='Put the final answer in \\boxed{}.',
28
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
28
29
  )
29
30
  class CompetitionMathAdapter(DataAdapter):
30
31
  """ To be tested for all models. """
@@ -39,8 +40,14 @@ class CompetitionMathAdapter(DataAdapter):
39
40
 
40
41
  super().__init__(**kwargs)
41
42
 
43
+ def load(self, **kwargs):
44
+ # default load all levels
45
+ kwargs['subset_list'] = ['default']
46
+ data_dict = super().load(**kwargs)
47
+ return self.reformat_subset(data_dict, subset_key='level')
48
+
42
49
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
43
- data_dict: dict = {}
50
+ data_dict = defaultdict(dict)
44
51
  for subset_name in subset_list:
45
52
  for split_name in [self.train_split, self.eval_split]:
46
53
  if os.path.exists(dataset_name_or_path):
@@ -53,10 +60,7 @@ class CompetitionMathAdapter(DataAdapter):
53
60
  if os.path.exists(file_path):
54
61
  with open(file_path, 'r') as f:
55
62
  split_data.append(json.load(f))
56
- if subset_name in data_dict:
57
- data_dict[subset_name].update({split_name: split_data})
58
- else:
59
- data_dict[subset_name] = {split_name: split_data}
63
+ data_dict[subset_name][split_name] = split_data
60
64
 
61
65
  return data_dict
62
66
 
@@ -75,13 +79,13 @@ class CompetitionMathAdapter(DataAdapter):
75
79
  {'data': [prompt]}
76
80
  """
77
81
  use_fewshot = self.few_shot_num > 0
78
- full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
79
-
80
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
82
+ query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
83
+ full_prompt = self.prompt_template.format(query=query)
84
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
81
85
 
82
86
  def get_gold_answer(self, input_d: dict) -> str:
83
87
  # Extract the gold answer from the input dict.
84
- return remove_boxed(last_boxed_only_string(input_d['solution']))
88
+ return strip_answer_string(extract_answer(input_d['solution']))
85
89
 
86
90
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
87
91
  """
@@ -96,18 +100,11 @@ class CompetitionMathAdapter(DataAdapter):
96
100
  The parsed answer. Depending on the dataset. Usually a string for chat.
97
101
  """
98
102
  # Note: Use same extraction method for both of checkpoint/service/custom
99
- try:
100
- result = remove_boxed(last_boxed_only_string(result))
101
- except Exception:
102
- return None
103
+ result = strip_answer_string(extract_answer(result))
103
104
  return result
104
105
 
105
106
  def match(self, gold: str, pred: str) -> float:
106
- res = 0
107
- if is_equiv(pred, gold):
108
- res = 1
109
-
110
- return res
107
+ return math_equal(pred, gold)
111
108
 
112
109
  @classmethod
113
110
  def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str: