evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (55) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  22. evalscope/benchmarks/math_500/__init__.py +0 -0
  23. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  24. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  25. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  26. evalscope/benchmarks/race/race_adapter.py +3 -3
  27. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  28. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  29. evalscope/collections/evaluator.py +103 -39
  30. evalscope/collections/sampler.py +2 -1
  31. evalscope/collections/schema.py +1 -2
  32. evalscope/config.py +1 -0
  33. evalscope/evaluator/evaluator.py +78 -64
  34. evalscope/metrics/math_parser.py +526 -0
  35. evalscope/metrics/metrics.py +16 -1
  36. evalscope/metrics/named_metrics.py +31 -7
  37. evalscope/models/chat_adapter.py +69 -49
  38. evalscope/models/choice_adapter.py +52 -45
  39. evalscope/models/custom_adapter.py +2 -2
  40. evalscope/models/local_model.py +4 -0
  41. evalscope/models/server_adapter.py +28 -34
  42. evalscope/report/app.py +30 -15
  43. evalscope/run.py +10 -7
  44. evalscope/utils/chat_service.py +2 -2
  45. evalscope/utils/io_utils.py +1 -1
  46. evalscope/version.py +2 -2
  47. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
  48. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
  49. tests/cli/test_run.py +93 -16
  50. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  51. evalscope/metrics/math_accuracy.py +0 -200
  52. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  53. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  54. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  55. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,129 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import csv
3
+ import os
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics.metrics import exact_match
8
+ from evalscope.models import MultiChoiceModelAdapter
9
+ from evalscope.utils import ResponseParser
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ # flake8: noqa
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ @Benchmark.register(
18
+ name='general_mcq',
19
+ dataset_id='general_mcq',
20
+ model_adapter=MultiChoiceModelAdapter,
21
+ subset_list=['default'],
22
+ metric_list=['AverageAccuracy'],
23
+ few_shot_num=0,
24
+ train_split='dev',
25
+ eval_split='val',
26
+ prompt_template='请回答问题,并选出其中的正确答案\n{query}',
27
+ )
28
+ class GeneralMCQAdapter(DataAdapter):
29
+
30
+ choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
31
+
32
+ def __init__(self, **kwargs):
33
+ super().__init__(**kwargs)
34
+
35
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
36
+ data_dict = {}
37
+ for subset_name in subset_list:
38
+ for split_name in [self.train_split, self.eval_split]:
39
+ if os.path.exists(dataset_name_or_path):
40
+ file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
41
+ else:
42
+ file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
43
+ if os.path.exists(file_path):
44
+ with open(file_path, encoding='utf-8') as f:
45
+ rows = []
46
+ reader = csv.reader(f)
47
+ header = next(reader)
48
+ for row in reader:
49
+ item = dict(zip(header, row))
50
+ rows.append(item)
51
+
52
+ if subset_name in data_dict:
53
+ data_dict[subset_name].update({split_name: rows})
54
+ else:
55
+ data_dict[subset_name] = {split_name: rows}
56
+
57
+ return data_dict
58
+
59
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
60
+ """
61
+ Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
62
+
63
+ Args:
64
+ input_d (dict): The raw input. A single data format of the C-Eval:
65
+
66
+ {'id': 0,
67
+ 'question': '下列关于税法基本原则的表述中,不正确的是____。',
68
+ 'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
69
+ 'B': '税收公平原则源于法律上的平等性原则',
70
+ 'C': '税收效率原则包含经济效率和行政效率两个方面',
71
+ 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定',
72
+ 'answer': 'D'}
73
+
74
+ Returns:
75
+ {'data': ['prompt ...']}
76
+ """
77
+
78
+ few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
79
+
80
+ if len(few_shot_prompts) > 0:
81
+ context: str = '\n'.join(few_shot_prompts) + '\n'
82
+ else:
83
+ context = ''
84
+ context = context.strip() + self._format_example(input_d=input_d, include_answer=False)
85
+
86
+ full_prompt = self.prompt_template.format(query=context)
87
+
88
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
89
+
90
+ def get_gold_answer(self, input_d: dict) -> str:
91
+ # Get the gold choice
92
+ return input_d.get('answer', '')
93
+
94
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
95
+ """
96
+ Parse the model output to get the answer. Could be the best choice index.
97
+
98
+ Args:
99
+ result: Predicted answer from the model. Usually a string for chat.
100
+ raw_input_d (dict): The raw input. Depending on the dataset.
101
+ eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
102
+
103
+ Returns:
104
+ The parsed answer. Depending on the dataset. Usually a string for chat.
105
+ """
106
+ if eval_type == EvalType.CHECKPOINT:
107
+ return result
108
+ elif eval_type == EvalType.SERVICE:
109
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
110
+ elif eval_type == EvalType.CUSTOM:
111
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
112
+ else:
113
+ raise ValueError(f'Invalid eval_type: {eval_type}')
114
+
115
+ def match(self, gold: str, pred: str) -> float:
116
+ return exact_match(gold=gold, pred=pred)
117
+
118
+ @classmethod
119
+ def _format_example(cls, input_d: dict, include_answer=True):
120
+ example = '问题:' + input_d['question']
121
+ for choice in cls.choices:
122
+ if choice in input_d:
123
+ example += f'\n{choice}. {input_d[f"{choice}"]}'
124
+
125
+ if include_answer:
126
+ example += '\n答案: ' + input_d['answer'] + '\n\n'
127
+ else:
128
+ example += '\n答案: '
129
+ return example
@@ -5,7 +5,7 @@ from collections import defaultdict
5
5
  from typing import List
6
6
 
7
7
  from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.metrics import AverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
8
+ from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
9
9
  from evalscope.models import ChatGenerationModelAdapter
10
10
  from evalscope.utils.io_utils import jsonl_to_list
11
11
  from evalscope.utils.logger import get_logger
@@ -18,7 +18,7 @@ logger = get_logger()
18
18
  dataset_id='general_qa',
19
19
  model_adapter=ChatGenerationModelAdapter,
20
20
  subset_list=['default'],
21
- metric_list=[AverageBLEU],
21
+ metric_list=['AverageBLEU'],
22
22
  few_shot_num=0,
23
23
  train_split=None,
24
24
  eval_split='test',
@@ -30,16 +30,16 @@ class GeneralQAAdapter(DataAdapter):
30
30
 
31
31
  super().__init__(**kwargs)
32
32
 
33
- def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
33
+ def load(self, **kwargs) -> dict:
34
34
 
35
- data_file_list = glob.glob(os.path.join(dataset_name_or_path, '*.jsonl'))
35
+ data_file_list = glob.glob(os.path.join(self.dataset_id, '*.jsonl'))
36
36
  data_list = []
37
37
 
38
38
  try:
39
39
  for file_path in data_file_list:
40
40
  data_list.extend(jsonl_to_list(file_path))
41
41
  except Exception as e:
42
- raise ValueError(f'Failed to load data from {dataset_name_or_path}, got error: {e}')
42
+ raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
43
43
 
44
44
  data_dict = {'default': {'test': data_list}}
45
45
 
@@ -66,7 +66,7 @@ class GeneralQAAdapter(DataAdapter):
66
66
 
67
67
  # if len(history) > 0:
68
68
  # prompt = '\n'.join(history) + '\n' + prompt
69
- return {'data': [prompt], 'system_prompt': self.prompt_template}
69
+ return {'data': [prompt], 'system_prompt': self.system_prompt}
70
70
 
71
71
  def get_gold_answer(self, input_d: dict) -> str:
72
72
  """
@@ -3,10 +3,9 @@ import random
3
3
  import re
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import AnswerKeys, EvalType
7
- from evalscope.metrics import Pass1, exact_match
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.models import ChatGenerationModelAdapter
9
- from evalscope.utils.utils import ResponseParser
10
9
 
11
10
 
12
11
  @Benchmark.register(
@@ -14,11 +13,11 @@ from evalscope.utils.utils import ResponseParser
14
13
  dataset_id='modelscope/gpqa',
15
14
  model_adapter=ChatGenerationModelAdapter,
16
15
  subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
17
- metric_list=[Pass1],
16
+ metric_list=['AveragePass@1'],
18
17
  few_shot_num=5,
19
18
  train_split='train',
20
19
  eval_split='train', # only have train split
21
- prompt_template='',
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
22
21
  )
23
22
  class GPQAAdapter(DataAdapter):
24
23
 
@@ -48,9 +47,10 @@ class GPQAAdapter(DataAdapter):
48
47
  """ # noqa: E501
49
48
  processed_input_d = self.__process_input(input_d)
50
49
  input_d['answer'] = processed_input_d['answer'] # add answer to input_d for answer extraction
51
- prompt = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}Let's think step by step: " # noqa: E501
50
+ query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
52
51
 
53
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
52
+ prompt = self.prompt_template.format(query=query)
53
+ return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
54
54
 
55
55
  def __process_input(self, input_d: dict) -> dict:
56
56
 
@@ -94,10 +94,28 @@ class GPQAAdapter(DataAdapter):
94
94
  """
95
95
  Parse the predicted result and extract proper answer.
96
96
  """
97
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
97
+ return GPQAAdapter.get_multiple_choice_answer(result)
98
98
 
99
99
  def match(self, gold: str, pred: str) -> float:
100
100
  """
101
101
  Match the gold answer and the predicted answer.
102
102
  """
103
103
  return exact_match(gold=gold, pred=pred)
104
+
105
+ @staticmethod
106
+ def get_multiple_choice_answer(pred: str):
107
+ tmp = re.findall(r'\b(A|B|C|D)\b', pred.upper())
108
+ if tmp:
109
+ pred = tmp
110
+ else:
111
+ pred = [pred.strip().strip('.')]
112
+
113
+ if len(pred) == 0:
114
+ pred = ''
115
+ else:
116
+ pred = pred[-1]
117
+
118
+ # Remove the period at the end, again!
119
+ pred = pred.rstrip('.').rstrip('/')
120
+
121
+ return pred
@@ -6,7 +6,6 @@ import os
6
6
  import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.metrics import AverageAccuracy
10
9
  from evalscope.models import ChatGenerationModelAdapter
11
10
  from evalscope.utils.io_utils import jsonl_to_list
12
11
  from evalscope.utils.logger import get_logger
@@ -19,11 +18,11 @@ logger = get_logger()
19
18
  dataset_id='modelscope/gsm8k',
20
19
  model_adapter=ChatGenerationModelAdapter,
21
20
  subset_list=['main'],
22
- metric_list=[AverageAccuracy],
21
+ metric_list=['AverageAccuracy'],
23
22
  few_shot_num=4,
24
23
  train_split='train',
25
24
  eval_split='test',
26
- prompt_template='',
25
+ prompt_template="Question: {query}\nLet's think step by step\nAnswer:",
27
26
  )
28
27
  class GSM8KAdapter(DataAdapter):
29
28
 
@@ -73,10 +72,11 @@ class GSM8KAdapter(DataAdapter):
73
72
  }
74
73
  """
75
74
  use_fewshot = self.few_shot_num > 0
75
+ context = self._generate_prompt(use_fewshot=use_fewshot)
76
76
 
77
- full_prompt = self._generate_prompt(input_d, few_shot_list=few_shot_list, use_fewshot=use_fewshot)
77
+ full_prompt = context + self.prompt_template.format(query=input_d['question'])
78
78
 
79
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
79
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
80
80
 
81
81
  def get_gold_answer(self, input_d: dict) -> str:
82
82
  # Extract the gold answer from the input dict.
@@ -123,7 +123,7 @@ class GSM8KAdapter(DataAdapter):
123
123
  return number_equal(gold_ans=gold, pred_ans=pred)
124
124
 
125
125
  @classmethod
126
- def _generate_prompt(cls, input_d: dict, few_shot_list: list, use_fewshot: bool = True) -> str:
126
+ def _generate_prompt(cls, use_fewshot: bool = True) -> str:
127
127
  if use_fewshot:
128
128
  # Use 4-shot examples by system
129
129
  context = (
@@ -135,14 +135,9 @@ class GSM8KAdapter(DataAdapter):
135
135
  "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n"
136
136
  "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n"
137
137
  'For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n'
138
- f"Question: {input_d['question']}\nLet's think step by step\nAnswer:")
139
- # context = input_d['question']
140
- # fewshot_prompts = ['Question: ' + item_d['question'] + '\nAnswer: ' + item_d['answer'] for item_d in few_shot_list]
141
- # fewshot_prompts = fewshot_prompts + ['Question: ' + context + '\nAnswer:']
142
- # context = '\n\n'.join(fewshot_prompts)
138
+ )
143
139
  else:
144
- context = input_d['question']
145
- context = 'Question: ' + context + '\nAnswer:'
140
+ context = ''
146
141
  return context
147
142
 
148
143
  @staticmethod
@@ -5,7 +5,7 @@ import re
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import ContinuationLogitsModelAdapter
10
10
  from evalscope.utils.io_utils import jsonl_to_list
11
11
  from evalscope.utils.logger import get_logger
@@ -21,7 +21,7 @@ logger = get_logger()
21
21
  dataset_id='modelscope/hellaswag',
22
22
  model_adapter=ContinuationLogitsModelAdapter,
23
23
  subset_list=['default'],
24
- metric_list=[AverageAccuracy],
24
+ metric_list=['AverageAccuracy'],
25
25
  few_shot_num=0,
26
26
  train_split='train',
27
27
  eval_split='validation',
@@ -89,11 +89,7 @@ class HellaSwagAdapter(DataAdapter):
89
89
 
90
90
  ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
91
91
 
92
- return {
93
- 'data': ctx_continuation_pair_list,
94
- 'multi_choices': self.choices,
95
- 'system_prompt': self.prompt_template
96
- }
92
+ return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
97
93
 
98
94
  def get_gold_answer(self, input_d: dict) -> str:
99
95
  # Get the gold choice
@@ -2,7 +2,6 @@
2
2
  import re
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.metrics import Pass1
6
5
  from evalscope.models import ChatGenerationModelAdapter
7
6
  from evalscope.utils.logger import get_logger
8
7
 
@@ -17,11 +16,11 @@ logger = get_logger()
17
16
  dataset_id='modelscope/humaneval',
18
17
  model_adapter=ChatGenerationModelAdapter,
19
18
  subset_list=['openai_humaneval'],
20
- metric_list=[Pass1],
19
+ metric_list=['Pass@1'],
21
20
  few_shot_num=0,
22
21
  train_split=None,
23
22
  eval_split='test',
24
- prompt_template='',
23
+ prompt_template='Complete the following python code:\n{query}',
25
24
  )
26
25
  class HumanevalAdapter(DataAdapter):
27
26
  """
@@ -64,10 +63,10 @@ class HumanevalAdapter(DataAdapter):
64
63
  input_d (dict): The raw input. A single data format of the Humaneval:
65
64
  {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
66
65
  """
67
- full_prompt = input_d['prompt']
68
- full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
66
+ query = input_d['prompt']
67
+ full_prompt = self.prompt_template.format(query=query)
69
68
 
70
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
69
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
71
70
 
72
71
  @classmethod
73
72
  def _postprocess(cls, text: str) -> str:
@@ -2,9 +2,9 @@ from collections import defaultdict
2
2
  from typing import Any, Dict, List
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
5
+ from evalscope.benchmarks.ifeval.utils import process_results
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import Metric, mean
7
+ from evalscope.metrics import Metric, mean, metric_registry
8
8
  from evalscope.models import ChatGenerationModelAdapter
9
9
 
10
10
 
@@ -14,10 +14,10 @@ from evalscope.models import ChatGenerationModelAdapter
14
14
  model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
16
  metric_list=[
17
- Metric(name='prompt_level_strict_acc', object=mean),
18
- Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
19
- Metric(name='prompt_level_loose_acc', object=mean),
20
- Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
17
+ 'prompt_level_strict_acc',
18
+ 'inst_level_strict_acc',
19
+ 'prompt_level_loose_acc',
20
+ 'inst_level_loose_acc',
21
21
  ],
22
22
  few_shot_num=0,
23
23
  train_split=None,
@@ -29,8 +29,14 @@ class IFEvalAdapter(DataAdapter):
29
29
  def __init__(self, **kwargs):
30
30
  super().__init__(**kwargs)
31
31
 
32
+ # register metrics
33
+ metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
34
+ metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
35
+ metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
36
+ metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
37
+
32
38
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
33
- return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
39
+ return {'data': [input_d['prompt']], 'system_prompt': self.system_prompt}
34
40
 
35
41
  def get_gold_answer(self, input_d: dict) -> str:
36
42
  return input_d
@@ -48,9 +54,4 @@ class IFEvalAdapter(DataAdapter):
48
54
  for k, v in res.items():
49
55
  res_dict[k].append(v)
50
56
 
51
- metrics = []
52
- for metric in self.metric_list:
53
- metric_name = metric.name
54
- pred_value = res_dict[metric_name]
55
- metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
56
- return metrics
57
+ return super().compute_metric(res_dict)
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import AnswerKeys, EvalType
3
- from evalscope.metrics import AverageAccuracy, exact_match
2
+ from evalscope.constants import EvalType
3
+ from evalscope.metrics import exact_match
4
4
  from evalscope.models import ChatGenerationModelAdapter
5
5
  from evalscope.utils.utils import ResponseParser
6
6
 
@@ -10,11 +10,11 @@ from evalscope.utils.utils import ResponseParser
10
10
  dataset_id='AI-ModelScope/IQuiz',
11
11
  model_adapter=ChatGenerationModelAdapter,
12
12
  subset_list=['IQ', 'EQ'],
13
- metric_list=[AverageAccuracy],
13
+ metric_list=['AverageAccuracy'],
14
14
  few_shot_num=0,
15
15
  train_split=None,
16
16
  eval_split='test',
17
- prompt_template='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
17
+ system_prompt='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
18
18
  )
19
19
  class IQuizAdapter(DataAdapter):
20
20
 
@@ -36,7 +36,7 @@ class IQuizAdapter(DataAdapter):
36
36
  """
37
37
  prompt = f"问题: {input_d['question']}\n"
38
38
  prompt += self.__form_options(input_d['choices'])
39
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
39
+ return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
40
40
 
41
41
  def __form_options(self, options: list):
42
42
  option_str = '选项:\n'
File without changes
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='math_500',
13
+ dataset_id='AI-ModelScope/MATH-500',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='test',
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class Math500Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['problem']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -4,17 +4,15 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy, exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.models import MultiChoiceModelAdapter
9
- from evalscope.utils import ResponseParser, normalize_score
9
+ from evalscope.utils import ResponseParser
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  # flake8: noqa
13
13
 
14
14
  logger = get_logger()
15
15
 
16
- DATASET_ID = 'modelscope/mmlu'
17
-
18
16
  SUBSET_LIST = [
19
17
  'high_school_european_history',
20
18
  'business_ethics',
@@ -141,11 +139,11 @@ SUBJECT_MAPPING = {
141
139
  dataset_id='modelscope/mmlu',
142
140
  model_adapter=MultiChoiceModelAdapter,
143
141
  subset_list=SUBSET_LIST,
144
- metric_list=[AverageAccuracy],
142
+ metric_list=['AverageAccuracy'],
145
143
  few_shot_num=5,
146
144
  train_split='train',
147
145
  eval_split='test',
148
- prompt_template='',
146
+ prompt_template='The following are multiple choice questions (with answers) about {subset_name}. \n{query}',
149
147
  )
150
148
  class MMLUAdapter(DataAdapter):
151
149
 
@@ -221,17 +219,15 @@ class MMLUAdapter(DataAdapter):
221
219
  {'data': [full_prompt], 'multi_choices': self.choices}
222
220
 
223
221
  """
224
- prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
225
- self._format_subject(subset_name))
226
222
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
227
223
 
228
224
  context: str = '\n'.join(few_shot_prompts) + '\n'
229
225
  context += self._generate_prompt(input_d=input_d, include_answer=False)
230
- context = prompt + context
226
+ query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
231
227
 
232
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
228
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
233
229
 
234
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
230
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
235
231
 
236
232
  def get_gold_answer(self, input_d: dict) -> str:
237
233
  # Get the gold choice
@@ -3,22 +3,27 @@ from typing import Any, Dict
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import AnswerKeys, EvalType
6
- from evalscope.metrics import AverageAccuracy, exact_match
6
+ from evalscope.metrics import exact_match
7
7
  from evalscope.models import ChatGenerationModelAdapter
8
8
  from evalscope.utils.utils import ResponseParser
9
9
 
10
+ SUBSET_LIST = [
11
+ 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
12
+ 'philosophy', 'economics', 'other', 'psychology', 'history'
13
+ ]
14
+
10
15
 
11
16
  @Benchmark.register(
12
17
  name='mmlu_pro',
13
18
  dataset_id='modelscope/mmlu-pro',
14
19
  model_adapter=ChatGenerationModelAdapter,
15
- subset_list=['default'],
16
- metric_list=[AverageAccuracy],
20
+ subset_list=SUBSET_LIST,
21
+ metric_list=['AverageAccuracy'],
17
22
  few_shot_num=5,
18
23
  train_split='validation',
19
24
  eval_split='test',
20
25
  prompt_template=
21
- 'You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...`.', # noqa: E501
26
+ 'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
22
27
  )
23
28
  class MMLUProAdapter(DataAdapter):
24
29
 
@@ -26,10 +31,11 @@ class MMLUProAdapter(DataAdapter):
26
31
  super().__init__(**kwargs)
27
32
 
28
33
  self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
29
- self.categories = [
30
- 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
31
- 'philosophy', 'economics', 'other', 'psychology', 'history'
32
- ]
34
+
35
+ def load(self, **kwargs):
36
+ # default load all data
37
+ kwargs['subset_list'] = ['default']
38
+ return super().load(**kwargs)
33
39
 
34
40
  def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
35
41
  """
@@ -37,26 +43,32 @@ class MMLUProAdapter(DataAdapter):
37
43
  Return a dict with category as key and list of prompts as value.
38
44
  """
39
45
 
40
- data_dict = data_dict[self.subset_list[0]] # Only one subset for MMLU-Pro
46
+ data_dict = data_dict['default'] # Only one subset for MMLU-Pro
41
47
  fewshot_prompts = self.get_fewshot_examples(data_dict)
42
48
 
43
49
  # Use the category as key to group the prompts
44
50
  res_dict = defaultdict(list)
45
51
  # generate prompts for each test sample
46
52
  for entry in data_dict[self.eval_split]:
47
- prefix = fewshot_prompts[entry['category']]
53
+ subset_name = entry['category']
54
+ if subset_name not in self.subset_list:
55
+ continue
56
+ prefix = fewshot_prompts[subset_name]
48
57
  query = prefix + 'Q: ' + entry['question'] + '\n' + \
49
58
  self.__form_options(entry['options']) + '\n'
50
59
 
51
- prompt_d = {'data': [query], 'system_prompt': self.prompt_template, AnswerKeys.RAW_INPUT: entry}
60
+ full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
61
+ prompt_d = {'data': [full_prompt], 'system_prompt': self.system_prompt, AnswerKeys.RAW_INPUT: entry}
52
62
 
53
- res_dict[entry['category']].append(prompt_d)
63
+ res_dict[subset_name].append(prompt_d)
54
64
  return res_dict
55
65
 
56
66
  def get_fewshot_examples(self, data_dict: dict):
57
- # load 5-shot prompts for each category
58
- prompts = {c: '' for c in self.categories}
59
- for d in data_dict[self.train_split]:
67
+ # load few-shot prompts for each category
68
+ prompts = {c: '' for c in self.subset_list}
69
+ for index, d in enumerate(data_dict[self.train_split]):
70
+ if index >= self.few_shot_num:
71
+ break
60
72
  prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
61
73
  self.__form_options(d['options']) + '\n' + \
62
74
  d['cot_content'] + '\n\n'