evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (89) hide show
  1. evalscope/arguments.py +3 -1
  2. evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +14 -17
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
  6. evalscope/benchmarks/benchmark.py +12 -10
  7. evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
  10. evalscope/benchmarks/data_adapter.py +82 -19
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
  13. evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
  14. evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
  15. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
  16. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
  17. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
  18. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
  19. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  20. evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
  23. evalscope/benchmarks/musr/__init__.py +0 -0
  24. evalscope/benchmarks/musr/musr_adapter.py +71 -0
  25. evalscope/benchmarks/process_bench/__init__.py +0 -0
  26. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  27. evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
  28. evalscope/benchmarks/race/race_adapter.py +12 -16
  29. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  30. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
  31. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  32. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  33. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  34. evalscope/benchmarks/super_gpqa/utils.py +90 -0
  35. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  36. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  37. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
  38. evalscope/benchmarks/utils.py +43 -0
  39. evalscope/cli/start_app.py +4 -1
  40. evalscope/cli/start_eval.py +4 -3
  41. evalscope/cli/start_perf.py +4 -2
  42. evalscope/collections/evaluator.py +16 -1
  43. evalscope/config.py +13 -3
  44. evalscope/constants.py +7 -0
  45. evalscope/evaluator/evaluator.py +3 -1
  46. evalscope/metrics/__init__.py +2 -1
  47. evalscope/metrics/metrics.py +23 -2
  48. evalscope/metrics/named_metrics.py +1 -0
  49. evalscope/models/__init__.py +2 -1
  50. evalscope/models/base_adapter.py +32 -6
  51. evalscope/models/chat_adapter.py +4 -1
  52. evalscope/models/choice_adapter.py +4 -0
  53. evalscope/models/custom_adapter.py +2 -0
  54. evalscope/models/local_model.py +3 -2
  55. evalscope/models/register.py +28 -0
  56. evalscope/models/server_adapter.py +107 -29
  57. evalscope/perf/__init__.py +0 -1
  58. evalscope/perf/arguments.py +18 -8
  59. evalscope/perf/http_client.py +8 -6
  60. evalscope/perf/plugin/api/openai_api.py +11 -1
  61. evalscope/perf/utils/analysis_result.py +1 -1
  62. evalscope/perf/utils/benchmark_util.py +6 -2
  63. evalscope/report/app.py +15 -8
  64. evalscope/report/combinator.py +2 -2
  65. evalscope/run.py +6 -5
  66. evalscope/third_party/thinkbench/__init__.py +3 -0
  67. evalscope/third_party/thinkbench/eval.py +429 -0
  68. evalscope/third_party/thinkbench/infer.py +130 -0
  69. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  70. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  71. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  72. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  73. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  74. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  75. evalscope/utils/chat_service.py +1 -0
  76. evalscope/utils/filters.py +59 -0
  77. evalscope/utils/logger.py +3 -3
  78. evalscope/utils/model_utils.py +17 -1
  79. evalscope/utils/utils.py +45 -45
  80. evalscope/version.py +2 -2
  81. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
  82. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
  83. tests/cli/test_collection.py +1 -1
  84. tests/cli/test_run.py +151 -32
  85. /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
  86. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
  87. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
  88. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
  89. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
@@ -2,8 +2,10 @@
2
2
  import os.path
3
3
  import random
4
4
  from abc import ABC, abstractmethod
5
+ from collections import defaultdict
5
6
  from typing import Any, List, Optional, Union
6
7
 
8
+ from evalscope.benchmarks.utils import PromptData, preprocess_decorator
7
9
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
8
10
  from evalscope.metrics.named_metrics import metric_registry
9
11
  from evalscope.report import Report, ReportGenerator
@@ -17,6 +19,7 @@ class DataAdapter(ABC):
17
19
  def __init__(self,
18
20
  name: str,
19
21
  dataset_id: str,
22
+ model_adapter: str,
20
23
  subset_list: list,
21
24
  metric_list: List[str],
22
25
  few_shot_num: Optional[int] = 0,
@@ -24,6 +27,8 @@ class DataAdapter(ABC):
24
27
  eval_split: Optional[str] = None,
25
28
  prompt_template: Optional[str] = None,
26
29
  system_prompt: Optional[str] = None,
30
+ query_template: Optional[str] = None,
31
+ pretty_name: Optional[str] = None,
27
32
  **kwargs):
28
33
  """
29
34
  Data Adapter for the benchmark. You need to implement the following methods:
@@ -45,6 +50,7 @@ class DataAdapter(ABC):
45
50
  """
46
51
  self.name = name
47
52
  self.dataset_id = dataset_id
53
+ self.model_adapter = model_adapter
48
54
  self.subset_list = subset_list
49
55
  self.metric_list = metric_list
50
56
  self.few_shot_num = few_shot_num
@@ -52,14 +58,24 @@ class DataAdapter(ABC):
52
58
  self.eval_split = eval_split
53
59
  self.prompt_template = prompt_template
54
60
  self.system_prompt = system_prompt
61
+ self.query_template = query_template
62
+ self.pretty_name = pretty_name
55
63
  self.config_kwargs = kwargs
56
64
  self.category_map = kwargs.get('category_map', {})
65
+ self.choices = kwargs.get('choices', None)
66
+
67
+ def __init_subclass__(cls, **kwargs):
68
+ super().__init_subclass__(**kwargs)
69
+
70
+ # find and decorate parse_pred_result method
71
+ if hasattr(cls, 'parse_pred_result'):
72
+ original_method = cls.parse_pred_result
73
+ cls.parse_pred_result = preprocess_decorator(original_method)
57
74
 
58
75
  def load(self,
59
76
  dataset_name_or_path: str = None,
60
77
  subset_list: list = None,
61
78
  work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
62
- datasets_hub: str = HubType.MODELSCOPE,
63
79
  **kwargs) -> dict:
64
80
  """
65
81
  Load the dataset. Remote and local datasets are supported.
@@ -74,22 +90,43 @@ class DataAdapter(ABC):
74
90
 
75
91
  # Try to load dataset from local disk
76
92
  if os.path.exists(dataset_name_or_path):
77
- logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
78
- subsets: {subset_list}')
79
- data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
80
- if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
81
- raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
93
+ logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
94
+ data_dict = self.load_from_disk(
95
+ dataset_name_or_path, subset_list, work_dir, trust_remote_code=False, **kwargs)
82
96
  else:
83
- from modelscope.msdatasets import MsDataset
97
+ logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
98
+ data_dict = self.load_from_hub(
99
+ dataset_name_or_path, subset_list, work_dir, trust_remote_code=True, **kwargs)
100
+ if len(data_dict) == 0:
101
+ raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
102
+ return data_dict
103
+
104
+ def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
105
+ from modelscope.msdatasets import MsDataset
84
106
 
85
- # Load dataset from remote
86
- logger.info(
87
- f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
88
- data_dict = {}
89
- split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
90
- if len(split_list) == 0:
91
- logger.error(f'Got empty split list: {split_list}')
107
+ datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
108
+ split_as_subset: bool = kwargs.pop('split_as_subset', False)
109
+ # Load dataset from remote
110
+ logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
92
111
 
112
+ data_dict = {}
113
+ split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
114
+ if len(split_list) == 0:
115
+ logger.error(f'Got empty split list: {split_list}')
116
+
117
+ if split_as_subset:
118
+ for sub_name in subset_list:
119
+ data_dict[sub_name] = {}
120
+ # e.g. train: few-shot, test: target dataset to evaluate
121
+ for split in split_list:
122
+ dataset = MsDataset.load(
123
+ dataset_name=dataset_name_or_path,
124
+ split=sub_name, # load subset from split
125
+ cache_dir=work_dir,
126
+ hub=datasets_hub,
127
+ **kwargs)
128
+ data_dict[sub_name].update({split: dataset})
129
+ else:
93
130
  for sub_name in subset_list:
94
131
  data_dict[sub_name] = {}
95
132
  # e.g. train: few-shot, test: target dataset to evaluate
@@ -101,17 +138,34 @@ class DataAdapter(ABC):
101
138
  cache_dir=work_dir,
102
139
  hub=datasets_hub,
103
140
  **kwargs)
104
-
105
141
  data_dict[sub_name].update({split: dataset})
106
142
 
107
143
  return data_dict
108
144
 
109
- def load_from_disk(self, *args, **kwargs) -> dict:
145
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
110
146
  """
111
147
  Load the dataset from local disk.
112
148
  If you want to support local dataset, please rewrite this method in xxx_data_adapter.
149
+ Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
150
+ """
151
+ return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
152
+
153
+ def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
113
154
  """
114
- return {}
155
+ Reformat the dataset subset with subset_key and format.
156
+ """
157
+ res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list})
158
+
159
+ for sub_name, sub_data_dict in data_dict.items():
160
+ for split in [self.train_split, self.eval_split]:
161
+ if split is None:
162
+ continue
163
+ for sample_d in sub_data_dict[split]:
164
+ new_subset_name = format.format(sample_d[subset_key])
165
+ if new_subset_name not in self.subset_list:
166
+ continue
167
+ res_dict[new_subset_name][split].append(sample_d)
168
+ return res_dict
115
169
 
116
170
  def gen_prompts(self, data_dict: dict) -> dict:
117
171
  """
@@ -138,7 +192,7 @@ class DataAdapter(ABC):
138
192
 
139
193
  for sub_name, sub_data_dict in data_dict.items():
140
194
  few_shot_data = []
141
- if self.few_shot_num and self.few_shot_num > 0:
195
+ if self.train_split and self.few_shot_num and self.few_shot_num > 0:
142
196
  few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
143
197
  few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
144
198
  self.few_shot_num,
@@ -161,7 +215,7 @@ class DataAdapter(ABC):
161
215
  else:
162
216
  return data_list[:k]
163
217
 
164
- def compute_metric(self, review_res_list: Union[dict, list]) -> List[dict]:
218
+ def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]:
165
219
  """
166
220
  Compute evaluation result by specific metrics.
167
221
 
@@ -232,6 +286,12 @@ class DataAdapter(ABC):
232
286
  kwargs['metric_list'] = self.metric_list
233
287
  return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
234
288
 
289
+ def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
290
+ if not isinstance(prompt, list):
291
+ prompt = [prompt]
292
+ prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
293
+ return prompt_data.to_dict()
294
+
235
295
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
236
296
  """
237
297
  Generate model prompt from raw input, unify the prompt format for different datasets.
@@ -295,3 +355,6 @@ class DataAdapter(ABC):
295
355
  The match result. Usually a score (float) for chat/multiple-choice-questions.
296
356
  """
297
357
  raise NotImplementedError
358
+
359
+ def llm_match(self, *args, **kwargs):
360
+ pass
@@ -15,7 +15,6 @@ logger = get_logger()
15
15
  @Benchmark.register(
16
16
  name='data_collection',
17
17
  dataset_id='', # dataset_id need to be set
18
- model_adapter=ChatGenerationModelAdapter,
19
18
  subset_list=['default'],
20
19
  metric_list=['AverageAccuracy'],
21
20
  few_shot_num=0,
@@ -3,9 +3,8 @@ import csv
3
3
  import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics.metrics import exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
8
  from evalscope.utils import ResponseParser
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -16,22 +15,24 @@ logger = get_logger()
16
15
 
17
16
  @Benchmark.register(
18
17
  name='general_mcq',
18
+ pretty_name='General MCQ',
19
19
  dataset_id='general_mcq',
20
- model_adapter=MultiChoiceModelAdapter,
20
+ model_adapter=OutputType.MULTIPLE_CHOICE,
21
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
21
22
  subset_list=['default'],
22
23
  metric_list=['AverageAccuracy'],
23
24
  few_shot_num=0,
24
25
  train_split='dev',
25
26
  eval_split='val',
26
27
  prompt_template='请回答问题,并选出其中的正确答案\n{query}',
27
- )
28
+ query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
28
29
  class GeneralMCQAdapter(DataAdapter):
29
30
 
30
- choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
31
-
32
31
  def __init__(self, **kwargs):
33
32
  super().__init__(**kwargs)
34
33
 
34
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
35
+
35
36
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
36
37
  data_dict = {}
37
38
  for subset_name in subset_list:
@@ -85,7 +86,7 @@ class GeneralMCQAdapter(DataAdapter):
85
86
 
86
87
  full_prompt = self.prompt_template.format(query=context)
87
88
 
88
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
89
+ return self.gen_prompt_data(full_prompt)
89
90
 
90
91
  def get_gold_answer(self, input_d: dict) -> str:
91
92
  # Get the gold choice
@@ -103,27 +104,19 @@ class GeneralMCQAdapter(DataAdapter):
103
104
  Returns:
104
105
  The parsed answer. Depending on the dataset. Usually a string for chat.
105
106
  """
106
- if eval_type == EvalType.CHECKPOINT:
107
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
107
108
  return result
108
- elif eval_type == EvalType.SERVICE:
109
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
110
- elif eval_type == EvalType.CUSTOM:
111
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
112
109
  else:
113
- raise ValueError(f'Invalid eval_type: {eval_type}')
110
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
114
111
 
115
112
  def match(self, gold: str, pred: str) -> float:
116
113
  return exact_match(gold=gold, pred=pred)
117
114
 
118
- @classmethod
119
- def _format_example(cls, input_d: dict, include_answer=True):
120
- example = '问题:' + input_d['question']
121
- for choice in cls.choices:
122
- if choice in input_d:
123
- example += f'\n{choice}. {input_d[f"{choice}"]}'
115
+ def _format_example(self, input_d: dict, include_answer=True):
116
+ choices_str = '\n'.join([f'{choice}. {input_d[choice]}' for choice in self.choices if choice in input_d])
124
117
 
125
118
  if include_answer:
126
- example += '\n答案: ' + input_d['answer'] + '\n\n'
119
+ return self.query_template.format(
120
+ question=input_d['question'], choices=choices_str, answer=input_d['answer'])
127
121
  else:
128
- example += '\n答案: '
129
- return example
122
+ return self.query_template.format(question=input_d['question'], choices=choices_str, answer='').rstrip()
@@ -16,12 +16,12 @@ logger = get_logger()
16
16
  @Benchmark.register(
17
17
  name='general_qa',
18
18
  dataset_id='general_qa',
19
- model_adapter=ChatGenerationModelAdapter,
20
19
  subset_list=['default'],
21
- metric_list=['AverageBLEU'],
20
+ metric_list=['AverageBLEU', 'AverageRouge'],
22
21
  few_shot_num=0,
23
22
  train_split=None,
24
23
  eval_split='test',
24
+ prompt_template='请回答问题\n{query}',
25
25
  )
26
26
  class GeneralQAAdapter(DataAdapter):
27
27
  # TODO: set few_shot_num
@@ -30,18 +30,31 @@ class GeneralQAAdapter(DataAdapter):
30
30
 
31
31
  super().__init__(**kwargs)
32
32
 
33
- def load(self, **kwargs) -> dict:
33
+ def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
34
+ dataset_name_or_path = dataset_name_or_path or self.dataset_id
35
+ subset_list = subset_list or self.subset_list
34
36
 
35
- data_file_list = glob.glob(os.path.join(self.dataset_id, '*.jsonl'))
37
+ data_file_dict = defaultdict(str)
36
38
  data_list = []
37
39
 
40
+ # get data file path and subset name
41
+ if os.path.isdir(dataset_name_or_path):
42
+ for subset_name in subset_list:
43
+ data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
44
+ elif os.path.isfile(dataset_name_or_path):
45
+ cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
46
+ data_file_dict[cur_subset_name] = dataset_name_or_path
47
+ else:
48
+ raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
49
+
50
+ # load data from local disk
38
51
  try:
39
- for file_path in data_file_list:
52
+ for subset_name, file_path in data_file_dict.items():
40
53
  data_list.extend(jsonl_to_list(file_path))
41
54
  except Exception as e:
42
55
  raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
43
56
 
44
- data_dict = {'default': {'test': data_list}}
57
+ data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
45
58
 
46
59
  return data_dict
47
60
 
@@ -62,11 +75,9 @@ class GeneralQAAdapter(DataAdapter):
62
75
  logger.warning('The history is not included in the prompt for GeneralQA. \
63
76
  To be supported in the future.')
64
77
 
65
- prompt = input_d.get('question', '') or input_d.get('query', '')
66
-
67
- # if len(history) > 0:
68
- # prompt = '\n'.join(history) + '\n' + prompt
69
- return {'data': [prompt], 'system_prompt': self.system_prompt}
78
+ query = input_d.get('question', '') or input_d.get('query', '')
79
+ prompt = self.prompt_template.format(query=query)
80
+ return self.gen_prompt_data(prompt)
70
81
 
71
82
  def get_gold_answer(self, input_d: dict) -> str:
72
83
  """
@@ -101,13 +112,15 @@ class GeneralQAAdapter(DataAdapter):
101
112
 
102
113
  """
103
114
  res = dict()
104
- rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
105
- bleu_dict = bleu_ngram_one_sample(pred, gold)
106
- res.update(rouge_dict)
107
- res.update(bleu_dict)
115
+ if 'AverageRouge' in self.metric_list:
116
+ rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
117
+ res.update(rouge_dict)
118
+ if 'AverageBLEU' in self.metric_list:
119
+ bleu_dict = bleu_ngram_one_sample(pred, gold)
120
+ res.update(bleu_dict)
108
121
  return res
109
122
 
110
- def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
123
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
111
124
  """
112
125
  compute weighted mean of the bleu score of all samples
113
126
 
@@ -3,19 +3,20 @@ import random
3
3
  import re
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
 
10
9
 
11
10
  @Benchmark.register(
12
11
  name='gpqa',
12
+ pretty_name='GPQA',
13
13
  dataset_id='modelscope/gpqa',
14
- model_adapter=ChatGenerationModelAdapter,
14
+ model_adapter=OutputType.GENERATION,
15
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
15
16
  subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
16
17
  metric_list=['AveragePass@1'],
17
18
  few_shot_num=5,
18
- train_split='train',
19
+ train_split=None,
19
20
  eval_split='train', # only have train split
20
21
  prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
22
  )
@@ -27,8 +28,9 @@ class GPQAAdapter(DataAdapter):
27
28
  self.choices = ['A', 'B', 'C', 'D']
28
29
  if self.few_shot_num and self.few_shot_num > 0:
29
30
  self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
30
- self.prompt_prefix += open(os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'),
31
- 'r').read() + '\nQuestion: '
31
+ self.prompt_prefix += open(
32
+ os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'), 'r',
33
+ encoding='utf-8').read() + '\nQuestion: '
32
34
  else:
33
35
  self.prompt_prefix = 'What is the correct answer to this question:'
34
36
 
@@ -50,7 +52,7 @@ class GPQAAdapter(DataAdapter):
50
52
  query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
51
53
 
52
54
  prompt = self.prompt_template.format(query=query)
53
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
55
+ return self.gen_prompt_data(prompt)
54
56
 
55
57
  def __process_input(self, input_d: dict) -> dict:
56
58
 
@@ -94,7 +96,10 @@ class GPQAAdapter(DataAdapter):
94
96
  """
95
97
  Parse the predicted result and extract proper answer.
96
98
  """
97
- return GPQAAdapter.get_multiple_choice_answer(result)
99
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
100
+ return result
101
+ else:
102
+ return GPQAAdapter.get_multiple_choice_answer(result)
98
103
 
99
104
  def match(self, gold: str, pred: str) -> float:
100
105
  """
@@ -6,7 +6,6 @@ import os
6
6
  import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.models import ChatGenerationModelAdapter
10
9
  from evalscope.utils.io_utils import jsonl_to_list
11
10
  from evalscope.utils.logger import get_logger
12
11
 
@@ -15,12 +14,12 @@ logger = get_logger()
15
14
 
16
15
  @Benchmark.register(
17
16
  name='gsm8k',
17
+ pretty_name='GSM8K',
18
18
  dataset_id='modelscope/gsm8k',
19
- model_adapter=ChatGenerationModelAdapter,
20
19
  subset_list=['main'],
21
20
  metric_list=['AverageAccuracy'],
22
21
  few_shot_num=4,
23
- train_split='train',
22
+ train_split=None,
24
23
  eval_split='test',
25
24
  prompt_template="Question: {query}\nLet's think step by step\nAnswer:",
26
25
  )
@@ -76,7 +75,7 @@ class GSM8KAdapter(DataAdapter):
76
75
 
77
76
  full_prompt = context + self.prompt_template.format(query=input_d['question'])
78
77
 
79
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
78
+ return self.gen_prompt_data(full_prompt)
80
79
 
81
80
  def get_gold_answer(self, input_d: dict) -> str:
82
81
  # Extract the gold answer from the input dict.
@@ -4,9 +4,8 @@ import os
4
4
  import re
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
7
+ from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.models import ContinuationLogitsModelAdapter
10
9
  from evalscope.utils.io_utils import jsonl_to_list
11
10
  from evalscope.utils.logger import get_logger
12
11
  from evalscope.utils.utils import ResponseParser
@@ -18,8 +17,10 @@ logger = get_logger()
18
17
 
19
18
  @Benchmark.register(
20
19
  name='hellaswag',
20
+ pretty_name='HellaSwag',
21
21
  dataset_id='modelscope/hellaswag',
22
- model_adapter=ContinuationLogitsModelAdapter,
22
+ model_adapter=OutputType.CONTINUOUS,
23
+ output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
23
24
  subset_list=['default'],
24
25
  metric_list=['AverageAccuracy'],
25
26
  few_shot_num=0,
@@ -30,8 +31,6 @@ logger = get_logger()
30
31
  )
31
32
  class HellaSwagAdapter(DataAdapter):
32
33
 
33
- choices = ['0', '1', '2', '3']
34
-
35
34
  def __init__(self, **kwargs):
36
35
 
37
36
  few_shot_num = kwargs.get('few_shot_num', 0)
@@ -40,6 +39,7 @@ class HellaSwagAdapter(DataAdapter):
40
39
  kwargs['few_shot_num'] = 0
41
40
 
42
41
  super().__init__(**kwargs)
42
+ self.choices = ['0', '1', '2', '3']
43
43
 
44
44
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
45
45
  data_dict = {}
@@ -89,7 +89,7 @@ class HellaSwagAdapter(DataAdapter):
89
89
 
90
90
  ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
91
91
 
92
- return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
92
+ return self.gen_prompt_data(ctx_continuation_pair_list)
93
93
 
94
94
  def get_gold_answer(self, input_d: dict) -> str:
95
95
  # Get the gold choice
@@ -107,7 +107,7 @@ class HellaSwagAdapter(DataAdapter):
107
107
  Returns:
108
108
  The parsed answer. Depending on the dataset. Usually a string for chat.
109
109
  """
110
- if eval_type == EvalType.CHECKPOINT:
110
+ if self.model_adapter == OutputType.CONTINUOUS:
111
111
  # answer: in the form of [-2.3, -4.5, ...], len of self.choices
112
112
  result = np.array(result)
113
113
  endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
@@ -115,12 +115,8 @@ class HellaSwagAdapter(DataAdapter):
115
115
  best_choice_idx = np.argmax(result / completion_len)
116
116
 
117
117
  return str(best_choice_idx)
118
- elif eval_type == EvalType.SERVICE:
119
- return ResponseParser.parse_first_option(result)
120
- elif eval_type == EvalType.CUSTOM:
121
- return ResponseParser.parse_first_option(result)
122
118
  else:
123
- raise ValueError(f'Invalid eval_type: {eval_type}')
119
+ return ResponseParser.parse_first_option(result)
124
120
 
125
121
  def match(self, gold: str, pred: str) -> float:
126
122
  return exact_match(gold=str(gold), pred=str(pred))
@@ -13,8 +13,8 @@ logger = get_logger()
13
13
 
14
14
  @Benchmark.register(
15
15
  name='humaneval',
16
+ pretty_name='HumanEval',
16
17
  dataset_id='modelscope/humaneval',
17
- model_adapter=ChatGenerationModelAdapter,
18
18
  subset_list=['openai_humaneval'],
19
19
  metric_list=['Pass@1'],
20
20
  few_shot_num=0,
@@ -66,7 +66,7 @@ class HumanevalAdapter(DataAdapter):
66
66
  query = input_d['prompt']
67
67
  full_prompt = self.prompt_template.format(query=query)
68
68
 
69
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
69
+ return self.gen_prompt_data(full_prompt)
70
70
 
71
71
  @classmethod
72
72
  def _postprocess(cls, text: str) -> str:
@@ -5,13 +5,12 @@ from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.benchmarks.ifeval.utils import process_results
6
6
  from evalscope.constants import EvalType
7
7
  from evalscope.metrics import Metric, mean, metric_registry
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
 
10
9
 
11
10
  @Benchmark.register(
12
11
  name='ifeval',
12
+ pretty_name='IFEval',
13
13
  dataset_id='opencompass/ifeval',
14
- model_adapter=ChatGenerationModelAdapter,
15
14
  subset_list=['default'],
16
15
  metric_list=[
17
16
  'prompt_level_strict_acc',
@@ -36,7 +35,7 @@ class IFEvalAdapter(DataAdapter):
36
35
  metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
37
36
 
38
37
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
39
- return {'data': [input_d['prompt']], 'system_prompt': self.system_prompt}
38
+ return self.gen_prompt_data(input_d['prompt'])
40
39
 
41
40
  def get_gold_answer(self, input_d: dict) -> str:
42
41
  return input_d
@@ -47,7 +46,7 @@ class IFEvalAdapter(DataAdapter):
47
46
  def match(self, gold: Any, pred: Any) -> Dict:
48
47
  return process_results(gold, [pred])
49
48
 
50
- def compute_metric(self, review_res_list: List[dict]) -> Any:
49
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
51
50
  # aggregate review results
52
51
  res_dict = defaultdict(list)
53
52
  for res in review_res_list:
@@ -1,14 +1,15 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import EvalType
2
+ from evalscope.constants import EvalType, OutputType
3
3
  from evalscope.metrics import exact_match
4
- from evalscope.models import ChatGenerationModelAdapter
5
4
  from evalscope.utils.utils import ResponseParser
6
5
 
7
6
 
8
7
  @Benchmark.register(
9
8
  name='iquiz',
9
+ pretty_name='IQuiz',
10
10
  dataset_id='AI-ModelScope/IQuiz',
11
- model_adapter=ChatGenerationModelAdapter,
11
+ model_adapter=OutputType.GENERATION,
12
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
12
13
  subset_list=['IQ', 'EQ'],
13
14
  metric_list=['AverageAccuracy'],
14
15
  few_shot_num=0,
@@ -36,7 +37,7 @@ class IQuizAdapter(DataAdapter):
36
37
  """
37
38
  prompt = f"问题: {input_d['question']}\n"
38
39
  prompt += self.__form_options(input_d['choices'])
39
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
40
+ return self.gen_prompt_data(prompt)
40
41
 
41
42
  def __form_options(self, options: list):
42
43
  option_str = '选项:\n'
@@ -54,7 +55,10 @@ class IQuizAdapter(DataAdapter):
54
55
  """
55
56
  Parse the predicted result and extract proper answer.
56
57
  """
57
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
58
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
59
+ return result
60
+ else:
61
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
58
62
 
59
63
  def match(self, gold: str, pred: str) -> float:
60
64
  """
@@ -1,6 +1,5 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
2
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
- from evalscope.models import ChatGenerationModelAdapter
4
3
  from evalscope.utils.logger import get_logger
5
4
 
6
5
  # flake8: noqa
@@ -10,9 +9,9 @@ logger = get_logger()
10
9
 
11
10
  @Benchmark.register(
12
11
  name='math_500',
12
+ pretty_name='MATH-500',
13
13
  dataset_id='AI-ModelScope/MATH-500',
14
- model_adapter=ChatGenerationModelAdapter,
15
- subset_list=['default'],
14
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
16
15
  metric_list=['AveragePass@1'],
17
16
  few_shot_num=0,
18
17
  train_split=None,
@@ -24,6 +23,12 @@ class Math500Adapter(DataAdapter):
24
23
  def __init__(self, *args, **kwargs):
25
24
  super().__init__(*args, **kwargs)
26
25
 
26
+ def load(self, **kwargs):
27
+ # default load all levels
28
+ kwargs['subset_list'] = ['default']
29
+ data_dict = super().load(**kwargs)
30
+ return self.reformat_subset(data_dict, subset_key='level', format='Level {}')
31
+
27
32
  def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
33
  """
29
34
  Generate the prompt for the model input.
@@ -31,7 +36,7 @@ class Math500Adapter(DataAdapter):
31
36
  problem = input_d['problem']
32
37
  full_prompt = self.prompt_template.format(query=problem)
33
38
 
34
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
39
+ return self.gen_prompt_data(full_prompt)
35
40
 
36
41
  def get_gold_answer(self, input_d: dict) -> str:
37
42
  # Extract the gold answer from the input dict.