evalscope 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (68) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  4. evalscope/benchmarks/arc/arc_adapter.py +14 -17
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  6. evalscope/benchmarks/benchmark.py +9 -9
  7. evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  10. evalscope/benchmarks/data_adapter.py +31 -21
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  13. evalscope/benchmarks/general_qa/general_qa_adapter.py +25 -11
  14. evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  15. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  16. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
  17. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
  18. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  19. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  20. evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  23. evalscope/benchmarks/musr/musr_adapter.py +8 -5
  24. evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  25. evalscope/benchmarks/race/race_adapter.py +12 -16
  26. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  27. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
  28. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  29. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  30. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  31. evalscope/benchmarks/super_gpqa/utils.py +90 -0
  32. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  34. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  35. evalscope/benchmarks/utils.py +43 -0
  36. evalscope/collections/evaluator.py +11 -2
  37. evalscope/config.py +10 -2
  38. evalscope/constants.py +7 -0
  39. evalscope/metrics/named_metrics.py +1 -0
  40. evalscope/models/__init__.py +2 -1
  41. evalscope/models/base_adapter.py +25 -5
  42. evalscope/models/chat_adapter.py +3 -0
  43. evalscope/models/choice_adapter.py +4 -0
  44. evalscope/models/custom_adapter.py +2 -0
  45. evalscope/models/register.py +28 -0
  46. evalscope/models/server_adapter.py +35 -8
  47. evalscope/perf/arguments.py +13 -7
  48. evalscope/perf/http_client.py +6 -4
  49. evalscope/perf/utils/analysis_result.py +1 -1
  50. evalscope/report/app.py +3 -0
  51. evalscope/report/combinator.py +2 -2
  52. evalscope/run.py +5 -4
  53. evalscope/third_party/thinkbench/eval.py +220 -55
  54. evalscope/third_party/thinkbench/infer.py +37 -7
  55. evalscope/third_party/thinkbench/tools/llm.py +1 -0
  56. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  57. evalscope/utils/chat_service.py +1 -0
  58. evalscope/utils/filters.py +59 -0
  59. evalscope/utils/logger.py +3 -3
  60. evalscope/version.py +2 -2
  61. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +7 -3
  62. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +68 -58
  63. tests/cli/test_collection.py +1 -1
  64. tests/cli/test_run.py +135 -28
  65. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
  66. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
  67. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
  68. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
@@ -16,9 +16,8 @@ logger = get_logger()
16
16
  @Benchmark.register(
17
17
  name='general_qa',
18
18
  dataset_id='general_qa',
19
- model_adapter=ChatGenerationModelAdapter,
20
19
  subset_list=['default'],
21
- metric_list=['AverageBLEU'],
20
+ metric_list=['AverageBLEU', 'AverageRouge'],
22
21
  few_shot_num=0,
23
22
  train_split=None,
24
23
  eval_split='test',
@@ -31,18 +30,31 @@ class GeneralQAAdapter(DataAdapter):
31
30
 
32
31
  super().__init__(**kwargs)
33
32
 
34
- def load(self, **kwargs) -> dict:
33
+ def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
34
+ dataset_name_or_path = dataset_name_or_path or self.dataset_id
35
+ subset_list = subset_list or self.subset_list
35
36
 
36
- data_file_list = glob.glob(os.path.join(self.dataset_id, '*.jsonl'))
37
+ data_file_dict = defaultdict(str)
37
38
  data_list = []
38
39
 
40
+ # get data file path and subset name
41
+ if os.path.isdir(dataset_name_or_path):
42
+ for subset_name in subset_list:
43
+ data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
44
+ elif os.path.isfile(dataset_name_or_path):
45
+ cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
46
+ data_file_dict[cur_subset_name] = dataset_name_or_path
47
+ else:
48
+ raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
49
+
50
+ # load data from local disk
39
51
  try:
40
- for file_path in data_file_list:
52
+ for subset_name, file_path in data_file_dict.items():
41
53
  data_list.extend(jsonl_to_list(file_path))
42
54
  except Exception as e:
43
55
  raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
44
56
 
45
- data_dict = {'default': {'test': data_list}}
57
+ data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
46
58
 
47
59
  return data_dict
48
60
 
@@ -65,7 +77,7 @@ class GeneralQAAdapter(DataAdapter):
65
77
 
66
78
  query = input_d.get('question', '') or input_d.get('query', '')
67
79
  prompt = self.prompt_template.format(query=query)
68
- return {'data': [prompt], 'system_prompt': self.system_prompt}
80
+ return self.gen_prompt_data(prompt)
69
81
 
70
82
  def get_gold_answer(self, input_d: dict) -> str:
71
83
  """
@@ -100,10 +112,12 @@ class GeneralQAAdapter(DataAdapter):
100
112
 
101
113
  """
102
114
  res = dict()
103
- rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
104
- bleu_dict = bleu_ngram_one_sample(pred, gold)
105
- res.update(rouge_dict)
106
- res.update(bleu_dict)
115
+ if 'AverageRouge' in self.metric_list:
116
+ rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
117
+ res.update(rouge_dict)
118
+ if 'AverageBLEU' in self.metric_list:
119
+ bleu_dict = bleu_ngram_one_sample(pred, gold)
120
+ res.update(bleu_dict)
107
121
  return res
108
122
 
109
123
  def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
@@ -3,15 +3,16 @@ import random
3
3
  import re
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
 
10
9
 
11
10
  @Benchmark.register(
12
11
  name='gpqa',
12
+ pretty_name='GPQA',
13
13
  dataset_id='modelscope/gpqa',
14
- model_adapter=ChatGenerationModelAdapter,
14
+ model_adapter=OutputType.GENERATION,
15
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
15
16
  subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
16
17
  metric_list=['AveragePass@1'],
17
18
  few_shot_num=5,
@@ -27,8 +28,9 @@ class GPQAAdapter(DataAdapter):
27
28
  self.choices = ['A', 'B', 'C', 'D']
28
29
  if self.few_shot_num and self.few_shot_num > 0:
29
30
  self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
30
- self.prompt_prefix += open(os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'),
31
- 'r').read() + '\nQuestion: '
31
+ self.prompt_prefix += open(
32
+ os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'), 'r',
33
+ encoding='utf-8').read() + '\nQuestion: '
32
34
  else:
33
35
  self.prompt_prefix = 'What is the correct answer to this question:'
34
36
 
@@ -50,7 +52,7 @@ class GPQAAdapter(DataAdapter):
50
52
  query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
51
53
 
52
54
  prompt = self.prompt_template.format(query=query)
53
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
55
+ return self.gen_prompt_data(prompt)
54
56
 
55
57
  def __process_input(self, input_d: dict) -> dict:
56
58
 
@@ -94,7 +96,10 @@ class GPQAAdapter(DataAdapter):
94
96
  """
95
97
  Parse the predicted result and extract proper answer.
96
98
  """
97
- return GPQAAdapter.get_multiple_choice_answer(result)
99
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
100
+ return result
101
+ else:
102
+ return GPQAAdapter.get_multiple_choice_answer(result)
98
103
 
99
104
  def match(self, gold: str, pred: str) -> float:
100
105
  """
@@ -6,7 +6,6 @@ import os
6
6
  import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.models import ChatGenerationModelAdapter
10
9
  from evalscope.utils.io_utils import jsonl_to_list
11
10
  from evalscope.utils.logger import get_logger
12
11
 
@@ -15,8 +14,8 @@ logger = get_logger()
15
14
 
16
15
  @Benchmark.register(
17
16
  name='gsm8k',
17
+ pretty_name='GSM8K',
18
18
  dataset_id='modelscope/gsm8k',
19
- model_adapter=ChatGenerationModelAdapter,
20
19
  subset_list=['main'],
21
20
  metric_list=['AverageAccuracy'],
22
21
  few_shot_num=4,
@@ -76,7 +75,7 @@ class GSM8KAdapter(DataAdapter):
76
75
 
77
76
  full_prompt = context + self.prompt_template.format(query=input_d['question'])
78
77
 
79
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
78
+ return self.gen_prompt_data(full_prompt)
80
79
 
81
80
  def get_gold_answer(self, input_d: dict) -> str:
82
81
  # Extract the gold answer from the input dict.
@@ -4,9 +4,8 @@ import os
4
4
  import re
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
7
+ from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.models import ContinuationLogitsModelAdapter
10
9
  from evalscope.utils.io_utils import jsonl_to_list
11
10
  from evalscope.utils.logger import get_logger
12
11
  from evalscope.utils.utils import ResponseParser
@@ -18,8 +17,10 @@ logger = get_logger()
18
17
 
19
18
  @Benchmark.register(
20
19
  name='hellaswag',
20
+ pretty_name='HellaSwag',
21
21
  dataset_id='modelscope/hellaswag',
22
- model_adapter=ContinuationLogitsModelAdapter,
22
+ model_adapter=OutputType.CONTINUOUS,
23
+ output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
23
24
  subset_list=['default'],
24
25
  metric_list=['AverageAccuracy'],
25
26
  few_shot_num=0,
@@ -30,8 +31,6 @@ logger = get_logger()
30
31
  )
31
32
  class HellaSwagAdapter(DataAdapter):
32
33
 
33
- choices = ['0', '1', '2', '3']
34
-
35
34
  def __init__(self, **kwargs):
36
35
 
37
36
  few_shot_num = kwargs.get('few_shot_num', 0)
@@ -40,6 +39,7 @@ class HellaSwagAdapter(DataAdapter):
40
39
  kwargs['few_shot_num'] = 0
41
40
 
42
41
  super().__init__(**kwargs)
42
+ self.choices = ['0', '1', '2', '3']
43
43
 
44
44
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
45
45
  data_dict = {}
@@ -89,7 +89,7 @@ class HellaSwagAdapter(DataAdapter):
89
89
 
90
90
  ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
91
91
 
92
- return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
92
+ return self.gen_prompt_data(ctx_continuation_pair_list)
93
93
 
94
94
  def get_gold_answer(self, input_d: dict) -> str:
95
95
  # Get the gold choice
@@ -107,7 +107,7 @@ class HellaSwagAdapter(DataAdapter):
107
107
  Returns:
108
108
  The parsed answer. Depending on the dataset. Usually a string for chat.
109
109
  """
110
- if eval_type == EvalType.CHECKPOINT:
110
+ if self.model_adapter == OutputType.CONTINUOUS:
111
111
  # answer: in the form of [-2.3, -4.5, ...], len of self.choices
112
112
  result = np.array(result)
113
113
  endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
@@ -115,12 +115,8 @@ class HellaSwagAdapter(DataAdapter):
115
115
  best_choice_idx = np.argmax(result / completion_len)
116
116
 
117
117
  return str(best_choice_idx)
118
- elif eval_type == EvalType.SERVICE:
119
- return ResponseParser.parse_first_option(result)
120
- elif eval_type == EvalType.CUSTOM:
121
- return ResponseParser.parse_first_option(result)
122
118
  else:
123
- raise ValueError(f'Invalid eval_type: {eval_type}')
119
+ return ResponseParser.parse_first_option(result)
124
120
 
125
121
  def match(self, gold: str, pred: str) -> float:
126
122
  return exact_match(gold=str(gold), pred=str(pred))
@@ -13,8 +13,8 @@ logger = get_logger()
13
13
 
14
14
  @Benchmark.register(
15
15
  name='humaneval',
16
+ pretty_name='HumanEval',
16
17
  dataset_id='modelscope/humaneval',
17
- model_adapter=ChatGenerationModelAdapter,
18
18
  subset_list=['openai_humaneval'],
19
19
  metric_list=['Pass@1'],
20
20
  few_shot_num=0,
@@ -66,7 +66,7 @@ class HumanevalAdapter(DataAdapter):
66
66
  query = input_d['prompt']
67
67
  full_prompt = self.prompt_template.format(query=query)
68
68
 
69
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
69
+ return self.gen_prompt_data(full_prompt)
70
70
 
71
71
  @classmethod
72
72
  def _postprocess(cls, text: str) -> str:
@@ -5,13 +5,12 @@ from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.benchmarks.ifeval.utils import process_results
6
6
  from evalscope.constants import EvalType
7
7
  from evalscope.metrics import Metric, mean, metric_registry
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
 
10
9
 
11
10
  @Benchmark.register(
12
11
  name='ifeval',
12
+ pretty_name='IFEval',
13
13
  dataset_id='opencompass/ifeval',
14
- model_adapter=ChatGenerationModelAdapter,
15
14
  subset_list=['default'],
16
15
  metric_list=[
17
16
  'prompt_level_strict_acc',
@@ -36,7 +35,7 @@ class IFEvalAdapter(DataAdapter):
36
35
  metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
37
36
 
38
37
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
39
- return {'data': [input_d['prompt']], 'system_prompt': self.system_prompt}
38
+ return self.gen_prompt_data(input_d['prompt'])
40
39
 
41
40
  def get_gold_answer(self, input_d: dict) -> str:
42
41
  return input_d
@@ -1,14 +1,15 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import EvalType
2
+ from evalscope.constants import EvalType, OutputType
3
3
  from evalscope.metrics import exact_match
4
- from evalscope.models import ChatGenerationModelAdapter
5
4
  from evalscope.utils.utils import ResponseParser
6
5
 
7
6
 
8
7
  @Benchmark.register(
9
8
  name='iquiz',
9
+ pretty_name='IQuiz',
10
10
  dataset_id='AI-ModelScope/IQuiz',
11
- model_adapter=ChatGenerationModelAdapter,
11
+ model_adapter=OutputType.GENERATION,
12
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
12
13
  subset_list=['IQ', 'EQ'],
13
14
  metric_list=['AverageAccuracy'],
14
15
  few_shot_num=0,
@@ -36,7 +37,7 @@ class IQuizAdapter(DataAdapter):
36
37
  """
37
38
  prompt = f"问题: {input_d['question']}\n"
38
39
  prompt += self.__form_options(input_d['choices'])
39
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
40
+ return self.gen_prompt_data(prompt)
40
41
 
41
42
  def __form_options(self, options: list):
42
43
  option_str = '选项:\n'
@@ -54,7 +55,10 @@ class IQuizAdapter(DataAdapter):
54
55
  """
55
56
  Parse the predicted result and extract proper answer.
56
57
  """
57
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
58
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
59
+ return result
60
+ else:
61
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
58
62
 
59
63
  def match(self, gold: str, pred: str) -> float:
60
64
  """
@@ -1,9 +1,5 @@
1
- from collections import defaultdict
2
-
3
1
  from evalscope.benchmarks import Benchmark, DataAdapter
4
- from evalscope.constants import AnswerKeys
5
2
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
6
- from evalscope.models import ChatGenerationModelAdapter
7
3
  from evalscope.utils.logger import get_logger
8
4
 
9
5
  # flake8: noqa
@@ -13,8 +9,8 @@ logger = get_logger()
13
9
 
14
10
  @Benchmark.register(
15
11
  name='math_500',
12
+ pretty_name='MATH-500',
16
13
  dataset_id='AI-ModelScope/MATH-500',
17
- model_adapter=ChatGenerationModelAdapter,
18
14
  subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
19
15
  metric_list=['AveragePass@1'],
20
16
  few_shot_num=0,
@@ -40,7 +36,7 @@ class Math500Adapter(DataAdapter):
40
36
  problem = input_d['problem']
41
37
  full_prompt = self.prompt_template.format(query=problem)
42
38
 
43
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
39
+ return self.gen_prompt_data(full_prompt)
44
40
 
45
41
  def get_gold_answer(self, input_d: dict) -> str:
46
42
  # Extract the gold answer from the input dict.
@@ -3,9 +3,8 @@ import csv
3
3
  import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
8
  from evalscope.utils import ResponseParser
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -136,8 +135,10 @@ SUBJECT_MAPPING = {
136
135
 
137
136
  @Benchmark.register(
138
137
  name='mmlu',
138
+ pretty_name='MMLU',
139
139
  dataset_id='modelscope/mmlu',
140
- model_adapter=MultiChoiceModelAdapter,
140
+ model_adapter=OutputType.MULTIPLE_CHOICE,
141
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
141
142
  subset_list=SUBSET_LIST,
142
143
  metric_list=['AverageAccuracy'],
143
144
  few_shot_num=5,
@@ -147,8 +148,6 @@ SUBJECT_MAPPING = {
147
148
  )
148
149
  class MMLUAdapter(DataAdapter):
149
150
 
150
- choices = ['A', 'B', 'C', 'D']
151
-
152
151
  def __init__(self, **kwargs):
153
152
 
154
153
  few_shot_num = kwargs.get('few_shot_num', 5)
@@ -159,6 +158,7 @@ class MMLUAdapter(DataAdapter):
159
158
  super().__init__(**kwargs)
160
159
 
161
160
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
161
+ self.choices = ['A', 'B', 'C', 'D']
162
162
 
163
163
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
164
164
  data_dict = {}
@@ -227,7 +227,7 @@ class MMLUAdapter(DataAdapter):
227
227
 
228
228
  full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
229
229
 
230
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
230
+ return self.gen_prompt_data(full_prompt)
231
231
 
232
232
  def get_gold_answer(self, input_d: dict) -> str:
233
233
  # Get the gold choice
@@ -245,26 +245,21 @@ class MMLUAdapter(DataAdapter):
245
245
  Returns:
246
246
  The parsed answer. Depending on the dataset. Usually a string for chat.
247
247
  """
248
- if eval_type == EvalType.CHECKPOINT:
248
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
249
249
  return result
250
- elif eval_type == EvalType.SERVICE:
251
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
252
- elif eval_type == EvalType.CUSTOM:
253
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
254
250
  else:
255
- raise ValueError(f'Invalid eval_type: {eval_type}')
251
+ return ResponseParser.parse_first_option(result, self.choices)
256
252
 
257
253
  def match(self, gold: str, pred: str) -> float:
258
254
  return exact_match(gold=gold, pred=pred)
259
255
 
260
- @classmethod
261
- def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
256
+ def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
262
257
 
263
258
  input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
264
259
 
265
260
  example: str = input_d['input']
266
- for j in range(len(cls.choices)):
267
- example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
261
+ for j in range(len(self.choices)):
262
+ example += '\n{}. {}'.format(self.choices[j], input_choices[j])
268
263
 
269
264
  example += '\nAnswer:'
270
265
  if include_answer:
@@ -2,9 +2,8 @@ from collections import defaultdict
2
2
  from typing import Any, Dict
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import AnswerKeys, EvalType
5
+ from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.metrics import exact_match
7
- from evalscope.models import ChatGenerationModelAdapter
8
7
  from evalscope.utils.utils import ResponseParser
9
8
 
10
9
  SUBSET_LIST = [
@@ -15,8 +14,10 @@ SUBSET_LIST = [
15
14
 
16
15
  @Benchmark.register(
17
16
  name='mmlu_pro',
17
+ pretty_name='MMLU-Pro',
18
18
  dataset_id='modelscope/MMLU-Pro',
19
- model_adapter=ChatGenerationModelAdapter,
19
+ model_adapter=OutputType.GENERATION,
20
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
20
21
  subset_list=SUBSET_LIST,
21
22
  metric_list=['AverageAccuracy'],
22
23
  few_shot_num=5,
@@ -47,7 +48,7 @@ class MMLUProAdapter(DataAdapter):
47
48
  self.__form_options(input_d['options']) + '\n'
48
49
 
49
50
  full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
50
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
51
+ return self.gen_prompt_data(full_prompt)
51
52
 
52
53
  def format_fewshot_examples(self, few_shot_list):
53
54
  # load few-shot prompts for each category
@@ -88,7 +89,10 @@ class MMLUProAdapter(DataAdapter):
88
89
  Returns:
89
90
  The parsed answer. Depending on the dataset. Usually a string for chat.
90
91
  """
91
- return ResponseParser.parse_first_option(result)
92
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
93
+ return result
94
+ else:
95
+ return ResponseParser.parse_first_option(result)
92
96
 
93
97
  def match(self, gold: str, pred: str) -> float:
94
98
  """
@@ -2,9 +2,8 @@ import ast
2
2
  from typing import Any
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType
5
+ from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.metrics import exact_match
7
- from evalscope.models import ChatGenerationModelAdapter
8
7
  from evalscope.utils.utils import ResponseParser
9
8
 
10
9
 
@@ -12,7 +11,8 @@ from evalscope.utils.utils import ResponseParser
12
11
  name='musr',
13
12
  pretty_name='MuSR',
14
13
  dataset_id='AI-ModelScope/MuSR',
15
- model_adapter=ChatGenerationModelAdapter,
14
+ model_adapter=OutputType.GENERATION,
15
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
16
16
  subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
17
17
  metric_list=['AverageAccuracy'],
18
18
  few_shot_num=0,
@@ -41,7 +41,7 @@ class MuSRAdapter(DataAdapter):
41
41
  full_prompt = self.prompt_template.format(
42
42
  narrative=input_d['narrative'], question=input_d['question'], choices=choices)
43
43
 
44
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
44
+ return self.gen_prompt_data(full_prompt)
45
45
 
46
46
  def format_choice(self, options: list):
47
47
  option_str = ''
@@ -59,7 +59,10 @@ class MuSRAdapter(DataAdapter):
59
59
  """
60
60
  Parse the predicted result and extract proper answer.
61
61
  """
62
- return ResponseParser.parse_first_option(result)
62
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
63
+ return result
64
+ else:
65
+ return ResponseParser.parse_first_option(result)
63
66
 
64
67
  def match(self, gold: str, pred: str) -> float:
65
68
  """
@@ -5,7 +5,6 @@ from typing import Any, List
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import AnswerKeys, EvalType
7
7
  from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
 
10
9
  cur_path = os.path.dirname(os.path.abspath(__file__))
11
10
 
@@ -14,7 +13,6 @@ cur_path = os.path.dirname(os.path.abspath(__file__))
14
13
  name='process_bench',
15
14
  pretty_name='ProcessBench',
16
15
  dataset_id='Qwen/ProcessBench',
17
- model_adapter=ChatGenerationModelAdapter,
18
16
  subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
19
17
  metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
20
18
  few_shot_num=0,
@@ -26,7 +24,7 @@ class ProcessBenchAdapter(DataAdapter):
26
24
  def __init__(self, **kwargs):
27
25
  super().__init__(**kwargs)
28
26
 
29
- self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt')).read()
27
+ self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt'), encoding='utf-8').read()
30
28
 
31
29
  # register metrics
32
30
  metric_registry.register(Metric(name='error_acc', object=mean))
@@ -50,7 +48,7 @@ class ProcessBenchAdapter(DataAdapter):
50
48
 
51
49
  full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
52
50
 
53
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
51
+ return self.gen_prompt_data(full_prompt)
54
52
 
55
53
  def get_gold_answer(self, input_d: dict) -> str:
56
54
  """
@@ -84,7 +82,12 @@ class ProcessBenchAdapter(DataAdapter):
84
82
  correct_data.append(res)
85
83
  else:
86
84
  error_data.append(res)
87
- data = {'error_acc': error_data, 'correct_acc': correct_data, 'simple_f1_score': (correct_data, error_data)}
85
+ data = {}
86
+ if len(correct_data) != 0:
87
+ data.update({'correct_acc': correct_data})
88
+ if len(error_data) != 0:
89
+ data.update({'error_acc': error_data})
90
+ data.update({'simple_f1_score': (correct_data, error_data)})
88
91
  return super().compute_metric(data)
89
92
 
90
93
  @staticmethod
@@ -3,9 +3,8 @@
3
3
  import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
8
  from evalscope.utils import ResponseParser
10
9
  from evalscope.utils.io_utils import jsonl_to_list
11
10
  from evalscope.utils.logger import get_logger
@@ -17,8 +16,10 @@ logger = get_logger()
17
16
 
18
17
  @Benchmark.register(
19
18
  name='race',
19
+ pretty_name='RACE',
20
20
  dataset_id='modelscope/race',
21
- model_adapter=MultiChoiceModelAdapter,
21
+ model_adapter=OutputType.MULTIPLE_CHOICE,
22
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
22
23
  subset_list=['high', 'middle'],
23
24
  metric_list=['AverageAccuracy'],
24
25
  few_shot_num=3,
@@ -27,8 +28,6 @@ logger = get_logger()
27
28
  )
28
29
  class RACEAdapter(DataAdapter):
29
30
 
30
- choices = ['A', 'B', 'C', 'D']
31
-
32
31
  def __init__(self, **kwargs):
33
32
  few_shot_num = kwargs.get('few_shot_num', 3)
34
33
  if few_shot_num > 3:
@@ -37,6 +36,8 @@ class RACEAdapter(DataAdapter):
37
36
 
38
37
  super().__init__(**kwargs)
39
38
 
39
+ self.choices = ['A', 'B', 'C', 'D']
40
+
40
41
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
41
42
  data_dict = {}
42
43
  for subset_name in subset_list:
@@ -82,7 +83,7 @@ class RACEAdapter(DataAdapter):
82
83
 
83
84
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
84
85
 
85
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
86
+ return self.gen_prompt_data(full_prompt)
86
87
 
87
88
  def get_gold_answer(self, input_d: dict) -> str:
88
89
  # Get the gold choice
@@ -100,26 +101,21 @@ class RACEAdapter(DataAdapter):
100
101
  Returns:
101
102
  The parsed answer. Depending on the dataset. Usually a string for chat.
102
103
  """
103
- if eval_type == EvalType.CHECKPOINT:
104
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
104
105
  return result
105
- elif eval_type == EvalType.SERVICE:
106
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
107
- elif eval_type == EvalType.CUSTOM:
108
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
109
106
  else:
110
- raise ValueError(f'Unknown eval_type: {eval_type}')
107
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
111
108
 
112
109
  def match(self, gold: str, pred: str) -> float:
113
110
  return exact_match(gold=gold, pred=pred)
114
111
 
115
- @classmethod
116
- def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
112
+ def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
117
113
 
118
114
  input_choices: list = input_d['options']
119
115
 
120
116
  example: str = 'Article:\n{}\nQuestion:\n{}'.format(input_d['article'], input_d['question'])
121
- for j in range(len(cls.choices)):
122
- example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
117
+ for j in range(len(self.choices)):
118
+ example += '\n{}. {}'.format(self.choices[j], input_choices[j])
123
119
 
124
120
  example += '\nAnswer:'
125
121
  if include_answer:
File without changes
@@ -0,0 +1,20 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.utils.logger import get_logger
3
+
4
+ # flake8: noqa
5
+
6
+ logger = get_logger()
7
+
8
+
9
+ @Benchmark.register(
10
+ name='simple_qa',
11
+ pretty_name='SimpleQA',
12
+ dataset_id='AI-ModelScope/SimpleQA',
13
+ metric_list=['AverageAccuracy'],
14
+ few_shot_num=0,
15
+ train_split=None,
16
+ eval_split='test')
17
+ class SimpleQAAdapter(DataAdapter):
18
+
19
+ def __init__(self, *args, **kwargs):
20
+ super().__init__(*args, **kwargs)
File without changes