evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (85) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  4. evalscope/benchmarks/arc/arc_adapter.py +15 -18
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  6. evalscope/benchmarks/benchmark.py +12 -11
  7. evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
  8. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  9. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  10. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
  11. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  12. evalscope/benchmarks/data_adapter.py +59 -21
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  22. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  23. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  24. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  25. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  26. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  27. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  28. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  29. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  30. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  31. evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  32. evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
  33. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  34. evalscope/benchmarks/musr/musr_adapter.py +8 -5
  35. evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  36. evalscope/benchmarks/race/race_adapter.py +12 -16
  37. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  38. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
  39. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  40. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  41. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  42. evalscope/benchmarks/super_gpqa/utils.py +85 -0
  43. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  45. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  46. evalscope/benchmarks/utils.py +43 -0
  47. evalscope/collections/evaluator.py +14 -5
  48. evalscope/config.py +15 -2
  49. evalscope/constants.py +14 -0
  50. evalscope/evaluator/evaluator.py +51 -13
  51. evalscope/metrics/llm_judge.py +104 -0
  52. evalscope/metrics/named_metrics.py +1 -0
  53. evalscope/models/__init__.py +2 -1
  54. evalscope/models/base_adapter.py +25 -5
  55. evalscope/models/chat_adapter.py +3 -0
  56. evalscope/models/choice_adapter.py +4 -0
  57. evalscope/models/custom_adapter.py +2 -0
  58. evalscope/models/register.py +28 -0
  59. evalscope/models/server_adapter.py +35 -8
  60. evalscope/perf/arguments.py +13 -7
  61. evalscope/perf/benchmark.py +5 -0
  62. evalscope/perf/http_client.py +15 -5
  63. evalscope/perf/main.py +1 -0
  64. evalscope/perf/utils/analysis_result.py +1 -1
  65. evalscope/report/app.py +3 -0
  66. evalscope/report/combinator.py +2 -2
  67. evalscope/run.py +6 -5
  68. evalscope/third_party/longbench_write/infer.py +1 -1
  69. evalscope/third_party/thinkbench/eval.py +220 -55
  70. evalscope/third_party/thinkbench/infer.py +37 -7
  71. evalscope/third_party/thinkbench/tools/llm.py +1 -0
  72. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  73. evalscope/utils/chat_service.py +1 -0
  74. evalscope/utils/filters.py +59 -0
  75. evalscope/utils/logger.py +3 -3
  76. evalscope/version.py +2 -2
  77. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
  78. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
  79. tests/cli/test_all.py +144 -0
  80. tests/cli/test_collection.py +28 -2
  81. tests/cli/test_run.py +201 -32
  82. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
  83. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
  84. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
  85. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, OutputType
5
5
 
6
6
 
7
7
  class ParseStrArgsAction(argparse.Action):
@@ -73,6 +73,11 @@ def add_argument(parser: argparse.ArgumentParser):
73
73
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
74
74
  parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
75
75
  parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
76
+
77
+ # LLMJudge arguments
78
+ parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
79
+ parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
80
+ parser.add_argument('--judge-worker-num', type=int, default=8, help='The number of workers for the judge model.')
76
81
  # yapf: enable
77
82
 
78
83
 
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import OutputType
2
3
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
- from evalscope.models import ChatGenerationModelAdapter
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -10,8 +10,8 @@ logger = get_logger()
10
10
 
11
11
  @Benchmark.register(
12
12
  name='aime24',
13
+ pretty_name='AIME-2024',
13
14
  dataset_id='HuggingFaceH4/aime_2024',
14
- model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=0,
@@ -31,7 +31,7 @@ class AIME24Adapter(DataAdapter):
31
31
  problem = input_d['problem']
32
32
  full_prompt = self.prompt_template.format(query=problem)
33
33
 
34
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
34
+ return self.gen_prompt_data(full_prompt)
35
35
 
36
36
  def get_gold_answer(self, input_d: dict) -> str:
37
37
  # Extract the gold answer from the input dict.
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import OutputType
2
3
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
- from evalscope.models import ChatGenerationModelAdapter
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -10,8 +10,8 @@ logger = get_logger()
10
10
 
11
11
  @Benchmark.register(
12
12
  name='aime25',
13
+ pretty_name='AIME-2025',
13
14
  dataset_id='TIGER-Lab/AIME25',
14
- model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=0,
@@ -31,7 +31,7 @@ class AIME25Adapter(DataAdapter):
31
31
  problem = input_d['question']
32
32
  full_prompt = self.prompt_template.format(query=problem)
33
33
 
34
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
34
+ return self.gen_prompt_data(full_prompt)
35
35
 
36
36
  def get_gold_answer(self, input_d: dict) -> str:
37
37
  # Extract the gold answer from the input dict.
@@ -4,9 +4,8 @@ import json
4
4
  import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
7
+ from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.models import MultiChoiceModelAdapter
10
9
  from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
@@ -17,19 +16,20 @@ logger = get_logger()
17
16
 
18
17
  @Benchmark.register(
19
18
  name='arc',
19
+ pretty_name='ARC',
20
20
  dataset_id='modelscope/ai2_arc',
21
- model_adapter=MultiChoiceModelAdapter,
21
+ model_adapter=OutputType.MULTIPLE_CHOICE,
22
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
22
23
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
24
  metric_list=['AverageAccuracy'],
24
25
  few_shot_num=0,
25
26
  train_split='train',
26
27
  eval_split='test',
27
- prompt_template='',
28
+ prompt_template=
29
+ 'Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n{query}\nYour response should end with "The best answer is [the_answer_letter]" where the [the_answer_letter] is one of A, B, C or D.', # noqa
28
30
  )
29
31
  class ARCAdapter(DataAdapter):
30
32
 
31
- choices = ['A', 'B', 'C', 'D']
32
-
33
33
  def __init__(self, **kwargs):
34
34
  few_shot_num = kwargs.get('few_shot_num', None)
35
35
  if few_shot_num is None:
@@ -42,6 +42,8 @@ class ARCAdapter(DataAdapter):
42
42
 
43
43
  super().__init__(**kwargs)
44
44
 
45
+ self.choices = ['A', 'B', 'C', 'D']
46
+
45
47
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
46
48
  """
47
49
  Load the dataset from local disk.
@@ -60,7 +62,7 @@ class ARCAdapter(DataAdapter):
60
62
  for split_name in ['Train', 'Test']:
61
63
  split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
62
64
  if os.path.exists(split_path):
63
- with open(split_path, 'r', errors='ignore') as in_f:
65
+ with open(split_path, 'r', errors='ignore', encoding='utf-8') as in_f:
64
66
  rows = []
65
67
  for line in in_f:
66
68
  item = json.loads(line.strip())
@@ -107,12 +109,11 @@ class ARCAdapter(DataAdapter):
107
109
  {'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
108
110
  """
109
111
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
110
- context: str = '\n'.join(few_shot_prompts)
112
+ context = '\n'.join(few_shot_prompts) + self._generate_prompt(input_d=input_d, include_answer=False)
111
113
 
112
- # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
113
- full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
114
+ full_prompt = self.prompt_template.format(query=context)
114
115
 
115
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
116
+ return self.gen_prompt_data(full_prompt)
116
117
 
117
118
  def get_gold_answer(self, input_d: dict) -> str:
118
119
  # Get the gold choice
@@ -130,14 +131,10 @@ class ARCAdapter(DataAdapter):
130
131
  Returns:
131
132
  The parsed answer. Depending on the dataset. Usually a string for chat.
132
133
  """
133
- if eval_type == EvalType.CHECKPOINT:
134
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
134
135
  return result
135
- elif eval_type == EvalType.SERVICE:
136
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
137
- elif eval_type == EvalType.CUSTOM:
138
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
139
136
  else:
140
- raise ValueError(f'Invalid eval_type: {eval_type}')
137
+ return ResponseParser.parse_first_option(text=result)
141
138
 
142
139
  def match(self, gold: str, pred: str) -> float:
143
140
  return exact_match(gold=gold, pred=pred)
@@ -152,8 +149,8 @@ class ARCAdapter(DataAdapter):
152
149
  choices_prompts: str = '\n'.join([label + '. ' + text for text, label in zip(choices_texts, choices_labels)])
153
150
  example += '\n' + choices_prompts
154
151
 
155
- example += '\nAnswer:'
156
152
  if include_answer:
153
+ example += '\nAnswer:'
157
154
  example += ' {}\n\n'.format(input_d['answerKey'])
158
155
 
159
156
  return example
@@ -8,8 +8,6 @@ import re
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
10
  from evalscope.metrics import exact_match
11
- from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
- from evalscope.utils import ResponseParser
13
11
  from evalscope.utils.logger import get_logger
14
12
 
15
13
  # flake8: noqa
@@ -60,8 +58,8 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
60
58
 
61
59
  @Benchmark.register(
62
60
  name='bbh',
61
+ pretty_name='BBH',
63
62
  dataset_id='modelscope/bbh',
64
- model_adapter=ChatGenerationModelAdapter,
65
63
  subset_list=SUBSET_LIST,
66
64
  metric_list=['AverageAccuracy'],
67
65
  few_shot_num=3,
@@ -94,7 +92,7 @@ class BBHAdapter(DataAdapter):
94
92
  else:
95
93
  file_path: str = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}.json')
96
94
  if os.path.exists(file_path):
97
- with open(file_path, 'r') as f:
95
+ with open(file_path, 'r', encoding='utf-8') as f:
98
96
  examples = json.load(f)['examples']
99
97
  if subset_name in data_dict:
100
98
  data_dict[subset_name].update({split_name: examples})
@@ -125,7 +123,7 @@ class BBHAdapter(DataAdapter):
125
123
  cot_prompts = ''
126
124
  full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
127
125
 
128
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
126
+ return self.gen_prompt_data(full_prompt)
129
127
 
130
128
  def gen_prompts(self, data_dict: dict) -> dict:
131
129
  """
@@ -153,7 +151,9 @@ class BBHAdapter(DataAdapter):
153
151
  for sub_name, sub_data_dict in data_dict.items():
154
152
  few_shot_data = []
155
153
  if self.few_shot_num > 0:
156
- with open(os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r') as f:
154
+ with open(
155
+ os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r',
156
+ encoding='utf-8') as f:
157
157
  cot_prompt_str = f.read()
158
158
  few_shot_data = [cot_prompt_str]
159
159
 
@@ -1,12 +1,13 @@
1
1
  import copy
2
- from dataclasses import dataclass, field
2
+ from collections import OrderedDict
3
+ from dataclasses import dataclass, field, fields
3
4
  from typing import TYPE_CHECKING, Dict, List, Optional
4
5
 
6
+ from evalscope.constants import OutputType
7
+
5
8
  if TYPE_CHECKING:
6
9
  from evalscope.benchmarks import DataAdapter
7
10
 
8
- from evalscope.models import BaseModelAdapter
9
-
10
11
  BENCHMARK_MAPPINGS = {}
11
12
 
12
13
 
@@ -15,8 +16,9 @@ class BenchmarkMeta:
15
16
  name: str
16
17
  dataset_id: str
17
18
  data_adapter: 'DataAdapter'
18
- model_adapter: BaseModelAdapter
19
- subset_list: List[str] = field(default_factory=list)
19
+ model_adapter: Optional[str] = OutputType.GENERATION
20
+ output_types: Optional[List[str]] = field(default_factory=lambda: [OutputType.GENERATION])
21
+ subset_list: List[str] = field(default_factory=lambda: ['default'])
20
22
  metric_list: List[str] = field(default_factory=list)
21
23
  few_shot_num: int = 0
22
24
  few_shot_random: bool = False
@@ -26,6 +28,8 @@ class BenchmarkMeta:
26
28
  system_prompt: Optional[str] = None
27
29
  query_template: Optional[str] = None
28
30
  pretty_name: Optional[str] = None
31
+ filters: Optional[OrderedDict] = None
32
+ extra_params: Optional[Dict] = field(default_factory=dict)
29
33
 
30
34
  def _update(self, args: dict):
31
35
  if args.get('local_path'):
@@ -37,12 +41,9 @@ class BenchmarkMeta:
37
41
  return self.__dict__
38
42
 
39
43
  def to_string_dict(self) -> dict:
40
- cur_dict = copy.deepcopy(self.__dict__)
44
+ cur_dict = copy.deepcopy(self.to_dict())
41
45
  # cur_dict['data_adapter'] = self.data_adapter.__name__
42
- # cur_dict['model_adapter'] = self.model_adapter.__name__
43
- # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
44
46
  del cur_dict['data_adapter']
45
- del cur_dict['model_adapter']
46
47
  return cur_dict
47
48
 
48
49
  def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
@@ -66,13 +67,13 @@ class Benchmark:
66
67
  return benchmark
67
68
 
68
69
  @classmethod
69
- def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs):
70
+ def register(cls, name: str, dataset_id: str, **kwargs):
70
71
 
71
72
  def register_wrapper(data_adapter):
72
73
  if name in BENCHMARK_MAPPINGS:
73
74
  raise Exception(f'Benchmark {name} already registered')
74
75
  BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
75
- name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
76
+ name=name, data_adapter=data_adapter, dataset_id=dataset_id, **kwargs)
76
77
  return data_adapter
77
78
 
78
79
  return register_wrapper
@@ -3,9 +3,8 @@ import csv
3
3
  import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics.metrics import exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
8
  from evalscope.utils import ResponseParser
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -126,19 +125,20 @@ SUBJECT_MAPPING = {
126
125
 
127
126
  @Benchmark.register(
128
127
  name='ceval',
128
+ pretty_name='C-Eval',
129
129
  dataset_id='modelscope/ceval-exam',
130
- model_adapter=MultiChoiceModelAdapter,
130
+ model_adapter=OutputType.MULTIPLE_CHOICE,
131
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
131
132
  subset_list=SUBSET_LIST,
132
133
  metric_list=['AverageAccuracy'],
133
134
  few_shot_num=0,
134
135
  train_split='dev',
135
136
  eval_split='val',
136
- prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
137
+ prompt_template=
138
+ '以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
137
139
  )
138
140
  class CEVALAdapter(DataAdapter):
139
141
 
140
- choices = ['A', 'B', 'C', 'D']
141
-
142
142
  def __init__(self, **kwargs):
143
143
 
144
144
  few_shot_num = kwargs.get('few_shot_num', 0)
@@ -148,6 +148,7 @@ class CEVALAdapter(DataAdapter):
148
148
  super().__init__(**kwargs)
149
149
 
150
150
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
151
+ self.choices = ['A', 'B', 'C', 'D']
151
152
 
152
153
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
153
154
  data_dict = {}
@@ -207,7 +208,7 @@ class CEVALAdapter(DataAdapter):
207
208
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
208
209
  full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
209
210
 
210
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
211
+ return self.gen_prompt_data(full_prompt)
211
212
 
212
213
  def get_gold_answer(self, input_d: dict) -> str:
213
214
  # Get the gold choice
@@ -225,22 +226,17 @@ class CEVALAdapter(DataAdapter):
225
226
  Returns:
226
227
  The parsed answer. Depending on the dataset. Usually a string for chat.
227
228
  """
228
- if eval_type == EvalType.CHECKPOINT:
229
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
229
230
  return result
230
- elif eval_type == EvalType.SERVICE:
231
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
232
- elif eval_type == EvalType.CUSTOM:
233
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
234
231
  else:
235
- raise ValueError(f'Invalid eval_type: {eval_type}')
232
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
236
233
 
237
234
  def match(self, gold: str, pred: str) -> float:
238
235
  return exact_match(gold=gold, pred=pred)
239
236
 
240
- @classmethod
241
- def _format_example(cls, input_d: dict, include_answer=True):
237
+ def _format_example(self, input_d: dict, include_answer=True):
242
238
  example = '问题:' + input_d['question']
243
- for choice in cls.choices:
239
+ for choice in self.choices:
244
240
  example += f'\n{choice}. {input_d[f"{choice}"]}'
245
241
 
246
242
  if include_answer:
File without changes
@@ -0,0 +1,168 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import Metric, mean, metric_registry
7
+ from evalscope.metrics.llm_judge import LLMJudge
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ # flake8: noqa
11
+
12
+ logger = get_logger()
13
+
14
+ GRADER_TEMPLATE = """
15
+ 请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。您的任务是将结果评定为:【正确】、【错误】或【未尝试】。
16
+
17
+ 首先,我们将列出每个评定类别的示例,然后请您对新问题的预测答案进行评定。
18
+ 以下是【正确】的答复示例:
19
+ ```
20
+ 问题:贝拉克·奥巴马的孩子叫什么名字?
21
+ 标准答案:玛丽亚·奥巴马和萨莎·奥巴马
22
+ 模型预测1:Malia Obama and Sasha Obama
23
+ 模型预测2:玛丽亚和萨沙
24
+ 模型预测3:大多数人会说是玛丽亚和萨莎,但我不确定,需要再确认
25
+ 模型预测4:巴拉克·奥巴马有两个女儿,她们分别是玛丽亚·安和娜塔莎·玛丽安,但通常称作玛丽亚·奥巴马和萨莎·奥巴马。玛丽亚出生于1998年7月4日,萨莎出生于2001年6月10日。
26
+ ```
27
+ 这些答复均为【正确】,因为:
28
+ - 完整地包含了标准答案中的重要信息。
29
+ - 不包含任何与标准答案矛盾的信息。
30
+ - 只关注语义内容,中英文,大小写、标点、语法和顺序不重要。
31
+ - 答复中出现模糊语句或猜测是可以接受的,前提是包含了标准答案且不含有不正确信息或矛盾。
32
+
33
+ 以下是【错误】的答复示例:
34
+ ```
35
+ 问题:巴拉克·奥巴马的孩子叫什么名字?
36
+ 标准答案:玛丽亚·奥巴马和萨莎·奥巴马
37
+ 模型预测1:玛丽亚
38
+ 模型预测2:玛丽亚、萨莎和苏珊
39
+ 模型预测3:巴拉克·奥巴马没有孩子
40
+ 模型预测4:我认为是玛丽亚和萨莎。或者是玛丽亚和杰基。或者是乔伊和玛丽亚。
41
+ 模型预测5:虽然我不知道他们的确切名字,但能说出巴拉克·奥巴马有三个孩子。
42
+ 模型预测6:你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗?
43
+ ```
44
+ 这些答复均为【错误】,因为:
45
+ - 答复中包含与标准答案矛盾的事实陈述。即使在陈述中略带保留(例如:“可能是”,“虽然我不确定,但我认为”),也视为错误。
46
+
47
+ 以下是【未尝试】的答复示例:
48
+ ```
49
+ 问题:巴拉克·奥巴马的孩子叫什么名字?
50
+ 标准答案:玛丽亚·奥巴马和萨莎·奥巴马
51
+ 模型预测1:我不知道。
52
+ 模型预测2:我需要更多关于您所指奥巴马的上下文。
53
+ 模型预测3:不查阅网络我无法回答这个问题,不过我知道巴拉克·奥巴马有两个孩子。
54
+ 模型预测4:巴拉克·奥巴马有两个孩子。我知道其中一个叫玛丽亚,但我不确定另一个的名字。
55
+ ```
56
+ 这些答复均为【未尝试】,因为:
57
+ - 没有包含标准答案中的重要信息。
58
+ - 回复中没有与标准答案矛盾的陈述。
59
+
60
+ 另外注意以下几点:
61
+ - 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题“金山铁路黄浦江特大桥的全长是多少米?”,标准答案为“3518.17”:
62
+ - 预测答案“3518”、“3518.1”、“3518.17”均为【正确】。
63
+ - 预测答案“3520”和“3600”均为【错误】。
64
+ - 预测答案“大约3500米”和“超过3000米”被视为【未尝试】,因为它们既不确认也不与标准答案矛盾。
65
+ - 如果标准答案包含比问题更多的信息,预测答案只需包含问题中提到的信息。
66
+ - 例如,考虑问题“菱镁矿的主要化学成分是什么?”标准答案为“碳酸镁(MgCO3)”。“碳酸镁”或“MgCO3”均视为【正确】答案。
67
+ - 如果从问题中明显可以推断出预测答案省略的信息,那么算作正确。
68
+ - 例如,问题“巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?”标准答案为“意大利撒丁岛”,预测答案“撒丁岛”被视为【正确】。
69
+ - 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。
70
+ - 例如,如果标准答案是“Robinson”,那么回答鲁滨逊或者鲁滨孙均正确。
71
+
72
+ 下面是一个新的问题示例。请只回复A、B、C之一,不要道歉或纠正自己的错误,只需要评估该回答。
73
+ ```
74
+ 问题: {question}
75
+ 正确答案: {target}
76
+ 预测答案: {predicted_answer}
77
+ ```
78
+
79
+ 将此新问题的预测答案评定为以下之一:
80
+ A:【正确】
81
+ B:【错误】
82
+ C:【未尝试】
83
+
84
+ 只返回字母"A"、"B"或"C",无须添加其他文本。
85
+ """.strip() # noqa E501
86
+
87
+ SUBSET_LIST = ['中华文化', '人文与社会科学', '工程、技术与应用科学', '生活、艺术与文化', '社会', '自然与自然科学']
88
+
89
+
90
+ @Benchmark.register(
91
+ name='chinese_simpleqa',
92
+ pretty_name='Chinese SimpleQA',
93
+ subset_list=SUBSET_LIST,
94
+ dataset_id='AI-ModelScope/Chinese-SimpleQA',
95
+ metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
96
+ few_shot_num=0,
97
+ train_split=None,
98
+ eval_split='train')
99
+ class ChineseSimpleQAAdapter(DataAdapter):
100
+
101
+ def __init__(self, *args, **kwargs):
102
+ super().__init__(*args, **kwargs)
103
+
104
+ # register metrics
105
+ metric_registry.register(Metric(name='is_correct', object=mean))
106
+ metric_registry.register(Metric(name='is_incorrect', object=mean))
107
+ metric_registry.register(Metric(name='is_not_attempted', object=mean))
108
+
109
+ # whether to use LLM as a judge
110
+ self.llm_as_a_judge = True
111
+
112
+ def load(self, **kwargs):
113
+ kwargs['subset_list'] = ['default']
114
+ data_dict = super().load(**kwargs)
115
+ return self.reformat_subset(data_dict, subset_key='primary_category', format='{}')
116
+
117
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
118
+ question = input_d['question']
119
+ return self.gen_prompt_data(question)
120
+
121
+ def get_gold_answer(self, input_d: dict) -> str:
122
+ return input_d['answer']
123
+
124
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
125
+ return result.strip()
126
+
127
+ def match(self, gold: str, pred: str) -> float:
128
+ # simple match
129
+ logger.warning(f'Please use LLMJudge to match the result for ChineseSimpleQA')
130
+ is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
131
+ is_incorrect = not is_correct
132
+ is_not_attempted = 0
133
+ return {
134
+ 'is_correct': is_correct,
135
+ 'is_incorrect': is_incorrect,
136
+ 'is_not_attempted': is_not_attempted,
137
+ }
138
+
139
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
140
+ raw_input = kwargs.get('raw_input', None)
141
+ question = raw_input['question']
142
+ # get grading response
143
+ prompt = GRADER_TEMPLATE.format(question=question, target=gold, predicted_answer=pred)
144
+ system_prompt = '你是一个智能助手,请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。'
145
+ grading_response = judge(prompt, system_prompt)
146
+ # parse grading response
147
+ match = re.search(r'(A|B|C)', grading_response)
148
+ res = match.group(0) if match else 'C'
149
+ return {
150
+ 'is_correct': 1 if res == 'A' else 0,
151
+ 'is_incorrect': 1 if res == 'B' else 0,
152
+ 'is_not_attempted': 1 if res == 'C' else 0,
153
+ }
154
+
155
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
156
+ """
157
+ compute weighted mean of the bleu score of all samples
158
+
159
+ Args:
160
+ review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
161
+ """
162
+ # zip dict answers
163
+ res_dict = defaultdict(list)
164
+ for res in review_res_list:
165
+ for key, value in res.items():
166
+ res_dict[key].append(value)
167
+
168
+ return super().compute_metric(res_dict, **kwargs)
@@ -4,9 +4,8 @@ import csv
4
4
  import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
7
+ from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.models import MultiChoiceModelAdapter
10
9
  from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
@@ -103,23 +102,25 @@ SUBJECT_MAPPING = {
103
102
 
104
103
  @Benchmark.register(
105
104
  name='cmmlu',
105
+ pretty_name='C-MMLU',
106
106
  dataset_id='modelscope/cmmlu',
107
- model_adapter=MultiChoiceModelAdapter,
107
+ model_adapter=OutputType.MULTIPLE_CHOICE,
108
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
108
109
  subset_list=SUBSET_LIST,
109
110
  metric_list=['AverageAccuracy'],
110
111
  few_shot_num=5,
111
112
  train_split='dev',
112
113
  eval_split='test',
113
- prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
114
+ prompt_template=
115
+ '以下是关于{subset_name}的单项选择题,请给出正确答案的选项。你的回答的最后一行应该是这样的格式:“答案:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
114
116
  )
115
117
  class CMMLUAdapter(DataAdapter):
116
118
 
117
- choices = ['A', 'B', 'C', 'D']
118
-
119
119
  def __init__(self, **kwargs):
120
120
  super().__init__(**kwargs)
121
121
 
122
122
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
123
+ self.choices = ['A', 'B', 'C', 'D']
123
124
 
124
125
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
125
126
  data_dict = {}
@@ -172,7 +173,7 @@ class CMMLUAdapter(DataAdapter):
172
173
 
173
174
  full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
174
175
 
175
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
176
+ return self.gen_prompt_data(full_prompt)
176
177
 
177
178
  def get_gold_answer(self, input_d: dict) -> str:
178
179
  # Get the gold choice
@@ -190,26 +191,21 @@ class CMMLUAdapter(DataAdapter):
190
191
  Returns:
191
192
  The parsed answer. Depending on the dataset. Usually a string for chat.
192
193
  """
193
- if eval_type == EvalType.CHECKPOINT:
194
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
194
195
  return result
195
- elif eval_type == EvalType.SERVICE:
196
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
197
- elif eval_type == EvalType.CUSTOM:
198
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
199
196
  else:
200
- raise ValueError(f'Invalid eval_type: {eval_type}')
197
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
201
198
 
202
199
  def match(self, gold: str, pred: str) -> float:
203
200
  return exact_match(gold=gold, pred=pred)
204
201
 
205
- @classmethod
206
- def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
202
+ def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
207
203
 
208
204
  input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
209
205
 
210
206
  example: str = input_d['Question']
211
- for j in range(len(cls.choices)):
212
- example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
207
+ for j in range(len(self.choices)):
208
+ example += '\n{}. {}'.format(self.choices[j], input_choices[j])
213
209
 
214
210
  example += '\nAnswer:'
215
211
  if include_answer:
@@ -18,8 +18,8 @@ logger = get_logger()
18
18
 
19
19
  @Benchmark.register(
20
20
  name='competition_math',
21
+ pretty_name='MATH',
21
22
  dataset_id='modelscope/competition_math',
22
- model_adapter=ChatGenerationModelAdapter,
23
23
  subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
24
24
  metric_list=['AveragePass@1'],
25
25
  few_shot_num=4,
@@ -58,7 +58,7 @@ class CompetitionMathAdapter(DataAdapter):
58
58
  split_data = []
59
59
  for file_path in split_files:
60
60
  if os.path.exists(file_path):
61
- with open(file_path, 'r') as f:
61
+ with open(file_path, 'r', encoding='utf-8') as f:
62
62
  split_data.append(json.load(f))
63
63
  data_dict[subset_name][split_name] = split_data
64
64
 
@@ -81,7 +81,7 @@ class CompetitionMathAdapter(DataAdapter):
81
81
  use_fewshot = self.few_shot_num > 0
82
82
  query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
83
83
  full_prompt = self.prompt_template.format(query=query)
84
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
84
+ return self.gen_prompt_data(full_prompt)
85
85
 
86
86
  def get_gold_answer(self, input_d: dict) -> str:
87
87
  # Extract the gold answer from the input dict.