evalscope 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (68) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  4. evalscope/benchmarks/arc/arc_adapter.py +14 -17
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  6. evalscope/benchmarks/benchmark.py +9 -9
  7. evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  10. evalscope/benchmarks/data_adapter.py +31 -21
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  13. evalscope/benchmarks/general_qa/general_qa_adapter.py +25 -11
  14. evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  15. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  16. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
  17. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
  18. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  19. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  20. evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  23. evalscope/benchmarks/musr/musr_adapter.py +8 -5
  24. evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  25. evalscope/benchmarks/race/race_adapter.py +12 -16
  26. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  27. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
  28. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  29. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  30. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  31. evalscope/benchmarks/super_gpqa/utils.py +90 -0
  32. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  34. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  35. evalscope/benchmarks/utils.py +43 -0
  36. evalscope/collections/evaluator.py +11 -2
  37. evalscope/config.py +10 -2
  38. evalscope/constants.py +7 -0
  39. evalscope/metrics/named_metrics.py +1 -0
  40. evalscope/models/__init__.py +2 -1
  41. evalscope/models/base_adapter.py +25 -5
  42. evalscope/models/chat_adapter.py +3 -0
  43. evalscope/models/choice_adapter.py +4 -0
  44. evalscope/models/custom_adapter.py +2 -0
  45. evalscope/models/register.py +28 -0
  46. evalscope/models/server_adapter.py +35 -8
  47. evalscope/perf/arguments.py +13 -7
  48. evalscope/perf/http_client.py +6 -4
  49. evalscope/perf/utils/analysis_result.py +1 -1
  50. evalscope/report/app.py +3 -0
  51. evalscope/report/combinator.py +2 -2
  52. evalscope/run.py +5 -4
  53. evalscope/third_party/thinkbench/eval.py +220 -55
  54. evalscope/third_party/thinkbench/infer.py +37 -7
  55. evalscope/third_party/thinkbench/tools/llm.py +1 -0
  56. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  57. evalscope/utils/chat_service.py +1 -0
  58. evalscope/utils/filters.py +59 -0
  59. evalscope/utils/logger.py +3 -3
  60. evalscope/version.py +2 -2
  61. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +7 -3
  62. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +68 -58
  63. tests/cli/test_collection.py +1 -1
  64. tests/cli/test_run.py +135 -28
  65. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
  66. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
  67. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
  68. {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType, OutputType
5
5
 
6
6
 
7
7
  class ParseStrArgsAction(argparse.Action):
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import OutputType
2
3
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
- from evalscope.models import ChatGenerationModelAdapter
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -10,8 +10,8 @@ logger = get_logger()
10
10
 
11
11
  @Benchmark.register(
12
12
  name='aime24',
13
+ pretty_name='AIME-2024',
13
14
  dataset_id='HuggingFaceH4/aime_2024',
14
- model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=0,
@@ -31,7 +31,7 @@ class AIME24Adapter(DataAdapter):
31
31
  problem = input_d['problem']
32
32
  full_prompt = self.prompt_template.format(query=problem)
33
33
 
34
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
34
+ return self.gen_prompt_data(full_prompt)
35
35
 
36
36
  def get_gold_answer(self, input_d: dict) -> str:
37
37
  # Extract the gold answer from the input dict.
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import OutputType
2
3
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
- from evalscope.models import ChatGenerationModelAdapter
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -10,8 +10,8 @@ logger = get_logger()
10
10
 
11
11
  @Benchmark.register(
12
12
  name='aime25',
13
+ pretty_name='AIME-2025',
13
14
  dataset_id='TIGER-Lab/AIME25',
14
- model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=0,
@@ -31,7 +31,7 @@ class AIME25Adapter(DataAdapter):
31
31
  problem = input_d['question']
32
32
  full_prompt = self.prompt_template.format(query=problem)
33
33
 
34
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
34
+ return self.gen_prompt_data(full_prompt)
35
35
 
36
36
  def get_gold_answer(self, input_d: dict) -> str:
37
37
  # Extract the gold answer from the input dict.
@@ -4,9 +4,8 @@ import json
4
4
  import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
7
+ from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.models import MultiChoiceModelAdapter
10
9
  from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
@@ -17,19 +16,20 @@ logger = get_logger()
17
16
 
18
17
  @Benchmark.register(
19
18
  name='arc',
19
+ pretty_name='ARC',
20
20
  dataset_id='modelscope/ai2_arc',
21
- model_adapter=MultiChoiceModelAdapter,
21
+ model_adapter=OutputType.MULTIPLE_CHOICE,
22
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
22
23
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
24
  metric_list=['AverageAccuracy'],
24
25
  few_shot_num=0,
25
26
  train_split='train',
26
27
  eval_split='test',
27
- prompt_template='',
28
+ prompt_template=
29
+ 'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n{query}',
28
30
  )
29
31
  class ARCAdapter(DataAdapter):
30
32
 
31
- choices = ['A', 'B', 'C', 'D']
32
-
33
33
  def __init__(self, **kwargs):
34
34
  few_shot_num = kwargs.get('few_shot_num', None)
35
35
  if few_shot_num is None:
@@ -42,6 +42,8 @@ class ARCAdapter(DataAdapter):
42
42
 
43
43
  super().__init__(**kwargs)
44
44
 
45
+ self.choices = ['A', 'B', 'C', 'D']
46
+
45
47
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
46
48
  """
47
49
  Load the dataset from local disk.
@@ -60,7 +62,7 @@ class ARCAdapter(DataAdapter):
60
62
  for split_name in ['Train', 'Test']:
61
63
  split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
62
64
  if os.path.exists(split_path):
63
- with open(split_path, 'r', errors='ignore') as in_f:
65
+ with open(split_path, 'r', errors='ignore', encoding='utf-8') as in_f:
64
66
  rows = []
65
67
  for line in in_f:
66
68
  item = json.loads(line.strip())
@@ -107,12 +109,11 @@ class ARCAdapter(DataAdapter):
107
109
  {'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
108
110
  """
109
111
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
110
- context: str = '\n'.join(few_shot_prompts)
112
+ context = '\n'.join(few_shot_prompts) + self._generate_prompt(input_d=input_d, include_answer=False)
111
113
 
112
- # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
113
- full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
114
+ full_prompt = self.prompt_template.format(query=context)
114
115
 
115
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
116
+ return self.gen_prompt_data(full_prompt)
116
117
 
117
118
  def get_gold_answer(self, input_d: dict) -> str:
118
119
  # Get the gold choice
@@ -130,14 +131,10 @@ class ARCAdapter(DataAdapter):
130
131
  Returns:
131
132
  The parsed answer. Depending on the dataset. Usually a string for chat.
132
133
  """
133
- if eval_type == EvalType.CHECKPOINT:
134
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
134
135
  return result
135
- elif eval_type == EvalType.SERVICE:
136
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
137
- elif eval_type == EvalType.CUSTOM:
138
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
139
136
  else:
140
- raise ValueError(f'Invalid eval_type: {eval_type}')
137
+ return ResponseParser.parse_first_capital(text=result, options=self.choices)
141
138
 
142
139
  def match(self, gold: str, pred: str) -> float:
143
140
  return exact_match(gold=gold, pred=pred)
@@ -8,8 +8,6 @@ import re
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
10
  from evalscope.metrics import exact_match
11
- from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
- from evalscope.utils import ResponseParser
13
11
  from evalscope.utils.logger import get_logger
14
12
 
15
13
  # flake8: noqa
@@ -60,8 +58,8 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
60
58
 
61
59
  @Benchmark.register(
62
60
  name='bbh',
61
+ pretty_name='BBH',
63
62
  dataset_id='modelscope/bbh',
64
- model_adapter=ChatGenerationModelAdapter,
65
63
  subset_list=SUBSET_LIST,
66
64
  metric_list=['AverageAccuracy'],
67
65
  few_shot_num=3,
@@ -94,7 +92,7 @@ class BBHAdapter(DataAdapter):
94
92
  else:
95
93
  file_path: str = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}.json')
96
94
  if os.path.exists(file_path):
97
- with open(file_path, 'r') as f:
95
+ with open(file_path, 'r', encoding='utf-8') as f:
98
96
  examples = json.load(f)['examples']
99
97
  if subset_name in data_dict:
100
98
  data_dict[subset_name].update({split_name: examples})
@@ -125,7 +123,7 @@ class BBHAdapter(DataAdapter):
125
123
  cot_prompts = ''
126
124
  full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
127
125
 
128
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
126
+ return self.gen_prompt_data(full_prompt)
129
127
 
130
128
  def gen_prompts(self, data_dict: dict) -> dict:
131
129
  """
@@ -153,7 +151,9 @@ class BBHAdapter(DataAdapter):
153
151
  for sub_name, sub_data_dict in data_dict.items():
154
152
  few_shot_data = []
155
153
  if self.few_shot_num > 0:
156
- with open(os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r') as f:
154
+ with open(
155
+ os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r',
156
+ encoding='utf-8') as f:
157
157
  cot_prompt_str = f.read()
158
158
  few_shot_data = [cot_prompt_str]
159
159
 
@@ -1,12 +1,13 @@
1
1
  import copy
2
+ from collections import OrderedDict
2
3
  from dataclasses import dataclass, field
3
4
  from typing import TYPE_CHECKING, Dict, List, Optional
4
5
 
6
+ from evalscope.constants import OutputType
7
+
5
8
  if TYPE_CHECKING:
6
9
  from evalscope.benchmarks import DataAdapter
7
10
 
8
- from evalscope.models import BaseModelAdapter
9
-
10
11
  BENCHMARK_MAPPINGS = {}
11
12
 
12
13
 
@@ -15,8 +16,9 @@ class BenchmarkMeta:
15
16
  name: str
16
17
  dataset_id: str
17
18
  data_adapter: 'DataAdapter'
18
- model_adapter: BaseModelAdapter
19
- subset_list: List[str] = field(default_factory=list)
19
+ model_adapter: Optional[str] = OutputType.GENERATION
20
+ output_types: Optional[List[str]] = field(default_factory=lambda: [OutputType.GENERATION])
21
+ subset_list: List[str] = field(default_factory=lambda: ['default'])
20
22
  metric_list: List[str] = field(default_factory=list)
21
23
  few_shot_num: int = 0
22
24
  few_shot_random: bool = False
@@ -26,6 +28,7 @@ class BenchmarkMeta:
26
28
  system_prompt: Optional[str] = None
27
29
  query_template: Optional[str] = None
28
30
  pretty_name: Optional[str] = None
31
+ filters: Optional[OrderedDict] = None
29
32
 
30
33
  def _update(self, args: dict):
31
34
  if args.get('local_path'):
@@ -39,10 +42,7 @@ class BenchmarkMeta:
39
42
  def to_string_dict(self) -> dict:
40
43
  cur_dict = copy.deepcopy(self.__dict__)
41
44
  # cur_dict['data_adapter'] = self.data_adapter.__name__
42
- # cur_dict['model_adapter'] = self.model_adapter.__name__
43
- # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
44
45
  del cur_dict['data_adapter']
45
- del cur_dict['model_adapter']
46
46
  return cur_dict
47
47
 
48
48
  def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
@@ -66,13 +66,13 @@ class Benchmark:
66
66
  return benchmark
67
67
 
68
68
  @classmethod
69
- def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs):
69
+ def register(cls, name: str, dataset_id: str, **kwargs):
70
70
 
71
71
  def register_wrapper(data_adapter):
72
72
  if name in BENCHMARK_MAPPINGS:
73
73
  raise Exception(f'Benchmark {name} already registered')
74
74
  BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
75
- name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
75
+ name=name, data_adapter=data_adapter, dataset_id=dataset_id, **kwargs)
76
76
  return data_adapter
77
77
 
78
78
  return register_wrapper
@@ -3,9 +3,8 @@ import csv
3
3
  import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics.metrics import exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
8
  from evalscope.utils import ResponseParser
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -126,8 +125,10 @@ SUBJECT_MAPPING = {
126
125
 
127
126
  @Benchmark.register(
128
127
  name='ceval',
128
+ pretty_name='C-Eval',
129
129
  dataset_id='modelscope/ceval-exam',
130
- model_adapter=MultiChoiceModelAdapter,
130
+ model_adapter=OutputType.MULTIPLE_CHOICE,
131
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
131
132
  subset_list=SUBSET_LIST,
132
133
  metric_list=['AverageAccuracy'],
133
134
  few_shot_num=0,
@@ -137,8 +138,6 @@ SUBJECT_MAPPING = {
137
138
  )
138
139
  class CEVALAdapter(DataAdapter):
139
140
 
140
- choices = ['A', 'B', 'C', 'D']
141
-
142
141
  def __init__(self, **kwargs):
143
142
 
144
143
  few_shot_num = kwargs.get('few_shot_num', 0)
@@ -148,6 +147,7 @@ class CEVALAdapter(DataAdapter):
148
147
  super().__init__(**kwargs)
149
148
 
150
149
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
150
+ self.choices = ['A', 'B', 'C', 'D']
151
151
 
152
152
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
153
153
  data_dict = {}
@@ -207,7 +207,7 @@ class CEVALAdapter(DataAdapter):
207
207
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
208
208
  full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
209
209
 
210
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
210
+ return self.gen_prompt_data(full_prompt)
211
211
 
212
212
  def get_gold_answer(self, input_d: dict) -> str:
213
213
  # Get the gold choice
@@ -225,22 +225,17 @@ class CEVALAdapter(DataAdapter):
225
225
  Returns:
226
226
  The parsed answer. Depending on the dataset. Usually a string for chat.
227
227
  """
228
- if eval_type == EvalType.CHECKPOINT:
228
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
229
229
  return result
230
- elif eval_type == EvalType.SERVICE:
231
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
232
- elif eval_type == EvalType.CUSTOM:
233
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
234
230
  else:
235
- raise ValueError(f'Invalid eval_type: {eval_type}')
231
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
236
232
 
237
233
  def match(self, gold: str, pred: str) -> float:
238
234
  return exact_match(gold=gold, pred=pred)
239
235
 
240
- @classmethod
241
- def _format_example(cls, input_d: dict, include_answer=True):
236
+ def _format_example(self, input_d: dict, include_answer=True):
242
237
  example = '问题:' + input_d['question']
243
- for choice in cls.choices:
238
+ for choice in self.choices:
244
239
  example += f'\n{choice}. {input_d[f"{choice}"]}'
245
240
 
246
241
  if include_answer:
@@ -4,9 +4,8 @@ import csv
4
4
  import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
7
+ from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.models import MultiChoiceModelAdapter
10
9
  from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
@@ -103,8 +102,10 @@ SUBJECT_MAPPING = {
103
102
 
104
103
  @Benchmark.register(
105
104
  name='cmmlu',
105
+ pretty_name='C-MMLU',
106
106
  dataset_id='modelscope/cmmlu',
107
- model_adapter=MultiChoiceModelAdapter,
107
+ model_adapter=OutputType.MULTIPLE_CHOICE,
108
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
108
109
  subset_list=SUBSET_LIST,
109
110
  metric_list=['AverageAccuracy'],
110
111
  few_shot_num=5,
@@ -114,12 +115,11 @@ SUBJECT_MAPPING = {
114
115
  )
115
116
  class CMMLUAdapter(DataAdapter):
116
117
 
117
- choices = ['A', 'B', 'C', 'D']
118
-
119
118
  def __init__(self, **kwargs):
120
119
  super().__init__(**kwargs)
121
120
 
122
121
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
122
+ self.choices = ['A', 'B', 'C', 'D']
123
123
 
124
124
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
125
125
  data_dict = {}
@@ -172,7 +172,7 @@ class CMMLUAdapter(DataAdapter):
172
172
 
173
173
  full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
174
174
 
175
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
175
+ return self.gen_prompt_data(full_prompt)
176
176
 
177
177
  def get_gold_answer(self, input_d: dict) -> str:
178
178
  # Get the gold choice
@@ -190,26 +190,21 @@ class CMMLUAdapter(DataAdapter):
190
190
  Returns:
191
191
  The parsed answer. Depending on the dataset. Usually a string for chat.
192
192
  """
193
- if eval_type == EvalType.CHECKPOINT:
193
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
194
194
  return result
195
- elif eval_type == EvalType.SERVICE:
196
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
197
- elif eval_type == EvalType.CUSTOM:
198
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
199
195
  else:
200
- raise ValueError(f'Invalid eval_type: {eval_type}')
196
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
201
197
 
202
198
  def match(self, gold: str, pred: str) -> float:
203
199
  return exact_match(gold=gold, pred=pred)
204
200
 
205
- @classmethod
206
- def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
201
+ def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
207
202
 
208
203
  input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
209
204
 
210
205
  example: str = input_d['Question']
211
- for j in range(len(cls.choices)):
212
- example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
206
+ for j in range(len(self.choices)):
207
+ example += '\n{}. {}'.format(self.choices[j], input_choices[j])
213
208
 
214
209
  example += '\nAnswer:'
215
210
  if include_answer:
@@ -18,8 +18,8 @@ logger = get_logger()
18
18
 
19
19
  @Benchmark.register(
20
20
  name='competition_math',
21
+ pretty_name='MATH',
21
22
  dataset_id='modelscope/competition_math',
22
- model_adapter=ChatGenerationModelAdapter,
23
23
  subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
24
24
  metric_list=['AveragePass@1'],
25
25
  few_shot_num=4,
@@ -58,7 +58,7 @@ class CompetitionMathAdapter(DataAdapter):
58
58
  split_data = []
59
59
  for file_path in split_files:
60
60
  if os.path.exists(file_path):
61
- with open(file_path, 'r') as f:
61
+ with open(file_path, 'r', encoding='utf-8') as f:
62
62
  split_data.append(json.load(f))
63
63
  data_dict[subset_name][split_name] = split_data
64
64
 
@@ -81,7 +81,7 @@ class CompetitionMathAdapter(DataAdapter):
81
81
  use_fewshot = self.few_shot_num > 0
82
82
  query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
83
83
  full_prompt = self.prompt_template.format(query=query)
84
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
84
+ return self.gen_prompt_data(full_prompt)
85
85
 
86
86
  def get_gold_answer(self, input_d: dict) -> str:
87
87
  # Extract the gold answer from the input dict.
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
5
5
  from collections import defaultdict
6
6
  from typing import Any, List, Optional, Union
7
7
 
8
+ from evalscope.benchmarks.utils import PromptData, preprocess_decorator
8
9
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
9
10
  from evalscope.metrics.named_metrics import metric_registry
10
11
  from evalscope.report import Report, ReportGenerator
@@ -18,6 +19,7 @@ class DataAdapter(ABC):
18
19
  def __init__(self,
19
20
  name: str,
20
21
  dataset_id: str,
22
+ model_adapter: str,
21
23
  subset_list: list,
22
24
  metric_list: List[str],
23
25
  few_shot_num: Optional[int] = 0,
@@ -48,6 +50,7 @@ class DataAdapter(ABC):
48
50
  """
49
51
  self.name = name
50
52
  self.dataset_id = dataset_id
53
+ self.model_adapter = model_adapter
51
54
  self.subset_list = subset_list
52
55
  self.metric_list = metric_list
53
56
  self.few_shot_num = few_shot_num
@@ -59,6 +62,15 @@ class DataAdapter(ABC):
59
62
  self.pretty_name = pretty_name
60
63
  self.config_kwargs = kwargs
61
64
  self.category_map = kwargs.get('category_map', {})
65
+ self.choices = kwargs.get('choices', None)
66
+
67
+ def __init_subclass__(cls, **kwargs):
68
+ super().__init_subclass__(**kwargs)
69
+
70
+ # find and decorate parse_pred_result method
71
+ if hasattr(cls, 'parse_pred_result'):
72
+ original_method = cls.parse_pred_result
73
+ cls.parse_pred_result = preprocess_decorator(original_method)
62
74
 
63
75
  def load(self,
64
76
  dataset_name_or_path: str = None,
@@ -78,11 +90,15 @@ class DataAdapter(ABC):
78
90
 
79
91
  # Try to load dataset from local disk
80
92
  if os.path.exists(dataset_name_or_path):
81
- data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
93
+ logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
94
+ data_dict = self.load_from_disk(
95
+ dataset_name_or_path, subset_list, work_dir, trust_remote_code=False, **kwargs)
82
96
  else:
83
- data_dict = self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
84
- if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
85
- raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
97
+ logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
98
+ data_dict = self.load_from_hub(
99
+ dataset_name_or_path, subset_list, work_dir, trust_remote_code=True, **kwargs)
100
+ if len(data_dict) == 0:
101
+ raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
86
102
  return data_dict
87
103
 
88
104
  def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
@@ -91,8 +107,7 @@ class DataAdapter(ABC):
91
107
  datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
92
108
  split_as_subset: bool = kwargs.pop('split_as_subset', False)
93
109
  # Load dataset from remote
94
- logger.info(
95
- f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
110
+ logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
96
111
 
97
112
  data_dict = {}
98
113
  split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
@@ -133,21 +148,7 @@ class DataAdapter(ABC):
133
148
  If you want to support local dataset, please rewrite this method in xxx_data_adapter.
134
149
  Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
135
150
  """
136
- from modelscope.msdatasets import MsDataset
137
-
138
- logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
139
- subsets: {subset_list}')
140
- data_dict = {}
141
- subset_list = subset_list or self.subset_list
142
- split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
143
- for sub_name in subset_list:
144
- data_dict[sub_name] = {}
145
- # e.g. train: few-shot, test: target dataset to evaluate
146
- for split in split_list:
147
- dataset = MsDataset.load(
148
- dataset_name=dataset_name_or_path, subset_name=sub_name, split=split, cache_dir=work_dir, **kwargs)
149
- data_dict[sub_name].update({split: dataset})
150
- return data_dict
151
+ return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
151
152
 
152
153
  def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
153
154
  """
@@ -285,6 +286,12 @@ class DataAdapter(ABC):
285
286
  kwargs['metric_list'] = self.metric_list
286
287
  return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
287
288
 
289
+ def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
290
+ if not isinstance(prompt, list):
291
+ prompt = [prompt]
292
+ prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
293
+ return prompt_data.to_dict()
294
+
288
295
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
289
296
  """
290
297
  Generate model prompt from raw input, unify the prompt format for different datasets.
@@ -348,3 +355,6 @@ class DataAdapter(ABC):
348
355
  The match result. Usually a score (float) for chat/multiple-choice-questions.
349
356
  """
350
357
  raise NotImplementedError
358
+
359
+ def llm_match(self, *args, **kwargs):
360
+ pass
@@ -15,7 +15,6 @@ logger = get_logger()
15
15
  @Benchmark.register(
16
16
  name='data_collection',
17
17
  dataset_id='', # dataset_id need to be set
18
- model_adapter=ChatGenerationModelAdapter,
19
18
  subset_list=['default'],
20
19
  metric_list=['AverageAccuracy'],
21
20
  few_shot_num=0,
@@ -3,9 +3,8 @@ import csv
3
3
  import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics.metrics import exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
8
  from evalscope.utils import ResponseParser
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -16,8 +15,10 @@ logger = get_logger()
16
15
 
17
16
  @Benchmark.register(
18
17
  name='general_mcq',
18
+ pretty_name='General MCQ',
19
19
  dataset_id='general_mcq',
20
- model_adapter=MultiChoiceModelAdapter,
20
+ model_adapter=OutputType.MULTIPLE_CHOICE,
21
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
21
22
  subset_list=['default'],
22
23
  metric_list=['AverageAccuracy'],
23
24
  few_shot_num=0,
@@ -27,11 +28,11 @@ logger = get_logger()
27
28
  query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
28
29
  class GeneralMCQAdapter(DataAdapter):
29
30
 
30
- choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
31
-
32
31
  def __init__(self, **kwargs):
33
32
  super().__init__(**kwargs)
34
33
 
34
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
35
+
35
36
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
36
37
  data_dict = {}
37
38
  for subset_name in subset_list:
@@ -85,7 +86,7 @@ class GeneralMCQAdapter(DataAdapter):
85
86
 
86
87
  full_prompt = self.prompt_template.format(query=context)
87
88
 
88
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
89
+ return self.gen_prompt_data(full_prompt)
89
90
 
90
91
  def get_gold_answer(self, input_d: dict) -> str:
91
92
  # Get the gold choice
@@ -103,14 +104,10 @@ class GeneralMCQAdapter(DataAdapter):
103
104
  Returns:
104
105
  The parsed answer. Depending on the dataset. Usually a string for chat.
105
106
  """
106
- if eval_type == EvalType.CHECKPOINT:
107
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
107
108
  return result
108
- elif eval_type == EvalType.SERVICE:
109
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
110
- elif eval_type == EvalType.CUSTOM:
111
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
112
109
  else:
113
- raise ValueError(f'Invalid eval_type: {eval_type}')
110
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
114
111
 
115
112
  def match(self, gold: str, pred: str) -> float:
116
113
  return exact_match(gold=gold, pred=pred)