evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (79) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +10 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +23 -99
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +114 -85
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  26. evalscope/benchmarks/mmlu/__init__.py +0 -5
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  28. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  30. evalscope/benchmarks/race/__init__.py +0 -5
  31. evalscope/benchmarks/race/race_adapter.py +25 -53
  32. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  34. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  35. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  36. evalscope/collections/__init__.py +3 -0
  37. evalscope/collections/evaluator.py +178 -0
  38. evalscope/collections/sampler.py +132 -0
  39. evalscope/collections/schema.py +122 -0
  40. evalscope/config.py +7 -5
  41. evalscope/constants.py +7 -28
  42. evalscope/evaluator/evaluator.py +66 -109
  43. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  44. evalscope/metrics/__init__.py +6 -0
  45. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  46. evalscope/metrics/math_accuracy.py +193 -50
  47. evalscope/metrics/metrics.py +7 -4
  48. evalscope/metrics/rouge_metric.py +13 -8
  49. evalscope/models/__init__.py +14 -1
  50. evalscope/models/base_adapter.py +52 -0
  51. evalscope/models/chat_adapter.py +138 -0
  52. evalscope/models/choice_adapter.py +211 -0
  53. evalscope/models/custom_adapter.py +67 -0
  54. evalscope/models/local_model.py +74 -0
  55. evalscope/models/model.py +141 -0
  56. evalscope/models/server_adapter.py +104 -0
  57. evalscope/run.py +37 -66
  58. evalscope/run_arena.py +1 -1
  59. evalscope/utils/__init__.py +1 -1
  60. evalscope/utils/chat_service.py +4 -3
  61. evalscope/utils/io_utils.py +8 -0
  62. evalscope/utils/logger.py +4 -0
  63. evalscope/utils/model_utils.py +10 -0
  64. evalscope/utils/utils.py +3 -25
  65. evalscope/version.py +2 -2
  66. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
  67. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
  68. tests/cli/test_collection.py +53 -0
  69. tests/cli/test_run.py +43 -1
  70. tests/rag/test_mteb.py +3 -2
  71. evalscope/models/api/__init__.py +0 -3
  72. evalscope/models/dummy_chat_model.py +0 -49
  73. evalscope/models/model_adapter.py +0 -525
  74. evalscope/models/openai_model.py +0 -103
  75. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  76. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
  77. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
  78. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
  79. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,38 +1,35 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import json
3
- import os
4
2
  import re
5
- from tqdm import tqdm
6
3
  from typing import List
7
4
 
8
- from evalscope.benchmarks.data_adapter import DataAdapter
9
- from evalscope.metrics.metrics import weighted_mean
10
- from evalscope.tools.combine_reports import gen_table
11
- from evalscope.utils import normalize_score
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import Pass1
7
+ from evalscope.models import ChatGenerationModelAdapter
12
8
  from evalscope.utils.logger import get_logger
13
9
 
14
10
  logger = get_logger()
15
11
 
16
- DATASET_ID = 'modelscope/humaneval'
17
- SUBSET_LIST = ['openai_humaneval']
18
-
19
12
  # Example:
20
13
  # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
21
14
 
22
15
 
16
+ @Benchmark.register(
17
+ name='humaneval',
18
+ dataset_id='modelscope/humaneval',
19
+ model_adapter=ChatGenerationModelAdapter,
20
+ subset_list=['openai_humaneval'],
21
+ metric_list=[Pass1],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ prompt_template='Complete the following python code:\n',
26
+ )
23
27
  class HumanevalAdapter(DataAdapter):
24
28
  """
25
29
  A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
26
30
  """
27
31
 
28
- def __init__(self,
29
- subset_list: list = None,
30
- metric_list: list = None,
31
- few_shot_num: int = None,
32
- train_split: str = None,
33
- eval_split: str = 'test',
34
- prompt_template: str = 'Complete the following python code:\n',
35
- **kwargs):
32
+ def __init__(self, **kwargs):
36
33
  try:
37
34
  from human_eval.data import stream_jsonl, write_jsonl
38
35
  from human_eval.evaluation import check_correctness
@@ -41,29 +38,15 @@ class HumanevalAdapter(DataAdapter):
41
38
  'https://github.com/openai/human-eval/tree/master#installation , '
42
39
  'Note that you need to enable the execution code in the human_eval/execution.py first.')
43
40
 
44
- if subset_list is None:
45
- subset_list = SUBSET_LIST
46
-
47
- if metric_list is None:
48
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
49
-
50
41
  self.k = [1]
51
42
  self.num_workers = 4
52
43
  self.timeout = 4.0
53
- self.outputs = kwargs.get('outputs', None)
54
44
 
55
45
  self.read_problems_func = stream_jsonl
56
46
  self.write_jsonl_func = write_jsonl
57
47
  self.eval_func = check_correctness
58
48
 
59
- super().__init__(
60
- subset_list=subset_list,
61
- metric_list=metric_list,
62
- few_shot_num=few_shot_num,
63
- train_split=train_split,
64
- eval_split=eval_split,
65
- prompt_template=prompt_template,
66
- **kwargs)
49
+ super().__init__(**kwargs)
67
50
 
68
51
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
69
52
  data_dict = {}
@@ -87,77 +70,6 @@ class HumanevalAdapter(DataAdapter):
87
70
 
88
71
  return {'data': [full_prompt]}
89
72
 
90
- def get_answers(self, infer_cfg: dict) -> List[dict]:
91
- ans_list: list = []
92
- system_prompt: str = ''
93
- for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
94
- prompt: str = system_prompt + data_d['prompt']
95
- inputs: dict = {'data': [prompt]}
96
-
97
- pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
98
-
99
- pred_ans: str = pred_res['choices'][0]['message']['content']
100
- pred_ans = self._postprocess(pred_ans)
101
-
102
- ans_list.append({'task_id': task_id, 'completion': pred_ans})
103
-
104
- return ans_list
105
-
106
- def eval(self, infer_cfg: dict, **kwargs):
107
-
108
- # predict
109
- ans_list: list = self.get_answers(infer_cfg)
110
- ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
111
-
112
- self.write_jsonl_func(filename=ans_out_file, data=ans_list)
113
- # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
114
- logger.info('** Dump predictions successfully.')
115
-
116
- # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
117
- results = self.eval_func(
118
- sample_file=ans_out_file,
119
- k=self.k,
120
- n_workers=self.num_workers,
121
- timeout=self.timeout,
122
- problem_file=self.problem_file)
123
-
124
- # output: report
125
- report_map: dict = self.gen_report(results=results)
126
- report_dir: str = self.outputs_structure.reports_dir
127
- report_file: str = os.path.join(report_dir, 'human_eval_report.json')
128
-
129
- with open(report_file, 'w') as f:
130
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
131
- # logger.info(f'** Dump report to {report_file} \n')
132
- logger.info('** Dump report \n')
133
-
134
- try:
135
- # Make table
136
- report_table: str = gen_table([report_dir])
137
- logger.info(f'** Report table: \n {report_table} \n')
138
- except Exception:
139
- logger.error('Failed to generate report table.')
140
-
141
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
142
- total_num: int = sum([num for _, num in subset_score_map.values()])
143
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
144
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
145
- cate_avg_list = [{
146
- 'name': subset_name,
147
- 'score': normalize_score(score=score)
148
- } for subset_name, (score, _) in subset_score_map.items()]
149
-
150
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
151
-
152
- res_map = dict(
153
- name=report_name or 'HumanEval',
154
- metric='pass@1',
155
- score=weighted_avg_acc,
156
- category=[category_d],
157
- total_num=total_num)
158
-
159
- return res_map
160
-
161
73
  @classmethod
162
74
  def _postprocess(cls, text: str) -> str:
163
75
  if '```' in text:
@@ -182,19 +94,6 @@ class HumanevalAdapter(DataAdapter):
182
94
  text = '\n'.join([' ' + line for line in text.split('\n')])
183
95
  return text
184
96
 
185
- def compute_metric(self, review_res_list: list) -> float:
186
- """
187
- Compute evaluation result by specific metric.
188
-
189
- Args:
190
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
191
-
192
- Returns:
193
- The metric score.
194
- """
195
- items = [(score, 1.0) for score in review_res_list]
196
- return weighted_mean(items)
197
-
198
97
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
199
98
  return self._postprocess(result)
200
99
 
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
4
- from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter
5
- from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -2,8 +2,10 @@
2
2
  import csv
3
3
  import os
4
4
 
5
- from evalscope.benchmarks.data_adapter import DataAdapter
6
- from evalscope.metrics.metrics import exact_match, weighted_mean
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import WeightedAverageAccuracy, exact_match
8
+ from evalscope.models import MultiChoiceModelAdapter
7
9
  from evalscope.utils import ResponseParser, normalize_score
8
10
  from evalscope.utils.logger import get_logger
9
11
 
@@ -134,40 +136,29 @@ SUBJECT_MAPPING = {
134
136
  }
135
137
 
136
138
 
139
+ @Benchmark.register(
140
+ name='mmlu',
141
+ dataset_id='modelscope/mmlu',
142
+ model_adapter=MultiChoiceModelAdapter,
143
+ subset_list=SUBSET_LIST,
144
+ metric_list=[WeightedAverageAccuracy],
145
+ few_shot_num=5,
146
+ train_split='train',
147
+ eval_split='test',
148
+ prompt_template='',
149
+ )
137
150
  class MMLUAdapter(DataAdapter):
138
151
 
139
152
  choices = ['A', 'B', 'C', 'D']
140
153
 
141
- def __init__(self,
142
- subset_list: list = None,
143
- metric_list: list = None,
144
- few_shot_num: int = None,
145
- train_split: str = 'train',
146
- eval_split: str = 'test',
147
- **kwargs):
148
-
149
- if subset_list is None:
150
- subset_list = SUBSET_LIST
151
-
152
- if metric_list is None:
153
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
154
-
155
- if few_shot_num is None:
156
- # Use 5-shot by default
157
- logger.info(f'Set 5-shot examples by system for MMLU.')
158
- few_shot_num = 5
154
+ def __init__(self, **kwargs):
159
155
 
156
+ few_shot_num = kwargs.get('few_shot_num', 5)
160
157
  if few_shot_num > 5:
161
158
  logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
162
- few_shot_num = 5
159
+ kwargs['few_shot_num'] = 5
163
160
 
164
- super().__init__(
165
- subset_list=subset_list,
166
- metric_list=metric_list,
167
- few_shot_num=few_shot_num,
168
- train_split=train_split,
169
- eval_split=eval_split,
170
- **kwargs)
161
+ super().__init__(**kwargs)
171
162
 
172
163
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
173
164
  data_dict = {}
@@ -225,7 +216,7 @@ class MMLUAdapter(DataAdapter):
225
216
  'target': 'A'}
226
217
 
227
218
  Returns:
228
- {'data': [(context, continuation), ...]}
219
+ {'data': [full_prompt], 'multi_choices': self.choices}
229
220
 
230
221
  """
231
222
  prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
@@ -244,7 +235,7 @@ class MMLUAdapter(DataAdapter):
244
235
  # Get the gold choice
245
236
  return input_d.get('target', '')
246
237
 
247
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
238
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
248
239
  """
249
240
  Parse the model output to get the answer. Could be the best choice index.
250
241
 
@@ -256,31 +247,18 @@ class MMLUAdapter(DataAdapter):
256
247
  Returns:
257
248
  The parsed answer. Depending on the dataset. Usually a string for chat.
258
249
  """
259
- if eval_type == 'checkpoint':
250
+ if eval_type == EvalType.CHECKPOINT:
260
251
  return result
261
- elif eval_type == 'service':
262
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
263
- elif eval_type == 'custom':
264
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
252
+ elif eval_type == EvalType.SERVICE:
253
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
254
+ elif eval_type == EvalType.CUSTOM:
255
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
265
256
  else:
266
257
  raise ValueError(f'Invalid eval_type: {eval_type}')
267
258
 
268
259
  def match(self, gold: str, pred: str) -> float:
269
260
  return exact_match(gold=gold, pred=pred)
270
261
 
271
- def compute_metric(self, review_res_list: list) -> float:
272
- """
273
- Compute evaluation result by specific metric.
274
-
275
- Args:
276
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
277
-
278
- Returns:
279
- The metric score.
280
- """
281
- items = [(score, 1.0) for score in review_res_list]
282
- return weighted_mean(items)
283
-
284
262
  def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
285
263
  """
286
264
  Generate report for the evaluation.
File without changes
@@ -0,0 +1,110 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.constants import AnswerKeys, EvalType
6
+ from evalscope.metrics import WeightedAverageAccuracy, exact_match
7
+ from evalscope.models import ChatGenerationModelAdapter
8
+ from evalscope.utils.utils import ResponseParser
9
+
10
+
11
+ @Benchmark.register(
12
+ name='mmlu_pro',
13
+ dataset_id='modelscope/mmlu-pro',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=[WeightedAverageAccuracy],
17
+ few_shot_num=5,
18
+ train_split='validation',
19
+ eval_split='test',
20
+ prompt_template=
21
+ 'You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...`.', # noqa: E501
22
+ )
23
+ class MMLUProAdapter(DataAdapter):
24
+
25
+ def __init__(self, **kwargs):
26
+ super().__init__(**kwargs)
27
+
28
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
29
+ self.categories = [
30
+ 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
31
+ 'philosophy', 'economics', 'other', 'psychology', 'history'
32
+ ]
33
+
34
+ def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
35
+ """
36
+ Generate model prompt from raw input, unify the prompt format for MMLU-Pro benchmark.
37
+ Return a dict with category as key and list of prompts as value.
38
+ """
39
+
40
+ data_dict = data_dict[self.subset_list[0]] # Only one subset for MMLU-Pro
41
+ fewshot_prompts = self.get_fewshot_examples(data_dict)
42
+
43
+ # Use the category as key to group the prompts
44
+ res_dict = defaultdict(list)
45
+ # generate prompts for each test sample
46
+ for entry in data_dict[self.eval_split]:
47
+ prefix = fewshot_prompts[entry['category']]
48
+ query = prefix + 'Q: ' + entry['question'] + '\n' + \
49
+ self.__form_options(entry['options']) + '\n'
50
+
51
+ prompt_d = {'data': [query], 'system_prompt': self.prompt_template, AnswerKeys.RAW_INPUT: entry}
52
+
53
+ res_dict[entry['category']].append(prompt_d)
54
+ return res_dict
55
+
56
+ def get_fewshot_examples(self, data_dict: dict):
57
+ # load 5-shot prompts for each category
58
+ prompts = {c: '' for c in self.categories}
59
+ for d in data_dict[self.train_split]:
60
+ prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
61
+ self.__form_options(d['options']) + '\n' + \
62
+ d['cot_content'] + '\n\n'
63
+ return prompts
64
+
65
+ def __form_options(self, options: list):
66
+ option_str = 'Options are:\n'
67
+ for opt, choice in zip(options, self.choices):
68
+ option_str += f'({choice}): {opt}' + '\n'
69
+ return option_str
70
+
71
+ def get_gold_answer(self, input_d: dict) -> str:
72
+ """
73
+ Parse the raw input labels (gold).
74
+
75
+ Args:
76
+ input_d: input raw data. Depending on the dataset.
77
+
78
+ Returns:
79
+ The parsed input. e.g. gold answer ... Depending on the dataset.
80
+ """
81
+ return input_d['answer']
82
+
83
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
84
+ """
85
+ Parse the predicted result and extract proper answer.
86
+
87
+ Args:
88
+ result: Predicted answer from the model. Usually a string for chat.
89
+ raw_input_d: The raw input. Depending on the dataset.
90
+ eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
91
+
92
+ Returns:
93
+ The parsed answer. Depending on the dataset. Usually a string for chat.
94
+ """
95
+ return ResponseParser.parse_first_option(result)
96
+
97
+ def match(self, gold: str, pred: str) -> float:
98
+ """
99
+ Match the gold answer and the predicted answer.
100
+
101
+ Args:
102
+ gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
103
+ e.g. 'A', extracted from get_gold_answer method.
104
+ pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
105
+ e.g. 'B', extracted from parse_pred_result method.
106
+
107
+ Returns:
108
+ The match result. Usually a score (float) for chat/multiple-choice-questions.
109
+ """
110
+ return exact_match(gold=gold, pred=pred)
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
4
- from evalscope.benchmarks.race.race_adapter import RACEAdapter
5
- from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -1,11 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import json
4
3
  import os
5
4
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import normalize_score
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import WeightedAverageAccuracy, exact_match
8
+ from evalscope.models import MultiChoiceModelAdapter
9
+ from evalscope.utils import ResponseParser, normalize_score
9
10
  from evalscope.utils.io_utils import jsonl_to_list
10
11
  from evalscope.utils.logger import get_logger
11
12
 
@@ -13,46 +14,30 @@ from evalscope.utils.logger import get_logger
13
14
 
14
15
  logger = get_logger()
15
16
 
16
- DATASET_ID = 'modelscope/race'
17
-
18
- SUBSET_LIST = ['high', 'middle']
19
-
20
17
  SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
21
18
 
22
19
 
20
+ @Benchmark.register(
21
+ name='race',
22
+ dataset_id='modelscope/race',
23
+ model_adapter=MultiChoiceModelAdapter,
24
+ subset_list=['high', 'middle'],
25
+ metric_list=[WeightedAverageAccuracy],
26
+ few_shot_num=3,
27
+ train_split='train',
28
+ eval_split='test',
29
+ )
23
30
  class RACEAdapter(DataAdapter):
24
31
 
25
32
  choices = ['A', 'B', 'C', 'D']
26
33
 
27
- def __init__(self,
28
- subset_list: list = None,
29
- metric_list: list = None,
30
- few_shot_num: int = None,
31
- train_split: str = 'train',
32
- eval_split: str = 'test',
33
- **kwargs):
34
-
35
- if subset_list is None:
36
- subset_list = SUBSET_LIST
37
-
38
- if metric_list is None:
39
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
40
-
41
- if few_shot_num is None:
42
- logger.info(f'Set 3-shot examples by system for RACE.')
43
- few_shot_num = 3
44
-
34
+ def __init__(self, **kwargs):
35
+ few_shot_num = kwargs.get('few_shot_num', 3)
45
36
  if few_shot_num > 3:
46
37
  logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
47
- few_shot_num = 3
38
+ kwargs['few_shot_num'] = 3
48
39
 
49
- super().__init__(
50
- subset_list=subset_list,
51
- metric_list=metric_list,
52
- few_shot_num=few_shot_num,
53
- train_split=train_split,
54
- eval_split=eval_split,
55
- **kwargs)
40
+ super().__init__(**kwargs)
56
41
 
57
42
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
58
43
  data_dict = {}
@@ -105,7 +90,7 @@ class RACEAdapter(DataAdapter):
105
90
  # Get the gold choice
106
91
  return input_d.get('answer', '')
107
92
 
108
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
93
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
109
94
  """
110
95
  Parse the model output to get the answer. Could be the best choice index.
111
96
 
@@ -117,31 +102,18 @@ class RACEAdapter(DataAdapter):
117
102
  Returns:
118
103
  The parsed answer. Depending on the dataset. Usually a string for chat.
119
104
  """
120
- if eval_type == 'checkpoint':
121
- return result
122
- elif eval_type == 'service': # TODO: to be implemented
123
- return result
124
- elif eval_type == 'custom': # TODO: to be implemented
105
+ if eval_type == EvalType.CHECKPOINT:
125
106
  return result
107
+ elif eval_type == EvalType.SERVICE:
108
+ return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
109
+ elif eval_type == EvalType.CUSTOM:
110
+ return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
126
111
  else:
127
112
  raise ValueError(f'Unknown eval_type: {eval_type}')
128
113
 
129
114
  def match(self, gold: str, pred: str) -> float:
130
115
  return exact_match(gold=gold, pred=pred)
131
116
 
132
- def compute_metric(self, review_res_list: list) -> float:
133
- """
134
- Compute evaluation result by specific metric.
135
-
136
- Args:
137
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
138
-
139
- Returns:
140
- The metric score.
141
- """
142
- items = [(score, 1.0) for score in review_res_list]
143
- return weighted_mean(items)
144
-
145
117
  def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
146
118
  """
147
119
  Generate report for the evaluation.
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter
5
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa