evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  11. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  12. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  13. evalscope/benchmarks/ifeval/__init__.py +0 -0
  14. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  15. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  16. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  17. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  18. evalscope/benchmarks/ifeval/utils.py +134 -0
  19. evalscope/benchmarks/iquiz/__init__.py +0 -0
  20. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  23. evalscope/benchmarks/race/race_adapter.py +4 -73
  24. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  25. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  26. evalscope/cli/cli.py +2 -0
  27. evalscope/cli/start_app.py +29 -0
  28. evalscope/collections/evaluator.py +82 -62
  29. evalscope/collections/sampler.py +47 -41
  30. evalscope/collections/schema.py +14 -10
  31. evalscope/constants.py +4 -0
  32. evalscope/evaluator/evaluator.py +22 -13
  33. evalscope/metrics/__init__.py +2 -5
  34. evalscope/metrics/metrics.py +11 -2
  35. evalscope/metrics/named_metrics.py +17 -0
  36. evalscope/models/server_adapter.py +11 -4
  37. evalscope/perf/__init__.py +1 -0
  38. evalscope/perf/main.py +0 -1
  39. evalscope/perf/plugin/api/custom_api.py +1 -1
  40. evalscope/perf/plugin/api/openai_api.py +1 -1
  41. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  42. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  43. evalscope/report/__init__.py +5 -0
  44. evalscope/report/app.py +506 -0
  45. evalscope/report/combinator.py +73 -0
  46. evalscope/report/generator.py +80 -0
  47. evalscope/report/utils.py +133 -0
  48. evalscope/run.py +16 -11
  49. evalscope/summarizer.py +1 -1
  50. evalscope/utils/chat_service.py +1 -1
  51. evalscope/utils/logger.py +1 -0
  52. evalscope/utils/model_utils.py +5 -2
  53. evalscope/version.py +2 -2
  54. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
  55. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
  56. tests/cli/test_collection.py +11 -7
  57. tests/cli/test_run.py +13 -4
  58. evalscope/tools/__init__.py +0 -1
  59. evalscope/tools/combine_reports.py +0 -133
  60. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  61. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  62. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  63. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  64. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  65. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -33,6 +33,7 @@ def add_argument(parser: argparse.ArgumentParser):
33
33
  # yapf: disable
34
34
  # Model-related arguments
35
35
  parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
36
+ parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
36
37
  parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
37
38
 
38
39
  # Template-related arguments
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/ai2_arc',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
- metric_list=[WeightedAverageAccuracy],
23
+ metric_list=[AverageAccuracy],
24
24
  few_shot_num=0,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -109,12 +109,10 @@ class ARCAdapter(DataAdapter):
109
109
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
110
110
  context: str = '\n'.join(few_shot_prompts)
111
111
 
112
- context = f'{self.prompt_template}\n{context}' if self.prompt_template else context
113
-
114
112
  # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
115
113
  full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
116
114
 
117
- return {'data': [full_prompt], 'multi_choices': self.choices}
115
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
118
116
 
119
117
  def get_gold_answer(self, input_d: dict) -> str:
120
118
  # Get the gold choice
@@ -7,7 +7,7 @@ import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
10
+ from evalscope.metrics import AverageAccuracy, exact_match
11
11
  from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
12
  from evalscope.utils import ResponseParser
13
13
  from evalscope.utils.logger import get_logger
@@ -63,7 +63,7 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
63
63
  dataset_id='modelscope/bbh',
64
64
  model_adapter=ChatGenerationModelAdapter,
65
65
  subset_list=SUBSET_LIST,
66
- metric_list=[WeightedAverageAccuracy],
66
+ metric_list=[AverageAccuracy],
67
67
  few_shot_num=3,
68
68
  train_split=None,
69
69
  eval_split='test',
@@ -122,7 +122,7 @@ class BBHAdapter(DataAdapter):
122
122
  cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
123
123
  full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
124
124
 
125
- return {'data': [full_prompt]}
125
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
126
126
 
127
127
  def gen_prompts(self, data_dict: dict) -> dict:
128
128
  """
@@ -22,7 +22,7 @@ class BenchmarkMeta:
22
22
  few_shot_random: bool = False
23
23
  train_split: Optional[str] = None
24
24
  eval_split: Optional[str] = None
25
- prompt_template: str = ''
25
+ prompt_template: Optional[str] = None
26
26
 
27
27
  def _update(self, args: dict):
28
28
  if args.get('local_path'):
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import WeightedAverageAccuracy
7
+ from evalscope.metrics import AverageAccuracy
8
8
  from evalscope.metrics.metrics import exact_match, weighted_mean
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser, normalize_score
@@ -130,7 +130,7 @@ SUBJECT_MAPPING = {
130
130
  dataset_id='modelscope/ceval-exam',
131
131
  model_adapter=MultiChoiceModelAdapter,
132
132
  subset_list=SUBSET_LIST,
133
- metric_list=[WeightedAverageAccuracy],
133
+ metric_list=[AverageAccuracy],
134
134
  few_shot_num=0,
135
135
  train_split='dev',
136
136
  eval_split='val',
@@ -145,9 +145,10 @@ class CEVALAdapter(DataAdapter):
145
145
  if few_shot_num > 5:
146
146
  logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
147
147
  kwargs['few_shot_num'] = 5
148
-
149
148
  super().__init__(**kwargs)
150
149
 
150
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
151
+
151
152
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
152
153
  data_dict = {}
153
154
  for subset_name in subset_list:
@@ -206,7 +207,7 @@ class CEVALAdapter(DataAdapter):
206
207
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
207
208
  full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
208
209
 
209
- return {'data': [full_prompt], 'multi_choices': self.choices}
210
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
210
211
 
211
212
  def get_gold_answer(self, input_d: dict) -> str:
212
213
  # Get the gold choice
@@ -236,84 +237,6 @@ class CEVALAdapter(DataAdapter):
236
237
  def match(self, gold: str, pred: str) -> float:
237
238
  return exact_match(gold=gold, pred=pred)
238
239
 
239
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
240
- """
241
- Generate report for the evaluation.
242
-
243
- Args:
244
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
245
- report_name: The user-defined report name.
246
-
247
- Returns:
248
- {
249
- "name":"C-Eval",
250
- "metric":"WeightedAverageAccuracy",
251
- "score":0.3389,
252
- "category":[
253
- {
254
- "name":"STEM",
255
- "score":0.2528,
256
- "subset":[
257
- {
258
- "name":"computer_network",
259
- "score":0.2632
260
- },
261
- {
262
- "name":"operating_system",
263
- "score":0.3157
264
- },
265
- {
266
- "name":"computer_architecture",
267
- "score":0.4285
268
- }
269
- ]
270
- }
271
- ],
272
- "total_num":59
273
- }
274
- """
275
- total_num: int = sum([num for _, num in subset_score_map.values()])
276
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
277
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
278
-
279
- # Get domain-subject mapping
280
- subject_review_map = {}
281
- for subset_name, (subset_score, num) in subset_score_map.items():
282
- domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else 'DEFAULT'
283
- if domain_name in subject_review_map:
284
- subject_review_map[domain_name].append((subset_name, subset_score, num))
285
- else:
286
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
287
-
288
- # Get domain score
289
- category_list = []
290
- for domain_name, domain_res_list in subject_review_map.items():
291
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
292
- sum([num for _, _, num in domain_res_list])
293
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
294
- category_list.append({
295
- 'name':
296
- domain_name,
297
- 'score':
298
- domain_weighted_avg_acc,
299
- 'subset': [{
300
- 'name': subset_name,
301
- 'score': normalize_score(score=subset_score)
302
- } for subset_name, subset_score, _ in domain_res_list]
303
- })
304
-
305
- category_list = sorted(category_list, key=lambda x: x['name'])
306
-
307
- # Get final dict of report
308
- res_map = dict(
309
- name=report_name or 'ceval',
310
- metric=self.metric_list[0]['name'],
311
- score=weighted_avg_acc,
312
- category=category_list,
313
- total_num=total_num)
314
-
315
- return res_map
316
-
317
240
  @classmethod
318
241
  def _format_example(cls, input_d: dict, include_answer=True):
319
242
  example = '问题:' + input_d['question']
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser, normalize_score
11
11
  from evalscope.utils.logger import get_logger
@@ -106,7 +106,7 @@ SUBJECT_MAPPING = {
106
106
  dataset_id='modelscope/cmmlu',
107
107
  model_adapter=MultiChoiceModelAdapter,
108
108
  subset_list=SUBSET_LIST,
109
- metric_list=[WeightedAverageAccuracy],
109
+ metric_list=[AverageAccuracy],
110
110
  few_shot_num=5,
111
111
  train_split='dev',
112
112
  eval_split='test',
@@ -116,9 +116,10 @@ class CMMLUAdapter(DataAdapter):
116
116
  choices = ['A', 'B', 'C', 'D']
117
117
 
118
118
  def __init__(self, **kwargs):
119
-
120
119
  super().__init__(**kwargs)
121
120
 
121
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
122
+
122
123
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
123
124
  data_dict = {}
124
125
  for subset_name in subset_list:
@@ -173,7 +174,7 @@ class CMMLUAdapter(DataAdapter):
173
174
 
174
175
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
175
176
 
176
- return {'data': [full_prompt], 'multi_choices': self.choices}
177
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
177
178
 
178
179
  def get_gold_answer(self, input_d: dict) -> str:
179
180
  # Get the gold choice
@@ -203,81 +204,6 @@ class CMMLUAdapter(DataAdapter):
203
204
  def match(self, gold: str, pred: str) -> float:
204
205
  return exact_match(gold=gold, pred=pred)
205
206
 
206
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
207
- """
208
- Generate report for the evaluation.
209
-
210
- Args:
211
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
212
- report_name: the user-defined report name. Default: None
213
-
214
- Returns:
215
- {
216
- "name":"CMMLU",
217
- "metric":"WeightedAverageAccuracy",
218
- "score":0.3389,
219
- "category":[
220
- {
221
- "name":"STEM",
222
- "score":0.2528,
223
- "subset":[
224
- {
225
- "name":"computer_network",
226
- "score":0.2632
227
- },
228
- {
229
- "name":"operating_system",
230
- "score":0.3157
231
- },
232
- {
233
- "name":"computer_architecture",
234
- "score":0.4285
235
- }
236
- ]
237
- }
238
- ],
239
- "total_num":59
240
- }
241
- """
242
- total_num: int = sum([num for _, num in subset_score_map.values()])
243
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
244
-
245
- # Get domain-subject mapping
246
- subject_review_map = {}
247
- for subset_name, (subset_score, num) in subset_score_map.items():
248
- domain_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
249
- if domain_name in subject_review_map:
250
- subject_review_map[domain_name].append((subset_name, subset_score, num))
251
- else:
252
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
253
-
254
- # Get domain score
255
- category_list = []
256
- for domain_name, domain_res_list in subject_review_map.items():
257
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
258
- sum([num for _, _, num in domain_res_list])
259
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
260
- category_list.append({
261
- 'name':
262
- domain_name,
263
- 'score':
264
- domain_weighted_avg_acc,
265
- 'subset': [{
266
- 'name': subset_name,
267
- 'score': normalize_score(subset_score)
268
- } for subset_name, subset_score, _ in domain_res_list]
269
- })
270
-
271
- # Get final dict of report
272
- res_map = dict(
273
- name=report_name or 'cmmlu',
274
- metric=self.metric_list[0]['name'],
275
- score=weighted_avg_acc,
276
- category=category_list,
277
- total_num=total_num)
278
-
279
- return res_map
280
-
281
207
  @classmethod
282
208
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
283
209
 
@@ -5,7 +5,7 @@ import json
5
5
  import os
6
6
 
7
7
  from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.metrics import WeightedAverageAccuracy
8
+ from evalscope.metrics import AverageAccuracy
9
9
  from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
10
10
  from evalscope.models import ChatGenerationModelAdapter
11
11
  from evalscope.utils.logger import get_logger
@@ -20,11 +20,11 @@ logger = get_logger()
20
20
  dataset_id='modelscope/competition_math',
21
21
  model_adapter=ChatGenerationModelAdapter,
22
22
  subset_list=['default'],
23
- metric_list=[WeightedAverageAccuracy],
23
+ metric_list=[AverageAccuracy],
24
24
  few_shot_num=4,
25
25
  train_split='train',
26
26
  eval_split='test',
27
- prompt_template='',
27
+ prompt_template='Put the final answer in \\boxed{}.',
28
28
  )
29
29
  class CompetitionMathAdapter(DataAdapter):
30
30
  """ To be tested for all models. """
@@ -77,7 +77,7 @@ class CompetitionMathAdapter(DataAdapter):
77
77
  use_fewshot = self.few_shot_num > 0
78
78
  full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
79
79
 
80
- return {'data': [full_prompt], 'system_prompt': 'Put the final answer in \\boxed{}.'}
80
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
81
81
 
82
82
  def get_gold_answer(self, input_d: dict) -> str:
83
83
  # Extract the gold answer from the input dict.
@@ -2,10 +2,11 @@
2
2
  import os.path
3
3
  import random
4
4
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional
5
+ from typing import Any, List, Optional
6
6
 
7
7
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
8
- from evalscope.utils import normalize_score
8
+ from evalscope.metrics import Metric
9
+ from evalscope.report import Report, ReportGenerator
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  logger = get_logger()
@@ -14,12 +15,13 @@ logger = get_logger()
14
15
  class DataAdapter(ABC):
15
16
 
16
17
  def __init__(self,
18
+ name: str,
17
19
  subset_list: list,
18
- metric_list: list,
20
+ metric_list: List[Metric],
19
21
  few_shot_num: Optional[int] = 0,
20
22
  train_split: Optional[str] = None,
21
23
  eval_split: Optional[str] = None,
22
- prompt_template: str = '',
24
+ prompt_template: Optional[str] = None,
23
25
  **kwargs):
24
26
  """
25
27
  Data Adapter for the benchmark. You need to implement the following methods:
@@ -28,6 +30,7 @@ class DataAdapter(ABC):
28
30
  - parse_pred_result
29
31
  - match
30
32
  Args:
33
+ name: str, the name of the benchmark.
31
34
  subset_list: list of subset names for the dataset.
32
35
  metric_list: list, the metric list to evaluate the model on specific benchmark.
33
36
  few_shot_num: int, number of few-shot examples. Default: 0
@@ -37,6 +40,7 @@ class DataAdapter(ABC):
37
40
  e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
38
41
  the form of A or B or C or D, do not output explanation:`
39
42
  """
43
+ self.name = name
40
44
  self.subset_list = subset_list
41
45
  self.metric_list = metric_list
42
46
  self.few_shot_num = few_shot_num
@@ -44,6 +48,7 @@ class DataAdapter(ABC):
44
48
  self.eval_split = eval_split
45
49
  self.prompt_template = prompt_template
46
50
  self.config_kwargs = kwargs
51
+ self.category_map = kwargs.get('category_map', {})
47
52
 
48
53
  def load(self,
49
54
  dataset_name_or_path: str,
@@ -142,59 +147,6 @@ class DataAdapter(ABC):
142
147
 
143
148
  return res_dict
144
149
 
145
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
146
- """
147
- Generate report for the evaluation results for all subsets.
148
-
149
- Args:
150
- subset_score_map: The subset-score map.
151
- e.g. {subset_name: (score, num)}
152
-
153
- report_name: str, the user-defined report name. Default: None
154
-
155
- Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
156
-
157
- Here is a format example for ARC-Challenge:
158
- {
159
- "name":"ARC-Challenge",
160
- "metric":"WeightedAverageAccuracy",
161
- "score": 0.3389,
162
- "category":[
163
- {
164
- "name":"DEFAULT",
165
- "score": 0.3389,
166
- "subset":[
167
- {
168
- "name":"ARC-Challenge",
169
- "score": 0.3389,
170
- "num": 100
171
- },
172
- ]
173
- }
174
- ],
175
- "total_num":100
176
- }
177
- """ # noqa: E501
178
- total_num: int = sum([num for _, num in subset_score_map.values()])
179
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
180
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
181
- cate_avg_list = [{
182
- 'name': subset_name,
183
- 'score': normalize_score(score=score),
184
- 'num': num
185
- } for subset_name, (score, num) in subset_score_map.items()]
186
-
187
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
188
-
189
- res_map = dict(
190
- name=report_name or 'DEFAULT',
191
- metric=self.metric_list[0]['name'],
192
- score=weighted_avg_acc,
193
- category=[category_d],
194
- total_num=total_num)
195
-
196
- return res_map
197
-
198
150
  def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
199
151
 
200
152
  if k > len(data_list):
@@ -204,28 +156,75 @@ class DataAdapter(ABC):
204
156
  else:
205
157
  return data_list[:k]
206
158
 
207
- def compute_metric(self, review_res_list: list) -> Any:
159
+ def compute_metric(self, review_res_list: list) -> List[dict]:
208
160
  """
209
161
  Compute evaluation result by specific metrics.
210
162
 
211
163
  Args:
212
164
  review_res_list: list, the review result list, each item of which is match result for gold and pred.
213
165
 
214
- Attributes:
215
- DataAdapter.metric_func_map: metric_name -> metric_func mapping,
216
- e.g. {'WeightedAverageAccuracy': weighted_average_acc}
217
-
218
166
  Returns:
219
- Metric results.
167
+ Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
220
168
  """
221
169
  if len(self.metric_list) == 0:
222
170
  raise ValueError('No metric list found for the benchmark.')
223
- elif len(self.metric_list) == 1:
224
- # review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
225
- items = [(score, 1.0) for score in review_res_list]
226
- return self.metric_list[0]['object'](items)
227
- else:
228
- raise ValueError('Please implement the compute_metric method for multiple metrics.')
171
+
172
+ res_list = []
173
+ for metric in self.metric_list:
174
+ metric_name = metric.name
175
+ metric_func = metric.object
176
+ res_list.append({
177
+ 'metric_name': metric_name,
178
+ 'score': metric_func(review_res_list),
179
+ 'num': len(review_res_list)
180
+ })
181
+ return res_list
182
+
183
+ def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
184
+ """
185
+ Generate report for the evaluation results for all subsets.
186
+
187
+ Args:
188
+ subset_score_map: The subset-score map.
189
+ e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
190
+
191
+ report_name: str, the user-defined report name. Default: None
192
+
193
+ Returns: The evaluation report.
194
+
195
+ Here is a format example for gsm8k:
196
+ {
197
+ "name": "qwen2.5_gsm8k",
198
+ "metrics": [
199
+ {
200
+ "name": "AverageAccuracy",
201
+ "categories": [
202
+ {
203
+ "name": "default",
204
+ "subsets": [
205
+ {
206
+ "name": "main",
207
+ "score": 0.0,
208
+ "num": 2
209
+ }
210
+ ],
211
+ "num": 2,
212
+ "score": 0.0,
213
+ "macro_score": 0.0
214
+ }
215
+ ],
216
+ "num": 2,
217
+ "score": 0.0,
218
+ "macro_score": 0.0
219
+ }
220
+ ],
221
+ "dataset_name": "gsm8k",
222
+ "model_name": "qwen2.5"
223
+ }
224
+ """ # noqa: E501
225
+ kwargs['category_map'] = self.category_map
226
+ kwargs['metric_list'] = self.metric_list
227
+ return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
229
228
 
230
229
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
231
230
  """
@@ -276,7 +275,7 @@ class DataAdapter(ABC):
276
275
  raise NotImplementedError
277
276
 
278
277
  @abstractmethod
279
- def match(self, gold: Any, pred: Any) -> float:
278
+ def match(self, gold: Any, pred: Any) -> Any:
280
279
  """
281
280
  Match the gold answer and the predicted answer.
282
281