evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -2,10 +2,11 @@
2
2
  import os.path
3
3
  import random
4
4
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional
5
+ from typing import Any, List, Optional
6
6
 
7
- from evalscope.benchmarks import Benchmark
8
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
7
+ from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
8
+ from evalscope.metrics import Metric
9
+ from evalscope.report import Report, ReportGenerator
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  logger = get_logger()
@@ -14,15 +15,22 @@ logger = get_logger()
14
15
  class DataAdapter(ABC):
15
16
 
16
17
  def __init__(self,
18
+ name: str,
17
19
  subset_list: list,
18
- metric_list: list,
20
+ metric_list: List[Metric],
19
21
  few_shot_num: Optional[int] = 0,
20
22
  train_split: Optional[str] = None,
21
23
  eval_split: Optional[str] = None,
22
- prompt_template: str = '',
24
+ prompt_template: Optional[str] = None,
23
25
  **kwargs):
24
26
  """
27
+ Data Adapter for the benchmark. You need to implement the following methods:
28
+ - gen_prompt
29
+ - get_gold_answer
30
+ - parse_pred_result
31
+ - match
25
32
  Args:
33
+ name: str, the name of the benchmark.
26
34
  subset_list: list of subset names for the dataset.
27
35
  metric_list: list, the metric list to evaluate the model on specific benchmark.
28
36
  few_shot_num: int, number of few-shot examples. Default: 0
@@ -32,6 +40,7 @@ class DataAdapter(ABC):
32
40
  e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
33
41
  the form of A or B or C or D, do not output explanation:`
34
42
  """
43
+ self.name = name
35
44
  self.subset_list = subset_list
36
45
  self.metric_list = metric_list
37
46
  self.few_shot_num = few_shot_num
@@ -39,6 +48,7 @@ class DataAdapter(ABC):
39
48
  self.eval_split = eval_split
40
49
  self.prompt_template = prompt_template
41
50
  self.config_kwargs = kwargs
51
+ self.category_map = kwargs.get('category_map', {})
42
52
 
43
53
  def load(self,
44
54
  dataset_name_or_path: str,
@@ -55,33 +65,36 @@ class DataAdapter(ABC):
55
65
 
56
66
  """
57
67
  dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
68
+ subset_list = subset_list or self.subset_list
58
69
 
59
70
  # Try to load dataset from local disk
60
71
  if os.path.exists(dataset_name_or_path):
61
- logger.info(
62
- f'Loading dataset from local disk: > dataset_name: {dataset_name_or_path} > work_dir: {work_dir}')
72
+ logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
73
+ subsets: {subset_list}')
63
74
  data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
64
75
  if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
65
76
  raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
66
77
  else:
78
+ from modelscope.msdatasets import MsDataset
79
+
67
80
  # Load dataset from remote
68
- logger.info(f'Loading dataset from {datasets_hub} hub: >dataset_name: {dataset_name_or_path}')
81
+ logger.info(
82
+ f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
69
83
  data_dict = {}
70
84
  split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
71
85
  if len(split_list) == 0:
72
86
  logger.error(f'Got empty split list: {split_list}')
73
87
 
74
- subset_list = subset_list if subset_list is not None else self.subset_list
75
88
  for sub_name in subset_list:
76
89
  data_dict[sub_name] = {}
77
90
  # e.g. train: few-shot, test: target dataset to evaluate
78
91
  for split in split_list:
79
- dataset = Benchmark.load(
92
+ dataset = MsDataset.load(
80
93
  dataset_name=dataset_name_or_path,
81
- subset=sub_name,
94
+ subset_name=sub_name,
82
95
  split=split,
96
+ cache_dir=work_dir,
83
97
  hub=datasets_hub,
84
- work_dir=work_dir,
85
98
  **kwargs)
86
99
 
87
100
  data_dict[sub_name].update({split: dataset})
@@ -132,30 +145,105 @@ class DataAdapter(ABC):
132
145
  prompt_d[AnswerKeys.RAW_INPUT] = sample_d
133
146
  res_dict[sub_name].append(prompt_d)
134
147
 
135
- rnd = random.Random()
136
- rnd.seed(42)
137
- for k, v in res_dict.items():
138
- rnd.shuffle(v)
139
-
140
148
  return res_dict
141
149
 
142
- @abstractmethod
143
- def gen_prompt(self, *args, **kwargs) -> Any:
150
+ def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
151
+
152
+ if k > len(data_list):
153
+ k = len(data_list)
154
+ if few_shot_random:
155
+ return random.sample(data_list, k)
156
+ else:
157
+ return data_list[:k]
158
+
159
+ def compute_metric(self, review_res_list: list) -> List[dict]:
160
+ """
161
+ Compute evaluation result by specific metrics.
162
+
163
+ Args:
164
+ review_res_list: list, the review result list, each item of which is match result for gold and pred.
165
+
166
+ Returns:
167
+ Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
168
+ """
169
+ if len(self.metric_list) == 0:
170
+ raise ValueError('No metric list found for the benchmark.')
171
+
172
+ res_list = []
173
+ for metric in self.metric_list:
174
+ metric_name = metric.name
175
+ metric_func = metric.object
176
+ res_list.append({
177
+ 'metric_name': metric_name,
178
+ 'score': metric_func(review_res_list),
179
+ 'num': len(review_res_list)
180
+ })
181
+ return res_list
182
+
183
+ def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
184
+ """
185
+ Generate report for the evaluation results for all subsets.
186
+
187
+ Args:
188
+ subset_score_map: The subset-score map.
189
+ e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
190
+
191
+ report_name: str, the user-defined report name. Default: None
192
+
193
+ Returns: The evaluation report.
194
+
195
+ Here is a format example for gsm8k:
196
+ {
197
+ "name": "qwen2.5_gsm8k",
198
+ "metrics": [
199
+ {
200
+ "name": "AverageAccuracy",
201
+ "categories": [
202
+ {
203
+ "name": "default",
204
+ "subsets": [
205
+ {
206
+ "name": "main",
207
+ "score": 0.0,
208
+ "num": 2
209
+ }
210
+ ],
211
+ "num": 2,
212
+ "score": 0.0,
213
+ "macro_score": 0.0
214
+ }
215
+ ],
216
+ "num": 2,
217
+ "score": 0.0,
218
+ "macro_score": 0.0
219
+ }
220
+ ],
221
+ "dataset_name": "gsm8k",
222
+ "model_name": "qwen2.5"
223
+ }
224
+ """ # noqa: E501
225
+ kwargs['category_map'] = self.category_map
226
+ kwargs['metric_list'] = self.metric_list
227
+ return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
228
+
229
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
144
230
  """
145
231
  Generate model prompt from raw input, unify the prompt format for different datasets.
146
232
  The input format is compatible with OpenAI Chat Completions APIs.
147
- Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
148
233
 
149
234
  Args:
150
235
  input_d (Any): The raw input. Depending on the dataset.
236
+ subset_name (str): The subset name.
237
+ few_shot_list (list): The few-shot examples.
151
238
 
152
239
  Returns:
240
+ For class ChatGenerationModelAdapter, the output format is:
241
+ {'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset.
153
242
  For class MultiChoiceModelAdapter, the output format is:
154
- {'data': [full_prompt]}, -- full_prompt: str, the constructed prompt for each sample from dataset.
155
-
243
+ {'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset.
156
244
  For class ContinuationEvalModelAdapter, the output format is:
157
- {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
158
- """
245
+ {'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
246
+ """ # noqa: E501
159
247
  raise NotImplementedError
160
248
 
161
249
  @abstractmethod
@@ -172,7 +260,7 @@ class DataAdapter(ABC):
172
260
  raise NotImplementedError
173
261
 
174
262
  @abstractmethod
175
- def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> Any:
263
+ def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
176
264
  """
177
265
  Parse the predicted result and extract proper answer.
178
266
 
@@ -193,71 +281,11 @@ class DataAdapter(ABC):
193
281
 
194
282
  Args:
195
283
  gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
196
- e.g. 'A'
284
+ e.g. 'A', extracted from get_gold_answer method.
197
285
  pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
198
- e.g. 'B'
286
+ e.g. 'B', extracted from parse_pred_result method.
199
287
 
200
288
  Returns:
201
289
  The match result. Usually a score (float) for chat/multiple-choice-questions.
202
290
  """
203
291
  raise NotImplementedError
204
-
205
- @abstractmethod
206
- def compute_metric(self, review_res_list: list) -> Any:
207
- """
208
- Compute evaluation result by specific metrics.
209
-
210
- Args:
211
- review_res_list: list, the review result list, each item of which is match result for gold and pred.
212
-
213
- Attributes:
214
- DataAdapter.metric_func_map: metric_name -> metric_func mapping,
215
- e.g. {'WeightedAverageAccuracy': weighted_average_acc}
216
-
217
- Returns:
218
- Metric results.
219
- """
220
- raise NotImplementedError
221
-
222
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
223
- """
224
- Generate report for the evaluation results for all subsets.
225
-
226
- Args:
227
- subset_score_map: The subset-score map.
228
- e.g. {subset_name: (score, num)}
229
-
230
- report_name: str, the user-defined report name. Default: None
231
-
232
- Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
233
-
234
- Here is a format example for ARC-Challenge:
235
- {
236
- "name":"ARC-Challenge",
237
- "metric":"WeightedAverageAccuracy",
238
- "score": 0.3389,
239
- "category":[
240
- {
241
- "name":"DEFAULT",
242
- "score": 0.3389,
243
- "subset":[
244
- {
245
- "name":"ARC-Challenge",
246
- "score": 0.3389
247
- },
248
- ]
249
- }
250
- ],
251
- "total_num":100
252
- }
253
- """
254
- raise NotImplementedError
255
-
256
- def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
257
-
258
- if k > len(data_list):
259
- k = len(data_list)
260
- if few_shot_random:
261
- return random.sample(data_list, k)
262
- else:
263
- return data_list[:k]
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
5
- from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
@@ -1,39 +1,34 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import glob
3
- import json
4
3
  import os.path
5
4
  from collections import defaultdict
6
- from typing import Any, Optional
5
+ from typing import List
7
6
 
8
- from evalscope.benchmarks.data_adapter import DataAdapter
9
- from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
10
- from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
7
+ from evalscope.benchmarks import Benchmark, DataAdapter
8
+ from evalscope.metrics import AverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
9
+ from evalscope.models import ChatGenerationModelAdapter
11
10
  from evalscope.utils.io_utils import jsonl_to_list
12
11
  from evalscope.utils.logger import get_logger
13
12
 
14
13
  logger = get_logger()
15
14
 
16
- DATASET_ID = 'general_qa'
17
- SUBSET_LIST = ['default']
18
-
19
15
 
16
+ @Benchmark.register(
17
+ name='general_qa',
18
+ dataset_id='general_qa',
19
+ model_adapter=ChatGenerationModelAdapter,
20
+ subset_list=['default'],
21
+ metric_list=[AverageBLEU],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ )
20
26
  class GeneralQAAdapter(DataAdapter):
21
27
  # TODO: set few_shot_num
22
28
 
23
- def __init__(self,
24
- subset_list: list = None,
25
- metric_list: list = None,
26
- train_split: str = None,
27
- eval_split: str = 'test',
28
- **kwargs):
29
- if subset_list is None:
30
- subset_list = SUBSET_LIST
31
-
32
- if metric_list is None:
33
- metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
29
+ def __init__(self, **kwargs):
34
30
 
35
- super().__init__(
36
- subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
31
+ super().__init__(**kwargs)
37
32
 
38
33
  def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
39
34
 
@@ -71,7 +66,7 @@ class GeneralQAAdapter(DataAdapter):
71
66
 
72
67
  # if len(history) > 0:
73
68
  # prompt = '\n'.join(history) + '\n' + prompt
74
- return {'data': [prompt]}
69
+ return {'data': [prompt], 'system_prompt': self.prompt_template}
75
70
 
76
71
  def get_gold_answer(self, input_d: dict) -> str:
77
72
  """
@@ -95,14 +90,14 @@ class GeneralQAAdapter(DataAdapter):
95
90
  """
96
91
  return result
97
92
 
98
- def match(self, gold: str, pred: str) -> float:
93
+ def match(self, gold: str, pred: str) -> dict:
99
94
  """
100
95
  Args:
101
96
  gold: str
102
97
  pred: str
103
98
 
104
99
  Returns:
105
- bleu_score: float
100
+ bleu_score: dict
106
101
 
107
102
  """
108
103
  res = dict()
@@ -110,10 +105,9 @@ class GeneralQAAdapter(DataAdapter):
110
105
  bleu_dict = bleu_ngram_one_sample(pred, gold)
111
106
  res.update(rouge_dict)
112
107
  res.update(bleu_dict)
113
- # return bleu(item)
114
108
  return res
115
109
 
116
- def compute_metric(self, review_res_list: list) -> float:
110
+ def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
117
111
  """
118
112
  compute weighted mean of the bleu score of all samples
119
113
 
@@ -121,62 +115,12 @@ class GeneralQAAdapter(DataAdapter):
121
115
  review_res_list: [score1, score2, ...]
122
116
 
123
117
  Returns:
124
- avg_res: float
118
+ avg_res: List[dict]
125
119
 
126
120
  """
127
121
  items = defaultdict(list)
128
122
  for scores in review_res_list:
129
123
  for k, v in scores.items():
130
- items[k].append((v, 1.0))
124
+ items[k].append(v)
131
125
  # items = [(score, 1.0) for score in review_res_list]
132
- res = {k: weighted_mean(v) for k, v in items.items()}
133
- # return weighted_mean(items)
134
- return res
135
-
136
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
137
- """
138
- Args:
139
- subset_score_map: {subset_name: (score_dict, num), ...}
140
- report_name: str, the user-defined report name.
141
-
142
- Returns:
143
- {
144
- "name":"GeneralQA",
145
- "metric":"WeightedAverageBLEU",
146
- "score":0.399,
147
- "category":[
148
- {
149
- "name":"DEFAULT",
150
- "score":0.399,
151
- "subset":[
152
- {
153
- "name":"default",
154
- "score":0.399
155
- },
156
- ]
157
- }
158
- ],
159
- "total_num":10
160
- }
161
- """
162
- total_num: int = sum([num for _, num in subset_score_map.values()])
163
- # weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
164
- cate_avg_list = [{
165
- 'name': subset_name,
166
- 'score': score_dict
167
- } for subset_name, (score_dict, _) in subset_score_map.items()]
168
- total_avg_list = defaultdict(float)
169
- for score_dict, num in subset_score_map.values():
170
- for metric, score in score_dict.items():
171
- total_avg_list[metric] += score * num / total_num
172
-
173
- category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
174
-
175
- res_map = dict(
176
- name=report_name or 'general_qa',
177
- metric=self.metric_list[0]['name'],
178
- score=total_avg_list,
179
- category=[category_d],
180
- total_num=total_num)
181
-
182
- return res_map
126
+ return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -1,5 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -1,70 +1,51 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI, Inc. and its affiliates.
3
+ # flake8: noqa
3
4
  import math
4
5
  import os
5
6
  import re
6
7
 
7
- from evalscope.benchmarks import DataAdapter
8
- from evalscope.metrics.metrics import exact_match, weighted_mean
9
- from evalscope.utils import normalize_score
8
+ from evalscope.benchmarks import Benchmark, DataAdapter
9
+ from evalscope.metrics import AverageAccuracy
10
+ from evalscope.models import ChatGenerationModelAdapter
10
11
  from evalscope.utils.io_utils import jsonl_to_list
11
12
  from evalscope.utils.logger import get_logger
12
13
 
13
- # flake8: noqa
14
-
15
14
  logger = get_logger()
16
15
 
17
- DATASET_ID = 'modelscope/gsm8k'
18
- SUBSET_LIST = ['main']
19
- ANS_RE = re.compile(r'#### (\-?[0-9\.\,]+)')
20
- INVALID_ANS = '[invalid]'
21
-
22
16
 
17
+ @Benchmark.register(
18
+ name='gsm8k',
19
+ dataset_id='modelscope/gsm8k',
20
+ model_adapter=ChatGenerationModelAdapter,
21
+ subset_list=['main'],
22
+ metric_list=[AverageAccuracy],
23
+ few_shot_num=4,
24
+ train_split='train',
25
+ eval_split='test',
26
+ prompt_template='',
27
+ )
23
28
  class GSM8KAdapter(DataAdapter):
24
29
 
25
- def __init__(self,
26
- subset_list: list = None,
27
- metric_list: list = None,
28
- few_shot_num: int = None,
29
- train_split: str = 'train',
30
- eval_split: str = 'test',
31
- prompt_template: str = '',
32
- **kwargs):
30
+ def __init__(self, **kwargs):
33
31
  """
34
32
  Data adapter for GSM8K dataset.
35
33
 
36
34
  Args:
37
35
  subset_list (list): Subset list for the dataset. Default: ['main']
38
- metric_list (list): Metric list for the dataset. Default: [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
36
+ metric_list (list): Metric list for the dataset. Default: [{'name': 'AverageAccuracy', 'object': mean}]
39
37
  few_shot_num (int): Number of few-shot examples. Default: 4
40
38
  train_split (str): Train split name. Default: 'train'
41
39
  eval_split (str): The target eval split name. Default: 'test'
42
40
  **kwargs: ...
43
41
  """
44
-
45
- if subset_list is None:
46
- subset_list = SUBSET_LIST
47
-
48
- if metric_list is None:
49
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
50
-
51
- if few_shot_num is None:
52
- logger.info(f'Set 4-shot examples by system for GSM8K.')
53
- few_shot_num = 4
54
-
42
+ few_shot_num = kwargs.get('few_shot_num', 4)
55
43
  if few_shot_num != 4 and few_shot_num != 0:
56
44
  logger.error(f'GSM8K uses 4-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
57
45
  f'Use 4-shot by default.')
58
- few_shot_num = 4
46
+ kwargs['few_shot_num'] = 4
59
47
 
60
- super().__init__(
61
- subset_list=subset_list,
62
- metric_list=metric_list,
63
- few_shot_num=few_shot_num,
64
- train_split=train_split,
65
- eval_split=eval_split,
66
- prompt_template=prompt_template,
67
- **kwargs)
48
+ super().__init__(**kwargs)
68
49
 
69
50
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
70
51
  data_dict = {}
@@ -94,9 +75,8 @@ class GSM8KAdapter(DataAdapter):
94
75
  use_fewshot = self.few_shot_num > 0
95
76
 
96
77
  full_prompt = self._generate_prompt(input_d, few_shot_list=few_shot_list, use_fewshot=use_fewshot)
97
- full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
98
78
 
99
- return {'data': [full_prompt]}
79
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
100
80
 
101
81
  def get_gold_answer(self, input_d: dict) -> str:
102
82
  # Extract the gold answer from the input dict.
@@ -142,66 +122,6 @@ class GSM8KAdapter(DataAdapter):
142
122
 
143
123
  return number_equal(gold_ans=gold, pred_ans=pred)
144
124
 
145
- def compute_metric(self, review_res_list: list) -> float:
146
- """
147
- Compute evaluation result by specific metric.
148
-
149
- Args:
150
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
151
-
152
- Returns:
153
- The metric score.
154
- """
155
- items = [(score, 1.0) for score in review_res_list]
156
- return weighted_mean(items)
157
-
158
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
159
- """
160
- Generate the report for the model output.
161
-
162
- Args:
163
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
164
- report_name: The user-defined report name. Default: None
165
-
166
- Returns: A dict of metric calculation results. The format is like:
167
- {
168
- "name":"GSM8K",
169
- "metric":"WeightedAverageAccuracy",
170
- "score":0.5632,
171
- "category":[
172
- {
173
- "name":"DEFAULT",
174
- "score":0.5632,
175
- "subset":[
176
- {
177
- "name":"main",
178
- "score":0.5632
179
- },
180
- ]
181
- }
182
- ],
183
- "total_num":100
184
- }
185
- """
186
- total_num: int = sum([num for _, num in subset_score_map.values()])
187
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
188
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
189
- cate_avg_list = [{
190
- 'name': subset_name,
191
- 'score': normalize_score(score=score)
192
- } for subset_name, (score, _) in subset_score_map.items()]
193
-
194
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
195
-
196
- res_map = dict(
197
- name=report_name or 'gsm8k',
198
- metric=self.metric_list[0]['name'],
199
- score=weighted_avg_acc,
200
- category=[category_d],
201
- total_num=total_num)
202
-
203
- return res_map
204
-
205
125
  @classmethod
206
126
  def _generate_prompt(cls, input_d: dict, few_shot_list: list, use_fewshot: bool = True) -> str:
207
127
  if use_fewshot:
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
5
- from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa