evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (79) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +10 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +23 -99
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +114 -85
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  26. evalscope/benchmarks/mmlu/__init__.py +0 -5
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  28. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  30. evalscope/benchmarks/race/__init__.py +0 -5
  31. evalscope/benchmarks/race/race_adapter.py +25 -53
  32. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  34. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  35. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  36. evalscope/collections/__init__.py +3 -0
  37. evalscope/collections/evaluator.py +178 -0
  38. evalscope/collections/sampler.py +132 -0
  39. evalscope/collections/schema.py +122 -0
  40. evalscope/config.py +7 -5
  41. evalscope/constants.py +7 -28
  42. evalscope/evaluator/evaluator.py +66 -109
  43. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  44. evalscope/metrics/__init__.py +6 -0
  45. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  46. evalscope/metrics/math_accuracy.py +193 -50
  47. evalscope/metrics/metrics.py +7 -4
  48. evalscope/metrics/rouge_metric.py +13 -8
  49. evalscope/models/__init__.py +14 -1
  50. evalscope/models/base_adapter.py +52 -0
  51. evalscope/models/chat_adapter.py +138 -0
  52. evalscope/models/choice_adapter.py +211 -0
  53. evalscope/models/custom_adapter.py +67 -0
  54. evalscope/models/local_model.py +74 -0
  55. evalscope/models/model.py +141 -0
  56. evalscope/models/server_adapter.py +104 -0
  57. evalscope/run.py +37 -66
  58. evalscope/run_arena.py +1 -1
  59. evalscope/utils/__init__.py +1 -1
  60. evalscope/utils/chat_service.py +4 -3
  61. evalscope/utils/io_utils.py +8 -0
  62. evalscope/utils/logger.py +4 -0
  63. evalscope/utils/model_utils.py +10 -0
  64. evalscope/utils/utils.py +3 -25
  65. evalscope/version.py +2 -2
  66. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
  67. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
  68. tests/cli/test_collection.py +53 -0
  69. tests/cli/test_run.py +43 -1
  70. tests/rag/test_mteb.py +3 -2
  71. evalscope/models/api/__init__.py +0 -3
  72. evalscope/models/dummy_chat_model.py +0 -49
  73. evalscope/models/model_adapter.py +0 -525
  74. evalscope/models/openai_model.py +0 -103
  75. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  76. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
  77. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
  78. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
  79. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
@@ -4,8 +4,8 @@ import random
4
4
  from abc import ABC, abstractmethod
5
5
  from typing import Any, Optional
6
6
 
7
- from evalscope.benchmarks import Benchmark
8
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
7
+ from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
8
+ from evalscope.utils import normalize_score
9
9
  from evalscope.utils.logger import get_logger
10
10
 
11
11
  logger = get_logger()
@@ -22,6 +22,11 @@ class DataAdapter(ABC):
22
22
  prompt_template: str = '',
23
23
  **kwargs):
24
24
  """
25
+ Data Adapter for the benchmark. You need to implement the following methods:
26
+ - gen_prompt
27
+ - get_gold_answer
28
+ - parse_pred_result
29
+ - match
25
30
  Args:
26
31
  subset_list: list of subset names for the dataset.
27
32
  metric_list: list, the metric list to evaluate the model on specific benchmark.
@@ -55,33 +60,36 @@ class DataAdapter(ABC):
55
60
 
56
61
  """
57
62
  dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
63
+ subset_list = subset_list or self.subset_list
58
64
 
59
65
  # Try to load dataset from local disk
60
66
  if os.path.exists(dataset_name_or_path):
61
- logger.info(
62
- f'Loading dataset from local disk: > dataset_name: {dataset_name_or_path} > work_dir: {work_dir}')
67
+ logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
68
+ subsets: {subset_list}')
63
69
  data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
64
70
  if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
65
71
  raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
66
72
  else:
73
+ from modelscope.msdatasets import MsDataset
74
+
67
75
  # Load dataset from remote
68
- logger.info(f'Loading dataset from {datasets_hub} hub: >dataset_name: {dataset_name_or_path}')
76
+ logger.info(
77
+ f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
69
78
  data_dict = {}
70
79
  split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
71
80
  if len(split_list) == 0:
72
81
  logger.error(f'Got empty split list: {split_list}')
73
82
 
74
- subset_list = subset_list if subset_list is not None else self.subset_list
75
83
  for sub_name in subset_list:
76
84
  data_dict[sub_name] = {}
77
85
  # e.g. train: few-shot, test: target dataset to evaluate
78
86
  for split in split_list:
79
- dataset = Benchmark.load(
87
+ dataset = MsDataset.load(
80
88
  dataset_name=dataset_name_or_path,
81
- subset=sub_name,
89
+ subset_name=sub_name,
82
90
  split=split,
91
+ cache_dir=work_dir,
83
92
  hub=datasets_hub,
84
- work_dir=work_dir,
85
93
  **kwargs)
86
94
 
87
95
  data_dict[sub_name].update({split: dataset})
@@ -132,30 +140,111 @@ class DataAdapter(ABC):
132
140
  prompt_d[AnswerKeys.RAW_INPUT] = sample_d
133
141
  res_dict[sub_name].append(prompt_d)
134
142
 
135
- rnd = random.Random()
136
- rnd.seed(42)
137
- for k, v in res_dict.items():
138
- rnd.shuffle(v)
139
-
140
143
  return res_dict
141
144
 
142
- @abstractmethod
143
- def gen_prompt(self, *args, **kwargs) -> Any:
145
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
146
+ """
147
+ Generate report for the evaluation results for all subsets.
148
+
149
+ Args:
150
+ subset_score_map: The subset-score map.
151
+ e.g. {subset_name: (score, num)}
152
+
153
+ report_name: str, the user-defined report name. Default: None
154
+
155
+ Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
156
+
157
+ Here is a format example for ARC-Challenge:
158
+ {
159
+ "name":"ARC-Challenge",
160
+ "metric":"WeightedAverageAccuracy",
161
+ "score": 0.3389,
162
+ "category":[
163
+ {
164
+ "name":"DEFAULT",
165
+ "score": 0.3389,
166
+ "subset":[
167
+ {
168
+ "name":"ARC-Challenge",
169
+ "score": 0.3389,
170
+ "num": 100
171
+ },
172
+ ]
173
+ }
174
+ ],
175
+ "total_num":100
176
+ }
177
+ """ # noqa: E501
178
+ total_num: int = sum([num for _, num in subset_score_map.values()])
179
+ weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
180
+ weighted_avg_acc = normalize_score(score=weighted_avg_acc)
181
+ cate_avg_list = [{
182
+ 'name': subset_name,
183
+ 'score': normalize_score(score=score),
184
+ 'num': num
185
+ } for subset_name, (score, num) in subset_score_map.items()]
186
+
187
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
188
+
189
+ res_map = dict(
190
+ name=report_name or 'DEFAULT',
191
+ metric=self.metric_list[0]['name'],
192
+ score=weighted_avg_acc,
193
+ category=[category_d],
194
+ total_num=total_num)
195
+
196
+ return res_map
197
+
198
+ def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
199
+
200
+ if k > len(data_list):
201
+ k = len(data_list)
202
+ if few_shot_random:
203
+ return random.sample(data_list, k)
204
+ else:
205
+ return data_list[:k]
206
+
207
+ def compute_metric(self, review_res_list: list) -> Any:
208
+ """
209
+ Compute evaluation result by specific metrics.
210
+
211
+ Args:
212
+ review_res_list: list, the review result list, each item of which is match result for gold and pred.
213
+
214
+ Attributes:
215
+ DataAdapter.metric_func_map: metric_name -> metric_func mapping,
216
+ e.g. {'WeightedAverageAccuracy': weighted_average_acc}
217
+
218
+ Returns:
219
+ Metric results.
220
+ """
221
+ if len(self.metric_list) == 0:
222
+ raise ValueError('No metric list found for the benchmark.')
223
+ elif len(self.metric_list) == 1:
224
+ # review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
225
+ items = [(score, 1.0) for score in review_res_list]
226
+ return self.metric_list[0]['object'](items)
227
+ else:
228
+ raise ValueError('Please implement the compute_metric method for multiple metrics.')
229
+
230
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
144
231
  """
145
232
  Generate model prompt from raw input, unify the prompt format for different datasets.
146
233
  The input format is compatible with OpenAI Chat Completions APIs.
147
- Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
148
234
 
149
235
  Args:
150
236
  input_d (Any): The raw input. Depending on the dataset.
237
+ subset_name (str): The subset name.
238
+ few_shot_list (list): The few-shot examples.
151
239
 
152
240
  Returns:
241
+ For class ChatGenerationModelAdapter, the output format is:
242
+ {'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset.
153
243
  For class MultiChoiceModelAdapter, the output format is:
154
- {'data': [full_prompt]}, -- full_prompt: str, the constructed prompt for each sample from dataset.
155
-
244
+ {'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset.
156
245
  For class ContinuationEvalModelAdapter, the output format is:
157
- {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
158
- """
246
+ {'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
247
+ """ # noqa: E501
159
248
  raise NotImplementedError
160
249
 
161
250
  @abstractmethod
@@ -172,7 +261,7 @@ class DataAdapter(ABC):
172
261
  raise NotImplementedError
173
262
 
174
263
  @abstractmethod
175
- def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> Any:
264
+ def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
176
265
  """
177
266
  Parse the predicted result and extract proper answer.
178
267
 
@@ -187,77 +276,17 @@ class DataAdapter(ABC):
187
276
  raise NotImplementedError
188
277
 
189
278
  @abstractmethod
190
- def match(self, gold: Any, pred: Any) -> Any:
279
+ def match(self, gold: Any, pred: Any) -> float:
191
280
  """
192
281
  Match the gold answer and the predicted answer.
193
282
 
194
283
  Args:
195
284
  gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
196
- e.g. 'A'
285
+ e.g. 'A', extracted from get_gold_answer method.
197
286
  pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
198
- e.g. 'B'
287
+ e.g. 'B', extracted from parse_pred_result method.
199
288
 
200
289
  Returns:
201
290
  The match result. Usually a score (float) for chat/multiple-choice-questions.
202
291
  """
203
292
  raise NotImplementedError
204
-
205
- @abstractmethod
206
- def compute_metric(self, review_res_list: list) -> Any:
207
- """
208
- Compute evaluation result by specific metrics.
209
-
210
- Args:
211
- review_res_list: list, the review result list, each item of which is match result for gold and pred.
212
-
213
- Attributes:
214
- DataAdapter.metric_func_map: metric_name -> metric_func mapping,
215
- e.g. {'WeightedAverageAccuracy': weighted_average_acc}
216
-
217
- Returns:
218
- Metric results.
219
- """
220
- raise NotImplementedError
221
-
222
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
223
- """
224
- Generate report for the evaluation results for all subsets.
225
-
226
- Args:
227
- subset_score_map: The subset-score map.
228
- e.g. {subset_name: (score, num)}
229
-
230
- report_name: str, the user-defined report name. Default: None
231
-
232
- Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
233
-
234
- Here is a format example for ARC-Challenge:
235
- {
236
- "name":"ARC-Challenge",
237
- "metric":"WeightedAverageAccuracy",
238
- "score": 0.3389,
239
- "category":[
240
- {
241
- "name":"DEFAULT",
242
- "score": 0.3389,
243
- "subset":[
244
- {
245
- "name":"ARC-Challenge",
246
- "score": 0.3389
247
- },
248
- ]
249
- }
250
- ],
251
- "total_num":100
252
- }
253
- """
254
- raise NotImplementedError
255
-
256
- def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
257
-
258
- if k > len(data_list):
259
- k = len(data_list)
260
- if few_shot_random:
261
- return random.sample(data_list, k)
262
- else:
263
- return data_list[:k]
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter
5
- from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
@@ -5,35 +5,32 @@ import os.path
5
5
  from collections import defaultdict
6
6
  from typing import Any, Optional
7
7
 
8
- from evalscope.benchmarks.data_adapter import DataAdapter
9
- from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
10
- from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
8
+ from evalscope.benchmarks import Benchmark, DataAdapter
9
+ from evalscope.metrics import (WeightedAverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh,
10
+ weighted_mean)
11
+ from evalscope.models import ChatGenerationModelAdapter
11
12
  from evalscope.utils.io_utils import jsonl_to_list
12
13
  from evalscope.utils.logger import get_logger
13
14
 
14
15
  logger = get_logger()
15
16
 
16
- DATASET_ID = 'general_qa'
17
- SUBSET_LIST = ['default']
18
-
19
17
 
18
+ @Benchmark.register(
19
+ name='general_qa',
20
+ dataset_id='general_qa',
21
+ model_adapter=ChatGenerationModelAdapter,
22
+ subset_list=['default'],
23
+ metric_list=[WeightedAverageBLEU],
24
+ few_shot_num=0,
25
+ train_split=None,
26
+ eval_split='test',
27
+ )
20
28
  class GeneralQAAdapter(DataAdapter):
21
29
  # TODO: set few_shot_num
22
30
 
23
- def __init__(self,
24
- subset_list: list = None,
25
- metric_list: list = None,
26
- train_split: str = None,
27
- eval_split: str = 'test',
28
- **kwargs):
29
- if subset_list is None:
30
- subset_list = SUBSET_LIST
31
-
32
- if metric_list is None:
33
- metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
31
+ def __init__(self, **kwargs):
34
32
 
35
- super().__init__(
36
- subset_list=subset_list, metric_list=metric_list, train_split=train_split, eval_split=eval_split, **kwargs)
33
+ super().__init__(**kwargs)
37
34
 
38
35
  def load(self, dataset_name_or_path: str, subset_list: list = None, **kwargs) -> dict:
39
36
 
@@ -1,5 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -1,35 +1,33 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI, Inc. and its affiliates.
3
+ # flake8: noqa
3
4
  import math
4
5
  import os
5
6
  import re
6
7
 
7
- from evalscope.benchmarks import DataAdapter
8
- from evalscope.metrics.metrics import exact_match, weighted_mean
9
- from evalscope.utils import normalize_score
8
+ from evalscope.benchmarks import Benchmark, DataAdapter
9
+ from evalscope.metrics import WeightedAverageAccuracy
10
+ from evalscope.models import ChatGenerationModelAdapter
10
11
  from evalscope.utils.io_utils import jsonl_to_list
11
12
  from evalscope.utils.logger import get_logger
12
13
 
13
- # flake8: noqa
14
-
15
14
  logger = get_logger()
16
15
 
17
- DATASET_ID = 'modelscope/gsm8k'
18
- SUBSET_LIST = ['main']
19
- ANS_RE = re.compile(r'#### (\-?[0-9\.\,]+)')
20
- INVALID_ANS = '[invalid]'
21
-
22
16
 
17
+ @Benchmark.register(
18
+ name='gsm8k',
19
+ dataset_id='modelscope/gsm8k',
20
+ model_adapter=ChatGenerationModelAdapter,
21
+ subset_list=['main'],
22
+ metric_list=[WeightedAverageAccuracy],
23
+ few_shot_num=4,
24
+ train_split='train',
25
+ eval_split='test',
26
+ prompt_template='',
27
+ )
23
28
  class GSM8KAdapter(DataAdapter):
24
29
 
25
- def __init__(self,
26
- subset_list: list = None,
27
- metric_list: list = None,
28
- few_shot_num: int = None,
29
- train_split: str = 'train',
30
- eval_split: str = 'test',
31
- prompt_template: str = '',
32
- **kwargs):
30
+ def __init__(self, **kwargs):
33
31
  """
34
32
  Data adapter for GSM8K dataset.
35
33
 
@@ -41,30 +39,13 @@ class GSM8KAdapter(DataAdapter):
41
39
  eval_split (str): The target eval split name. Default: 'test'
42
40
  **kwargs: ...
43
41
  """
44
-
45
- if subset_list is None:
46
- subset_list = SUBSET_LIST
47
-
48
- if metric_list is None:
49
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
50
-
51
- if few_shot_num is None:
52
- logger.info(f'Set 4-shot examples by system for GSM8K.')
53
- few_shot_num = 4
54
-
42
+ few_shot_num = kwargs.get('few_shot_num', 4)
55
43
  if few_shot_num != 4 and few_shot_num != 0:
56
44
  logger.error(f'GSM8K uses 4-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
57
45
  f'Use 4-shot by default.')
58
- few_shot_num = 4
46
+ kwargs['few_shot_num'] = 4
59
47
 
60
- super().__init__(
61
- subset_list=subset_list,
62
- metric_list=metric_list,
63
- few_shot_num=few_shot_num,
64
- train_split=train_split,
65
- eval_split=eval_split,
66
- prompt_template=prompt_template,
67
- **kwargs)
48
+ super().__init__(**kwargs)
68
49
 
69
50
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
70
51
  data_dict = {}
@@ -142,66 +123,6 @@ class GSM8KAdapter(DataAdapter):
142
123
 
143
124
  return number_equal(gold_ans=gold, pred_ans=pred)
144
125
 
145
- def compute_metric(self, review_res_list: list) -> float:
146
- """
147
- Compute evaluation result by specific metric.
148
-
149
- Args:
150
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
151
-
152
- Returns:
153
- The metric score.
154
- """
155
- items = [(score, 1.0) for score in review_res_list]
156
- return weighted_mean(items)
157
-
158
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
159
- """
160
- Generate the report for the model output.
161
-
162
- Args:
163
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
164
- report_name: The user-defined report name. Default: None
165
-
166
- Returns: A dict of metric calculation results. The format is like:
167
- {
168
- "name":"GSM8K",
169
- "metric":"WeightedAverageAccuracy",
170
- "score":0.5632,
171
- "category":[
172
- {
173
- "name":"DEFAULT",
174
- "score":0.5632,
175
- "subset":[
176
- {
177
- "name":"main",
178
- "score":0.5632
179
- },
180
- ]
181
- }
182
- ],
183
- "total_num":100
184
- }
185
- """
186
- total_num: int = sum([num for _, num in subset_score_map.values()])
187
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
188
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
189
- cate_avg_list = [{
190
- 'name': subset_name,
191
- 'score': normalize_score(score=score)
192
- } for subset_name, (score, _) in subset_score_map.items()]
193
-
194
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
195
-
196
- res_map = dict(
197
- name=report_name or 'gsm8k',
198
- metric=self.metric_list[0]['name'],
199
- score=weighted_avg_acc,
200
- category=[category_d],
201
- total_num=total_num)
202
-
203
- return res_map
204
-
205
126
  @classmethod
206
127
  def _generate_prompt(cls, input_d: dict, few_shot_list: list, use_fewshot: bool = True) -> str:
207
128
  if use_fewshot:
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
5
- from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
@@ -3,9 +3,10 @@ import numpy as np
3
3
  import os
4
4
  import re
5
5
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import normalize_score
6
+ from evalscope.benchmarks import Benchmark, DataAdapter
7
+ from evalscope.constants import EvalType
8
+ from evalscope.metrics import WeightedAverageAccuracy, exact_match
9
+ from evalscope.models import ContinuationLogitsModelAdapter
9
10
  from evalscope.utils.io_utils import jsonl_to_list
10
11
  from evalscope.utils.logger import get_logger
11
12
 
@@ -13,44 +14,30 @@ from evalscope.utils.logger import get_logger
13
14
 
14
15
  logger = get_logger()
15
16
 
16
- DATASET_ID = 'modelscope/hellaswag'
17
- SUBSET_LIST = ['default']
18
-
19
17
 
18
+ @Benchmark.register(
19
+ name='hellaswag',
20
+ dataset_id='modelscope/hellaswag',
21
+ model_adapter=ContinuationLogitsModelAdapter,
22
+ subset_list=['default'],
23
+ metric_list=[WeightedAverageAccuracy],
24
+ few_shot_num=0,
25
+ train_split='train',
26
+ eval_split='validation',
27
+ prompt_template='',
28
+ )
20
29
  class HellaSwagAdapter(DataAdapter):
21
30
 
22
31
  choices = ['0', '1', '2', '3']
23
32
 
24
- def __init__(self,
25
- subset_list: list = None,
26
- metric_list: list = None,
27
- few_shot_num: int = None,
28
- train_split: str = 'train',
29
- eval_split: str = 'validation',
30
- **kwargs):
31
-
32
- if subset_list is None:
33
- subset_list = SUBSET_LIST
34
-
35
- if metric_list is None:
36
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
37
-
38
- if few_shot_num is None:
39
- # Use 0-shot by default
40
- logger.info(f'Set 0-shot examples by system for HellaSwag.')
41
- few_shot_num = 0
33
+ def __init__(self, **kwargs):
42
34
 
35
+ few_shot_num = kwargs.get('few_shot_num', 0)
43
36
  if few_shot_num != 0:
44
37
  logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
45
- few_shot_num = 0
38
+ kwargs['few_shot_num'] = 0
46
39
 
47
- super().__init__(
48
- subset_list=subset_list,
49
- metric_list=metric_list,
50
- few_shot_num=few_shot_num,
51
- train_split=train_split,
52
- eval_split=eval_split,
53
- **kwargs)
40
+ super().__init__(**kwargs)
54
41
 
55
42
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
56
43
  data_dict = {}
@@ -106,7 +93,7 @@ class HellaSwagAdapter(DataAdapter):
106
93
  # Get the gold choice
107
94
  return input_d['label']
108
95
 
109
- def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
96
+ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
110
97
  """
111
98
  Parse the model output to get the answer. Could be the best choice index.
112
99
 
@@ -118,7 +105,7 @@ class HellaSwagAdapter(DataAdapter):
118
105
  Returns:
119
106
  The parsed answer. Depending on the dataset. Usually a string for chat.
120
107
  """
121
- if eval_type == 'checkpoint':
108
+ if eval_type == EvalType.CHECKPOINT:
122
109
  # answer: in the form of [-2.3, -4.5, ...], len of self.choices
123
110
  result = np.array(result)
124
111
  endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
@@ -126,9 +113,9 @@ class HellaSwagAdapter(DataAdapter):
126
113
  best_choice_idx = np.argmax(result / completion_len)
127
114
 
128
115
  return str(best_choice_idx)
129
- elif eval_type == 'service':
116
+ elif eval_type == EvalType.SERVICE:
130
117
  return result # TODO: to be supported !
131
- elif eval_type == 'custom':
118
+ elif eval_type == EvalType.CUSTOM:
132
119
  return result # TODO: to be supported !
133
120
  else:
134
121
  raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -136,66 +123,6 @@ class HellaSwagAdapter(DataAdapter):
136
123
  def match(self, gold: str, pred: str) -> float:
137
124
  return exact_match(gold=str(gold), pred=str(pred))
138
125
 
139
- def compute_metric(self, review_res_list: list) -> float:
140
- """
141
- Compute evaluation result by specific metric.
142
-
143
- Args:
144
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
145
-
146
- Returns:
147
- The metric score.
148
- """
149
- items = [(score, 1.0) for score in review_res_list]
150
- return weighted_mean(items)
151
-
152
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
153
- """
154
- Generate the report for the model output.
155
-
156
- Args:
157
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
158
- report_name: The user-defined report name.
159
-
160
- Returns: A dict of metric calculation results. The format is like:
161
- {
162
- "name":"HellaSwag",
163
- "metric":"WeightedAverageAccuracy",
164
- "score":0.3389,
165
- "category":[
166
- {
167
- "name":"DEFAULT",
168
- "score":0.4128,
169
- "subset":[
170
- {
171
- "name":"default",
172
- "score":0.5632
173
- },
174
- ]
175
- }
176
- ],
177
- "total_num":7800
178
- }
179
- """
180
- total_num: int = sum([num for _, num in subset_score_map.values()])
181
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
182
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
183
- cate_avg_list = [{
184
- 'name': subset_name,
185
- 'score': normalize_score(score=score)
186
- } for subset_name, (score, _) in subset_score_map.items()]
187
-
188
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
189
-
190
- res_map = dict(
191
- name=report_name or 'hellaswag',
192
- metric=self.metric_list[0]['name'],
193
- score=weighted_avg_acc,
194
- category=[category_d],
195
- total_num=total_num)
196
-
197
- return res_map
198
-
199
126
  @classmethod
200
127
  def _preprocess(cls, text):
201
128
  text = text.strip()
@@ -1,5 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa