evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
evalscope/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ from evalscope.config import TaskConfig
4
+ from evalscope.run import run_task
3
5
  from .version import __release_datetime__, __version__
evalscope/arguments.py CHANGED
@@ -1,6 +1,8 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType
5
+
4
6
 
5
7
  class ParseStrArgsAction(argparse.Action):
6
8
 
@@ -31,6 +33,7 @@ def add_argument(parser: argparse.ArgumentParser):
31
33
  # yapf: disable
32
34
  # Model-related arguments
33
35
  parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
36
+ parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
34
37
  parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
35
38
 
36
39
  # Template-related arguments
@@ -47,10 +50,13 @@ def add_argument(parser: argparse.ArgumentParser):
47
50
  parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
48
51
 
49
52
  # Evaluation-related arguments
50
- parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
51
- parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
53
+ parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
54
+ choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
55
+ parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
56
+ choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
52
57
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
53
- parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
58
+ parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
59
+ choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
54
60
  parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
55
61
 
56
62
  # Cache and working directory arguments
@@ -62,6 +68,8 @@ def add_argument(parser: argparse.ArgumentParser):
62
68
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
63
69
  parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
64
70
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
71
+ parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
72
+ parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
65
73
  # yapf: enable
66
74
 
67
75
 
@@ -3,7 +3,6 @@ Code adapated from https://github.com/mlfoundations/open_clip/blob/main/src/trai
3
3
  Thanks to the authors of OpenCLIP
4
4
  """
5
5
 
6
- import logging
7
6
  import torch
8
7
  import torch.nn.functional as F
9
8
  from contextlib import suppress
@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models.model_adapter import ChatGenerationModelAdapter
9
+ from evalscope.models import ChatGenerationModelAdapter
10
10
 
11
11
 
12
12
  class LLM:
@@ -1,4 +1,23 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import glob
3
+ import importlib
4
+ import os
2
5
 
3
- from evalscope.benchmarks.benchmark import Benchmark
6
+ from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
4
7
  from evalscope.benchmarks.data_adapter import DataAdapter
8
+ from evalscope.utils import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+ # Using glob to find all files matching the pattern
13
+ pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
14
+ files = glob.glob(pattern, recursive=False)
15
+
16
+ for file_path in files:
17
+ if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
18
+ # Convert file path to a module path
19
+ relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
20
+ module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
21
+ full_path = f'evalscope.benchmarks.{module_path}'
22
+ importlib.import_module(full_path)
23
+ # print(f'Importing {full_path}')
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
5
- from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -3,40 +3,35 @@
3
3
  import json
4
4
  import os
5
5
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import ResponseParser, normalize_score
6
+ from evalscope.benchmarks import Benchmark, DataAdapter
7
+ from evalscope.constants import EvalType
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
+ from evalscope.models import MultiChoiceModelAdapter
10
+ from evalscope.utils import ResponseParser
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  # flake8: noqa
12
14
 
13
15
  logger = get_logger()
14
16
 
15
- DATASET_ID = 'modelscope/ai2_arc'
16
-
17
- # task_list = ['ARC-Easy', 'ARC-Challenge']
18
- SUBSET_LIST = ['ARC-Challenge']
19
-
20
17
 
18
+ @Benchmark.register(
19
+ name='arc',
20
+ dataset_id='modelscope/ai2_arc',
21
+ model_adapter=MultiChoiceModelAdapter,
22
+ subset_list=['ARC-Easy', 'ARC-Challenge'],
23
+ metric_list=[AverageAccuracy],
24
+ few_shot_num=0,
25
+ train_split='train',
26
+ eval_split='test',
27
+ prompt_template='',
28
+ )
21
29
  class ARCAdapter(DataAdapter):
22
30
 
23
31
  choices = ['A', 'B', 'C', 'D']
24
32
 
25
- def __init__(self,
26
- subset_list: list = None,
27
- metric_list: list = None,
28
- few_shot_num: int = None,
29
- train_split: str = 'train',
30
- eval_split: str = 'test',
31
- prompt_template: str = '',
32
- **kwargs):
33
-
34
- if subset_list is None:
35
- subset_list = SUBSET_LIST
36
-
37
- if metric_list is None:
38
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
39
-
33
+ def __init__(self, **kwargs):
34
+ few_shot_num = kwargs.get('few_shot_num', None)
40
35
  if few_shot_num is None:
41
36
  # Use 0-shot by default
42
37
  logger.info(f'Set 0-shot examples by system for ARC.')
@@ -45,14 +40,7 @@ class ARCAdapter(DataAdapter):
45
40
  if few_shot_num != 0:
46
41
  logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
47
42
 
48
- super().__init__(
49
- subset_list=subset_list,
50
- metric_list=metric_list,
51
- few_shot_num=few_shot_num,
52
- train_split=train_split,
53
- eval_split=eval_split,
54
- prompt_template=prompt_template,
55
- **kwargs)
43
+ super().__init__(**kwargs)
56
44
 
57
45
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
58
46
  """
@@ -121,18 +109,16 @@ class ARCAdapter(DataAdapter):
121
109
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
122
110
  context: str = '\n'.join(few_shot_prompts)
123
111
 
124
- context = f'{self.prompt_template}\n{context}' if self.prompt_template else context
125
-
126
112
  # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
127
113
  full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
128
114
 
129
- return {'data': [full_prompt], 'multi_choices': self.choices}
115
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
130
116
 
131
117
  def get_gold_answer(self, input_d: dict) -> str:
132
118
  # Get the gold choice
133
119
  return input_d.get('answerKey', '')
134
120
 
135
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
121
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
136
122
  """
137
123
  Parse the model output to get the answer. Could be the best choice index.
138
124
 
@@ -144,12 +130,12 @@ class ARCAdapter(DataAdapter):
144
130
  Returns:
145
131
  The parsed answer. Depending on the dataset. Usually a string for chat.
146
132
  """
147
- if eval_type == 'checkpoint':
133
+ if eval_type == EvalType.CHECKPOINT:
148
134
  return result
149
- elif eval_type == 'service':
135
+ elif eval_type == EvalType.SERVICE:
150
136
  return ResponseParser.parse_first_option_with_choices(
151
137
  text=result, options=self.choices) # TODO: to be checked !
152
- elif eval_type == 'custom':
138
+ elif eval_type == EvalType.CUSTOM:
153
139
  return ResponseParser.parse_first_option_with_choices(
154
140
  text=result, options=self.choices) # TODO: to be checked !
155
141
  else:
@@ -158,70 +144,6 @@ class ARCAdapter(DataAdapter):
158
144
  def match(self, gold: str, pred: str) -> float:
159
145
  return exact_match(gold=gold, pred=pred)
160
146
 
161
- def compute_metric(self, review_res_list: list) -> float:
162
- """
163
- Compute evaluation result by specific metric.
164
-
165
- Args:
166
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
167
-
168
- Returns:
169
- The metric score.
170
- """
171
- items = [(score, 1.0) for score in review_res_list]
172
- return weighted_mean(items)
173
-
174
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
175
- """
176
- Generate the report for the model output.
177
-
178
- Args:
179
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
180
- report_name: The user-defined report name.
181
-
182
- Returns: A dict of metric calculation results. The format is like:
183
- {
184
- "name":"ARC",
185
- "metric":"WeightedAverageAccuracy",
186
- "score":0.3389,
187
- "category":[
188
- {
189
- "name":"DEFAULT",
190
- "score":0.4128,
191
- "subset":[
192
- {
193
- "name":"ARC-Easy",
194
- "score":0.5632
195
- },
196
- {
197
- "name":"ARC-Challenge",
198
- "score":0.3157
199
- }
200
- ]
201
- }
202
- ],
203
- "total_num":7800
204
- }
205
- """
206
- total_num: int = sum([num for _, num in subset_score_map.values()])
207
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
208
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
209
- cate_avg_list = [{
210
- 'name': subset_name,
211
- 'score': normalize_score(score=score)
212
- } for subset_name, (score, _) in subset_score_map.items()]
213
-
214
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
215
-
216
- res_map = dict(
217
- name=report_name or 'arc',
218
- metric=self.metric_list[0]['name'],
219
- score=weighted_avg_acc,
220
- category=[category_d],
221
- total_num=total_num)
222
-
223
- return res_map
224
-
225
147
  @classmethod
226
148
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
227
149
 
@@ -1,5 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -5,18 +5,17 @@ import os
5
5
  import random
6
6
  import re
7
7
 
8
- from evalscope.benchmarks.data_adapter import DataAdapter
8
+ from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics.metrics import exact_match, weighted_mean
11
- from evalscope.utils import ResponseParser, normalize_score
10
+ from evalscope.metrics import AverageAccuracy, exact_match
11
+ from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
+ from evalscope.utils import ResponseParser
12
13
  from evalscope.utils.logger import get_logger
13
14
 
14
15
  # flake8: noqa
15
16
 
16
17
  logger = get_logger()
17
18
 
18
- DATASET_ID = 'modelscope/bbh'
19
-
20
19
  # BBH multiple choice subset list
21
20
  MULTIPLE_CHOICE = 'multiple_choice'
22
21
  MULTIPLE_CHOICE_LIST = [
@@ -59,41 +58,32 @@ TASK_TYPE = 'task_type'
59
58
  SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
60
59
 
61
60
 
61
+ @Benchmark.register(
62
+ name='bbh',
63
+ dataset_id='modelscope/bbh',
64
+ model_adapter=ChatGenerationModelAdapter,
65
+ subset_list=SUBSET_LIST,
66
+ metric_list=[AverageAccuracy],
67
+ few_shot_num=3,
68
+ train_split=None,
69
+ eval_split='test',
70
+ prompt_template='',
71
+ )
62
72
  class BBHAdapter(DataAdapter):
63
73
  """
64
74
  Adapter for BBH free-form and multiple-choices sub-tasks.
65
75
  """
66
76
 
67
- def __init__(self,
68
- subset_list: list = None,
69
- metric_list: list = None,
70
- few_shot_num: int = None,
71
- train_split: str = None,
72
- eval_split: str = 'test',
73
- **kwargs):
74
-
75
- if subset_list is None:
76
- subset_list = SUBSET_LIST
77
+ def __init__(self, **kwargs):
77
78
 
78
- if metric_list is None:
79
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
80
-
81
- if few_shot_num is None:
82
- logger.info(f'Set 3-shot examples by system for BBH.')
83
- few_shot_num = 3
79
+ few_shot_num = kwargs.get('few_shot_num', 3)
84
80
 
85
81
  if few_shot_num != 3 and few_shot_num != 0:
86
82
  logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
87
83
  f'Use 3-shot by default.')
88
- few_shot_num = 3
84
+ kwargs['few_shot_num'] = 3
89
85
 
90
- super().__init__(
91
- subset_list=subset_list,
92
- metric_list=metric_list,
93
- few_shot_num=few_shot_num,
94
- train_split=train_split,
95
- eval_split=eval_split,
96
- **kwargs)
86
+ super().__init__(**kwargs)
97
87
 
98
88
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
99
89
  data_dict = {}
@@ -132,7 +122,7 @@ class BBHAdapter(DataAdapter):
132
122
  cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
133
123
  full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
134
124
 
135
- return {'data': [full_prompt]}
125
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
136
126
 
137
127
  def gen_prompts(self, data_dict: dict) -> dict:
138
128
  """
@@ -217,66 +207,6 @@ class BBHAdapter(DataAdapter):
217
207
  def match(self, gold: str, pred: str) -> float:
218
208
  return exact_match(gold=gold, pred=pred)
219
209
 
220
- def compute_metric(self, review_res_list: list) -> float:
221
- """
222
- Compute evaluation result by specific metric.
223
-
224
- Args:
225
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
226
-
227
- Returns:
228
- The metric score.
229
- """
230
- items = [(score, 1.0) for score in review_res_list]
231
- return weighted_mean(items)
232
-
233
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
234
- """
235
- Generate the report for the model output.
236
-
237
- Args:
238
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
239
- report_name: The user-defined report name.
240
-
241
- Returns: A dict of metric calculation results. The format is like:
242
- {
243
- "name":"BBH",
244
- "metric":"WeightedAverageAccuracy",
245
- "score":0.3389,
246
- "category":[
247
- {
248
- "name":"DEFAULT",
249
- "score":0.3389,
250
- "subset":[
251
- {
252
- "name":"BBH",
253
- "score":0.3389
254
- },
255
- ]
256
- }
257
- ],
258
- "total_num":100
259
- }
260
- """
261
- total_num: int = sum([num for _, num in subset_score_map.values()])
262
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
263
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
264
- cate_avg_list = [{
265
- 'name': subset_name,
266
- 'score': normalize_score(score=score)
267
- } for subset_name, (score, _) in subset_score_map.items()]
268
-
269
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
270
-
271
- res_map = dict(
272
- name=report_name or 'bbh',
273
- metric=self.metric_list[0]['name'],
274
- score=weighted_avg_acc,
275
- category=[category_d],
276
- total_num=total_num)
277
-
278
- return res_map
279
-
280
210
  @classmethod
281
211
  def _extract_mc_answer(cls, ans: str) -> str:
282
212
  """
@@ -1,65 +1,76 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
1
+ import copy
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Dict, List, Optional
2
4
 
3
- import os.path
4
- from modelscope.msdatasets import MsDataset
5
- from typing import Optional
5
+ if TYPE_CHECKING:
6
+ from evalscope.benchmarks import DataAdapter
6
7
 
7
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, HubType
8
+ from evalscope.models import BaseModelAdapter
8
9
 
10
+ BENCHMARK_MAPPINGS = {}
9
11
 
10
- class Benchmark(object):
11
- """
12
- Wrapper for loading datasets from ModelScope or HuggingFace.
13
- """
12
+
13
+ @dataclass
14
+ class BenchmarkMeta:
15
+ name: str
16
+ dataset_id: str
17
+ data_adapter: 'DataAdapter'
18
+ model_adapter: BaseModelAdapter
19
+ subset_list: List[str] = field(default_factory=list)
20
+ metric_list: List[dict] = field(default_factory=list)
21
+ few_shot_num: int = 0
22
+ few_shot_random: bool = False
23
+ train_split: Optional[str] = None
24
+ eval_split: Optional[str] = None
25
+ prompt_template: Optional[str] = None
26
+
27
+ def _update(self, args: dict):
28
+ if args.get('local_path'):
29
+ self.dataset_id = args['local_path']
30
+ del args['local_path']
31
+ self.__dict__.update(args)
32
+
33
+ def to_dict(self) -> dict:
34
+ return self.__dict__
35
+
36
+ def to_string_dict(self) -> dict:
37
+ cur_dict = copy.deepcopy(self.__dict__)
38
+ # cur_dict['data_adapter'] = self.data_adapter.__name__
39
+ # cur_dict['model_adapter'] = self.model_adapter.__name__
40
+ # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
41
+ del cur_dict['data_adapter']
42
+ del cur_dict['model_adapter']
43
+ del cur_dict['metric_list']
44
+ return cur_dict
45
+
46
+ def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
47
+ if config:
48
+ self._update(config)
49
+
50
+ data_adapter = self.data_adapter(**self.to_dict())
51
+ return data_adapter
52
+
53
+
54
+ class Benchmark:
14
55
 
15
56
  def __init__(self):
16
- ...
17
-
18
- @staticmethod
19
- def load(dataset_name: str,
20
- subset: str = None,
21
- split: str = None,
22
- token: str = None,
23
- hub: str = 'ModelScope',
24
- work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
25
- **kwargs):
26
- """
27
- Load a dataset from ModelScope or HuggingFace.
28
-
29
- Args:
30
- dataset_name (str): The dataset id or path.
31
- If it is dataset id, should be in the format of `organization/name` for ModelScope and HuggingFace hub.
32
- If it is dataset path, should be the path on local disk.
33
- subset (str):
34
- split:
35
- token: sdk token for ModelScope, optional, default None
36
- hub: `ModelScope` or `HuggingFace`
37
- work_dir: the work directory for caching, optional
38
-
39
- Returns:
40
- A dict.
41
- """
42
-
43
- dataset = MsDataset.load(
44
- dataset_name=dataset_name,
45
- subset_name=subset,
46
- split=split,
47
- token=token,
48
- cache_dir=work_dir,
49
- hub=hub,
50
- **kwargs)
51
-
52
- dataset.dataset_name = dataset_name.split('/')[-1]
53
- dataset.subset_name = subset
54
- # dataset.split = split
55
- return dataset
56
-
57
-
58
- if __name__ == '__main__':
59
-
60
- ds = Benchmark.load(dataset_name='mmlu', subset='management', split=None)
61
-
62
- n = 1
63
- for i in ds:
64
- print('>', n, ': ', i)
65
- n += 1
57
+ pass
58
+
59
+ @classmethod
60
+ def get(cls, name: str) -> 'BenchmarkMeta':
61
+ if name not in BENCHMARK_MAPPINGS:
62
+ raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
63
+ benchmark = BENCHMARK_MAPPINGS[name]
64
+ return benchmark
65
+
66
+ @classmethod
67
+ def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs):
68
+
69
+ def register_wrapper(data_adapter):
70
+ if name in BENCHMARK_MAPPINGS:
71
+ raise Exception(f'Benchmark {name} already registered')
72
+ BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
73
+ name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
74
+ return data_adapter
75
+
76
+ return register_wrapper
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.ceval.ceval_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
4
- from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter
5
- from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa