evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (105) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +10 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +23 -99
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +114 -85
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  26. evalscope/benchmarks/mmlu/__init__.py +0 -5
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  28. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  30. evalscope/benchmarks/race/__init__.py +0 -5
  31. evalscope/benchmarks/race/race_adapter.py +25 -53
  32. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  34. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  35. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  36. evalscope/collections/__init__.py +3 -0
  37. evalscope/collections/evaluator.py +178 -0
  38. evalscope/collections/sampler.py +132 -0
  39. evalscope/collections/schema.py +122 -0
  40. evalscope/config.py +10 -6
  41. evalscope/constants.py +7 -28
  42. evalscope/evaluator/evaluator.py +66 -108
  43. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  44. evalscope/metrics/__init__.py +6 -0
  45. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  46. evalscope/metrics/math_accuracy.py +193 -50
  47. evalscope/metrics/metrics.py +7 -4
  48. evalscope/metrics/rouge_metric.py +13 -8
  49. evalscope/models/__init__.py +14 -1
  50. evalscope/models/base_adapter.py +52 -0
  51. evalscope/models/chat_adapter.py +138 -0
  52. evalscope/models/choice_adapter.py +211 -0
  53. evalscope/models/custom_adapter.py +67 -0
  54. evalscope/models/local_model.py +74 -0
  55. evalscope/models/model.py +141 -0
  56. evalscope/models/server_adapter.py +104 -0
  57. evalscope/perf/arguments.py +1 -0
  58. evalscope/perf/benchmark.py +1 -1
  59. evalscope/perf/main.py +3 -1
  60. evalscope/perf/plugin/api/openai_api.py +51 -47
  61. evalscope/perf/utils/local_server.py +1 -0
  62. evalscope/run.py +37 -66
  63. evalscope/run_arena.py +1 -1
  64. evalscope/utils/__init__.py +1 -1
  65. evalscope/utils/chat_service.py +4 -3
  66. evalscope/utils/io_utils.py +8 -0
  67. evalscope/utils/logger.py +4 -0
  68. evalscope/utils/model_utils.py +10 -0
  69. evalscope/utils/utils.py +3 -25
  70. evalscope/version.py +2 -2
  71. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
  72. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
  73. tests/cli/test_collection.py +53 -0
  74. tests/cli/test_run.py +43 -1
  75. tests/perf/test_perf.py +3 -3
  76. tests/rag/test_mteb.py +3 -2
  77. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  78. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  79. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  80. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  81. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  82. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  83. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  84. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  85. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  86. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  87. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  88. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  89. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  90. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  91. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  92. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  93. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  94. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  95. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  96. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  97. evalscope/models/api/__init__.py +0 -3
  98. evalscope/models/dummy_chat_model.py +0 -49
  99. evalscope/models/model_adapter.py +0 -525
  100. evalscope/models/openai_model.py +0 -103
  101. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  102. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
  103. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
  104. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
  105. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
evalscope/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ from evalscope.config import TaskConfig
4
+ from evalscope.run import run_task
3
5
  from .version import __release_datetime__, __version__
evalscope/arguments.py CHANGED
@@ -1,6 +1,8 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType
5
+
4
6
 
5
7
  class ParseStrArgsAction(argparse.Action):
6
8
 
@@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
47
49
  parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
48
50
 
49
51
  # Evaluation-related arguments
50
- parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
51
- parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
52
+ parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
53
+ choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
54
+ parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
55
+ choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
52
56
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
53
- parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
57
+ parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
58
+ choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
54
59
  parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
55
60
 
56
61
  # Cache and working directory arguments
@@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
62
67
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
63
68
  parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
64
69
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
70
+ parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
71
+ parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
65
72
  # yapf: enable
66
73
 
67
74
 
@@ -3,7 +3,6 @@ Code adapated from https://github.com/mlfoundations/open_clip/blob/main/src/trai
3
3
  Thanks to the authors of OpenCLIP
4
4
  """
5
5
 
6
- import logging
7
6
  import torch
8
7
  import torch.nn.functional as F
9
8
  from contextlib import suppress
@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models.model_adapter import ChatGenerationModelAdapter
9
+ from evalscope.models import ChatGenerationModelAdapter
10
10
 
11
11
 
12
12
  class LLM:
@@ -1,4 +1,23 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import glob
3
+ import importlib
4
+ import os
2
5
 
3
- from evalscope.benchmarks.benchmark import Benchmark
6
+ from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
4
7
  from evalscope.benchmarks.data_adapter import DataAdapter
8
+ from evalscope.utils import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+ # Using glob to find all files matching the pattern
13
+ pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
14
+ files = glob.glob(pattern, recursive=False)
15
+
16
+ for file_path in files:
17
+ if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
18
+ # Convert file path to a module path
19
+ relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
20
+ module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
21
+ full_path = f'evalscope.benchmarks.{module_path}'
22
+ importlib.import_module(full_path)
23
+ # print(f'Importing {full_path}')
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
5
- from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -3,40 +3,35 @@
3
3
  import json
4
4
  import os
5
5
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import ResponseParser, normalize_score
6
+ from evalscope.benchmarks import Benchmark, DataAdapter
7
+ from evalscope.constants import EvalType
8
+ from evalscope.metrics import WeightedAverageAccuracy, exact_match
9
+ from evalscope.models import MultiChoiceModelAdapter
10
+ from evalscope.utils import ResponseParser
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  # flake8: noqa
12
14
 
13
15
  logger = get_logger()
14
16
 
15
- DATASET_ID = 'modelscope/ai2_arc'
16
-
17
- # task_list = ['ARC-Easy', 'ARC-Challenge']
18
- SUBSET_LIST = ['ARC-Challenge']
19
-
20
17
 
18
+ @Benchmark.register(
19
+ name='arc',
20
+ dataset_id='modelscope/ai2_arc',
21
+ model_adapter=MultiChoiceModelAdapter,
22
+ subset_list=['ARC-Easy', 'ARC-Challenge'],
23
+ metric_list=[WeightedAverageAccuracy],
24
+ few_shot_num=0,
25
+ train_split='train',
26
+ eval_split='test',
27
+ prompt_template='',
28
+ )
21
29
  class ARCAdapter(DataAdapter):
22
30
 
23
31
  choices = ['A', 'B', 'C', 'D']
24
32
 
25
- def __init__(self,
26
- subset_list: list = None,
27
- metric_list: list = None,
28
- few_shot_num: int = None,
29
- train_split: str = 'train',
30
- eval_split: str = 'test',
31
- prompt_template: str = '',
32
- **kwargs):
33
-
34
- if subset_list is None:
35
- subset_list = SUBSET_LIST
36
-
37
- if metric_list is None:
38
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
39
-
33
+ def __init__(self, **kwargs):
34
+ few_shot_num = kwargs.get('few_shot_num', None)
40
35
  if few_shot_num is None:
41
36
  # Use 0-shot by default
42
37
  logger.info(f'Set 0-shot examples by system for ARC.')
@@ -45,14 +40,7 @@ class ARCAdapter(DataAdapter):
45
40
  if few_shot_num != 0:
46
41
  logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
47
42
 
48
- super().__init__(
49
- subset_list=subset_list,
50
- metric_list=metric_list,
51
- few_shot_num=few_shot_num,
52
- train_split=train_split,
53
- eval_split=eval_split,
54
- prompt_template=prompt_template,
55
- **kwargs)
43
+ super().__init__(**kwargs)
56
44
 
57
45
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
58
46
  """
@@ -132,7 +120,7 @@ class ARCAdapter(DataAdapter):
132
120
  # Get the gold choice
133
121
  return input_d.get('answerKey', '')
134
122
 
135
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
123
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
136
124
  """
137
125
  Parse the model output to get the answer. Could be the best choice index.
138
126
 
@@ -144,12 +132,12 @@ class ARCAdapter(DataAdapter):
144
132
  Returns:
145
133
  The parsed answer. Depending on the dataset. Usually a string for chat.
146
134
  """
147
- if eval_type == 'checkpoint':
135
+ if eval_type == EvalType.CHECKPOINT:
148
136
  return result
149
- elif eval_type == 'service':
137
+ elif eval_type == EvalType.SERVICE:
150
138
  return ResponseParser.parse_first_option_with_choices(
151
139
  text=result, options=self.choices) # TODO: to be checked !
152
- elif eval_type == 'custom':
140
+ elif eval_type == EvalType.CUSTOM:
153
141
  return ResponseParser.parse_first_option_with_choices(
154
142
  text=result, options=self.choices) # TODO: to be checked !
155
143
  else:
@@ -158,70 +146,6 @@ class ARCAdapter(DataAdapter):
158
146
  def match(self, gold: str, pred: str) -> float:
159
147
  return exact_match(gold=gold, pred=pred)
160
148
 
161
- def compute_metric(self, review_res_list: list) -> float:
162
- """
163
- Compute evaluation result by specific metric.
164
-
165
- Args:
166
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
167
-
168
- Returns:
169
- The metric score.
170
- """
171
- items = [(score, 1.0) for score in review_res_list]
172
- return weighted_mean(items)
173
-
174
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
175
- """
176
- Generate the report for the model output.
177
-
178
- Args:
179
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
180
- report_name: The user-defined report name.
181
-
182
- Returns: A dict of metric calculation results. The format is like:
183
- {
184
- "name":"ARC",
185
- "metric":"WeightedAverageAccuracy",
186
- "score":0.3389,
187
- "category":[
188
- {
189
- "name":"DEFAULT",
190
- "score":0.4128,
191
- "subset":[
192
- {
193
- "name":"ARC-Easy",
194
- "score":0.5632
195
- },
196
- {
197
- "name":"ARC-Challenge",
198
- "score":0.3157
199
- }
200
- ]
201
- }
202
- ],
203
- "total_num":7800
204
- }
205
- """
206
- total_num: int = sum([num for _, num in subset_score_map.values()])
207
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
208
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
209
- cate_avg_list = [{
210
- 'name': subset_name,
211
- 'score': normalize_score(score=score)
212
- } for subset_name, (score, _) in subset_score_map.items()]
213
-
214
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
215
-
216
- res_map = dict(
217
- name=report_name or 'arc',
218
- metric=self.metric_list[0]['name'],
219
- score=weighted_avg_acc,
220
- category=[category_d],
221
- total_num=total_num)
222
-
223
- return res_map
224
-
225
149
  @classmethod
226
150
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
227
151
 
@@ -1,5 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -5,18 +5,17 @@ import os
5
5
  import random
6
6
  import re
7
7
 
8
- from evalscope.benchmarks.data_adapter import DataAdapter
8
+ from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics.metrics import exact_match, weighted_mean
11
- from evalscope.utils import ResponseParser, normalize_score
10
+ from evalscope.metrics import WeightedAverageAccuracy, exact_match
11
+ from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
+ from evalscope.utils import ResponseParser
12
13
  from evalscope.utils.logger import get_logger
13
14
 
14
15
  # flake8: noqa
15
16
 
16
17
  logger = get_logger()
17
18
 
18
- DATASET_ID = 'modelscope/bbh'
19
-
20
19
  # BBH multiple choice subset list
21
20
  MULTIPLE_CHOICE = 'multiple_choice'
22
21
  MULTIPLE_CHOICE_LIST = [
@@ -59,41 +58,32 @@ TASK_TYPE = 'task_type'
59
58
  SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
60
59
 
61
60
 
61
+ @Benchmark.register(
62
+ name='bbh',
63
+ dataset_id='modelscope/bbh',
64
+ model_adapter=ChatGenerationModelAdapter,
65
+ subset_list=SUBSET_LIST,
66
+ metric_list=[WeightedAverageAccuracy],
67
+ few_shot_num=3,
68
+ train_split=None,
69
+ eval_split='test',
70
+ prompt_template='',
71
+ )
62
72
  class BBHAdapter(DataAdapter):
63
73
  """
64
74
  Adapter for BBH free-form and multiple-choices sub-tasks.
65
75
  """
66
76
 
67
- def __init__(self,
68
- subset_list: list = None,
69
- metric_list: list = None,
70
- few_shot_num: int = None,
71
- train_split: str = None,
72
- eval_split: str = 'test',
73
- **kwargs):
74
-
75
- if subset_list is None:
76
- subset_list = SUBSET_LIST
77
+ def __init__(self, **kwargs):
77
78
 
78
- if metric_list is None:
79
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
80
-
81
- if few_shot_num is None:
82
- logger.info(f'Set 3-shot examples by system for BBH.')
83
- few_shot_num = 3
79
+ few_shot_num = kwargs.get('few_shot_num', 3)
84
80
 
85
81
  if few_shot_num != 3 and few_shot_num != 0:
86
82
  logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
87
83
  f'Use 3-shot by default.')
88
- few_shot_num = 3
84
+ kwargs['few_shot_num'] = 3
89
85
 
90
- super().__init__(
91
- subset_list=subset_list,
92
- metric_list=metric_list,
93
- few_shot_num=few_shot_num,
94
- train_split=train_split,
95
- eval_split=eval_split,
96
- **kwargs)
86
+ super().__init__(**kwargs)
97
87
 
98
88
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
99
89
  data_dict = {}
@@ -217,66 +207,6 @@ class BBHAdapter(DataAdapter):
217
207
  def match(self, gold: str, pred: str) -> float:
218
208
  return exact_match(gold=gold, pred=pred)
219
209
 
220
- def compute_metric(self, review_res_list: list) -> float:
221
- """
222
- Compute evaluation result by specific metric.
223
-
224
- Args:
225
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
226
-
227
- Returns:
228
- The metric score.
229
- """
230
- items = [(score, 1.0) for score in review_res_list]
231
- return weighted_mean(items)
232
-
233
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
234
- """
235
- Generate the report for the model output.
236
-
237
- Args:
238
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
239
- report_name: The user-defined report name.
240
-
241
- Returns: A dict of metric calculation results. The format is like:
242
- {
243
- "name":"BBH",
244
- "metric":"WeightedAverageAccuracy",
245
- "score":0.3389,
246
- "category":[
247
- {
248
- "name":"DEFAULT",
249
- "score":0.3389,
250
- "subset":[
251
- {
252
- "name":"BBH",
253
- "score":0.3389
254
- },
255
- ]
256
- }
257
- ],
258
- "total_num":100
259
- }
260
- """
261
- total_num: int = sum([num for _, num in subset_score_map.values()])
262
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
263
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
264
- cate_avg_list = [{
265
- 'name': subset_name,
266
- 'score': normalize_score(score=score)
267
- } for subset_name, (score, _) in subset_score_map.items()]
268
-
269
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
270
-
271
- res_map = dict(
272
- name=report_name or 'bbh',
273
- metric=self.metric_list[0]['name'],
274
- score=weighted_avg_acc,
275
- category=[category_d],
276
- total_num=total_num)
277
-
278
- return res_map
279
-
280
210
  @classmethod
281
211
  def _extract_mc_answer(cls, ans: str) -> str:
282
212
  """
@@ -1,65 +1,76 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
1
+ import copy
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Dict, List, Optional
2
4
 
3
- import os.path
4
- from modelscope.msdatasets import MsDataset
5
- from typing import Optional
5
+ if TYPE_CHECKING:
6
+ from evalscope.benchmarks import DataAdapter
6
7
 
7
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, HubType
8
+ from evalscope.models import BaseModelAdapter
8
9
 
10
+ BENCHMARK_MAPPINGS = {}
9
11
 
10
- class Benchmark(object):
11
- """
12
- Wrapper for loading datasets from ModelScope or HuggingFace.
13
- """
12
+
13
+ @dataclass
14
+ class BenchmarkMeta:
15
+ name: str
16
+ dataset_id: str
17
+ data_adapter: 'DataAdapter'
18
+ model_adapter: BaseModelAdapter
19
+ subset_list: List[str] = field(default_factory=list)
20
+ metric_list: List[dict] = field(default_factory=list)
21
+ few_shot_num: int = 0
22
+ few_shot_random: bool = False
23
+ train_split: Optional[str] = None
24
+ eval_split: Optional[str] = None
25
+ prompt_template: str = ''
26
+
27
+ def _update(self, args: dict):
28
+ if args.get('local_path'):
29
+ self.dataset_id = args['local_path']
30
+ del args['local_path']
31
+ self.__dict__.update(args)
32
+
33
+ def to_dict(self) -> dict:
34
+ return self.__dict__
35
+
36
+ def to_string_dict(self) -> dict:
37
+ cur_dict = copy.deepcopy(self.__dict__)
38
+ # cur_dict['data_adapter'] = self.data_adapter.__name__
39
+ # cur_dict['model_adapter'] = self.model_adapter.__name__
40
+ # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
41
+ del cur_dict['data_adapter']
42
+ del cur_dict['model_adapter']
43
+ del cur_dict['metric_list']
44
+ return cur_dict
45
+
46
+ def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
47
+ if config:
48
+ self._update(config)
49
+
50
+ data_adapter = self.data_adapter(**self.to_dict())
51
+ return data_adapter
52
+
53
+
54
+ class Benchmark:
14
55
 
15
56
  def __init__(self):
16
- ...
17
-
18
- @staticmethod
19
- def load(dataset_name: str,
20
- subset: str = None,
21
- split: str = None,
22
- token: str = None,
23
- hub: str = 'ModelScope',
24
- work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
25
- **kwargs):
26
- """
27
- Load a dataset from ModelScope or HuggingFace.
28
-
29
- Args:
30
- dataset_name (str): The dataset id or path.
31
- If it is dataset id, should be in the format of `organization/name` for ModelScope and HuggingFace hub.
32
- If it is dataset path, should be the path on local disk.
33
- subset (str):
34
- split:
35
- token: sdk token for ModelScope, optional, default None
36
- hub: `ModelScope` or `HuggingFace`
37
- work_dir: the work directory for caching, optional
38
-
39
- Returns:
40
- A dict.
41
- """
42
-
43
- dataset = MsDataset.load(
44
- dataset_name=dataset_name,
45
- subset_name=subset,
46
- split=split,
47
- token=token,
48
- cache_dir=work_dir,
49
- hub=hub,
50
- **kwargs)
51
-
52
- dataset.dataset_name = dataset_name.split('/')[-1]
53
- dataset.subset_name = subset
54
- # dataset.split = split
55
- return dataset
56
-
57
-
58
- if __name__ == '__main__':
59
-
60
- ds = Benchmark.load(dataset_name='mmlu', subset='management', split=None)
61
-
62
- n = 1
63
- for i in ds:
64
- print('>', n, ': ', i)
65
- n += 1
57
+ pass
58
+
59
+ @classmethod
60
+ def get(cls, name: str) -> 'BenchmarkMeta':
61
+ if name not in BENCHMARK_MAPPINGS:
62
+ raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
63
+ benchmark = BENCHMARK_MAPPINGS[name]
64
+ return benchmark
65
+
66
+ @classmethod
67
+ def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs):
68
+
69
+ def register_wrapper(data_adapter):
70
+ if name in BENCHMARK_MAPPINGS:
71
+ raise Exception(f'Benchmark {name} already registered')
72
+ BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
73
+ name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
74
+ return data_adapter
75
+
76
+ return register_wrapper
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.ceval.ceval_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
4
- from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter
5
- from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -2,8 +2,11 @@
2
2
  import csv
3
3
  import os
4
4
 
5
- from evalscope.benchmarks.data_adapter import DataAdapter
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import WeightedAverageAccuracy
6
8
  from evalscope.metrics.metrics import exact_match, weighted_mean
9
+ from evalscope.models import MultiChoiceModelAdapter
7
10
  from evalscope.utils import ResponseParser, normalize_score
8
11
  from evalscope.utils.logger import get_logger
9
12
 
@@ -11,8 +14,6 @@ from evalscope.utils.logger import get_logger
11
14
 
12
15
  logger = get_logger()
13
16
 
14
- DATASET_ID = 'modelscope/ceval-exam'
15
-
16
17
  SUBSET_LIST = [
17
18
  'computer_network',
18
19
  'operating_system',
@@ -124,40 +125,28 @@ SUBJECT_MAPPING = {
124
125
  }
125
126
 
126
127
 
128
+ @Benchmark.register(
129
+ name='ceval',
130
+ dataset_id='modelscope/ceval-exam',
131
+ model_adapter=MultiChoiceModelAdapter,
132
+ subset_list=SUBSET_LIST,
133
+ metric_list=[WeightedAverageAccuracy],
134
+ few_shot_num=0,
135
+ train_split='dev',
136
+ eval_split='val',
137
+ )
127
138
  class CEVALAdapter(DataAdapter):
128
139
 
129
140
  choices = ['A', 'B', 'C', 'D']
130
141
 
131
- def __init__(self,
132
- subset_list: list = None,
133
- metric_list: list = None,
134
- few_shot_num: int = None,
135
- train_split: str = 'dev',
136
- eval_split: str = 'val',
137
- **kwargs):
138
-
139
- if subset_list is None:
140
- subset_list = SUBSET_LIST
141
-
142
- if metric_list is None:
143
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
144
-
145
- if few_shot_num is None:
146
- # Use 5-shot by default
147
- logger.info(f'Set 0-shot examples by default for C-Eval.')
148
- few_shot_num = 0
142
+ def __init__(self, **kwargs):
149
143
 
144
+ few_shot_num = kwargs.get('few_shot_num', 0)
150
145
  if few_shot_num > 5:
151
146
  logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
152
- few_shot_num = 5
147
+ kwargs['few_shot_num'] = 5
153
148
 
154
- super().__init__(
155
- subset_list=subset_list,
156
- metric_list=metric_list,
157
- few_shot_num=few_shot_num,
158
- train_split=train_split,
159
- eval_split=eval_split,
160
- **kwargs)
149
+ super().__init__(**kwargs)
161
150
 
162
151
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
163
152
  data_dict = {}
@@ -223,7 +212,7 @@ class CEVALAdapter(DataAdapter):
223
212
  # Get the gold choice
224
213
  return input_d.get('answer', '')
225
214
 
226
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
215
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
227
216
  """
228
217
  Parse the model output to get the answer. Could be the best choice index.
229
218
 
@@ -235,11 +224,11 @@ class CEVALAdapter(DataAdapter):
235
224
  Returns:
236
225
  The parsed answer. Depending on the dataset. Usually a string for chat.
237
226
  """
238
- if eval_type == 'checkpoint':
227
+ if eval_type == EvalType.CHECKPOINT:
239
228
  return result
240
- elif eval_type == 'service':
229
+ elif eval_type == EvalType.SERVICE:
241
230
  return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
242
- elif eval_type == 'custom':
231
+ elif eval_type == EvalType.CUSTOM:
243
232
  return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
244
233
  else:
245
234
  raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -247,19 +236,6 @@ class CEVALAdapter(DataAdapter):
247
236
  def match(self, gold: str, pred: str) -> float:
248
237
  return exact_match(gold=gold, pred=pred)
249
238
 
250
- def compute_metric(self, review_res_list: list) -> float:
251
- """
252
- Compute evaluation result by specific metric.
253
-
254
- Args:
255
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
256
-
257
- Returns:
258
- The metric score.
259
- """
260
- items = [(score, 1.0) for score in review_res_list]
261
- return weighted_mean(items)
262
-
263
239
  def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
264
240
  """
265
241
  Generate report for the evaluation.