evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +10 -6
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -108
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/perf/arguments.py +1 -0
- evalscope/perf/benchmark.py +1 -1
- evalscope/perf/main.py +3 -1
- evalscope/perf/plugin/api/openai_api.py +51 -47
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/perf/test_perf.py +3 -3
- tests/rag/test_mteb.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
evalscope/__init__.py
CHANGED
evalscope/arguments.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import json
|
|
3
3
|
|
|
4
|
+
from evalscope.constants import EvalBackend, EvalStage, EvalType
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
class ParseStrArgsAction(argparse.Action):
|
|
6
8
|
|
|
@@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
47
49
|
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
|
|
48
50
|
|
|
49
51
|
# Evaluation-related arguments
|
|
50
|
-
parser.add_argument('--eval-type', type=str, help='The type for evaluating.'
|
|
51
|
-
|
|
52
|
+
parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
|
|
53
|
+
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
|
|
54
|
+
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
|
|
55
|
+
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
|
|
52
56
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
53
|
-
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.'
|
|
57
|
+
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
58
|
+
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
54
59
|
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
|
|
55
60
|
|
|
56
61
|
# Cache and working directory arguments
|
|
@@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
62
67
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
63
68
|
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
64
69
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
70
|
+
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
71
|
+
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
65
72
|
# yapf: enable
|
|
66
73
|
|
|
67
74
|
|
|
@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
|
|
|
6
6
|
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_MODEL_REVISION
|
|
9
|
-
from evalscope.models
|
|
9
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class LLM:
|
evalscope/benchmarks/__init__.py
CHANGED
|
@@ -1,4 +1,23 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import glob
|
|
3
|
+
import importlib
|
|
4
|
+
import os
|
|
2
5
|
|
|
3
|
-
from evalscope.benchmarks.benchmark import Benchmark
|
|
6
|
+
from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
|
|
4
7
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
8
|
+
from evalscope.utils import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
# Using glob to find all files matching the pattern
|
|
13
|
+
pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
|
|
14
|
+
files = glob.glob(pattern, recursive=False)
|
|
15
|
+
|
|
16
|
+
for file_path in files:
|
|
17
|
+
if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
|
|
18
|
+
# Convert file path to a module path
|
|
19
|
+
relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
|
|
20
|
+
module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
|
|
21
|
+
full_path = f'evalscope.benchmarks.{module_path}'
|
|
22
|
+
importlib.import_module(full_path)
|
|
23
|
+
# print(f'Importing {full_path}')
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
|
|
5
|
-
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|
|
@@ -3,40 +3,35 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
|
-
from evalscope.benchmarks
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
6
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
+
from evalscope.constants import EvalType
|
|
8
|
+
from evalscope.metrics import WeightedAverageAccuracy, exact_match
|
|
9
|
+
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
+
from evalscope.utils import ResponseParser
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
# flake8: noqa
|
|
12
14
|
|
|
13
15
|
logger = get_logger()
|
|
14
16
|
|
|
15
|
-
DATASET_ID = 'modelscope/ai2_arc'
|
|
16
|
-
|
|
17
|
-
# task_list = ['ARC-Easy', 'ARC-Challenge']
|
|
18
|
-
SUBSET_LIST = ['ARC-Challenge']
|
|
19
|
-
|
|
20
17
|
|
|
18
|
+
@Benchmark.register(
|
|
19
|
+
name='arc',
|
|
20
|
+
dataset_id='modelscope/ai2_arc',
|
|
21
|
+
model_adapter=MultiChoiceModelAdapter,
|
|
22
|
+
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
|
+
metric_list=[WeightedAverageAccuracy],
|
|
24
|
+
few_shot_num=0,
|
|
25
|
+
train_split='train',
|
|
26
|
+
eval_split='test',
|
|
27
|
+
prompt_template='',
|
|
28
|
+
)
|
|
21
29
|
class ARCAdapter(DataAdapter):
|
|
22
30
|
|
|
23
31
|
choices = ['A', 'B', 'C', 'D']
|
|
24
32
|
|
|
25
|
-
def __init__(self,
|
|
26
|
-
|
|
27
|
-
metric_list: list = None,
|
|
28
|
-
few_shot_num: int = None,
|
|
29
|
-
train_split: str = 'train',
|
|
30
|
-
eval_split: str = 'test',
|
|
31
|
-
prompt_template: str = '',
|
|
32
|
-
**kwargs):
|
|
33
|
-
|
|
34
|
-
if subset_list is None:
|
|
35
|
-
subset_list = SUBSET_LIST
|
|
36
|
-
|
|
37
|
-
if metric_list is None:
|
|
38
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
39
|
-
|
|
33
|
+
def __init__(self, **kwargs):
|
|
34
|
+
few_shot_num = kwargs.get('few_shot_num', None)
|
|
40
35
|
if few_shot_num is None:
|
|
41
36
|
# Use 0-shot by default
|
|
42
37
|
logger.info(f'Set 0-shot examples by system for ARC.')
|
|
@@ -45,14 +40,7 @@ class ARCAdapter(DataAdapter):
|
|
|
45
40
|
if few_shot_num != 0:
|
|
46
41
|
logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
|
|
47
42
|
|
|
48
|
-
super().__init__(
|
|
49
|
-
subset_list=subset_list,
|
|
50
|
-
metric_list=metric_list,
|
|
51
|
-
few_shot_num=few_shot_num,
|
|
52
|
-
train_split=train_split,
|
|
53
|
-
eval_split=eval_split,
|
|
54
|
-
prompt_template=prompt_template,
|
|
55
|
-
**kwargs)
|
|
43
|
+
super().__init__(**kwargs)
|
|
56
44
|
|
|
57
45
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
58
46
|
"""
|
|
@@ -132,7 +120,7 @@ class ARCAdapter(DataAdapter):
|
|
|
132
120
|
# Get the gold choice
|
|
133
121
|
return input_d.get('answerKey', '')
|
|
134
122
|
|
|
135
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
123
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
136
124
|
"""
|
|
137
125
|
Parse the model output to get the answer. Could be the best choice index.
|
|
138
126
|
|
|
@@ -144,12 +132,12 @@ class ARCAdapter(DataAdapter):
|
|
|
144
132
|
Returns:
|
|
145
133
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
146
134
|
"""
|
|
147
|
-
if eval_type ==
|
|
135
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
148
136
|
return result
|
|
149
|
-
elif eval_type ==
|
|
137
|
+
elif eval_type == EvalType.SERVICE:
|
|
150
138
|
return ResponseParser.parse_first_option_with_choices(
|
|
151
139
|
text=result, options=self.choices) # TODO: to be checked !
|
|
152
|
-
elif eval_type ==
|
|
140
|
+
elif eval_type == EvalType.CUSTOM:
|
|
153
141
|
return ResponseParser.parse_first_option_with_choices(
|
|
154
142
|
text=result, options=self.choices) # TODO: to be checked !
|
|
155
143
|
else:
|
|
@@ -158,70 +146,6 @@ class ARCAdapter(DataAdapter):
|
|
|
158
146
|
def match(self, gold: str, pred: str) -> float:
|
|
159
147
|
return exact_match(gold=gold, pred=pred)
|
|
160
148
|
|
|
161
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
162
|
-
"""
|
|
163
|
-
Compute evaluation result by specific metric.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
167
|
-
|
|
168
|
-
Returns:
|
|
169
|
-
The metric score.
|
|
170
|
-
"""
|
|
171
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
172
|
-
return weighted_mean(items)
|
|
173
|
-
|
|
174
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
175
|
-
"""
|
|
176
|
-
Generate the report for the model output.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
180
|
-
report_name: The user-defined report name.
|
|
181
|
-
|
|
182
|
-
Returns: A dict of metric calculation results. The format is like:
|
|
183
|
-
{
|
|
184
|
-
"name":"ARC",
|
|
185
|
-
"metric":"WeightedAverageAccuracy",
|
|
186
|
-
"score":0.3389,
|
|
187
|
-
"category":[
|
|
188
|
-
{
|
|
189
|
-
"name":"DEFAULT",
|
|
190
|
-
"score":0.4128,
|
|
191
|
-
"subset":[
|
|
192
|
-
{
|
|
193
|
-
"name":"ARC-Easy",
|
|
194
|
-
"score":0.5632
|
|
195
|
-
},
|
|
196
|
-
{
|
|
197
|
-
"name":"ARC-Challenge",
|
|
198
|
-
"score":0.3157
|
|
199
|
-
}
|
|
200
|
-
]
|
|
201
|
-
}
|
|
202
|
-
],
|
|
203
|
-
"total_num":7800
|
|
204
|
-
}
|
|
205
|
-
"""
|
|
206
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
207
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
208
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
209
|
-
cate_avg_list = [{
|
|
210
|
-
'name': subset_name,
|
|
211
|
-
'score': normalize_score(score=score)
|
|
212
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
213
|
-
|
|
214
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
215
|
-
|
|
216
|
-
res_map = dict(
|
|
217
|
-
name=report_name or 'arc',
|
|
218
|
-
metric=self.metric_list[0]['name'],
|
|
219
|
-
score=weighted_avg_acc,
|
|
220
|
-
category=[category_d],
|
|
221
|
-
total_num=total_num)
|
|
222
|
-
|
|
223
|
-
return res_map
|
|
224
|
-
|
|
225
149
|
@classmethod
|
|
226
150
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
227
151
|
|
|
@@ -1,5 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -5,18 +5,17 @@ import os
|
|
|
5
5
|
import random
|
|
6
6
|
import re
|
|
7
7
|
|
|
8
|
-
from evalscope.benchmarks
|
|
8
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
|
-
from evalscope.metrics
|
|
11
|
-
from evalscope.
|
|
10
|
+
from evalscope.metrics import WeightedAverageAccuracy, exact_match
|
|
11
|
+
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
|
+
from evalscope.utils import ResponseParser
|
|
12
13
|
from evalscope.utils.logger import get_logger
|
|
13
14
|
|
|
14
15
|
# flake8: noqa
|
|
15
16
|
|
|
16
17
|
logger = get_logger()
|
|
17
18
|
|
|
18
|
-
DATASET_ID = 'modelscope/bbh'
|
|
19
|
-
|
|
20
19
|
# BBH multiple choice subset list
|
|
21
20
|
MULTIPLE_CHOICE = 'multiple_choice'
|
|
22
21
|
MULTIPLE_CHOICE_LIST = [
|
|
@@ -59,41 +58,32 @@ TASK_TYPE = 'task_type'
|
|
|
59
58
|
SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
60
59
|
|
|
61
60
|
|
|
61
|
+
@Benchmark.register(
|
|
62
|
+
name='bbh',
|
|
63
|
+
dataset_id='modelscope/bbh',
|
|
64
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
65
|
+
subset_list=SUBSET_LIST,
|
|
66
|
+
metric_list=[WeightedAverageAccuracy],
|
|
67
|
+
few_shot_num=3,
|
|
68
|
+
train_split=None,
|
|
69
|
+
eval_split='test',
|
|
70
|
+
prompt_template='',
|
|
71
|
+
)
|
|
62
72
|
class BBHAdapter(DataAdapter):
|
|
63
73
|
"""
|
|
64
74
|
Adapter for BBH free-form and multiple-choices sub-tasks.
|
|
65
75
|
"""
|
|
66
76
|
|
|
67
|
-
def __init__(self,
|
|
68
|
-
subset_list: list = None,
|
|
69
|
-
metric_list: list = None,
|
|
70
|
-
few_shot_num: int = None,
|
|
71
|
-
train_split: str = None,
|
|
72
|
-
eval_split: str = 'test',
|
|
73
|
-
**kwargs):
|
|
74
|
-
|
|
75
|
-
if subset_list is None:
|
|
76
|
-
subset_list = SUBSET_LIST
|
|
77
|
+
def __init__(self, **kwargs):
|
|
77
78
|
|
|
78
|
-
|
|
79
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
80
|
-
|
|
81
|
-
if few_shot_num is None:
|
|
82
|
-
logger.info(f'Set 3-shot examples by system for BBH.')
|
|
83
|
-
few_shot_num = 3
|
|
79
|
+
few_shot_num = kwargs.get('few_shot_num', 3)
|
|
84
80
|
|
|
85
81
|
if few_shot_num != 3 and few_shot_num != 0:
|
|
86
82
|
logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
|
|
87
83
|
f'Use 3-shot by default.')
|
|
88
|
-
few_shot_num = 3
|
|
84
|
+
kwargs['few_shot_num'] = 3
|
|
89
85
|
|
|
90
|
-
super().__init__(
|
|
91
|
-
subset_list=subset_list,
|
|
92
|
-
metric_list=metric_list,
|
|
93
|
-
few_shot_num=few_shot_num,
|
|
94
|
-
train_split=train_split,
|
|
95
|
-
eval_split=eval_split,
|
|
96
|
-
**kwargs)
|
|
86
|
+
super().__init__(**kwargs)
|
|
97
87
|
|
|
98
88
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
99
89
|
data_dict = {}
|
|
@@ -217,66 +207,6 @@ class BBHAdapter(DataAdapter):
|
|
|
217
207
|
def match(self, gold: str, pred: str) -> float:
|
|
218
208
|
return exact_match(gold=gold, pred=pred)
|
|
219
209
|
|
|
220
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
221
|
-
"""
|
|
222
|
-
Compute evaluation result by specific metric.
|
|
223
|
-
|
|
224
|
-
Args:
|
|
225
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
The metric score.
|
|
229
|
-
"""
|
|
230
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
231
|
-
return weighted_mean(items)
|
|
232
|
-
|
|
233
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
234
|
-
"""
|
|
235
|
-
Generate the report for the model output.
|
|
236
|
-
|
|
237
|
-
Args:
|
|
238
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
239
|
-
report_name: The user-defined report name.
|
|
240
|
-
|
|
241
|
-
Returns: A dict of metric calculation results. The format is like:
|
|
242
|
-
{
|
|
243
|
-
"name":"BBH",
|
|
244
|
-
"metric":"WeightedAverageAccuracy",
|
|
245
|
-
"score":0.3389,
|
|
246
|
-
"category":[
|
|
247
|
-
{
|
|
248
|
-
"name":"DEFAULT",
|
|
249
|
-
"score":0.3389,
|
|
250
|
-
"subset":[
|
|
251
|
-
{
|
|
252
|
-
"name":"BBH",
|
|
253
|
-
"score":0.3389
|
|
254
|
-
},
|
|
255
|
-
]
|
|
256
|
-
}
|
|
257
|
-
],
|
|
258
|
-
"total_num":100
|
|
259
|
-
}
|
|
260
|
-
"""
|
|
261
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
262
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
263
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
264
|
-
cate_avg_list = [{
|
|
265
|
-
'name': subset_name,
|
|
266
|
-
'score': normalize_score(score=score)
|
|
267
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
268
|
-
|
|
269
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
270
|
-
|
|
271
|
-
res_map = dict(
|
|
272
|
-
name=report_name or 'bbh',
|
|
273
|
-
metric=self.metric_list[0]['name'],
|
|
274
|
-
score=weighted_avg_acc,
|
|
275
|
-
category=[category_d],
|
|
276
|
-
total_num=total_num)
|
|
277
|
-
|
|
278
|
-
return res_map
|
|
279
|
-
|
|
280
210
|
@classmethod
|
|
281
211
|
def _extract_mc_answer(cls, ans: str) -> str:
|
|
282
212
|
"""
|
|
@@ -1,65 +1,76 @@
|
|
|
1
|
-
|
|
1
|
+
import copy
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
2
4
|
|
|
3
|
-
|
|
4
|
-
from
|
|
5
|
-
from typing import Optional
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from evalscope.benchmarks import DataAdapter
|
|
6
7
|
|
|
7
|
-
from evalscope.
|
|
8
|
+
from evalscope.models import BaseModelAdapter
|
|
8
9
|
|
|
10
|
+
BENCHMARK_MAPPINGS = {}
|
|
9
11
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class BenchmarkMeta:
|
|
15
|
+
name: str
|
|
16
|
+
dataset_id: str
|
|
17
|
+
data_adapter: 'DataAdapter'
|
|
18
|
+
model_adapter: BaseModelAdapter
|
|
19
|
+
subset_list: List[str] = field(default_factory=list)
|
|
20
|
+
metric_list: List[dict] = field(default_factory=list)
|
|
21
|
+
few_shot_num: int = 0
|
|
22
|
+
few_shot_random: bool = False
|
|
23
|
+
train_split: Optional[str] = None
|
|
24
|
+
eval_split: Optional[str] = None
|
|
25
|
+
prompt_template: str = ''
|
|
26
|
+
|
|
27
|
+
def _update(self, args: dict):
|
|
28
|
+
if args.get('local_path'):
|
|
29
|
+
self.dataset_id = args['local_path']
|
|
30
|
+
del args['local_path']
|
|
31
|
+
self.__dict__.update(args)
|
|
32
|
+
|
|
33
|
+
def to_dict(self) -> dict:
|
|
34
|
+
return self.__dict__
|
|
35
|
+
|
|
36
|
+
def to_string_dict(self) -> dict:
|
|
37
|
+
cur_dict = copy.deepcopy(self.__dict__)
|
|
38
|
+
# cur_dict['data_adapter'] = self.data_adapter.__name__
|
|
39
|
+
# cur_dict['model_adapter'] = self.model_adapter.__name__
|
|
40
|
+
# cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
|
|
41
|
+
del cur_dict['data_adapter']
|
|
42
|
+
del cur_dict['model_adapter']
|
|
43
|
+
del cur_dict['metric_list']
|
|
44
|
+
return cur_dict
|
|
45
|
+
|
|
46
|
+
def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
|
|
47
|
+
if config:
|
|
48
|
+
self._update(config)
|
|
49
|
+
|
|
50
|
+
data_adapter = self.data_adapter(**self.to_dict())
|
|
51
|
+
return data_adapter
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Benchmark:
|
|
14
55
|
|
|
15
56
|
def __init__(self):
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@
|
|
19
|
-
def
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
hub: `ModelScope` or `HuggingFace`
|
|
37
|
-
work_dir: the work directory for caching, optional
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
A dict.
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
dataset = MsDataset.load(
|
|
44
|
-
dataset_name=dataset_name,
|
|
45
|
-
subset_name=subset,
|
|
46
|
-
split=split,
|
|
47
|
-
token=token,
|
|
48
|
-
cache_dir=work_dir,
|
|
49
|
-
hub=hub,
|
|
50
|
-
**kwargs)
|
|
51
|
-
|
|
52
|
-
dataset.dataset_name = dataset_name.split('/')[-1]
|
|
53
|
-
dataset.subset_name = subset
|
|
54
|
-
# dataset.split = split
|
|
55
|
-
return dataset
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if __name__ == '__main__':
|
|
59
|
-
|
|
60
|
-
ds = Benchmark.load(dataset_name='mmlu', subset='management', split=None)
|
|
61
|
-
|
|
62
|
-
n = 1
|
|
63
|
-
for i in ds:
|
|
64
|
-
print('>', n, ': ', i)
|
|
65
|
-
n += 1
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def get(cls, name: str) -> 'BenchmarkMeta':
|
|
61
|
+
if name not in BENCHMARK_MAPPINGS:
|
|
62
|
+
raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
|
|
63
|
+
benchmark = BENCHMARK_MAPPINGS[name]
|
|
64
|
+
return benchmark
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs):
|
|
68
|
+
|
|
69
|
+
def register_wrapper(data_adapter):
|
|
70
|
+
if name in BENCHMARK_MAPPINGS:
|
|
71
|
+
raise Exception(f'Benchmark {name} already registered')
|
|
72
|
+
BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
|
|
73
|
+
name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
|
|
74
|
+
return data_adapter
|
|
75
|
+
|
|
76
|
+
return register_wrapper
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.ceval.ceval_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter
|
|
5
|
-
from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|
|
@@ -2,8 +2,11 @@
|
|
|
2
2
|
import csv
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
|
-
from evalscope.benchmarks
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.metrics import WeightedAverageAccuracy
|
|
6
8
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
9
|
+
from evalscope.models import MultiChoiceModelAdapter
|
|
7
10
|
from evalscope.utils import ResponseParser, normalize_score
|
|
8
11
|
from evalscope.utils.logger import get_logger
|
|
9
12
|
|
|
@@ -11,8 +14,6 @@ from evalscope.utils.logger import get_logger
|
|
|
11
14
|
|
|
12
15
|
logger = get_logger()
|
|
13
16
|
|
|
14
|
-
DATASET_ID = 'modelscope/ceval-exam'
|
|
15
|
-
|
|
16
17
|
SUBSET_LIST = [
|
|
17
18
|
'computer_network',
|
|
18
19
|
'operating_system',
|
|
@@ -124,40 +125,28 @@ SUBJECT_MAPPING = {
|
|
|
124
125
|
}
|
|
125
126
|
|
|
126
127
|
|
|
128
|
+
@Benchmark.register(
|
|
129
|
+
name='ceval',
|
|
130
|
+
dataset_id='modelscope/ceval-exam',
|
|
131
|
+
model_adapter=MultiChoiceModelAdapter,
|
|
132
|
+
subset_list=SUBSET_LIST,
|
|
133
|
+
metric_list=[WeightedAverageAccuracy],
|
|
134
|
+
few_shot_num=0,
|
|
135
|
+
train_split='dev',
|
|
136
|
+
eval_split='val',
|
|
137
|
+
)
|
|
127
138
|
class CEVALAdapter(DataAdapter):
|
|
128
139
|
|
|
129
140
|
choices = ['A', 'B', 'C', 'D']
|
|
130
141
|
|
|
131
|
-
def __init__(self,
|
|
132
|
-
subset_list: list = None,
|
|
133
|
-
metric_list: list = None,
|
|
134
|
-
few_shot_num: int = None,
|
|
135
|
-
train_split: str = 'dev',
|
|
136
|
-
eval_split: str = 'val',
|
|
137
|
-
**kwargs):
|
|
138
|
-
|
|
139
|
-
if subset_list is None:
|
|
140
|
-
subset_list = SUBSET_LIST
|
|
141
|
-
|
|
142
|
-
if metric_list is None:
|
|
143
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
144
|
-
|
|
145
|
-
if few_shot_num is None:
|
|
146
|
-
# Use 5-shot by default
|
|
147
|
-
logger.info(f'Set 0-shot examples by default for C-Eval.')
|
|
148
|
-
few_shot_num = 0
|
|
142
|
+
def __init__(self, **kwargs):
|
|
149
143
|
|
|
144
|
+
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
150
145
|
if few_shot_num > 5:
|
|
151
146
|
logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
|
|
152
|
-
few_shot_num = 5
|
|
147
|
+
kwargs['few_shot_num'] = 5
|
|
153
148
|
|
|
154
|
-
super().__init__(
|
|
155
|
-
subset_list=subset_list,
|
|
156
|
-
metric_list=metric_list,
|
|
157
|
-
few_shot_num=few_shot_num,
|
|
158
|
-
train_split=train_split,
|
|
159
|
-
eval_split=eval_split,
|
|
160
|
-
**kwargs)
|
|
149
|
+
super().__init__(**kwargs)
|
|
161
150
|
|
|
162
151
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
163
152
|
data_dict = {}
|
|
@@ -223,7 +212,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
223
212
|
# Get the gold choice
|
|
224
213
|
return input_d.get('answer', '')
|
|
225
214
|
|
|
226
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
215
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
227
216
|
"""
|
|
228
217
|
Parse the model output to get the answer. Could be the best choice index.
|
|
229
218
|
|
|
@@ -235,11 +224,11 @@ class CEVALAdapter(DataAdapter):
|
|
|
235
224
|
Returns:
|
|
236
225
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
237
226
|
"""
|
|
238
|
-
if eval_type ==
|
|
227
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
239
228
|
return result
|
|
240
|
-
elif eval_type ==
|
|
229
|
+
elif eval_type == EvalType.SERVICE:
|
|
241
230
|
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
242
|
-
elif eval_type ==
|
|
231
|
+
elif eval_type == EvalType.CUSTOM:
|
|
243
232
|
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
244
233
|
else:
|
|
245
234
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
@@ -247,19 +236,6 @@ class CEVALAdapter(DataAdapter):
|
|
|
247
236
|
def match(self, gold: str, pred: str) -> float:
|
|
248
237
|
return exact_match(gold=gold, pred=pred)
|
|
249
238
|
|
|
250
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
251
|
-
"""
|
|
252
|
-
Compute evaluation result by specific metric.
|
|
253
|
-
|
|
254
|
-
Args:
|
|
255
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
256
|
-
|
|
257
|
-
Returns:
|
|
258
|
-
The metric score.
|
|
259
|
-
"""
|
|
260
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
261
|
-
return weighted_mean(items)
|
|
262
|
-
|
|
263
239
|
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
264
240
|
"""
|
|
265
241
|
Generate report for the evaluation.
|