evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
- evalscope/benchmarks/ifeval/instructions.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/cli/start_app.py +3 -2
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -47
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +298 -96
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
# flake8: noqa
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='math_500',
|
|
13
|
+
dataset_id='AI-ModelScope/MATH-500',
|
|
14
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
+
subset_list=['default'],
|
|
16
|
+
metric_list=['AveragePass@1'],
|
|
17
|
+
few_shot_num=0,
|
|
18
|
+
train_split=None,
|
|
19
|
+
eval_split='test',
|
|
20
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
|
+
)
|
|
22
|
+
class Math500Adapter(DataAdapter):
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, **kwargs):
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Generate the prompt for the model input.
|
|
30
|
+
"""
|
|
31
|
+
problem = input_d['problem']
|
|
32
|
+
full_prompt = self.prompt_template.format(query=problem)
|
|
33
|
+
|
|
34
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
# Extract the gold answer from the input dict.
|
|
38
|
+
return strip_answer_string(input_d['answer'])
|
|
39
|
+
|
|
40
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
41
|
+
"""
|
|
42
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
43
|
+
"""
|
|
44
|
+
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
45
|
+
result = strip_answer_string(extract_answer(result))
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def match(self, gold: str, pred: str) -> float:
|
|
49
|
+
return math_equal(pred, gold)
|
|
@@ -4,17 +4,15 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
9
|
-
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
# flake8: noqa
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
15
15
|
|
|
16
|
-
DATASET_ID = 'modelscope/mmlu'
|
|
17
|
-
|
|
18
16
|
SUBSET_LIST = [
|
|
19
17
|
'high_school_european_history',
|
|
20
18
|
'business_ethics',
|
|
@@ -141,11 +139,11 @@ SUBJECT_MAPPING = {
|
|
|
141
139
|
dataset_id='modelscope/mmlu',
|
|
142
140
|
model_adapter=MultiChoiceModelAdapter,
|
|
143
141
|
subset_list=SUBSET_LIST,
|
|
144
|
-
metric_list=[AverageAccuracy],
|
|
142
|
+
metric_list=['AverageAccuracy'],
|
|
145
143
|
few_shot_num=5,
|
|
146
144
|
train_split='train',
|
|
147
145
|
eval_split='test',
|
|
148
|
-
prompt_template='',
|
|
146
|
+
prompt_template='The following are multiple choice questions (with answers) about {subset_name}. \n{query}',
|
|
149
147
|
)
|
|
150
148
|
class MMLUAdapter(DataAdapter):
|
|
151
149
|
|
|
@@ -221,17 +219,15 @@ class MMLUAdapter(DataAdapter):
|
|
|
221
219
|
{'data': [full_prompt], 'multi_choices': self.choices}
|
|
222
220
|
|
|
223
221
|
"""
|
|
224
|
-
prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
|
|
225
|
-
self._format_subject(subset_name))
|
|
226
222
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
227
223
|
|
|
228
224
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
229
225
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
230
|
-
|
|
226
|
+
query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
231
227
|
|
|
232
|
-
full_prompt
|
|
228
|
+
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
|
|
233
229
|
|
|
234
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
230
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
235
231
|
|
|
236
232
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
237
233
|
# Get the gold choice
|
|
@@ -3,22 +3,27 @@ from typing import Any, Dict
|
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import AnswerKeys, EvalType
|
|
6
|
-
from evalscope.metrics import
|
|
6
|
+
from evalscope.metrics import exact_match
|
|
7
7
|
from evalscope.models import ChatGenerationModelAdapter
|
|
8
8
|
from evalscope.utils.utils import ResponseParser
|
|
9
9
|
|
|
10
|
+
SUBSET_LIST = [
|
|
11
|
+
'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
|
|
12
|
+
'philosophy', 'economics', 'other', 'psychology', 'history'
|
|
13
|
+
]
|
|
14
|
+
|
|
10
15
|
|
|
11
16
|
@Benchmark.register(
|
|
12
17
|
name='mmlu_pro',
|
|
13
18
|
dataset_id='modelscope/mmlu-pro',
|
|
14
19
|
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
-
subset_list=
|
|
16
|
-
metric_list=[AverageAccuracy],
|
|
20
|
+
subset_list=SUBSET_LIST,
|
|
21
|
+
metric_list=['AverageAccuracy'],
|
|
17
22
|
few_shot_num=5,
|
|
18
23
|
train_split='validation',
|
|
19
24
|
eval_split='test',
|
|
20
25
|
prompt_template=
|
|
21
|
-
'
|
|
26
|
+
'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
|
|
22
27
|
)
|
|
23
28
|
class MMLUProAdapter(DataAdapter):
|
|
24
29
|
|
|
@@ -26,10 +31,11 @@ class MMLUProAdapter(DataAdapter):
|
|
|
26
31
|
super().__init__(**kwargs)
|
|
27
32
|
|
|
28
33
|
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
]
|
|
34
|
+
|
|
35
|
+
def load(self, **kwargs):
|
|
36
|
+
# default load all data
|
|
37
|
+
kwargs['subset_list'] = ['default']
|
|
38
|
+
return super().load(**kwargs)
|
|
33
39
|
|
|
34
40
|
def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
|
|
35
41
|
"""
|
|
@@ -37,26 +43,32 @@ class MMLUProAdapter(DataAdapter):
|
|
|
37
43
|
Return a dict with category as key and list of prompts as value.
|
|
38
44
|
"""
|
|
39
45
|
|
|
40
|
-
data_dict = data_dict[
|
|
46
|
+
data_dict = data_dict['default'] # Only one subset for MMLU-Pro
|
|
41
47
|
fewshot_prompts = self.get_fewshot_examples(data_dict)
|
|
42
48
|
|
|
43
49
|
# Use the category as key to group the prompts
|
|
44
50
|
res_dict = defaultdict(list)
|
|
45
51
|
# generate prompts for each test sample
|
|
46
52
|
for entry in data_dict[self.eval_split]:
|
|
47
|
-
|
|
53
|
+
subset_name = entry['category']
|
|
54
|
+
if subset_name not in self.subset_list:
|
|
55
|
+
continue
|
|
56
|
+
prefix = fewshot_prompts[subset_name]
|
|
48
57
|
query = prefix + 'Q: ' + entry['question'] + '\n' + \
|
|
49
58
|
self.__form_options(entry['options']) + '\n'
|
|
50
59
|
|
|
51
|
-
|
|
60
|
+
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
61
|
+
prompt_d = {'data': [full_prompt], 'system_prompt': self.system_prompt, AnswerKeys.RAW_INPUT: entry}
|
|
52
62
|
|
|
53
|
-
res_dict[
|
|
63
|
+
res_dict[subset_name].append(prompt_d)
|
|
54
64
|
return res_dict
|
|
55
65
|
|
|
56
66
|
def get_fewshot_examples(self, data_dict: dict):
|
|
57
|
-
# load
|
|
58
|
-
prompts = {c: '' for c in self.
|
|
59
|
-
for d in data_dict[self.train_split]:
|
|
67
|
+
# load few-shot prompts for each category
|
|
68
|
+
prompts = {c: '' for c in self.subset_list}
|
|
69
|
+
for index, d in enumerate(data_dict[self.train_split]):
|
|
70
|
+
if index >= self.few_shot_num:
|
|
71
|
+
break
|
|
60
72
|
prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
|
|
61
73
|
self.__form_options(d['options']) + '\n' + \
|
|
62
74
|
d['cot_content'] + '\n\n'
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
9
9
|
from evalscope.utils import ResponseParser
|
|
10
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/race',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['high', 'middle'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
23
|
+
metric_list=['AverageAccuracy'],
|
|
24
24
|
few_shot_num=3,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -82,7 +82,7 @@ class RACEAdapter(DataAdapter):
|
|
|
82
82
|
|
|
83
83
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
84
84
|
|
|
85
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
85
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
86
86
|
|
|
87
87
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
88
88
|
# Get the gold choice
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark
|
|
7
7
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
8
8
|
from evalscope.constants import EvalType
|
|
9
|
-
from evalscope.metrics import AverageAccuracy
|
|
10
9
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
10
|
from evalscope.utils import get_logger
|
|
12
11
|
|
|
@@ -20,7 +19,7 @@ logger = get_logger()
|
|
|
20
19
|
dataset_id='modelscope/trivia_qa',
|
|
21
20
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
21
|
subset_list=['default'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
22
|
+
metric_list=['AverageAccuracy'],
|
|
24
23
|
few_shot_num=5,
|
|
25
24
|
train_split='dev',
|
|
26
25
|
eval_split='test',
|
|
@@ -9,9 +9,8 @@ from typing import List
|
|
|
9
9
|
from evalscope.benchmarks import Benchmark
|
|
10
10
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
11
11
|
from evalscope.constants import EvalType
|
|
12
|
-
from evalscope.metrics import AverageAccuracy
|
|
13
12
|
from evalscope.models import ContinuationLogitsModelAdapter
|
|
14
|
-
from evalscope.utils import get_logger
|
|
13
|
+
from evalscope.utils import get_logger
|
|
15
14
|
|
|
16
15
|
# flake8: noqa
|
|
17
16
|
|
|
@@ -25,7 +24,7 @@ logger = get_logger()
|
|
|
25
24
|
dataset_id='modelscope/truthful_qa',
|
|
26
25
|
model_adapter=ContinuationLogitsModelAdapter,
|
|
27
26
|
subset_list=['multiple_choice'],
|
|
28
|
-
metric_list=[AverageAccuracy],
|
|
27
|
+
metric_list=['AverageAccuracy'],
|
|
29
28
|
few_shot_num=0,
|
|
30
29
|
train_split=None,
|
|
31
30
|
eval_split='validation',
|
|
@@ -284,8 +283,9 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
284
283
|
logger.error(f'** Unknown review_res: {review_res_d}')
|
|
285
284
|
|
|
286
285
|
# To get mc2 score
|
|
287
|
-
return [{
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
}]
|
|
286
|
+
# return [{
|
|
287
|
+
# 'metric_name': self.metric_list[0].name,
|
|
288
|
+
# 'score': self.metric_list[0].object(mc2_list),
|
|
289
|
+
# 'num': len(mc2_list)
|
|
290
|
+
# }]
|
|
291
|
+
return super().compute_metric(mc2_list)
|
evalscope/cli/start_app.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from argparse import ArgumentParser
|
|
4
4
|
|
|
5
5
|
from evalscope.cli.base import CLICommand
|
|
6
|
-
from evalscope.report.app import create_app
|
|
6
|
+
from evalscope.report.app import add_argument, create_app
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def subparser_func(args):
|
|
@@ -23,7 +23,8 @@ class StartAppCMD(CLICommand):
|
|
|
23
23
|
""" define args for create pipeline template command.
|
|
24
24
|
"""
|
|
25
25
|
parser = parsers.add_parser(StartAppCMD.name)
|
|
26
|
+
add_argument(parser)
|
|
26
27
|
parser.set_defaults(func=subparser_func)
|
|
27
28
|
|
|
28
29
|
def execute(self):
|
|
29
|
-
create_app()
|
|
30
|
+
create_app(self.args)
|
|
@@ -2,14 +2,15 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from collections import defaultdict
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
6
|
from tabulate import tabulate
|
|
6
7
|
from tqdm import tqdm
|
|
7
8
|
from typing import List
|
|
8
9
|
|
|
9
|
-
from evalscope.benchmarks import Benchmark
|
|
10
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
10
11
|
from evalscope.collections.sampler import DatasetEntry
|
|
11
12
|
from evalscope.config import TaskConfig
|
|
12
|
-
from evalscope.constants import
|
|
13
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalType
|
|
13
14
|
from evalscope.evaluator import Evaluator
|
|
14
15
|
from evalscope.models import get_local_model, initialize_model_adapter
|
|
15
16
|
from evalscope.report import ReportGenerator
|
|
@@ -29,11 +30,16 @@ class SimpleEvaluator(Evaluator):
|
|
|
29
30
|
task_cfg=task_cfg,
|
|
30
31
|
outputs=outputs)
|
|
31
32
|
|
|
32
|
-
def get_answer(self,
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
33
|
+
def get_answer(self, samples, infer_cfg) -> List[dict]:
|
|
34
|
+
input_prompts = [sample.prompt for sample in samples]
|
|
35
|
+
subset_name = samples[0].subset_name
|
|
36
|
+
answers_list = []
|
|
37
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
38
|
+
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
39
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
40
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
41
|
+
answers_list.append(processed_answer)
|
|
42
|
+
return answers_list, samples
|
|
37
43
|
|
|
38
44
|
def get_review(self, answer_d) -> dict:
|
|
39
45
|
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
@@ -42,38 +48,50 @@ class SimpleEvaluator(Evaluator):
|
|
|
42
48
|
|
|
43
49
|
def get_score(self, review_d) -> float:
|
|
44
50
|
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
|
|
45
|
-
|
|
46
|
-
score = metric_score[0]['score']
|
|
47
|
-
return score
|
|
51
|
+
return metric_score
|
|
48
52
|
|
|
49
53
|
|
|
50
54
|
class EvaluatorCollection:
|
|
51
55
|
|
|
52
|
-
def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
|
|
56
|
+
def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure):
|
|
53
57
|
self.task_cfg = task_cfg
|
|
58
|
+
self.data_adapter = data_adapter
|
|
54
59
|
self.outputs = outputs
|
|
55
60
|
self.model = get_local_model(task_cfg)
|
|
61
|
+
|
|
56
62
|
self.dataset, self.dataset_name = self.load()
|
|
57
|
-
self.dataset_name_map
|
|
63
|
+
self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
|
|
64
|
+
self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
|
|
58
65
|
self.evaluators = self._initialize_evaluators()
|
|
59
66
|
|
|
60
67
|
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
68
|
+
dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
|
|
69
|
+
raw_dataset = self.data_adapter.load()
|
|
70
|
+
# limit the dataset
|
|
71
|
+
if self.task_cfg.limit:
|
|
72
|
+
raw_dataset = raw_dataset[:self.task_cfg.limit]
|
|
73
|
+
# index dataset
|
|
64
74
|
datasets = []
|
|
65
75
|
for sample in raw_dataset:
|
|
76
|
+
sample['prompt'].update({'index': sample['index']})
|
|
66
77
|
datasets.append(DatasetEntry(**sample))
|
|
78
|
+
|
|
67
79
|
return datasets, dataset_name
|
|
68
80
|
|
|
69
|
-
|
|
81
|
+
@staticmethod
|
|
82
|
+
def _init_name_map(dataset):
|
|
70
83
|
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
71
|
-
|
|
72
|
-
for sample in self.dataset:
|
|
84
|
+
for sample in dataset:
|
|
73
85
|
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
74
86
|
dataset_name_map[dataset_name][subset_name].append(sample.index)
|
|
87
|
+
return dataset_name_map
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _init_id_map(dataset):
|
|
91
|
+
dataset_id_map = {}
|
|
92
|
+
for sample in dataset:
|
|
75
93
|
dataset_id_map[sample.index] = sample
|
|
76
|
-
return
|
|
94
|
+
return dataset_id_map
|
|
77
95
|
|
|
78
96
|
def _initialize_evaluators(self):
|
|
79
97
|
evaluators = {}
|
|
@@ -93,15 +111,16 @@ class EvaluatorCollection:
|
|
|
93
111
|
for subset_name, ids in data_map.items():
|
|
94
112
|
for _id in ids:
|
|
95
113
|
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
114
|
+
for metric in scores[_id]:
|
|
115
|
+
data.append(
|
|
116
|
+
dict(
|
|
117
|
+
task_type=row_data.task_type,
|
|
118
|
+
categories=tuple(row_data.categories),
|
|
119
|
+
dataset_name=dataset_name,
|
|
120
|
+
subset_name=subset_name,
|
|
121
|
+
tags=row_data.tags,
|
|
122
|
+
metric=metric['metric_name'],
|
|
123
|
+
score=metric['score']))
|
|
105
124
|
return pd.DataFrame(data)
|
|
106
125
|
|
|
107
126
|
def aggregate_and_sort(df, group_by_cols):
|
|
@@ -117,13 +136,13 @@ class EvaluatorCollection:
|
|
|
117
136
|
df = get_dataframe(scores)
|
|
118
137
|
|
|
119
138
|
# multi-level aggregation
|
|
120
|
-
subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
|
|
121
|
-
dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
|
|
122
|
-
task_report_df = aggregate_and_sort(df, ['task_type'])
|
|
139
|
+
subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
|
|
140
|
+
dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
|
|
141
|
+
task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
|
|
123
142
|
|
|
124
143
|
# explode tags to multiple rows
|
|
125
144
|
df_exploded_tags = df.explode('tags')
|
|
126
|
-
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
|
|
145
|
+
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
|
|
127
146
|
|
|
128
147
|
# process multi-level categories
|
|
129
148
|
df_categories = df.copy()
|
|
@@ -132,7 +151,8 @@ class EvaluatorCollection:
|
|
|
132
151
|
for level in range(max_depth):
|
|
133
152
|
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
|
|
134
153
|
if len(x) > level else '')
|
|
135
|
-
category_report_df = aggregate_and_sort(df_categories,
|
|
154
|
+
category_report_df = aggregate_and_sort(df_categories,
|
|
155
|
+
[f'category{level}' for level in range(max_depth)] + ['metric'])
|
|
136
156
|
|
|
137
157
|
# convert to dict format
|
|
138
158
|
report_dict = {
|
|
@@ -155,16 +175,60 @@ class EvaluatorCollection:
|
|
|
155
175
|
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
156
176
|
json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
|
|
157
177
|
|
|
178
|
+
def _filter_answer(self, pred_file_path):
|
|
179
|
+
answer_dict = defaultdict(dict)
|
|
180
|
+
if self.task_cfg.use_cache and os.path.exists(pred_file_path):
|
|
181
|
+
answers_list = jsonl_to_list(pred_file_path)
|
|
182
|
+
indices = set()
|
|
183
|
+
for answer in answers_list:
|
|
184
|
+
index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
|
|
185
|
+
answer_dict[index] = answer
|
|
186
|
+
indices.add(index)
|
|
187
|
+
data = []
|
|
188
|
+
for sample in self.dataset:
|
|
189
|
+
if sample.index not in indices:
|
|
190
|
+
data.append(sample)
|
|
191
|
+
data_map = self._init_name_map(data)
|
|
192
|
+
|
|
193
|
+
return answer_dict, data, data_map
|
|
194
|
+
return answer_dict, self.dataset, self.dataset_name_map
|
|
195
|
+
|
|
158
196
|
def get_answers(self):
|
|
159
197
|
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
160
198
|
f'{self.dataset_name}.jsonl')
|
|
161
199
|
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
200
|
+
|
|
201
|
+
answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
|
|
202
|
+
|
|
203
|
+
eval_batch_size = self.task_cfg.eval_batch_size
|
|
204
|
+
with tqdm(total=len(dataset), desc='Getting answers') as pbar:
|
|
205
|
+
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
206
|
+
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
207
|
+
futures = []
|
|
208
|
+
for sample in dataset:
|
|
209
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
210
|
+
futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
|
|
211
|
+
for future in as_completed(futures):
|
|
212
|
+
answer_list, samples = future.result()
|
|
213
|
+
answers[samples[0].index] = answer_list[0]
|
|
214
|
+
dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
215
|
+
pbar.update(1)
|
|
216
|
+
else:
|
|
217
|
+
for dataset_name, data_map in dataset_name_map.items():
|
|
218
|
+
# get evaluator for the dataset
|
|
219
|
+
evaluator = self.evaluators[dataset_name]
|
|
220
|
+
for subset_name, ids in data_map.items():
|
|
221
|
+
for i in range(0, len(ids), eval_batch_size):
|
|
222
|
+
# get batch samples
|
|
223
|
+
batch_ids = ids[i:i + eval_batch_size]
|
|
224
|
+
batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
|
|
225
|
+
answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
|
|
226
|
+
# update answers
|
|
227
|
+
for j, _id in enumerate(batch_ids):
|
|
228
|
+
answers[_id] = answer_list[j]
|
|
229
|
+
dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
230
|
+
|
|
231
|
+
pbar.update(len(batch_ids))
|
|
168
232
|
return answers
|
|
169
233
|
|
|
170
234
|
def get_reviews(self, answers):
|
evalscope/collections/sampler.py
CHANGED
|
@@ -44,7 +44,8 @@ class Sampler(ABC):
|
|
|
44
44
|
dataset_name=dataset.name,
|
|
45
45
|
subset_name=subset_name,
|
|
46
46
|
))
|
|
47
|
-
|
|
47
|
+
count = min(count, len(all_data)) # avoid sampling more than the dataset size
|
|
48
|
+
sampled_data = random.sample(all_data, k=count)
|
|
48
49
|
return sampled_data
|
|
49
50
|
|
|
50
51
|
def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
|
evalscope/collections/schema.py
CHANGED
|
@@ -19,8 +19,7 @@ class DatasetInfo:
|
|
|
19
19
|
benchmark_meta = Benchmark.get(self.name)
|
|
20
20
|
|
|
21
21
|
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
22
|
-
data_dict = data_adapter.load(
|
|
23
|
-
dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
|
|
22
|
+
data_dict = data_adapter.load()
|
|
24
23
|
prompts = data_adapter.gen_prompts(data_dict)
|
|
25
24
|
return prompts
|
|
26
25
|
|
evalscope/config.py
CHANGED
|
@@ -54,6 +54,7 @@ class TaskConfig:
|
|
|
54
54
|
eval_config: Union[str, Dict, None] = None
|
|
55
55
|
stage: str = EvalStage.ALL
|
|
56
56
|
limit: Optional[int] = None
|
|
57
|
+
eval_batch_size: int = 1
|
|
57
58
|
|
|
58
59
|
# Cache and working directory arguments
|
|
59
60
|
mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
|