evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +49 -0
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
- evalscope/benchmarks/benchmark.py +5 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
- evalscope/benchmarks/data_adapter.py +88 -29
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +68 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +109 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +4 -1
- evalscope/evaluator/evaluator.py +81 -65
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +39 -3
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/base_adapter.py +7 -1
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +7 -2
- evalscope/models/server_adapter.py +106 -61
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +5 -1
- evalscope/perf/http_client.py +2 -2
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +42 -23
- evalscope/run.py +11 -8
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +264 -0
- evalscope/third_party/thinkbench/infer.py +100 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +47 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
- tests/cli/test_run.py +108 -19
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
evalscope/cli/start_perf.py
CHANGED
|
@@ -3,8 +3,6 @@ import os
|
|
|
3
3
|
from argparse import ArgumentParser
|
|
4
4
|
|
|
5
5
|
from evalscope.cli.base import CLICommand
|
|
6
|
-
from evalscope.perf.arguments import add_argument
|
|
7
|
-
from evalscope.perf.main import run_perf_benchmark
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
def subparser_func(args):
|
|
@@ -23,9 +21,13 @@ class PerfBenchCMD(CLICommand):
|
|
|
23
21
|
def define_args(parsers: ArgumentParser):
|
|
24
22
|
""" define args for create pipeline template command.
|
|
25
23
|
"""
|
|
24
|
+
from evalscope.perf.arguments import add_argument
|
|
25
|
+
|
|
26
26
|
parser = parsers.add_parser(PerfBenchCMD.name)
|
|
27
27
|
add_argument(parser)
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
32
|
+
|
|
31
33
|
run_perf_benchmark(self.args)
|
|
@@ -2,14 +2,15 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from collections import defaultdict
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
6
|
from tabulate import tabulate
|
|
6
7
|
from tqdm import tqdm
|
|
7
8
|
from typing import List
|
|
8
9
|
|
|
9
|
-
from evalscope.benchmarks import Benchmark
|
|
10
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
10
11
|
from evalscope.collections.sampler import DatasetEntry
|
|
11
12
|
from evalscope.config import TaskConfig
|
|
12
|
-
from evalscope.constants import
|
|
13
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalType
|
|
13
14
|
from evalscope.evaluator import Evaluator
|
|
14
15
|
from evalscope.models import get_local_model, initialize_model_adapter
|
|
15
16
|
from evalscope.report import ReportGenerator
|
|
@@ -29,11 +30,16 @@ class SimpleEvaluator(Evaluator):
|
|
|
29
30
|
task_cfg=task_cfg,
|
|
30
31
|
outputs=outputs)
|
|
31
32
|
|
|
32
|
-
def get_answer(self,
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
33
|
+
def get_answer(self, samples, infer_cfg) -> List[dict]:
|
|
34
|
+
input_prompts = [sample.prompt for sample in samples]
|
|
35
|
+
subset_name = samples[0].subset_name
|
|
36
|
+
answers_list = []
|
|
37
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
38
|
+
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
39
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
40
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
41
|
+
answers_list.append(processed_answer)
|
|
42
|
+
return answers_list, samples
|
|
37
43
|
|
|
38
44
|
def get_review(self, answer_d) -> dict:
|
|
39
45
|
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
@@ -42,38 +48,50 @@ class SimpleEvaluator(Evaluator):
|
|
|
42
48
|
|
|
43
49
|
def get_score(self, review_d) -> float:
|
|
44
50
|
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
|
|
45
|
-
|
|
46
|
-
score = metric_score[0]['score']
|
|
47
|
-
return score
|
|
51
|
+
return metric_score
|
|
48
52
|
|
|
49
53
|
|
|
50
54
|
class EvaluatorCollection:
|
|
51
55
|
|
|
52
|
-
def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
|
|
56
|
+
def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure):
|
|
53
57
|
self.task_cfg = task_cfg
|
|
58
|
+
self.data_adapter = data_adapter
|
|
54
59
|
self.outputs = outputs
|
|
55
60
|
self.model = get_local_model(task_cfg)
|
|
61
|
+
|
|
56
62
|
self.dataset, self.dataset_name = self.load()
|
|
57
|
-
self.dataset_name_map
|
|
63
|
+
self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
|
|
64
|
+
self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
|
|
58
65
|
self.evaluators = self._initialize_evaluators()
|
|
59
66
|
|
|
60
67
|
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
68
|
+
dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
|
|
69
|
+
raw_dataset = self.data_adapter.load()
|
|
70
|
+
# limit the dataset
|
|
71
|
+
if self.task_cfg.limit:
|
|
72
|
+
raw_dataset = raw_dataset[:self.task_cfg.limit]
|
|
73
|
+
# index dataset
|
|
64
74
|
datasets = []
|
|
65
75
|
for sample in raw_dataset:
|
|
76
|
+
sample['prompt'].update({'index': sample['index']})
|
|
66
77
|
datasets.append(DatasetEntry(**sample))
|
|
78
|
+
|
|
67
79
|
return datasets, dataset_name
|
|
68
80
|
|
|
69
|
-
|
|
81
|
+
@staticmethod
|
|
82
|
+
def _init_name_map(dataset):
|
|
70
83
|
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
71
|
-
|
|
72
|
-
for sample in self.dataset:
|
|
84
|
+
for sample in dataset:
|
|
73
85
|
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
74
86
|
dataset_name_map[dataset_name][subset_name].append(sample.index)
|
|
87
|
+
return dataset_name_map
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _init_id_map(dataset):
|
|
91
|
+
dataset_id_map = {}
|
|
92
|
+
for sample in dataset:
|
|
75
93
|
dataset_id_map[sample.index] = sample
|
|
76
|
-
return
|
|
94
|
+
return dataset_id_map
|
|
77
95
|
|
|
78
96
|
def _initialize_evaluators(self):
|
|
79
97
|
evaluators = {}
|
|
@@ -93,15 +111,16 @@ class EvaluatorCollection:
|
|
|
93
111
|
for subset_name, ids in data_map.items():
|
|
94
112
|
for _id in ids:
|
|
95
113
|
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
114
|
+
for metric in scores[_id]:
|
|
115
|
+
data.append(
|
|
116
|
+
dict(
|
|
117
|
+
task_type=row_data.task_type,
|
|
118
|
+
categories=tuple(row_data.categories),
|
|
119
|
+
dataset_name=dataset_name,
|
|
120
|
+
subset_name=subset_name,
|
|
121
|
+
tags=row_data.tags,
|
|
122
|
+
metric=metric['metric_name'],
|
|
123
|
+
score=metric['score']))
|
|
105
124
|
return pd.DataFrame(data)
|
|
106
125
|
|
|
107
126
|
def aggregate_and_sort(df, group_by_cols):
|
|
@@ -117,13 +136,13 @@ class EvaluatorCollection:
|
|
|
117
136
|
df = get_dataframe(scores)
|
|
118
137
|
|
|
119
138
|
# multi-level aggregation
|
|
120
|
-
subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
|
|
121
|
-
dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
|
|
122
|
-
task_report_df = aggregate_and_sort(df, ['task_type'])
|
|
139
|
+
subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
|
|
140
|
+
dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
|
|
141
|
+
task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
|
|
123
142
|
|
|
124
143
|
# explode tags to multiple rows
|
|
125
144
|
df_exploded_tags = df.explode('tags')
|
|
126
|
-
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
|
|
145
|
+
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
|
|
127
146
|
|
|
128
147
|
# process multi-level categories
|
|
129
148
|
df_categories = df.copy()
|
|
@@ -132,7 +151,8 @@ class EvaluatorCollection:
|
|
|
132
151
|
for level in range(max_depth):
|
|
133
152
|
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
|
|
134
153
|
if len(x) > level else '')
|
|
135
|
-
category_report_df = aggregate_and_sort(df_categories,
|
|
154
|
+
category_report_df = aggregate_and_sort(df_categories,
|
|
155
|
+
[f'category{level}' for level in range(max_depth)] + ['metric'])
|
|
136
156
|
|
|
137
157
|
# convert to dict format
|
|
138
158
|
report_dict = {
|
|
@@ -155,21 +175,71 @@ class EvaluatorCollection:
|
|
|
155
175
|
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
156
176
|
json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
|
|
157
177
|
|
|
178
|
+
def _filter_answer(self, pred_file_path):
|
|
179
|
+
answer_dict = defaultdict(dict)
|
|
180
|
+
if self.task_cfg.use_cache and os.path.exists(pred_file_path):
|
|
181
|
+
answers_list = jsonl_to_list(pred_file_path)
|
|
182
|
+
indices = set()
|
|
183
|
+
for answer in answers_list:
|
|
184
|
+
index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
|
|
185
|
+
answer_dict[index] = answer
|
|
186
|
+
indices.add(index)
|
|
187
|
+
data = []
|
|
188
|
+
for sample in self.dataset:
|
|
189
|
+
if sample.index not in indices:
|
|
190
|
+
data.append(sample)
|
|
191
|
+
data_map = self._init_name_map(data)
|
|
192
|
+
|
|
193
|
+
return answer_dict, data, data_map
|
|
194
|
+
return answer_dict, self.dataset, self.dataset_name_map
|
|
195
|
+
|
|
158
196
|
def get_answers(self):
|
|
159
197
|
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
160
198
|
f'{self.dataset_name}.jsonl')
|
|
161
199
|
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
200
|
+
|
|
201
|
+
answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
|
|
202
|
+
|
|
203
|
+
eval_batch_size = self.task_cfg.eval_batch_size
|
|
204
|
+
with tqdm(total=len(dataset), desc='Getting answers') as pbar:
|
|
205
|
+
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
206
|
+
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
207
|
+
futures = []
|
|
208
|
+
for sample in dataset:
|
|
209
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
210
|
+
futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
|
|
211
|
+
for future in as_completed(futures):
|
|
212
|
+
answer_list, samples = future.result()
|
|
213
|
+
answers[samples[0].index] = answer_list[0]
|
|
214
|
+
dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
215
|
+
pbar.update(1)
|
|
216
|
+
else:
|
|
217
|
+
for dataset_name, data_map in dataset_name_map.items():
|
|
218
|
+
# get evaluator for the dataset
|
|
219
|
+
evaluator = self.evaluators[dataset_name]
|
|
220
|
+
for subset_name, ids in data_map.items():
|
|
221
|
+
for i in range(0, len(ids), eval_batch_size):
|
|
222
|
+
# get batch samples
|
|
223
|
+
batch_ids = ids[i:i + eval_batch_size]
|
|
224
|
+
batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
|
|
225
|
+
answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
|
|
226
|
+
# update answers
|
|
227
|
+
for j, _id in enumerate(batch_ids):
|
|
228
|
+
answers[_id] = answer_list[j]
|
|
229
|
+
dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
230
|
+
|
|
231
|
+
pbar.update(len(batch_ids))
|
|
168
232
|
return answers
|
|
169
233
|
|
|
170
234
|
def get_reviews(self, answers):
|
|
171
235
|
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
|
|
172
236
|
os.makedirs(review_file_path, exist_ok=True)
|
|
237
|
+
|
|
238
|
+
if self.task_cfg.use_cache and os.path.exists(review_file_path):
|
|
239
|
+
logger.warning(
|
|
240
|
+
f'Ignore use_cache={self.task_cfg.use_cache}, updating the review file: {review_file_path} ...')
|
|
241
|
+
os.remove(review_file_path)
|
|
242
|
+
|
|
173
243
|
reviews = defaultdict(dict)
|
|
174
244
|
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
175
245
|
evaluator = self.evaluators[sample.dataset_name]
|
evalscope/collections/sampler.py
CHANGED
|
@@ -44,7 +44,8 @@ class Sampler(ABC):
|
|
|
44
44
|
dataset_name=dataset.name,
|
|
45
45
|
subset_name=subset_name,
|
|
46
46
|
))
|
|
47
|
-
|
|
47
|
+
count = min(count, len(all_data)) # avoid sampling more than the dataset size
|
|
48
|
+
sampled_data = random.sample(all_data, k=count)
|
|
48
49
|
return sampled_data
|
|
49
50
|
|
|
50
51
|
def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
|
evalscope/collections/schema.py
CHANGED
|
@@ -19,8 +19,7 @@ class DatasetInfo:
|
|
|
19
19
|
benchmark_meta = Benchmark.get(self.name)
|
|
20
20
|
|
|
21
21
|
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
22
|
-
data_dict = data_adapter.load(
|
|
23
|
-
dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
|
|
22
|
+
data_dict = data_adapter.load()
|
|
24
23
|
prompts = data_adapter.gen_prompts(data_dict)
|
|
25
24
|
return prompts
|
|
26
25
|
|
evalscope/config.py
CHANGED
|
@@ -17,7 +17,7 @@ logger = get_logger()
|
|
|
17
17
|
|
|
18
18
|
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
19
19
|
|
|
20
|
-
DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'
|
|
20
|
+
DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'}
|
|
21
21
|
DEFAULT_GENERATION_CONFIG = {
|
|
22
22
|
'max_length': 2048,
|
|
23
23
|
'max_new_tokens': 512,
|
|
@@ -54,6 +54,7 @@ class TaskConfig:
|
|
|
54
54
|
eval_config: Union[str, Dict, None] = None
|
|
55
55
|
stage: str = EvalStage.ALL
|
|
56
56
|
limit: Optional[int] = None
|
|
57
|
+
eval_batch_size: int = 1
|
|
57
58
|
|
|
58
59
|
# Cache and working directory arguments
|
|
59
60
|
mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
|
|
@@ -67,6 +68,8 @@ class TaskConfig:
|
|
|
67
68
|
seed: Optional[int] = 42
|
|
68
69
|
api_url: Optional[str] = None # Only used for server model
|
|
69
70
|
api_key: Optional[str] = 'EMPTY' # Only used for server model
|
|
71
|
+
timeout: Optional[float] = None # Only used for server model
|
|
72
|
+
stream: bool = False # Only used for server model
|
|
70
73
|
|
|
71
74
|
def __post_init__(self):
|
|
72
75
|
if (not self.model_id) and self.model:
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -3,15 +3,16 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import time
|
|
6
|
-
from collections import OrderedDict
|
|
6
|
+
from collections import OrderedDict, defaultdict
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
8
|
from copy import deepcopy
|
|
8
9
|
from tqdm import tqdm
|
|
9
10
|
from typing import Any, Dict, List, Optional, Union
|
|
10
11
|
|
|
11
12
|
from evalscope.benchmarks import DataAdapter
|
|
12
13
|
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
|
|
14
|
-
from evalscope.models import BaseModelAdapter
|
|
14
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, ReviewKeys
|
|
15
|
+
from evalscope.models import BaseModelAdapter
|
|
15
16
|
from evalscope.report import Report, gen_table
|
|
16
17
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
17
18
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
@@ -36,7 +37,6 @@ class Evaluator(object):
|
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
39
|
def __init__(self,
|
|
39
|
-
dataset_name_or_path: str,
|
|
40
40
|
data_adapter: DataAdapter,
|
|
41
41
|
model_adapter: BaseModelAdapter,
|
|
42
42
|
outputs: OutputsStructure = None,
|
|
@@ -44,7 +44,7 @@ class Evaluator(object):
|
|
|
44
44
|
**kwargs):
|
|
45
45
|
|
|
46
46
|
self.dataset_name = data_adapter.name
|
|
47
|
-
self.dataset_name_or_path = os.path.expanduser(
|
|
47
|
+
self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
|
|
48
48
|
self.model_name = task_cfg.model_id
|
|
49
49
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
50
50
|
|
|
@@ -63,15 +63,20 @@ class Evaluator(object):
|
|
|
63
63
|
|
|
64
64
|
def load_dataset(self):
|
|
65
65
|
dataset = self.data_adapter.load(
|
|
66
|
-
|
|
67
|
-
subset_list=self.data_adapter.subset_list,
|
|
68
|
-
work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
|
|
69
|
-
datasets_hub=self.dataset_hub,
|
|
70
|
-
**self.kwargs)
|
|
66
|
+
work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
|
|
71
67
|
|
|
72
68
|
# Get prompts from dataset
|
|
73
69
|
prompts = self.data_adapter.gen_prompts(data_dict=dataset)
|
|
74
|
-
|
|
70
|
+
|
|
71
|
+
# Limit and index prompts
|
|
72
|
+
limited_prompts = defaultdict(list)
|
|
73
|
+
for subset_name, prompts_list in prompts.items():
|
|
74
|
+
limit = self.task_cfg.limit or len(prompts_list)
|
|
75
|
+
for index, prompt in enumerate(prompts_list[:limit]):
|
|
76
|
+
prompt['index'] = index
|
|
77
|
+
limited_prompts[subset_name].append(prompt)
|
|
78
|
+
|
|
79
|
+
return limited_prompts
|
|
75
80
|
|
|
76
81
|
def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
|
|
77
82
|
model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
|
|
@@ -87,12 +92,38 @@ class Evaluator(object):
|
|
|
87
92
|
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
88
93
|
return answer_d
|
|
89
94
|
|
|
90
|
-
def
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
95
|
+
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
96
|
+
answers_list = []
|
|
97
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
98
|
+
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
99
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
100
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
101
|
+
answers_list.append(processed_answer)
|
|
102
|
+
return answers_list
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
|
|
106
|
+
# Filter prompts that have been answered
|
|
107
|
+
answers_list = []
|
|
108
|
+
if not use_cache or not os.path.exists(pred_file_path):
|
|
109
|
+
return answers_list, prompts_list
|
|
110
|
+
|
|
111
|
+
def get_answered_indices(answers_list: List[Dict]) -> List[int]:
|
|
112
|
+
indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
|
|
113
|
+
|
|
114
|
+
if all(index is None for index in indices):
|
|
115
|
+
return list(range(len(answers_list)))
|
|
116
|
+
|
|
117
|
+
return [index for index in indices if index is not None]
|
|
118
|
+
|
|
119
|
+
answers_list = jsonl_to_list(pred_file_path)
|
|
120
|
+
answered_indices = set(get_answered_indices(answers_list))
|
|
121
|
+
logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
|
|
122
|
+
|
|
123
|
+
prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
|
|
124
|
+
return answers_list, prompts
|
|
125
|
+
|
|
126
|
+
def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
|
|
96
127
|
"""
|
|
97
128
|
Get answers from model inference.
|
|
98
129
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -110,7 +141,6 @@ class Evaluator(object):
|
|
|
110
141
|
max_length: int, the max length of the sequence to be generated.
|
|
111
142
|
max_new_tokens: int, the max number of new tokens to be generated.
|
|
112
143
|
repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
|
|
113
|
-
debug: whether to run in debug mode.
|
|
114
144
|
**kwargs: kwargs.
|
|
115
145
|
|
|
116
146
|
Returns: The list of answers.
|
|
@@ -119,41 +149,35 @@ class Evaluator(object):
|
|
|
119
149
|
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
120
150
|
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
121
151
|
|
|
122
|
-
answers_list = []
|
|
123
152
|
pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
|
|
124
153
|
pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
|
|
125
154
|
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
126
155
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
142
|
-
answers_list.append(processed_answer)
|
|
143
|
-
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
144
|
-
|
|
156
|
+
answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
|
|
157
|
+
|
|
158
|
+
eval_batch_size = self.task_cfg.eval_batch_size
|
|
159
|
+
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
160
|
+
with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
|
|
161
|
+
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
162
|
+
futures = []
|
|
163
|
+
for input_prompt in prompts_list:
|
|
164
|
+
futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
|
|
165
|
+
for future in as_completed(futures):
|
|
166
|
+
answer_ds: List[dict] = future.result()
|
|
167
|
+
answers_list.extend(answer_ds)
|
|
168
|
+
dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
169
|
+
pbar.update(len(answer_ds))
|
|
145
170
|
else:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
171
|
+
batch_prompts_list = [
|
|
172
|
+
prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
|
|
173
|
+
]
|
|
174
|
+
with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
|
|
175
|
+
for batch_prompts in batch_prompts_list:
|
|
176
|
+
answer_ds: List[dict] = self._get_answer(
|
|
177
|
+
input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
|
|
178
|
+
answers_list.extend(answer_ds)
|
|
179
|
+
dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
180
|
+
pbar.update(len(batch_prompts))
|
|
157
181
|
|
|
158
182
|
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
159
183
|
return answers_list
|
|
@@ -200,17 +224,13 @@ class Evaluator(object):
|
|
|
200
224
|
def _generate_review_id(self, answer_d):
|
|
201
225
|
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
202
226
|
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
203
|
-
reviewer_spec = {
|
|
204
|
-
'metric': [metric.name for metric in self.data_adapter.metric_list],
|
|
205
|
-
'reviewer': ['Evaluator'],
|
|
206
|
-
'revision': ['default']
|
|
207
|
-
}
|
|
227
|
+
reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
|
|
208
228
|
reviewer_spec_str = json.dumps(
|
|
209
229
|
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
210
230
|
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
211
231
|
return review_id, reviewer_spec
|
|
212
232
|
|
|
213
|
-
def get_reviews(self, subset_name: str, answers_list: List[dict],
|
|
233
|
+
def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
|
|
214
234
|
"""
|
|
215
235
|
Get reviews from answers.
|
|
216
236
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -218,7 +238,6 @@ class Evaluator(object):
|
|
|
218
238
|
Args:
|
|
219
239
|
subset_name: subset name of benchmark
|
|
220
240
|
answers_list: inference results list.
|
|
221
|
-
debug: whether to run in debug mode.
|
|
222
241
|
**kwargs: kwargs.
|
|
223
242
|
|
|
224
243
|
Returns: reviews list.
|
|
@@ -231,14 +250,14 @@ class Evaluator(object):
|
|
|
231
250
|
|
|
232
251
|
if self.use_cache and os.path.exists(review_file_path):
|
|
233
252
|
logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
|
|
253
|
+
os.remove(review_file_path)
|
|
234
254
|
|
|
235
255
|
for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
|
|
236
256
|
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
237
257
|
# Get review
|
|
238
258
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
239
259
|
|
|
240
|
-
|
|
241
|
-
logger.info(review_d)
|
|
260
|
+
logger.debug(review_d)
|
|
242
261
|
|
|
243
262
|
reviews_list.append(review_d)
|
|
244
263
|
# Dump reviews
|
|
@@ -274,7 +293,8 @@ class Evaluator(object):
|
|
|
274
293
|
|
|
275
294
|
review_res_list.append(review_res)
|
|
276
295
|
|
|
277
|
-
metric_score: List[dict] = self.data_adapter.compute_metric(
|
|
296
|
+
metric_score: List[dict] = self.data_adapter.compute_metric(
|
|
297
|
+
review_res_list=review_res_list, reviews_list=reviews_list)
|
|
278
298
|
|
|
279
299
|
return metric_score
|
|
280
300
|
|
|
@@ -315,7 +335,7 @@ class Evaluator(object):
|
|
|
315
335
|
logger.error('Failed to generate report table.')
|
|
316
336
|
return report_map
|
|
317
337
|
|
|
318
|
-
def eval(self,
|
|
338
|
+
def eval(self, **kwargs) -> dict:
|
|
319
339
|
"""
|
|
320
340
|
Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
|
|
321
341
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -329,7 +349,6 @@ class Evaluator(object):
|
|
|
329
349
|
|
|
330
350
|
Args:
|
|
331
351
|
infer_cfg: The config for model inference.
|
|
332
|
-
debug: Whether to run in debug mode. Default: False.
|
|
333
352
|
|
|
334
353
|
Returns:
|
|
335
354
|
Dict of results. Depends on the stage of evaluation.
|
|
@@ -347,17 +366,14 @@ class Evaluator(object):
|
|
|
347
366
|
|
|
348
367
|
prompts = self.load_dataset()
|
|
349
368
|
for subset_name, prompts_list in prompts.items():
|
|
350
|
-
limit = kwargs.get('limit', len(prompts_list))
|
|
351
|
-
prompts_list = prompts_list[:limit]
|
|
352
369
|
|
|
353
370
|
answers_list: list = self.get_answers(
|
|
354
|
-
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=
|
|
371
|
+
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
|
|
355
372
|
if self.stage == EvalStage.INFER:
|
|
356
373
|
stage_answers_dict[subset_name] = answers_list
|
|
357
374
|
continue
|
|
358
375
|
|
|
359
|
-
reviews_list: list = self.get_reviews(
|
|
360
|
-
subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
|
|
376
|
+
reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
|
|
361
377
|
|
|
362
378
|
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
363
379
|
reviews_score_all[subset_name] = metric_res
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean,
|
|
2
|
+
from evalscope.metrics.metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean,
|
|
3
|
+
simple_f1_score, weighted_mean)
|
|
3
4
|
from evalscope.metrics.named_metrics import *
|
|
4
5
|
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|