evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +30 -15
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
9
9
|
from evalscope.utils import ResponseParser
|
|
10
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/race',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['high', 'middle'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
23
|
+
metric_list=['AverageAccuracy'],
|
|
24
24
|
few_shot_num=3,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -82,7 +82,7 @@ class RACEAdapter(DataAdapter):
|
|
|
82
82
|
|
|
83
83
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
84
84
|
|
|
85
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
85
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
86
86
|
|
|
87
87
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
88
88
|
# Get the gold choice
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark
|
|
7
7
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
8
8
|
from evalscope.constants import EvalType
|
|
9
|
-
from evalscope.metrics import AverageAccuracy
|
|
10
9
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
10
|
from evalscope.utils import get_logger
|
|
12
11
|
|
|
@@ -20,7 +19,7 @@ logger = get_logger()
|
|
|
20
19
|
dataset_id='modelscope/trivia_qa',
|
|
21
20
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
21
|
subset_list=['default'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
22
|
+
metric_list=['AverageAccuracy'],
|
|
24
23
|
few_shot_num=5,
|
|
25
24
|
train_split='dev',
|
|
26
25
|
eval_split='test',
|
|
@@ -9,9 +9,8 @@ from typing import List
|
|
|
9
9
|
from evalscope.benchmarks import Benchmark
|
|
10
10
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
11
11
|
from evalscope.constants import EvalType
|
|
12
|
-
from evalscope.metrics import AverageAccuracy
|
|
13
12
|
from evalscope.models import ContinuationLogitsModelAdapter
|
|
14
|
-
from evalscope.utils import get_logger
|
|
13
|
+
from evalscope.utils import get_logger
|
|
15
14
|
|
|
16
15
|
# flake8: noqa
|
|
17
16
|
|
|
@@ -25,7 +24,7 @@ logger = get_logger()
|
|
|
25
24
|
dataset_id='modelscope/truthful_qa',
|
|
26
25
|
model_adapter=ContinuationLogitsModelAdapter,
|
|
27
26
|
subset_list=['multiple_choice'],
|
|
28
|
-
metric_list=[AverageAccuracy],
|
|
27
|
+
metric_list=['AverageAccuracy'],
|
|
29
28
|
few_shot_num=0,
|
|
30
29
|
train_split=None,
|
|
31
30
|
eval_split='validation',
|
|
@@ -284,8 +283,9 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
284
283
|
logger.error(f'** Unknown review_res: {review_res_d}')
|
|
285
284
|
|
|
286
285
|
# To get mc2 score
|
|
287
|
-
return [{
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
}]
|
|
286
|
+
# return [{
|
|
287
|
+
# 'metric_name': self.metric_list[0].name,
|
|
288
|
+
# 'score': self.metric_list[0].object(mc2_list),
|
|
289
|
+
# 'num': len(mc2_list)
|
|
290
|
+
# }]
|
|
291
|
+
return super().compute_metric(mc2_list)
|
|
@@ -2,14 +2,15 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from collections import defaultdict
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
6
|
from tabulate import tabulate
|
|
6
7
|
from tqdm import tqdm
|
|
7
8
|
from typing import List
|
|
8
9
|
|
|
9
|
-
from evalscope.benchmarks import Benchmark
|
|
10
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
10
11
|
from evalscope.collections.sampler import DatasetEntry
|
|
11
12
|
from evalscope.config import TaskConfig
|
|
12
|
-
from evalscope.constants import
|
|
13
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalType
|
|
13
14
|
from evalscope.evaluator import Evaluator
|
|
14
15
|
from evalscope.models import get_local_model, initialize_model_adapter
|
|
15
16
|
from evalscope.report import ReportGenerator
|
|
@@ -29,11 +30,16 @@ class SimpleEvaluator(Evaluator):
|
|
|
29
30
|
task_cfg=task_cfg,
|
|
30
31
|
outputs=outputs)
|
|
31
32
|
|
|
32
|
-
def get_answer(self,
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
33
|
+
def get_answer(self, samples, infer_cfg) -> List[dict]:
|
|
34
|
+
input_prompts = [sample.prompt for sample in samples]
|
|
35
|
+
subset_name = samples[0].subset_name
|
|
36
|
+
answers_list = []
|
|
37
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
38
|
+
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
39
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
40
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
41
|
+
answers_list.append(processed_answer)
|
|
42
|
+
return answers_list, samples
|
|
37
43
|
|
|
38
44
|
def get_review(self, answer_d) -> dict:
|
|
39
45
|
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
@@ -42,38 +48,50 @@ class SimpleEvaluator(Evaluator):
|
|
|
42
48
|
|
|
43
49
|
def get_score(self, review_d) -> float:
|
|
44
50
|
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
|
|
45
|
-
|
|
46
|
-
score = metric_score[0]['score']
|
|
47
|
-
return score
|
|
51
|
+
return metric_score
|
|
48
52
|
|
|
49
53
|
|
|
50
54
|
class EvaluatorCollection:
|
|
51
55
|
|
|
52
|
-
def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
|
|
56
|
+
def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure):
|
|
53
57
|
self.task_cfg = task_cfg
|
|
58
|
+
self.data_adapter = data_adapter
|
|
54
59
|
self.outputs = outputs
|
|
55
60
|
self.model = get_local_model(task_cfg)
|
|
61
|
+
|
|
56
62
|
self.dataset, self.dataset_name = self.load()
|
|
57
|
-
self.dataset_name_map
|
|
63
|
+
self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
|
|
64
|
+
self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
|
|
58
65
|
self.evaluators = self._initialize_evaluators()
|
|
59
66
|
|
|
60
67
|
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
68
|
+
dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
|
|
69
|
+
raw_dataset = self.data_adapter.load()
|
|
70
|
+
# limit the dataset
|
|
71
|
+
if self.task_cfg.limit:
|
|
72
|
+
raw_dataset = raw_dataset[:self.task_cfg.limit]
|
|
73
|
+
# index dataset
|
|
64
74
|
datasets = []
|
|
65
75
|
for sample in raw_dataset:
|
|
76
|
+
sample['prompt'].update({'index': sample['index']})
|
|
66
77
|
datasets.append(DatasetEntry(**sample))
|
|
78
|
+
|
|
67
79
|
return datasets, dataset_name
|
|
68
80
|
|
|
69
|
-
|
|
81
|
+
@staticmethod
|
|
82
|
+
def _init_name_map(dataset):
|
|
70
83
|
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
71
|
-
|
|
72
|
-
for sample in self.dataset:
|
|
84
|
+
for sample in dataset:
|
|
73
85
|
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
74
86
|
dataset_name_map[dataset_name][subset_name].append(sample.index)
|
|
87
|
+
return dataset_name_map
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _init_id_map(dataset):
|
|
91
|
+
dataset_id_map = {}
|
|
92
|
+
for sample in dataset:
|
|
75
93
|
dataset_id_map[sample.index] = sample
|
|
76
|
-
return
|
|
94
|
+
return dataset_id_map
|
|
77
95
|
|
|
78
96
|
def _initialize_evaluators(self):
|
|
79
97
|
evaluators = {}
|
|
@@ -93,15 +111,16 @@ class EvaluatorCollection:
|
|
|
93
111
|
for subset_name, ids in data_map.items():
|
|
94
112
|
for _id in ids:
|
|
95
113
|
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
114
|
+
for metric in scores[_id]:
|
|
115
|
+
data.append(
|
|
116
|
+
dict(
|
|
117
|
+
task_type=row_data.task_type,
|
|
118
|
+
categories=tuple(row_data.categories),
|
|
119
|
+
dataset_name=dataset_name,
|
|
120
|
+
subset_name=subset_name,
|
|
121
|
+
tags=row_data.tags,
|
|
122
|
+
metric=metric['metric_name'],
|
|
123
|
+
score=metric['score']))
|
|
105
124
|
return pd.DataFrame(data)
|
|
106
125
|
|
|
107
126
|
def aggregate_and_sort(df, group_by_cols):
|
|
@@ -117,13 +136,13 @@ class EvaluatorCollection:
|
|
|
117
136
|
df = get_dataframe(scores)
|
|
118
137
|
|
|
119
138
|
# multi-level aggregation
|
|
120
|
-
subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
|
|
121
|
-
dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
|
|
122
|
-
task_report_df = aggregate_and_sort(df, ['task_type'])
|
|
139
|
+
subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
|
|
140
|
+
dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
|
|
141
|
+
task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
|
|
123
142
|
|
|
124
143
|
# explode tags to multiple rows
|
|
125
144
|
df_exploded_tags = df.explode('tags')
|
|
126
|
-
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
|
|
145
|
+
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
|
|
127
146
|
|
|
128
147
|
# process multi-level categories
|
|
129
148
|
df_categories = df.copy()
|
|
@@ -132,7 +151,8 @@ class EvaluatorCollection:
|
|
|
132
151
|
for level in range(max_depth):
|
|
133
152
|
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
|
|
134
153
|
if len(x) > level else '')
|
|
135
|
-
category_report_df = aggregate_and_sort(df_categories,
|
|
154
|
+
category_report_df = aggregate_and_sort(df_categories,
|
|
155
|
+
[f'category{level}' for level in range(max_depth)] + ['metric'])
|
|
136
156
|
|
|
137
157
|
# convert to dict format
|
|
138
158
|
report_dict = {
|
|
@@ -155,16 +175,60 @@ class EvaluatorCollection:
|
|
|
155
175
|
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
156
176
|
json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
|
|
157
177
|
|
|
178
|
+
def _filter_answer(self, pred_file_path):
|
|
179
|
+
answer_dict = defaultdict(dict)
|
|
180
|
+
if self.task_cfg.use_cache and os.path.exists(pred_file_path):
|
|
181
|
+
answers_list = jsonl_to_list(pred_file_path)
|
|
182
|
+
indices = set()
|
|
183
|
+
for answer in answers_list:
|
|
184
|
+
index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
|
|
185
|
+
answer_dict[index] = answer
|
|
186
|
+
indices.add(index)
|
|
187
|
+
data = []
|
|
188
|
+
for sample in self.dataset:
|
|
189
|
+
if sample.index not in indices:
|
|
190
|
+
data.append(sample)
|
|
191
|
+
data_map = self._init_name_map(data)
|
|
192
|
+
|
|
193
|
+
return answer_dict, data, data_map
|
|
194
|
+
return answer_dict, self.dataset, self.dataset_name_map
|
|
195
|
+
|
|
158
196
|
def get_answers(self):
|
|
159
197
|
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
160
198
|
f'{self.dataset_name}.jsonl')
|
|
161
199
|
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
200
|
+
|
|
201
|
+
answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
|
|
202
|
+
|
|
203
|
+
eval_batch_size = self.task_cfg.eval_batch_size
|
|
204
|
+
with tqdm(total=len(dataset), desc='Getting answers') as pbar:
|
|
205
|
+
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
206
|
+
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
207
|
+
futures = []
|
|
208
|
+
for sample in dataset:
|
|
209
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
210
|
+
futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
|
|
211
|
+
for future in as_completed(futures):
|
|
212
|
+
answer_list, samples = future.result()
|
|
213
|
+
answers[samples[0].index] = answer_list[0]
|
|
214
|
+
dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
215
|
+
pbar.update(1)
|
|
216
|
+
else:
|
|
217
|
+
for dataset_name, data_map in dataset_name_map.items():
|
|
218
|
+
# get evaluator for the dataset
|
|
219
|
+
evaluator = self.evaluators[dataset_name]
|
|
220
|
+
for subset_name, ids in data_map.items():
|
|
221
|
+
for i in range(0, len(ids), eval_batch_size):
|
|
222
|
+
# get batch samples
|
|
223
|
+
batch_ids = ids[i:i + eval_batch_size]
|
|
224
|
+
batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
|
|
225
|
+
answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
|
|
226
|
+
# update answers
|
|
227
|
+
for j, _id in enumerate(batch_ids):
|
|
228
|
+
answers[_id] = answer_list[j]
|
|
229
|
+
dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
230
|
+
|
|
231
|
+
pbar.update(len(batch_ids))
|
|
168
232
|
return answers
|
|
169
233
|
|
|
170
234
|
def get_reviews(self, answers):
|
evalscope/collections/sampler.py
CHANGED
|
@@ -44,7 +44,8 @@ class Sampler(ABC):
|
|
|
44
44
|
dataset_name=dataset.name,
|
|
45
45
|
subset_name=subset_name,
|
|
46
46
|
))
|
|
47
|
-
|
|
47
|
+
count = min(count, len(all_data)) # avoid sampling more than the dataset size
|
|
48
|
+
sampled_data = random.sample(all_data, k=count)
|
|
48
49
|
return sampled_data
|
|
49
50
|
|
|
50
51
|
def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
|
evalscope/collections/schema.py
CHANGED
|
@@ -19,8 +19,7 @@ class DatasetInfo:
|
|
|
19
19
|
benchmark_meta = Benchmark.get(self.name)
|
|
20
20
|
|
|
21
21
|
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
22
|
-
data_dict = data_adapter.load(
|
|
23
|
-
dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
|
|
22
|
+
data_dict = data_adapter.load()
|
|
24
23
|
prompts = data_adapter.gen_prompts(data_dict)
|
|
25
24
|
return prompts
|
|
26
25
|
|
evalscope/config.py
CHANGED
|
@@ -54,6 +54,7 @@ class TaskConfig:
|
|
|
54
54
|
eval_config: Union[str, Dict, None] = None
|
|
55
55
|
stage: str = EvalStage.ALL
|
|
56
56
|
limit: Optional[int] = None
|
|
57
|
+
eval_batch_size: int = 1
|
|
57
58
|
|
|
58
59
|
# Cache and working directory arguments
|
|
59
60
|
mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -3,15 +3,16 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import time
|
|
6
|
-
from collections import OrderedDict
|
|
6
|
+
from collections import OrderedDict, defaultdict
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
8
|
from copy import deepcopy
|
|
8
9
|
from tqdm import tqdm
|
|
9
10
|
from typing import Any, Dict, List, Optional, Union
|
|
10
11
|
|
|
11
12
|
from evalscope.benchmarks import DataAdapter
|
|
12
13
|
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
|
|
14
|
-
from evalscope.models import BaseModelAdapter
|
|
14
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, ReviewKeys
|
|
15
|
+
from evalscope.models import BaseModelAdapter
|
|
15
16
|
from evalscope.report import Report, gen_table
|
|
16
17
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
17
18
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
@@ -36,7 +37,6 @@ class Evaluator(object):
|
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
39
|
def __init__(self,
|
|
39
|
-
dataset_name_or_path: str,
|
|
40
40
|
data_adapter: DataAdapter,
|
|
41
41
|
model_adapter: BaseModelAdapter,
|
|
42
42
|
outputs: OutputsStructure = None,
|
|
@@ -44,7 +44,7 @@ class Evaluator(object):
|
|
|
44
44
|
**kwargs):
|
|
45
45
|
|
|
46
46
|
self.dataset_name = data_adapter.name
|
|
47
|
-
self.dataset_name_or_path = os.path.expanduser(
|
|
47
|
+
self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
|
|
48
48
|
self.model_name = task_cfg.model_id
|
|
49
49
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
50
50
|
|
|
@@ -63,15 +63,20 @@ class Evaluator(object):
|
|
|
63
63
|
|
|
64
64
|
def load_dataset(self):
|
|
65
65
|
dataset = self.data_adapter.load(
|
|
66
|
-
|
|
67
|
-
subset_list=self.data_adapter.subset_list,
|
|
68
|
-
work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
|
|
69
|
-
datasets_hub=self.dataset_hub,
|
|
70
|
-
**self.kwargs)
|
|
66
|
+
work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
|
|
71
67
|
|
|
72
68
|
# Get prompts from dataset
|
|
73
69
|
prompts = self.data_adapter.gen_prompts(data_dict=dataset)
|
|
74
|
-
|
|
70
|
+
|
|
71
|
+
# Limit and index prompts
|
|
72
|
+
limited_prompts = defaultdict(list)
|
|
73
|
+
for subset_name, prompts_list in prompts.items():
|
|
74
|
+
limit = self.task_cfg.limit or len(prompts_list)
|
|
75
|
+
for index, prompt in enumerate(prompts_list[:limit]):
|
|
76
|
+
prompt['index'] = index
|
|
77
|
+
limited_prompts[subset_name].append(prompt)
|
|
78
|
+
|
|
79
|
+
return limited_prompts
|
|
75
80
|
|
|
76
81
|
def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
|
|
77
82
|
model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
|
|
@@ -87,12 +92,38 @@ class Evaluator(object):
|
|
|
87
92
|
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
88
93
|
return answer_d
|
|
89
94
|
|
|
90
|
-
def
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
95
|
+
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
96
|
+
answers_list = []
|
|
97
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
98
|
+
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
99
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
100
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
101
|
+
answers_list.append(processed_answer)
|
|
102
|
+
return answers_list
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
|
|
106
|
+
# Filter prompts that have been answered
|
|
107
|
+
answers_list = []
|
|
108
|
+
if not use_cache or not os.path.exists(pred_file_path):
|
|
109
|
+
return answers_list, prompts_list
|
|
110
|
+
|
|
111
|
+
def get_answered_indices(answers_list: List[Dict]) -> List[int]:
|
|
112
|
+
indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
|
|
113
|
+
|
|
114
|
+
if all(index is None for index in indices):
|
|
115
|
+
return list(range(len(answers_list)))
|
|
116
|
+
|
|
117
|
+
return [index for index in indices if index is not None]
|
|
118
|
+
|
|
119
|
+
answers_list = jsonl_to_list(pred_file_path)
|
|
120
|
+
answered_indices = set(get_answered_indices(answers_list))
|
|
121
|
+
logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
|
|
122
|
+
|
|
123
|
+
prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
|
|
124
|
+
return answers_list, prompts
|
|
125
|
+
|
|
126
|
+
def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
|
|
96
127
|
"""
|
|
97
128
|
Get answers from model inference.
|
|
98
129
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -110,7 +141,6 @@ class Evaluator(object):
|
|
|
110
141
|
max_length: int, the max length of the sequence to be generated.
|
|
111
142
|
max_new_tokens: int, the max number of new tokens to be generated.
|
|
112
143
|
repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
|
|
113
|
-
debug: whether to run in debug mode.
|
|
114
144
|
**kwargs: kwargs.
|
|
115
145
|
|
|
116
146
|
Returns: The list of answers.
|
|
@@ -119,41 +149,35 @@ class Evaluator(object):
|
|
|
119
149
|
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
120
150
|
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
121
151
|
|
|
122
|
-
answers_list = []
|
|
123
152
|
pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
|
|
124
153
|
pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
|
|
125
154
|
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
126
155
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
142
|
-
answers_list.append(processed_answer)
|
|
143
|
-
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
144
|
-
|
|
156
|
+
answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
|
|
157
|
+
|
|
158
|
+
eval_batch_size = self.task_cfg.eval_batch_size
|
|
159
|
+
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
160
|
+
with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
|
|
161
|
+
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
162
|
+
futures = []
|
|
163
|
+
for input_prompt in prompts_list:
|
|
164
|
+
futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
|
|
165
|
+
for future in as_completed(futures):
|
|
166
|
+
answer_ds: List[dict] = future.result()
|
|
167
|
+
answers_list.extend(answer_ds)
|
|
168
|
+
dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
169
|
+
pbar.update(len(answer_ds))
|
|
145
170
|
else:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
171
|
+
batch_prompts_list = [
|
|
172
|
+
prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
|
|
173
|
+
]
|
|
174
|
+
with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
|
|
175
|
+
for batch_prompts in batch_prompts_list:
|
|
176
|
+
answer_ds: List[dict] = self._get_answer(
|
|
177
|
+
input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
|
|
178
|
+
answers_list.extend(answer_ds)
|
|
179
|
+
dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
180
|
+
pbar.update(len(batch_prompts))
|
|
157
181
|
|
|
158
182
|
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
159
183
|
return answers_list
|
|
@@ -200,17 +224,13 @@ class Evaluator(object):
|
|
|
200
224
|
def _generate_review_id(self, answer_d):
|
|
201
225
|
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
202
226
|
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
203
|
-
reviewer_spec = {
|
|
204
|
-
'metric': [metric.name for metric in self.data_adapter.metric_list],
|
|
205
|
-
'reviewer': ['Evaluator'],
|
|
206
|
-
'revision': ['default']
|
|
207
|
-
}
|
|
227
|
+
reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
|
|
208
228
|
reviewer_spec_str = json.dumps(
|
|
209
229
|
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
210
230
|
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
211
231
|
return review_id, reviewer_spec
|
|
212
232
|
|
|
213
|
-
def get_reviews(self, subset_name: str, answers_list: List[dict],
|
|
233
|
+
def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
|
|
214
234
|
"""
|
|
215
235
|
Get reviews from answers.
|
|
216
236
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -218,7 +238,6 @@ class Evaluator(object):
|
|
|
218
238
|
Args:
|
|
219
239
|
subset_name: subset name of benchmark
|
|
220
240
|
answers_list: inference results list.
|
|
221
|
-
debug: whether to run in debug mode.
|
|
222
241
|
**kwargs: kwargs.
|
|
223
242
|
|
|
224
243
|
Returns: reviews list.
|
|
@@ -237,8 +256,7 @@ class Evaluator(object):
|
|
|
237
256
|
# Get review
|
|
238
257
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
239
258
|
|
|
240
|
-
|
|
241
|
-
logger.info(review_d)
|
|
259
|
+
logger.debug(review_d)
|
|
242
260
|
|
|
243
261
|
reviews_list.append(review_d)
|
|
244
262
|
# Dump reviews
|
|
@@ -315,7 +333,7 @@ class Evaluator(object):
|
|
|
315
333
|
logger.error('Failed to generate report table.')
|
|
316
334
|
return report_map
|
|
317
335
|
|
|
318
|
-
def eval(self,
|
|
336
|
+
def eval(self, **kwargs) -> dict:
|
|
319
337
|
"""
|
|
320
338
|
Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
|
|
321
339
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -329,7 +347,6 @@ class Evaluator(object):
|
|
|
329
347
|
|
|
330
348
|
Args:
|
|
331
349
|
infer_cfg: The config for model inference.
|
|
332
|
-
debug: Whether to run in debug mode. Default: False.
|
|
333
350
|
|
|
334
351
|
Returns:
|
|
335
352
|
Dict of results. Depends on the stage of evaluation.
|
|
@@ -347,17 +364,14 @@ class Evaluator(object):
|
|
|
347
364
|
|
|
348
365
|
prompts = self.load_dataset()
|
|
349
366
|
for subset_name, prompts_list in prompts.items():
|
|
350
|
-
limit = kwargs.get('limit', len(prompts_list))
|
|
351
|
-
prompts_list = prompts_list[:limit]
|
|
352
367
|
|
|
353
368
|
answers_list: list = self.get_answers(
|
|
354
|
-
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=
|
|
369
|
+
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
|
|
355
370
|
if self.stage == EvalStage.INFER:
|
|
356
371
|
stage_answers_dict[subset_name] = answers_list
|
|
357
372
|
continue
|
|
358
373
|
|
|
359
|
-
reviews_list: list = self.get_reviews(
|
|
360
|
-
subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
|
|
374
|
+
reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
|
|
361
375
|
|
|
362
376
|
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
363
377
|
reviews_score_all[subset_name] = metric_res
|