evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
- evalscope/benchmarks/ifeval/instructions.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/cli/start_app.py +3 -2
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -47
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +298 -96
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -3,15 +3,16 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import time
|
|
6
|
-
from collections import OrderedDict
|
|
6
|
+
from collections import OrderedDict, defaultdict
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
8
|
from copy import deepcopy
|
|
8
9
|
from tqdm import tqdm
|
|
9
10
|
from typing import Any, Dict, List, Optional, Union
|
|
10
11
|
|
|
11
12
|
from evalscope.benchmarks import DataAdapter
|
|
12
13
|
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
|
|
14
|
-
from evalscope.models import BaseModelAdapter
|
|
14
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, ReviewKeys
|
|
15
|
+
from evalscope.models import BaseModelAdapter
|
|
15
16
|
from evalscope.report import Report, gen_table
|
|
16
17
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
17
18
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
@@ -36,7 +37,6 @@ class Evaluator(object):
|
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
39
|
def __init__(self,
|
|
39
|
-
dataset_name_or_path: str,
|
|
40
40
|
data_adapter: DataAdapter,
|
|
41
41
|
model_adapter: BaseModelAdapter,
|
|
42
42
|
outputs: OutputsStructure = None,
|
|
@@ -44,7 +44,7 @@ class Evaluator(object):
|
|
|
44
44
|
**kwargs):
|
|
45
45
|
|
|
46
46
|
self.dataset_name = data_adapter.name
|
|
47
|
-
self.dataset_name_or_path = os.path.expanduser(
|
|
47
|
+
self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
|
|
48
48
|
self.model_name = task_cfg.model_id
|
|
49
49
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
50
50
|
|
|
@@ -63,15 +63,20 @@ class Evaluator(object):
|
|
|
63
63
|
|
|
64
64
|
def load_dataset(self):
|
|
65
65
|
dataset = self.data_adapter.load(
|
|
66
|
-
|
|
67
|
-
subset_list=self.data_adapter.subset_list,
|
|
68
|
-
work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
|
|
69
|
-
datasets_hub=self.dataset_hub,
|
|
70
|
-
**self.kwargs)
|
|
66
|
+
work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
|
|
71
67
|
|
|
72
68
|
# Get prompts from dataset
|
|
73
69
|
prompts = self.data_adapter.gen_prompts(data_dict=dataset)
|
|
74
|
-
|
|
70
|
+
|
|
71
|
+
# Limit and index prompts
|
|
72
|
+
limited_prompts = defaultdict(list)
|
|
73
|
+
for subset_name, prompts_list in prompts.items():
|
|
74
|
+
limit = self.task_cfg.limit or len(prompts_list)
|
|
75
|
+
for index, prompt in enumerate(prompts_list[:limit]):
|
|
76
|
+
prompt['index'] = index
|
|
77
|
+
limited_prompts[subset_name].append(prompt)
|
|
78
|
+
|
|
79
|
+
return limited_prompts
|
|
75
80
|
|
|
76
81
|
def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
|
|
77
82
|
model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
|
|
@@ -87,12 +92,38 @@ class Evaluator(object):
|
|
|
87
92
|
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
88
93
|
return answer_d
|
|
89
94
|
|
|
90
|
-
def
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
95
|
+
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
96
|
+
answers_list = []
|
|
97
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
98
|
+
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
99
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
100
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
101
|
+
answers_list.append(processed_answer)
|
|
102
|
+
return answers_list
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
|
|
106
|
+
# Filter prompts that have been answered
|
|
107
|
+
answers_list = []
|
|
108
|
+
if not use_cache or not os.path.exists(pred_file_path):
|
|
109
|
+
return answers_list, prompts_list
|
|
110
|
+
|
|
111
|
+
def get_answered_indices(answers_list: List[Dict]) -> List[int]:
|
|
112
|
+
indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
|
|
113
|
+
|
|
114
|
+
if all(index is None for index in indices):
|
|
115
|
+
return list(range(len(answers_list)))
|
|
116
|
+
|
|
117
|
+
return [index for index in indices if index is not None]
|
|
118
|
+
|
|
119
|
+
answers_list = jsonl_to_list(pred_file_path)
|
|
120
|
+
answered_indices = set(get_answered_indices(answers_list))
|
|
121
|
+
logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
|
|
122
|
+
|
|
123
|
+
prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
|
|
124
|
+
return answers_list, prompts
|
|
125
|
+
|
|
126
|
+
def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
|
|
96
127
|
"""
|
|
97
128
|
Get answers from model inference.
|
|
98
129
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -110,7 +141,6 @@ class Evaluator(object):
|
|
|
110
141
|
max_length: int, the max length of the sequence to be generated.
|
|
111
142
|
max_new_tokens: int, the max number of new tokens to be generated.
|
|
112
143
|
repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
|
|
113
|
-
debug: whether to run in debug mode.
|
|
114
144
|
**kwargs: kwargs.
|
|
115
145
|
|
|
116
146
|
Returns: The list of answers.
|
|
@@ -119,41 +149,35 @@ class Evaluator(object):
|
|
|
119
149
|
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
120
150
|
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
121
151
|
|
|
122
|
-
answers_list = []
|
|
123
152
|
pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
|
|
124
153
|
pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
|
|
125
154
|
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
126
155
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
142
|
-
answers_list.append(processed_answer)
|
|
143
|
-
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
144
|
-
|
|
156
|
+
answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
|
|
157
|
+
|
|
158
|
+
eval_batch_size = self.task_cfg.eval_batch_size
|
|
159
|
+
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
160
|
+
with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
|
|
161
|
+
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
162
|
+
futures = []
|
|
163
|
+
for input_prompt in prompts_list:
|
|
164
|
+
futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
|
|
165
|
+
for future in as_completed(futures):
|
|
166
|
+
answer_ds: List[dict] = future.result()
|
|
167
|
+
answers_list.extend(answer_ds)
|
|
168
|
+
dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
169
|
+
pbar.update(len(answer_ds))
|
|
145
170
|
else:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
171
|
+
batch_prompts_list = [
|
|
172
|
+
prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
|
|
173
|
+
]
|
|
174
|
+
with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
|
|
175
|
+
for batch_prompts in batch_prompts_list:
|
|
176
|
+
answer_ds: List[dict] = self._get_answer(
|
|
177
|
+
input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
|
|
178
|
+
answers_list.extend(answer_ds)
|
|
179
|
+
dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
180
|
+
pbar.update(len(batch_prompts))
|
|
157
181
|
|
|
158
182
|
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
159
183
|
return answers_list
|
|
@@ -200,17 +224,13 @@ class Evaluator(object):
|
|
|
200
224
|
def _generate_review_id(self, answer_d):
|
|
201
225
|
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
202
226
|
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
203
|
-
reviewer_spec = {
|
|
204
|
-
'metric': [metric.name for metric in self.data_adapter.metric_list],
|
|
205
|
-
'reviewer': ['Evaluator'],
|
|
206
|
-
'revision': ['default']
|
|
207
|
-
}
|
|
227
|
+
reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
|
|
208
228
|
reviewer_spec_str = json.dumps(
|
|
209
229
|
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
210
230
|
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
211
231
|
return review_id, reviewer_spec
|
|
212
232
|
|
|
213
|
-
def get_reviews(self, subset_name: str, answers_list: List[dict],
|
|
233
|
+
def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
|
|
214
234
|
"""
|
|
215
235
|
Get reviews from answers.
|
|
216
236
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -218,7 +238,6 @@ class Evaluator(object):
|
|
|
218
238
|
Args:
|
|
219
239
|
subset_name: subset name of benchmark
|
|
220
240
|
answers_list: inference results list.
|
|
221
|
-
debug: whether to run in debug mode.
|
|
222
241
|
**kwargs: kwargs.
|
|
223
242
|
|
|
224
243
|
Returns: reviews list.
|
|
@@ -237,8 +256,7 @@ class Evaluator(object):
|
|
|
237
256
|
# Get review
|
|
238
257
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
239
258
|
|
|
240
|
-
|
|
241
|
-
logger.info(review_d)
|
|
259
|
+
logger.debug(review_d)
|
|
242
260
|
|
|
243
261
|
reviews_list.append(review_d)
|
|
244
262
|
# Dump reviews
|
|
@@ -315,7 +333,7 @@ class Evaluator(object):
|
|
|
315
333
|
logger.error('Failed to generate report table.')
|
|
316
334
|
return report_map
|
|
317
335
|
|
|
318
|
-
def eval(self,
|
|
336
|
+
def eval(self, **kwargs) -> dict:
|
|
319
337
|
"""
|
|
320
338
|
Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
|
|
321
339
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -329,7 +347,6 @@ class Evaluator(object):
|
|
|
329
347
|
|
|
330
348
|
Args:
|
|
331
349
|
infer_cfg: The config for model inference.
|
|
332
|
-
debug: Whether to run in debug mode. Default: False.
|
|
333
350
|
|
|
334
351
|
Returns:
|
|
335
352
|
Dict of results. Depends on the stage of evaluation.
|
|
@@ -347,17 +364,14 @@ class Evaluator(object):
|
|
|
347
364
|
|
|
348
365
|
prompts = self.load_dataset()
|
|
349
366
|
for subset_name, prompts_list in prompts.items():
|
|
350
|
-
limit = kwargs.get('limit', len(prompts_list))
|
|
351
|
-
prompts_list = prompts_list[:limit]
|
|
352
367
|
|
|
353
368
|
answers_list: list = self.get_answers(
|
|
354
|
-
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=
|
|
369
|
+
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
|
|
355
370
|
if self.stage == EvalStage.INFER:
|
|
356
371
|
stage_answers_dict[subset_name] = answers_list
|
|
357
372
|
continue
|
|
358
373
|
|
|
359
|
-
reviews_list: list = self.get_reviews(
|
|
360
|
-
subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
|
|
374
|
+
reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
|
|
361
375
|
|
|
362
376
|
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
363
377
|
reviews_score_all[subset_name] = metric_res
|