evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +40 -30
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +77 -39
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +2 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +99 -16
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +91 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/utils.py +25 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +68 -34
- evalscope/config.py +8 -2
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +40 -28
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +80 -23
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +4 -2
- evalscope/perf/benchmark.py +16 -12
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +40 -6
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +84 -4
- evalscope/run.py +12 -0
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
- tests/aigc/test_t2i.py +48 -11
- tests/cli/test_all.py +14 -3
- tests/cli/test_collection.py +6 -4
- tests/cli/test_run.py +50 -25
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +51 -7
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
evalscope/cli/start_app.py
CHANGED
|
@@ -21,13 +21,13 @@ class StartAppCMD(CLICommand):
|
|
|
21
21
|
def define_args(parsers: ArgumentParser):
|
|
22
22
|
""" define args for create pipeline template command.
|
|
23
23
|
"""
|
|
24
|
-
from evalscope.
|
|
24
|
+
from evalscope.app import add_argument
|
|
25
25
|
|
|
26
26
|
parser = parsers.add_parser(StartAppCMD.name)
|
|
27
27
|
add_argument(parser)
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
from evalscope.
|
|
31
|
+
from evalscope.app import create_app
|
|
32
32
|
|
|
33
33
|
create_app(self.args)
|
|
@@ -1,3 +1,35 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .evaluator import EvaluatorCollection
|
|
8
|
+
from .sampler import StratifiedSampler, UniformSampler, WeightedSampler
|
|
9
|
+
from .schema import CollectionSchema, DatasetInfo
|
|
10
|
+
|
|
11
|
+
else:
|
|
12
|
+
_import_structure = {
|
|
13
|
+
'evaluator': [
|
|
14
|
+
'EvaluatorCollection',
|
|
15
|
+
],
|
|
16
|
+
'sampler': [
|
|
17
|
+
'StratifiedSampler',
|
|
18
|
+
'UniformSampler',
|
|
19
|
+
'WeightedSampler',
|
|
20
|
+
],
|
|
21
|
+
'schema': [
|
|
22
|
+
'CollectionSchema',
|
|
23
|
+
'DatasetInfo',
|
|
24
|
+
],
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
sys.modules[__name__] = _LazyModule(
|
|
30
|
+
__name__,
|
|
31
|
+
globals()['__file__'],
|
|
32
|
+
_import_structure,
|
|
33
|
+
module_spec=__spec__,
|
|
34
|
+
extra_objects={},
|
|
35
|
+
)
|
|
@@ -32,11 +32,22 @@ class SimpleEvaluator(Evaluator):
|
|
|
32
32
|
task_cfg=task_cfg,
|
|
33
33
|
outputs=outputs)
|
|
34
34
|
|
|
35
|
-
def get_answer(self, samples, infer_cfg) -> List[dict]:
|
|
35
|
+
def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
|
|
36
36
|
input_prompts = [sample.prompt for sample in samples]
|
|
37
37
|
subset_name = samples[0].subset_name
|
|
38
|
+
try:
|
|
39
|
+
# get answer from model
|
|
40
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
|
|
43
|
+
# if ignore_errors is True, continue to next input
|
|
44
|
+
if self.task_cfg.ignore_errors:
|
|
45
|
+
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
46
|
+
return [None] * len(samples), samples
|
|
47
|
+
else:
|
|
48
|
+
raise e
|
|
49
|
+
# process answers
|
|
38
50
|
answers_list = []
|
|
39
|
-
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
40
51
|
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
41
52
|
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
42
53
|
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
@@ -66,13 +77,17 @@ class EvaluatorCollection:
|
|
|
66
77
|
self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
|
|
67
78
|
self.evaluators = self._initialize_evaluators()
|
|
68
79
|
|
|
69
|
-
def load(self) -> tuple[
|
|
80
|
+
def load(self) -> tuple[List[DatasetEntry], str]:
|
|
70
81
|
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
|
|
71
82
|
raw_dataset = self.data_adapter.load()
|
|
72
83
|
# random limit the dataset
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
84
|
+
limit = len(raw_dataset)
|
|
85
|
+
if self.task_cfg.limit is not None:
|
|
86
|
+
if isinstance(self.task_cfg.limit, int):
|
|
87
|
+
limit = self.task_cfg.limit
|
|
88
|
+
elif isinstance(self.task_cfg.limit, float):
|
|
89
|
+
limit = int(len(raw_dataset) * self.task_cfg.limit)
|
|
90
|
+
raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
|
|
76
91
|
# index dataset
|
|
77
92
|
datasets = []
|
|
78
93
|
for sample in raw_dataset:
|
|
@@ -82,7 +97,7 @@ class EvaluatorCollection:
|
|
|
82
97
|
return datasets, dataset_name
|
|
83
98
|
|
|
84
99
|
@staticmethod
|
|
85
|
-
def _init_name_map(dataset):
|
|
100
|
+
def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
|
|
86
101
|
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
87
102
|
for sample in dataset:
|
|
88
103
|
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
@@ -90,13 +105,13 @@ class EvaluatorCollection:
|
|
|
90
105
|
return dataset_name_map
|
|
91
106
|
|
|
92
107
|
@staticmethod
|
|
93
|
-
def _init_id_map(dataset):
|
|
108
|
+
def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
|
|
94
109
|
dataset_id_map = {}
|
|
95
110
|
for sample in dataset:
|
|
96
111
|
dataset_id_map[sample.index] = sample
|
|
97
112
|
return dataset_id_map
|
|
98
113
|
|
|
99
|
-
def _initialize_evaluators(self):
|
|
114
|
+
def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
|
|
100
115
|
evaluators = {}
|
|
101
116
|
# load dataset args
|
|
102
117
|
dataset_args = deepcopy(self.task_cfg.dataset_args)
|
|
@@ -114,6 +129,8 @@ class EvaluatorCollection:
|
|
|
114
129
|
return evaluators
|
|
115
130
|
|
|
116
131
|
def get_report(self, scores):
|
|
132
|
+
if not scores:
|
|
133
|
+
return
|
|
117
134
|
|
|
118
135
|
def get_dataframe(scores):
|
|
119
136
|
data = []
|
|
@@ -179,11 +196,19 @@ class EvaluatorCollection:
|
|
|
179
196
|
logger.info(f'{level} Report:\n{table}')
|
|
180
197
|
|
|
181
198
|
report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
|
|
199
|
+
# Make report analysis
|
|
200
|
+
if self.task_cfg.analysis_report:
|
|
201
|
+
logger.info('Generating report analysis, please wait ...')
|
|
202
|
+
analysis = report.generate_analysis(self.task_cfg.judge_model_args)
|
|
203
|
+
logger.info('Report analysis:\n%s', analysis)
|
|
204
|
+
else:
|
|
205
|
+
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
206
|
+
|
|
182
207
|
# save report to JSON file
|
|
183
208
|
report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
209
|
+
report.to_json(report_file_path)
|
|
210
|
+
|
|
211
|
+
logger.info(f'Report saved to {report_file_path}')
|
|
187
212
|
return report
|
|
188
213
|
|
|
189
214
|
def _filter_answer(self, pred_file_path):
|
|
@@ -229,9 +254,12 @@ class EvaluatorCollection:
|
|
|
229
254
|
# Process completed tasks
|
|
230
255
|
for future in as_completed(futures):
|
|
231
256
|
answer_list, samples = future.result()
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
257
|
+
for answer_d, sample in zip(answer_list, samples):
|
|
258
|
+
if answer_d is None:
|
|
259
|
+
continue
|
|
260
|
+
answers[sample.index] = answer_d
|
|
261
|
+
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
|
|
262
|
+
pbar.update(1)
|
|
235
263
|
else:
|
|
236
264
|
for dataset_name, data_map in dataset_name_map.items():
|
|
237
265
|
# get evaluator for the dataset
|
|
@@ -241,13 +269,14 @@ class EvaluatorCollection:
|
|
|
241
269
|
# get batch samples
|
|
242
270
|
batch_ids = ids[i:i + eval_batch_size]
|
|
243
271
|
batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
|
|
244
|
-
answer_list,
|
|
272
|
+
answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
|
|
245
273
|
# update answers
|
|
246
|
-
for
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
274
|
+
for answer_d, sample in zip(answer_list, samples):
|
|
275
|
+
if answer_d is None:
|
|
276
|
+
continue
|
|
277
|
+
answers[sample.index] = answer_d
|
|
278
|
+
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
|
|
279
|
+
pbar.update(1)
|
|
251
280
|
return answers
|
|
252
281
|
|
|
253
282
|
def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
|
|
@@ -277,19 +306,22 @@ class EvaluatorCollection:
|
|
|
277
306
|
|
|
278
307
|
reviews = {}
|
|
279
308
|
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
309
|
+
try:
|
|
310
|
+
file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
|
|
311
|
+
|
|
312
|
+
if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
|
|
313
|
+
# Use cached review if available
|
|
314
|
+
review_d = review_history_map[file_name][sample.index]
|
|
315
|
+
else:
|
|
316
|
+
# Generate new review
|
|
317
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
318
|
+
review_d = evaluator.get_review(answers[sample.index])
|
|
319
|
+
# Only save the review if it's not in the cache
|
|
320
|
+
self._save_review(review_file_path, file_name, review_d)
|
|
321
|
+
|
|
322
|
+
reviews[sample.index] = review_d
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
|
|
293
325
|
|
|
294
326
|
return reviews
|
|
295
327
|
|
|
@@ -327,6 +359,8 @@ class EvaluatorCollection:
|
|
|
327
359
|
scores = defaultdict(dict)
|
|
328
360
|
for sample in tqdm(self.dataset, desc='Getting scores'):
|
|
329
361
|
evaluator = self.evaluators[sample.dataset_name]
|
|
362
|
+
if sample.index not in reviews:
|
|
363
|
+
continue
|
|
330
364
|
review_d = reviews[sample.index]
|
|
331
365
|
score = evaluator.get_score(review_d)
|
|
332
366
|
scores[sample.index] = score
|
evalscope/config.py
CHANGED
|
@@ -13,6 +13,7 @@ from evalscope.models import CustomModel, DummyCustomModel
|
|
|
13
13
|
from evalscope.utils import gen_hash
|
|
14
14
|
from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
|
|
15
15
|
from evalscope.utils.logger import get_logger
|
|
16
|
+
from evalscope.utils.utils import parse_int_or_float
|
|
16
17
|
|
|
17
18
|
logger = get_logger()
|
|
18
19
|
|
|
@@ -45,7 +46,7 @@ class TaskConfig:
|
|
|
45
46
|
eval_backend: str = EvalBackend.NATIVE
|
|
46
47
|
eval_config: Union[str, Dict, None] = None
|
|
47
48
|
stage: str = EvalStage.ALL
|
|
48
|
-
limit: Optional[int] = None
|
|
49
|
+
limit: Optional[Union[int, float]] = None
|
|
49
50
|
eval_batch_size: Optional[int] = None
|
|
50
51
|
|
|
51
52
|
# Cache and working directory arguments
|
|
@@ -67,7 +68,8 @@ class TaskConfig:
|
|
|
67
68
|
# LLMJudge arguments
|
|
68
69
|
judge_strategy: str = JudgeStrategy.AUTO
|
|
69
70
|
judge_worker_num: int = 1
|
|
70
|
-
judge_model_args: Optional[Dict] = field(default_factory=
|
|
71
|
+
judge_model_args: Optional[Dict] = field(default_factory=dict)
|
|
72
|
+
analysis_report: bool = False
|
|
71
73
|
|
|
72
74
|
def __post_init__(self):
|
|
73
75
|
if self.model is None:
|
|
@@ -86,6 +88,10 @@ class TaskConfig:
|
|
|
86
88
|
if self.eval_batch_size is None:
|
|
87
89
|
self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
|
|
88
90
|
|
|
91
|
+
# Post process limit
|
|
92
|
+
if self.limit is not None:
|
|
93
|
+
self.limit = parse_int_or_float(self.limit)
|
|
94
|
+
|
|
89
95
|
# Set default generation_config and model_args
|
|
90
96
|
self.__init_default_generation_config()
|
|
91
97
|
self.__init_default_model_args()
|
evalscope/constants.py
CHANGED
|
@@ -146,7 +146,7 @@ class EvalType:
|
|
|
146
146
|
|
|
147
147
|
|
|
148
148
|
class OutputType:
|
|
149
|
-
LOGITS = 'logits' # for
|
|
149
|
+
LOGITS = 'logits' # for logits output tasks
|
|
150
150
|
GENERATION = 'generation' # for text generation tasks and general tasks
|
|
151
151
|
MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
|
|
152
152
|
CONTINUOUS = 'continuous_logits' # for continuous tasks
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -46,7 +46,6 @@ class Evaluator(object):
|
|
|
46
46
|
self.dataset_name = data_adapter.name
|
|
47
47
|
self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
|
|
48
48
|
self.model_name = task_cfg.model_id
|
|
49
|
-
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
50
49
|
|
|
51
50
|
self.data_adapter = data_adapter
|
|
52
51
|
self.model_adapter = model_adapter
|
|
@@ -79,8 +78,16 @@ class Evaluator(object):
|
|
|
79
78
|
# Limit and index prompts
|
|
80
79
|
limited_prompts = defaultdict(list)
|
|
81
80
|
for subset_name, prompts_list in prompts.items():
|
|
82
|
-
|
|
83
|
-
|
|
81
|
+
# If limit is None, use all prompts
|
|
82
|
+
if self.task_cfg.limit is None:
|
|
83
|
+
limit = len(prompts_list)
|
|
84
|
+
else:
|
|
85
|
+
if isinstance(self.task_cfg.limit, int):
|
|
86
|
+
limit = self.task_cfg.limit
|
|
87
|
+
elif isinstance(self.task_cfg.limit, float):
|
|
88
|
+
limit = int(len(prompts_list) * self.task_cfg.limit)
|
|
89
|
+
# Limit the number of prompts
|
|
90
|
+
for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
|
|
84
91
|
prompt[AnswerKeys.INDEX] = index
|
|
85
92
|
limited_prompts[subset_name].append(prompt)
|
|
86
93
|
|
|
@@ -101,7 +108,6 @@ class Evaluator(object):
|
|
|
101
108
|
return answer_d
|
|
102
109
|
|
|
103
110
|
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
104
|
-
answers_list = []
|
|
105
111
|
try:
|
|
106
112
|
# get answer from model
|
|
107
113
|
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
@@ -110,10 +116,11 @@ class Evaluator(object):
|
|
|
110
116
|
# if ignore_errors is True, continue to next input
|
|
111
117
|
if self.task_cfg.ignore_errors:
|
|
112
118
|
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
113
|
-
return
|
|
119
|
+
return []
|
|
114
120
|
else:
|
|
115
121
|
raise e
|
|
116
122
|
# process answer
|
|
123
|
+
answers_list = []
|
|
117
124
|
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
118
125
|
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
119
126
|
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
@@ -371,41 +378,46 @@ class Evaluator(object):
|
|
|
371
378
|
|
|
372
379
|
return metric_score
|
|
373
380
|
|
|
374
|
-
def dump_report(self, reviews_score_all: List[dict]
|
|
381
|
+
def dump_report(self, reviews_score_all: List[dict]):
|
|
375
382
|
"""
|
|
376
383
|
Get report for total reviews of specific dataset.
|
|
377
384
|
It is required to rewrite this method to support your own evaluator.
|
|
378
385
|
|
|
379
386
|
Args:
|
|
380
387
|
reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
|
|
381
|
-
use_table: whether to generate table for reports. Default to True.
|
|
382
388
|
|
|
383
389
|
Returns: None
|
|
384
390
|
"""
|
|
391
|
+
report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
|
|
392
|
+
os.makedirs(report_path, exist_ok=True)
|
|
385
393
|
# Get report map
|
|
386
394
|
report_map: Report = self.data_adapter.gen_report(
|
|
387
|
-
subset_score_map=reviews_score_all,
|
|
388
|
-
report_name=self.custom_task_name,
|
|
389
|
-
model_name=self.model_name,
|
|
390
|
-
dataset_name=self.dataset_name)
|
|
391
|
-
|
|
392
|
-
# Dump report
|
|
393
|
-
report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
|
|
394
|
-
self.dataset_name + '.json')
|
|
395
|
-
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
|
395
|
+
subset_score_map=reviews_score_all, model_name=self.model_name)
|
|
396
396
|
|
|
397
|
-
#
|
|
398
|
-
|
|
399
|
-
f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
|
|
400
|
-
logger.info(f'Dump report: {report_path} \n')
|
|
397
|
+
# Post process report
|
|
398
|
+
self.data_adapter.post_process_report(report_map, report_path=report_path)
|
|
401
399
|
|
|
402
400
|
# Make table
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
401
|
+
try:
|
|
402
|
+
report_table = gen_table(report_list=[report_map], add_overall_metric=True)
|
|
403
|
+
logger.info(f'\n{self.dataset_name_or_path} report table:'
|
|
404
|
+
f'\n{report_table} \n')
|
|
405
|
+
except Exception:
|
|
406
|
+
logger.error('Failed to generate report table.')
|
|
407
|
+
|
|
408
|
+
# Make report analysis
|
|
409
|
+
if self.task_cfg.analysis_report:
|
|
410
|
+
logger.info('Generating report analysis, please wait ...')
|
|
411
|
+
analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
|
|
412
|
+
logger.info('Report analysis:\n%s', analysis)
|
|
413
|
+
else:
|
|
414
|
+
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
415
|
+
|
|
416
|
+
# Dump report
|
|
417
|
+
report_file = os.path.join(report_path, f'{self.dataset_name}.json')
|
|
418
|
+
report_map.to_json(report_file)
|
|
419
|
+
logger.info(f'Dump report to: {report_file} \n')
|
|
420
|
+
|
|
409
421
|
return report_map
|
|
410
422
|
|
|
411
423
|
def eval(self, **kwargs) -> dict:
|
|
@@ -431,7 +443,7 @@ class Evaluator(object):
|
|
|
431
443
|
stage == 'review': return the reviews_map
|
|
432
444
|
"""
|
|
433
445
|
|
|
434
|
-
logger.info(f'
|
|
446
|
+
logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
|
|
435
447
|
|
|
436
448
|
reviews_score_all = {} # {subset_name: (score, num)}
|
|
437
449
|
stage_answers_dict = {}
|
|
@@ -461,6 +473,6 @@ class Evaluator(object):
|
|
|
461
473
|
# Generate report
|
|
462
474
|
report_map = self.dump_report(reviews_score_all)
|
|
463
475
|
|
|
464
|
-
logger.info(f'
|
|
476
|
+
logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
|
|
465
477
|
|
|
466
478
|
return report_map
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
|
|
10
10
|
weighted_mean)
|
|
11
11
|
from .named_metrics import Metric, metric_registry
|
|
12
|
-
from .rouge_metric import compute_rouge_score_one_sample_zh
|
|
12
|
+
from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
|
|
13
13
|
|
|
14
14
|
else:
|
|
15
15
|
_import_structure = {
|
|
@@ -28,6 +28,8 @@ else:
|
|
|
28
28
|
],
|
|
29
29
|
'rouge_metric': [
|
|
30
30
|
'compute_rouge_score_one_sample_zh',
|
|
31
|
+
'compute_rouge_score',
|
|
32
|
+
'compute_rouge_score_one_sample',
|
|
31
33
|
],
|
|
32
34
|
'llm_judge': [
|
|
33
35
|
'LLMJudge',
|
|
@@ -88,11 +88,11 @@ class RougeScorer(scoring.BaseScorer):
|
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
90
|
def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
|
|
91
|
-
check_nltk_data()
|
|
92
91
|
self.rouge_types = rouge_types
|
|
93
92
|
if tokenizer:
|
|
94
93
|
self._tokenizer = tokenizer
|
|
95
94
|
else:
|
|
95
|
+
check_nltk_data()
|
|
96
96
|
self._tokenizer = tokenizers.DefaultTokenizer(use_stemmer)
|
|
97
97
|
logging.info('Using default tokenizer.')
|
|
98
98
|
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -22,6 +22,9 @@ B: INCORRECT
|
|
|
22
22
|
Just return the letters "A" or "B", with no text around it.
|
|
23
23
|
""" # noqa: E501
|
|
24
24
|
|
|
25
|
+
DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
|
|
26
|
+
DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
class LLMJudge:
|
|
27
30
|
"""
|
|
@@ -47,12 +50,12 @@ class LLMJudge:
|
|
|
47
50
|
prompt_template (str, optional): Prompt template for the judge
|
|
48
51
|
generation_config (dict, optional): Generation configuration for the judge
|
|
49
52
|
"""
|
|
50
|
-
self.api_key = api_key or os.environ.get('
|
|
51
|
-
self.api_url = api_url or os.environ.get('
|
|
52
|
-
self.model_id = model_id or os.environ.get('
|
|
53
|
+
self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
|
|
54
|
+
self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
|
|
55
|
+
self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
|
|
53
56
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
54
57
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
55
|
-
self.generation_config = generation_config
|
|
58
|
+
self.generation_config = generation_config or {}
|
|
56
59
|
|
|
57
60
|
from evalscope.models import ServerModelAdapter
|
|
58
61
|
|
|
@@ -74,6 +77,10 @@ class LLMJudge:
|
|
|
74
77
|
if self.generation_config:
|
|
75
78
|
infer_cfg.update(self.generation_config)
|
|
76
79
|
|
|
80
|
+
if self.model_id == DEFAULT_JUDGE_MODEL:
|
|
81
|
+
# Disable thinking for the default judge model
|
|
82
|
+
infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
|
|
83
|
+
|
|
77
84
|
try:
|
|
78
85
|
# Send request using ServerModelAdapter
|
|
79
86
|
response = self.server_adapter.process_single_input(input_data, infer_cfg)
|
|
@@ -82,7 +89,7 @@ class LLMJudge:
|
|
|
82
89
|
llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
83
90
|
return llm_response
|
|
84
91
|
except Exception as e:
|
|
85
|
-
logger.error(f'Error during LLM evaluation: {e}')
|
|
92
|
+
logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
|
|
86
93
|
return ''
|
|
87
94
|
|
|
88
95
|
def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
|
evalscope/metrics/math_parser.py
CHANGED
|
@@ -4,7 +4,7 @@ The logic in this file largely borrows from Qwen2.5-Math codebase at https://git
|
|
|
4
4
|
# flake8: noqa
|
|
5
5
|
import re
|
|
6
6
|
import regex
|
|
7
|
-
from
|
|
7
|
+
from latex2sympy2_extended import latex2sympy
|
|
8
8
|
from math import isclose
|
|
9
9
|
from sympy import N, simplify
|
|
10
10
|
from sympy.parsing.latex import parse_latex
|
|
@@ -1,66 +1,52 @@
|
|
|
1
|
-
from __future__ import absolute_import, division, print_function
|
|
2
|
-
|
|
3
|
-
from .clipscore import CLIPScore, list_all_clipscore_models
|
|
4
|
-
from .constants import CACHE_DIR
|
|
5
|
-
from .itmscore import ITMScore, list_all_itmscore_models
|
|
6
|
-
from .vqascore import VQAScore, list_all_vqascore_models
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def list_all_models():
|
|
10
|
-
return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models()
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
|
|
14
|
-
if model in list_all_vqascore_models():
|
|
15
|
-
return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
16
|
-
elif model in list_all_clipscore_models():
|
|
17
|
-
return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
18
|
-
elif model in list_all_itmscore_models():
|
|
19
|
-
return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
20
|
-
else:
|
|
21
|
-
raise NotImplementedError()
|
|
22
|
-
|
|
23
|
-
|
|
24
1
|
def clip_flant5_score():
|
|
2
|
+
from .vqascore import VQAScore
|
|
25
3
|
clip_flant5_score = VQAScore(model='clip-flant5-xxl')
|
|
26
4
|
return clip_flant5_score
|
|
27
5
|
|
|
28
6
|
|
|
29
7
|
def pick_score():
|
|
8
|
+
from .clipscore import CLIPScore
|
|
30
9
|
pick_score = CLIPScore(model='pickscore-v1')
|
|
31
10
|
return pick_score
|
|
32
11
|
|
|
33
12
|
|
|
34
13
|
def clip_score():
|
|
14
|
+
from .clipscore import CLIPScore
|
|
35
15
|
clip_score = CLIPScore(model='openai:ViT-L-14-336')
|
|
36
16
|
return clip_score
|
|
37
17
|
|
|
38
18
|
|
|
39
19
|
def blip2_score():
|
|
20
|
+
from .itmscore import ITMScore
|
|
40
21
|
blip_itm_score = ITMScore(model='blip2-itm')
|
|
41
22
|
return blip_itm_score
|
|
42
23
|
|
|
43
24
|
|
|
44
25
|
def hpsv2_score():
|
|
26
|
+
from .clipscore import CLIPScore
|
|
45
27
|
hpsv2_score = CLIPScore(model='hpsv2')
|
|
46
28
|
return hpsv2_score
|
|
47
29
|
|
|
48
30
|
|
|
49
31
|
def hpsv2_1_score():
|
|
32
|
+
from .clipscore import CLIPScore
|
|
50
33
|
hpsv2_1_score = CLIPScore(model='hpsv2.1')
|
|
51
34
|
return hpsv2_1_score
|
|
52
35
|
|
|
53
36
|
|
|
54
37
|
def image_reward_score():
|
|
38
|
+
from .itmscore import ITMScore
|
|
55
39
|
image_reward_score = ITMScore(model='image-reward-v1')
|
|
56
40
|
return image_reward_score
|
|
57
41
|
|
|
58
42
|
|
|
59
43
|
def fga_blip2_score():
|
|
44
|
+
from .itmscore import ITMScore
|
|
60
45
|
fga_blip2_score = ITMScore(model='fga_blip2')
|
|
61
46
|
return fga_blip2_score
|
|
62
47
|
|
|
63
48
|
|
|
64
49
|
def mps_score():
|
|
50
|
+
from .clipscore import CLIPScore
|
|
65
51
|
mps_score = CLIPScore(model='mps')
|
|
66
52
|
return mps_score
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from .base_adapter import BaseModelAdapter, initialize_model_adapter
|
|
2
|
+
from .bfcl_adapter import BFCLAdapter
|
|
2
3
|
from .chat_adapter import ChatGenerationModelAdapter
|
|
3
4
|
from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
|
|
4
5
|
from .custom_adapter import CustomModelAdapter
|
|
@@ -13,5 +14,6 @@ __all__ = [
|
|
|
13
14
|
'MultiChoiceModelAdapter',
|
|
14
15
|
'CustomModelAdapter',
|
|
15
16
|
'ServerModelAdapter',
|
|
17
|
+
'BFCLAdapter',
|
|
16
18
|
'T2IModelAdapter',
|
|
17
19
|
]
|
|
@@ -44,35 +44,39 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
|
|
|
44
44
|
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
45
45
|
from evalscope.models import CustomModelAdapter
|
|
46
46
|
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
47
|
-
elif task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
|
|
48
|
-
from evalscope.models import ServerModelAdapter
|
|
49
|
-
|
|
50
|
-
if benchmark.model_adapter in [OutputType.CONTINUOUS, OutputType.MULTIPLE_CHOICE]:
|
|
51
|
-
logger.warning('Output type is set to logits. This is not supported for service evaluation. '
|
|
52
|
-
'Setting output type to generation by default.')
|
|
53
|
-
benchmark.model_adapter = OutputType.GENERATION
|
|
54
|
-
|
|
55
|
-
return ServerModelAdapter(
|
|
56
|
-
api_url=task_cfg.api_url,
|
|
57
|
-
model_id=task_cfg.model,
|
|
58
|
-
api_key=task_cfg.api_key,
|
|
59
|
-
seed=task_cfg.seed,
|
|
60
|
-
timeout=task_cfg.timeout,
|
|
61
|
-
stream=task_cfg.stream,
|
|
62
|
-
)
|
|
63
47
|
else:
|
|
64
48
|
from ..register import get_model_adapter
|
|
65
49
|
|
|
66
|
-
#
|
|
50
|
+
# we need to determine the model adapter class based on the output type
|
|
67
51
|
model_adapter_cls_str = benchmark.model_adapter
|
|
68
|
-
if model_adapter_cls_str not in benchmark.output_types:
|
|
69
|
-
logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}. '
|
|
70
|
-
f'Using {benchmark.output_types[0]} instead.')
|
|
71
|
-
model_adapter_cls_str = benchmark.output_types[0]
|
|
72
52
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
53
|
+
if task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
|
|
54
|
+
|
|
55
|
+
if 'server' not in model_adapter_cls_str:
|
|
56
|
+
model_adapter_cls_str = 'server'
|
|
57
|
+
logger.info(
|
|
58
|
+
f'Using {model_adapter_cls.__name__} for api model evaluation for benchmark {benchmark.name}.')
|
|
59
|
+
|
|
60
|
+
# init server model adapter
|
|
61
|
+
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
62
|
+
|
|
63
|
+
return model_adapter_cls(
|
|
64
|
+
api_url=task_cfg.api_url,
|
|
65
|
+
model_id=task_cfg.model,
|
|
66
|
+
api_key=task_cfg.api_key,
|
|
67
|
+
seed=task_cfg.seed,
|
|
68
|
+
timeout=task_cfg.timeout,
|
|
69
|
+
stream=task_cfg.stream,
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
if model_adapter_cls_str not in benchmark.output_types:
|
|
73
|
+
logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}.'
|
|
74
|
+
f'Using {benchmark.output_types[0]} instead.')
|
|
75
|
+
model_adapter_cls_str = benchmark.output_types[0]
|
|
76
|
+
|
|
77
|
+
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
78
|
+
return model_adapter_cls(
|
|
79
|
+
model=base_model,
|
|
80
|
+
generation_config=task_cfg.generation_config,
|
|
81
|
+
chat_template=task_cfg.chat_template,
|
|
82
|
+
task_cfg=task_cfg)
|