evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +16 -9
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +16 -4
- evalscope/config.py +7 -3
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +9 -3
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/benchmark.py +4 -3
- evalscope/perf/main.py +4 -2
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/utils/benchmark_util.py +2 -2
- evalscope/perf/utils/db_util.py +16 -8
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +117 -67
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +3 -3
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +12 -4
- evalscope/version.py +2 -2
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_run.py +20 -7
- tests/perf/test_perf.py +6 -3
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
evalscope/arguments.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import json
|
|
3
3
|
|
|
4
|
-
from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, OutputType
|
|
4
|
+
from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class ParseStrArgsAction(argparse.Action):
|
|
@@ -35,6 +35,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
35
35
|
parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
|
|
36
36
|
parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
|
|
37
37
|
parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
|
|
38
|
+
parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
|
|
38
39
|
|
|
39
40
|
# Template-related arguments
|
|
40
41
|
parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
|
evalscope/benchmarks/__init__.py
CHANGED
|
@@ -10,8 +10,8 @@ from evalscope.utils import get_logger
|
|
|
10
10
|
logger = get_logger()
|
|
11
11
|
|
|
12
12
|
# Using glob to find all files matching the pattern
|
|
13
|
-
pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
|
|
14
|
-
files = glob.glob(pattern, recursive=
|
|
13
|
+
pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
|
|
14
|
+
files = glob.glob(pattern, recursive=True)
|
|
15
15
|
|
|
16
16
|
for file_path in files:
|
|
17
17
|
if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import List, Optional, Union
|
|
2
|
+
|
|
3
|
+
from evalscope.benchmarks import DataAdapter
|
|
4
|
+
from evalscope.metrics import mean, metric_registry
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class T2IBaseAdapter(DataAdapter):
|
|
11
|
+
|
|
12
|
+
def __init__(self, **kwargs):
|
|
13
|
+
|
|
14
|
+
super().__init__(**kwargs)
|
|
15
|
+
|
|
16
|
+
logger.info(f'Initializing metrics: {self.metric_list}')
|
|
17
|
+
self.metrics = {m: metric_registry.get(m).object() for m in self.metric_list}
|
|
18
|
+
|
|
19
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
20
|
+
# dummy prompt for general t2i
|
|
21
|
+
return self.gen_prompt_data(prompt=input_d.get('prompt', ''), id=input_d.get('id', 0))
|
|
22
|
+
|
|
23
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
24
|
+
# dummy gold answer for general t2i
|
|
25
|
+
return input_d.get('prompt', '')
|
|
26
|
+
|
|
27
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
28
|
+
# dummy parse pred result for general t2i
|
|
29
|
+
return result or raw_input_d.get('image_path', '')
|
|
30
|
+
|
|
31
|
+
def match(self, gold: str, pred: str) -> dict:
|
|
32
|
+
# dummy match for general t2i
|
|
33
|
+
# pred is the image path, gold is the prompt
|
|
34
|
+
res = {}
|
|
35
|
+
for metric_name, metric_func in self.metrics.items():
|
|
36
|
+
score = metric_func(images=[pred], texts=[gold])[0][0]
|
|
37
|
+
if isinstance(score, dict):
|
|
38
|
+
for k, v in score.items():
|
|
39
|
+
res[f'{metric_name}_{k}'] = v.cpu().item()
|
|
40
|
+
else:
|
|
41
|
+
res[metric_name] = score.cpu().item() # Updated to use score.cpu().item()
|
|
42
|
+
return res
|
|
43
|
+
|
|
44
|
+
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
45
|
+
"""
|
|
46
|
+
compute weighted mean of the bleu score of all samples
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
review_res_list: [score1, score2, ...]
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
avg_res: List[dict]
|
|
53
|
+
|
|
54
|
+
"""
|
|
55
|
+
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
56
|
+
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.metrics import mean
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from .base import T2IBaseAdapter
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@Benchmark.register(
|
|
17
|
+
name='evalmuse',
|
|
18
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
20
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
21
|
+
subset_list=['EvalMuse'],
|
|
22
|
+
metric_list=['FGA_BLIP2Score'],
|
|
23
|
+
few_shot_num=0,
|
|
24
|
+
train_split=None,
|
|
25
|
+
eval_split='test',
|
|
26
|
+
)
|
|
27
|
+
class EvalMuseAdapter(T2IBaseAdapter):
|
|
28
|
+
|
|
29
|
+
def __init__(self, **kwargs):
|
|
30
|
+
super().__init__(**kwargs)
|
|
31
|
+
|
|
32
|
+
def load(self, **kwargs) -> dict:
|
|
33
|
+
if os.path.isfile(self.dataset_id):
|
|
34
|
+
data_list = jsonl_to_list(self.dataset_id)
|
|
35
|
+
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
36
|
+
return data_dict
|
|
37
|
+
else:
|
|
38
|
+
return super().load(**kwargs)
|
|
39
|
+
|
|
40
|
+
def get_gold_answer(self, input_d: dict) -> dict:
|
|
41
|
+
# return prompt and elements dict
|
|
42
|
+
return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
|
|
43
|
+
|
|
44
|
+
def match(self, gold: dict, pred: str) -> dict:
|
|
45
|
+
# dummy match for general t2i
|
|
46
|
+
# pred is the image path, gold is the prompt
|
|
47
|
+
res = {}
|
|
48
|
+
for metric_name, metric_func in self.metrics.items():
|
|
49
|
+
if metric_name == 'FGA_BLIP2Score':
|
|
50
|
+
# For FGA_BLIP2Score, we need to pass the dictionary
|
|
51
|
+
score = metric_func(images=[pred], texts=[gold])[0][0]
|
|
52
|
+
else:
|
|
53
|
+
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
54
|
+
if isinstance(score, dict):
|
|
55
|
+
for k, v in score.items():
|
|
56
|
+
res[f'{metric_name}:{k}'] = v.cpu().item()
|
|
57
|
+
else:
|
|
58
|
+
res[metric_name] = score.cpu().item()
|
|
59
|
+
return res
|
|
60
|
+
|
|
61
|
+
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
62
|
+
"""
|
|
63
|
+
compute weighted mean of the bleu score of all samples
|
|
64
|
+
"""
|
|
65
|
+
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
66
|
+
# add statistics for each metric
|
|
67
|
+
new_items = defaultdict(list)
|
|
68
|
+
for metric_name, value_list in items.items():
|
|
69
|
+
if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
|
|
70
|
+
metrics_prefix = metric_name.split(':')[0]
|
|
71
|
+
category = metric_name.rpartition('(')[-1].split(')')[0]
|
|
72
|
+
new_items[f'{metrics_prefix}:{category}'].extend(value_list)
|
|
73
|
+
else:
|
|
74
|
+
new_items[metric_name].extend(value_list)
|
|
75
|
+
|
|
76
|
+
# calculate mean for each metric
|
|
77
|
+
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in new_items.items()]
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .base import T2IBaseAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='genai_bench',
|
|
17
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
19
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
20
|
+
subset_list=['GenAI-Bench-1600'],
|
|
21
|
+
metric_list=['VQAScore'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
class GenAIBenchAdapter(T2IBaseAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
super().__init__(**kwargs)
|
|
30
|
+
|
|
31
|
+
def load(self, **kwargs) -> dict:
|
|
32
|
+
if os.path.isfile(self.dataset_id):
|
|
33
|
+
data_list = jsonl_to_list(self.dataset_id)
|
|
34
|
+
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
35
|
+
return data_dict
|
|
36
|
+
else:
|
|
37
|
+
return super().load(**kwargs)
|
|
38
|
+
|
|
39
|
+
def get_gold_answer(self, input_d: dict) -> dict:
|
|
40
|
+
# return prompt and elements dict
|
|
41
|
+
return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
|
|
42
|
+
|
|
43
|
+
def match(self, gold: dict, pred: str) -> dict:
|
|
44
|
+
# dummy match for general t2i
|
|
45
|
+
# pred is the image path, gold is the prompt
|
|
46
|
+
res = {}
|
|
47
|
+
for metric_name, metric_func in self.metrics.items():
|
|
48
|
+
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
49
|
+
|
|
50
|
+
res[metric_name] = score.cpu().item()
|
|
51
|
+
|
|
52
|
+
# fine-granular metrics
|
|
53
|
+
if gold['tags'].get('advanced'):
|
|
54
|
+
res[f'{metric_name}_advanced'] = score.cpu().item()
|
|
55
|
+
else:
|
|
56
|
+
res[f'{metric_name}_basic'] = score.cpu().item()
|
|
57
|
+
|
|
58
|
+
return res
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .base import T2IBaseAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='general_t2i',
|
|
17
|
+
dataset_id='general_t2i',
|
|
18
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
19
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
20
|
+
subset_list=['default'],
|
|
21
|
+
metric_list=['PickScore'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
class GeneralT2IAdapter(T2IBaseAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
|
|
30
|
+
super().__init__(**kwargs)
|
|
31
|
+
|
|
32
|
+
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
33
|
+
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
34
|
+
subset_list = subset_list or self.subset_list
|
|
35
|
+
|
|
36
|
+
data_file_dict = defaultdict(str)
|
|
37
|
+
data_item_dict = defaultdict(list)
|
|
38
|
+
|
|
39
|
+
# get data file path and subset name
|
|
40
|
+
if os.path.isdir(dataset_name_or_path):
|
|
41
|
+
for subset_name in subset_list:
|
|
42
|
+
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
43
|
+
elif os.path.isfile(dataset_name_or_path):
|
|
44
|
+
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
45
|
+
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
48
|
+
|
|
49
|
+
# load data from local disk
|
|
50
|
+
try:
|
|
51
|
+
for subset_name, file_path in data_file_dict.items():
|
|
52
|
+
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
53
|
+
except Exception as e:
|
|
54
|
+
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
55
|
+
|
|
56
|
+
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
57
|
+
|
|
58
|
+
return data_dict
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .base import T2IBaseAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='hpdv2',
|
|
17
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
19
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
20
|
+
subset_list=['HPDv2'],
|
|
21
|
+
metric_list=['HPSv2.1Score'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
class HPDv2Adapter(T2IBaseAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
super().__init__(**kwargs)
|
|
30
|
+
|
|
31
|
+
def load(self, **kwargs) -> dict:
|
|
32
|
+
if os.path.isfile(self.dataset_id):
|
|
33
|
+
data_list = jsonl_to_list(self.dataset_id)
|
|
34
|
+
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
35
|
+
return data_dict
|
|
36
|
+
else:
|
|
37
|
+
return super().load(**kwargs)
|
|
38
|
+
|
|
39
|
+
def get_gold_answer(self, input_d: dict) -> dict:
|
|
40
|
+
# return prompt and elements dict
|
|
41
|
+
return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
|
|
42
|
+
|
|
43
|
+
def match(self, gold: dict, pred: str) -> dict:
|
|
44
|
+
# dummy match for general t2i
|
|
45
|
+
# pred is the image path, gold is the prompt
|
|
46
|
+
res = {}
|
|
47
|
+
for metric_name, metric_func in self.metrics.items():
|
|
48
|
+
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
49
|
+
|
|
50
|
+
res[metric_name] = score.cpu().item()
|
|
51
|
+
|
|
52
|
+
# fine-granular metrics
|
|
53
|
+
category = gold['tags'].get('category')
|
|
54
|
+
if category:
|
|
55
|
+
res[f'{metric_name}_{category}'] = score.cpu().item()
|
|
56
|
+
|
|
57
|
+
return res
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .base import T2IBaseAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='tifa160',
|
|
17
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
19
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
20
|
+
subset_list=['TIFA-160'],
|
|
21
|
+
metric_list=['PickScore'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
class TIFA_Adapter(T2IBaseAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
super().__init__(**kwargs)
|
|
30
|
+
|
|
31
|
+
def load(self, **kwargs) -> dict:
|
|
32
|
+
if os.path.isfile(self.dataset_id):
|
|
33
|
+
data_list = jsonl_to_list(self.dataset_id)
|
|
34
|
+
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
35
|
+
return data_dict
|
|
36
|
+
else:
|
|
37
|
+
return super().load(**kwargs)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.constants import OutputType
|
|
3
|
-
from evalscope.metrics
|
|
3
|
+
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
|
|
6
6
|
# flake8: noqa
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.constants import OutputType
|
|
3
|
-
from evalscope.metrics
|
|
3
|
+
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
|
|
6
6
|
# flake8: noqa
|
|
@@ -11,12 +11,12 @@ logger = get_logger()
|
|
|
11
11
|
@Benchmark.register(
|
|
12
12
|
name='aime25',
|
|
13
13
|
pretty_name='AIME-2025',
|
|
14
|
-
dataset_id='
|
|
15
|
-
subset_list=['
|
|
14
|
+
dataset_id='opencompass/AIME2025',
|
|
15
|
+
subset_list=['AIME2025-I', 'AIME2025-II'],
|
|
16
16
|
metric_list=['AveragePass@1'],
|
|
17
17
|
few_shot_num=0,
|
|
18
18
|
train_split=None,
|
|
19
|
-
eval_split='
|
|
19
|
+
eval_split='test', # Only train set is available
|
|
20
20
|
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
21
|
)
|
|
22
22
|
class AIME25Adapter(DataAdapter):
|
|
@@ -3,8 +3,7 @@ from collections import defaultdict
|
|
|
3
3
|
from typing import Any, List
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.metrics import Metric, mean, metric_registry
|
|
7
|
-
from evalscope.metrics.llm_judge import LLMJudge
|
|
6
|
+
from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
9
8
|
|
|
10
9
|
# flake8: noqa
|
|
@@ -18,7 +18,7 @@ logger = get_logger()
|
|
|
18
18
|
name='arc',
|
|
19
19
|
pretty_name='ARC',
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
|
-
model_adapter=OutputType.
|
|
21
|
+
model_adapter=OutputType.GENERATION,
|
|
22
22
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
23
23
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
24
24
|
metric_list=['AverageAccuracy'],
|
|
@@ -3,9 +3,7 @@ from collections import defaultdict
|
|
|
3
3
|
from typing import Any, List
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.metrics import Metric, mean, metric_registry
|
|
8
|
-
from evalscope.metrics.llm_judge import LLMJudge
|
|
6
|
+
from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
|
|
9
7
|
from evalscope.utils.logger import get_logger
|
|
10
8
|
|
|
11
9
|
# flake8: noqa
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
|
-
from evalscope.metrics
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.utils import ResponseParser
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
@@ -127,7 +127,7 @@ SUBJECT_MAPPING = {
|
|
|
127
127
|
name='ceval',
|
|
128
128
|
pretty_name='C-Eval',
|
|
129
129
|
dataset_id='modelscope/ceval-exam',
|
|
130
|
-
model_adapter=OutputType.
|
|
130
|
+
model_adapter=OutputType.GENERATION,
|
|
131
131
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
132
132
|
subset_list=SUBSET_LIST,
|
|
133
133
|
metric_list=['AverageAccuracy'],
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from collections import defaultdict
|
|
3
2
|
from typing import Any, List
|
|
4
3
|
|
|
5
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.metrics import Metric, mean, metric_registry
|
|
7
|
-
from evalscope.metrics.llm_judge import LLMJudge
|
|
5
|
+
from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
|
|
8
6
|
from evalscope.utils.logger import get_logger
|
|
9
7
|
|
|
10
8
|
# flake8: noqa
|
|
@@ -104,7 +104,7 @@ SUBJECT_MAPPING = {
|
|
|
104
104
|
name='cmmlu',
|
|
105
105
|
pretty_name='C-MMLU',
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
|
-
model_adapter=OutputType.
|
|
107
|
+
model_adapter=OutputType.GENERATION,
|
|
108
108
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
109
109
|
subset_list=SUBSET_LIST,
|
|
110
110
|
metric_list=['AverageAccuracy'],
|
|
@@ -6,8 +6,7 @@ import os
|
|
|
6
6
|
from collections import defaultdict
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
-
from evalscope.
|
|
10
|
-
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
9
|
+
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
13
12
|
# flake8: noqa
|
|
@@ -3,12 +3,11 @@ import os.path
|
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
5
|
from collections import defaultdict
|
|
6
|
-
from typing import Any, List, Optional, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks.utils import PromptData, preprocess_decorator
|
|
9
9
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
10
|
-
from evalscope.metrics
|
|
11
|
-
from evalscope.metrics.named_metrics import metric_registry
|
|
10
|
+
from evalscope.metrics import LLMJudge, metric_registry
|
|
12
11
|
from evalscope.report import Report, ReportGenerator
|
|
13
12
|
from evalscope.utils.logger import get_logger
|
|
14
13
|
|
|
@@ -24,6 +23,7 @@ class DataAdapter(ABC):
|
|
|
24
23
|
subset_list: list,
|
|
25
24
|
metric_list: List[str],
|
|
26
25
|
llm_as_a_judge: bool = False,
|
|
26
|
+
output_types: Optional[List[str]] = None,
|
|
27
27
|
few_shot_num: Optional[int] = 0,
|
|
28
28
|
train_split: Optional[str] = None,
|
|
29
29
|
eval_split: Optional[str] = None,
|
|
@@ -63,6 +63,7 @@ class DataAdapter(ABC):
|
|
|
63
63
|
self.query_template = query_template
|
|
64
64
|
self.pretty_name = pretty_name
|
|
65
65
|
self.config_kwargs = kwargs
|
|
66
|
+
self.output_types = output_types or [model_adapter]
|
|
66
67
|
self.llm_as_a_judge = llm_as_a_judge
|
|
67
68
|
self.category_map = kwargs.get('category_map', {})
|
|
68
69
|
self.choices = kwargs.get('choices', None)
|
|
@@ -190,7 +191,7 @@ class DataAdapter(ABC):
|
|
|
190
191
|
if self.few_shot_num and self.few_shot_num < 0:
|
|
191
192
|
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
|
|
192
193
|
|
|
193
|
-
logger.info(f'Use
|
|
194
|
+
logger.info(f'Use settings: '
|
|
194
195
|
f'> few_shot_num: {self.few_shot_num}, '
|
|
195
196
|
f'> few_shot_split: {self.train_split}, '
|
|
196
197
|
f'> target_eval_split: {self.eval_split}')
|
|
@@ -245,7 +246,8 @@ class DataAdapter(ABC):
|
|
|
245
246
|
res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
|
|
246
247
|
return res_list
|
|
247
248
|
|
|
248
|
-
def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
|
|
249
|
+
def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
|
|
250
|
+
**kwargs) -> Dict[str, List[float]]:
|
|
249
251
|
"""
|
|
250
252
|
compute weighted mean of the bleu score of all samples
|
|
251
253
|
|
|
@@ -253,7 +255,7 @@ class DataAdapter(ABC):
|
|
|
253
255
|
review_res_list: [score1, score2, ...]
|
|
254
256
|
|
|
255
257
|
Returns:
|
|
256
|
-
avg_res: List[
|
|
258
|
+
avg_res: Dict[str, List[float]]
|
|
257
259
|
|
|
258
260
|
"""
|
|
259
261
|
if isinstance(review_res_list[0], list):
|
|
@@ -318,11 +320,16 @@ class DataAdapter(ABC):
|
|
|
318
320
|
prompt: str,
|
|
319
321
|
system_prompt: Optional[str] = None,
|
|
320
322
|
choices: Optional[List[str]] = None,
|
|
323
|
+
index: Optional[Union[int, str]] = None,
|
|
324
|
+
id: Optional[Union[int, str]] = None,
|
|
321
325
|
**kwargs) -> dict:
|
|
322
|
-
if not isinstance(prompt, list)
|
|
323
|
-
prompt = [prompt]
|
|
326
|
+
data = [prompt] if not isinstance(prompt, list) else prompt
|
|
324
327
|
prompt_data = PromptData(
|
|
325
|
-
data=
|
|
328
|
+
data=data,
|
|
329
|
+
multi_choices=choices or self.choices,
|
|
330
|
+
system_prompt=system_prompt or self.system_prompt,
|
|
331
|
+
index=index or 0,
|
|
332
|
+
id=id)
|
|
326
333
|
return prompt_data.to_dict()
|
|
327
334
|
|
|
328
335
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
@@ -48,14 +48,16 @@ class DataCollectionAdapter(DataAdapter):
|
|
|
48
48
|
if len(dataset) == 0:
|
|
49
49
|
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
50
50
|
else:
|
|
51
|
-
from modelscope
|
|
51
|
+
from modelscope import dataset_snapshot_download
|
|
52
52
|
|
|
53
53
|
# Load dataset from remote
|
|
54
54
|
logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
dataset_path = dataset_snapshot_download(
|
|
57
|
+
dataset_name_or_path, cache_dir=work_dir, allow_file_pattern='*.jsonl')
|
|
58
|
+
# find the jsonl file
|
|
59
|
+
dataset_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.jsonl')]
|
|
60
|
+
dataset = jsonl_to_list(dataset_files[0])
|
|
59
61
|
|
|
60
62
|
return dataset
|
|
61
63
|
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
|
-
from evalscope.metrics
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.utils import ResponseParser
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
@@ -17,7 +17,7 @@ logger = get_logger()
|
|
|
17
17
|
name='general_mcq',
|
|
18
18
|
pretty_name='General MCQ',
|
|
19
19
|
dataset_id='general_mcq',
|
|
20
|
-
model_adapter=OutputType.
|
|
20
|
+
model_adapter=OutputType.GENERATION,
|
|
21
21
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
22
22
|
subset_list=['default'],
|
|
23
23
|
metric_list=['AverageAccuracy'],
|
|
@@ -33,7 +33,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
33
33
|
subset_list = subset_list or self.subset_list
|
|
34
34
|
|
|
35
35
|
data_file_dict = defaultdict(str)
|
|
36
|
-
|
|
36
|
+
data_item_dict = defaultdict(list)
|
|
37
37
|
|
|
38
38
|
# get data file path and subset name
|
|
39
39
|
if os.path.isdir(dataset_name_or_path):
|
|
@@ -48,11 +48,11 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
48
48
|
# load data from local disk
|
|
49
49
|
try:
|
|
50
50
|
for subset_name, file_path in data_file_dict.items():
|
|
51
|
-
|
|
51
|
+
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
52
52
|
except Exception as e:
|
|
53
53
|
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
54
54
|
|
|
55
|
-
data_dict = {subset_name: {'test':
|
|
55
|
+
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
56
56
|
|
|
57
57
|
return data_dict
|
|
58
58
|
|