evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +4 -4
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +2 -2
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +21 -10
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
- evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +20 -6
- evalscope/config.py +8 -4
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +2 -2
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/arguments.py +24 -5
- evalscope/perf/benchmark.py +28 -42
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/utils/benchmark_util.py +14 -8
- evalscope/perf/utils/db_util.py +9 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +128 -78
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +10 -3
- evalscope/summarizer.py +2 -1
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +48 -29
- evalscope/version.py +2 -2
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/METADATA +37 -15
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/RECORD +209 -96
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_all.py +4 -4
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +19 -12
- tests/perf/test_perf.py +3 -3
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +29 -26
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import json
|
|
3
3
|
|
|
4
|
-
from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, OutputType
|
|
4
|
+
from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class ParseStrArgsAction(argparse.Action):
|
|
@@ -35,6 +35,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
35
35
|
parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
|
|
36
36
|
parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
|
|
37
37
|
parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
|
|
38
|
+
parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
|
|
38
39
|
|
|
39
40
|
# Template-related arguments
|
|
40
41
|
parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
|
|
1
|
+
from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager, Tools
|
|
2
2
|
from evalscope.backend.rag_eval.utils.clip import VisionModel
|
|
3
3
|
from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
|
|
4
4
|
from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
|
|
@@ -8,6 +8,12 @@ from evalscope.utils.logger import get_logger
|
|
|
8
8
|
logger = get_logger()
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
class Tools:
|
|
12
|
+
MTEB = 'mteb'
|
|
13
|
+
RAGAS = 'ragas'
|
|
14
|
+
CLIP_BENCHMARK = 'clip_benchmark'
|
|
15
|
+
|
|
16
|
+
|
|
11
17
|
class RAGEvalBackendManager(BackendManager):
|
|
12
18
|
|
|
13
19
|
def __init__(self, config: Union[str, dict], **kwargs):
|
|
@@ -47,9 +53,19 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
47
53
|
from evalscope.backend.rag_eval.ragas.tasks import generate_testset
|
|
48
54
|
|
|
49
55
|
if testset_args is not None:
|
|
50
|
-
|
|
56
|
+
if isinstance(testset_args, dict):
|
|
57
|
+
generate_testset(TestsetGenerationArguments(**testset_args))
|
|
58
|
+
elif isinstance(testset_args, TestsetGenerationArguments):
|
|
59
|
+
generate_testset(testset_args)
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError('Please provide the testset generation arguments.')
|
|
51
62
|
if eval_args is not None:
|
|
52
|
-
|
|
63
|
+
if isinstance(eval_args, dict):
|
|
64
|
+
rag_eval(EvaluationArguments(**eval_args))
|
|
65
|
+
elif isinstance(eval_args, EvaluationArguments):
|
|
66
|
+
rag_eval(eval_args)
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError('Please provide the evaluation arguments.')
|
|
53
69
|
|
|
54
70
|
@staticmethod
|
|
55
71
|
def run_clip_benchmark(args):
|
|
@@ -59,17 +75,17 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
59
75
|
|
|
60
76
|
def run(self, *args, **kwargs):
|
|
61
77
|
tool = self.config_d.pop('tool')
|
|
62
|
-
if tool.lower() ==
|
|
78
|
+
if tool.lower() == Tools.MTEB:
|
|
63
79
|
self._check_env('mteb')
|
|
64
80
|
model_args = self.config_d['model']
|
|
65
81
|
eval_args = self.config_d['eval']
|
|
66
82
|
self.run_mteb(model_args, eval_args)
|
|
67
|
-
elif tool.lower() ==
|
|
83
|
+
elif tool.lower() == Tools.RAGAS:
|
|
68
84
|
self._check_env('ragas')
|
|
69
85
|
testset_args = self.config_d.get('testset_generation', None)
|
|
70
86
|
eval_args = self.config_d.get('eval', None)
|
|
71
87
|
self.run_ragas(testset_args, eval_args)
|
|
72
|
-
elif tool.lower() ==
|
|
88
|
+
elif tool.lower() == Tools.CLIP_BENCHMARK:
|
|
73
89
|
self._check_env('webdataset')
|
|
74
90
|
self.run_clip_benchmark(self.config_d['eval'])
|
|
75
91
|
else:
|
|
@@ -20,6 +20,12 @@ class ModelArguments:
|
|
|
20
20
|
encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
|
|
21
21
|
hub: str = 'modelscope' # modelscope or huggingface
|
|
22
22
|
|
|
23
|
+
# for API embedding model
|
|
24
|
+
model_name: Optional[str] = None
|
|
25
|
+
api_base: Optional[str] = None
|
|
26
|
+
api_key: Optional[str] = None
|
|
27
|
+
dimensions: Optional[int] = None
|
|
28
|
+
|
|
23
29
|
def to_dict(self) -> Dict[str, Any]:
|
|
24
30
|
return {
|
|
25
31
|
'model_name_or_path': self.model_name_or_path,
|
|
@@ -31,6 +37,10 @@ class ModelArguments:
|
|
|
31
37
|
'config_kwargs': self.config_kwargs,
|
|
32
38
|
'encode_kwargs': self.encode_kwargs,
|
|
33
39
|
'hub': self.hub,
|
|
40
|
+
'model_name': self.model_name,
|
|
41
|
+
'api_base': self.api_base,
|
|
42
|
+
'api_key': self.api_key,
|
|
43
|
+
'dimensions': self.dimensions,
|
|
34
44
|
}
|
|
35
45
|
|
|
36
46
|
|
|
@@ -21,7 +21,6 @@ class TestsetGenerationArguments:
|
|
|
21
21
|
"""
|
|
22
22
|
generator_llm: Dict = field(default_factory=dict)
|
|
23
23
|
embeddings: Dict = field(default_factory=dict)
|
|
24
|
-
distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
|
|
25
24
|
# For LLM based evaluation
|
|
26
25
|
# available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
|
|
27
26
|
# 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
|
|
@@ -67,9 +67,14 @@ def get_persona(llm, kg, language):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
def load_data(file_path):
|
|
70
|
-
|
|
70
|
+
import nltk
|
|
71
|
+
from langchain_unstructured import UnstructuredLoader
|
|
71
72
|
|
|
72
|
-
|
|
73
|
+
if nltk.data.find('taggers/averaged_perceptron_tagger_eng') is False:
|
|
74
|
+
# need to download nltk data for the first time
|
|
75
|
+
nltk.download('averaged_perceptron_tagger_eng')
|
|
76
|
+
|
|
77
|
+
loader = UnstructuredLoader(file_path)
|
|
73
78
|
data = loader.load()
|
|
74
79
|
return data
|
|
75
80
|
|
|
@@ -2,7 +2,6 @@ import asyncio
|
|
|
2
2
|
import os
|
|
3
3
|
from ragas.llms import BaseRagasLLM
|
|
4
4
|
from ragas.prompt import PromptMixin, PydanticPrompt
|
|
5
|
-
from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
|
|
6
5
|
from typing import List
|
|
7
6
|
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
@@ -16,10 +15,6 @@ async def translate_prompt(
|
|
|
16
15
|
llm: BaseRagasLLM,
|
|
17
16
|
adapt_instruction: bool = False,
|
|
18
17
|
):
|
|
19
|
-
if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
|
|
20
|
-
logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
|
|
21
|
-
return
|
|
22
|
-
|
|
23
18
|
if not issubclass(type(prompt_user), PromptMixin):
|
|
24
19
|
logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
|
|
25
20
|
return
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import torch
|
|
3
3
|
from langchain_core.embeddings import Embeddings
|
|
4
|
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
4
5
|
from sentence_transformers import models
|
|
5
6
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
6
7
|
from sentence_transformers.SentenceTransformer import SentenceTransformer
|
|
7
8
|
from torch import Tensor
|
|
9
|
+
from tqdm import tqdm
|
|
8
10
|
from typing import Dict, List, Optional, Union
|
|
9
11
|
|
|
10
12
|
from evalscope.backend.rag_eval.utils.tools import download_model
|
|
@@ -18,10 +20,10 @@ class BaseModel(Embeddings):
|
|
|
18
20
|
|
|
19
21
|
def __init__(
|
|
20
22
|
self,
|
|
21
|
-
model_name_or_path: str,
|
|
23
|
+
model_name_or_path: str = '',
|
|
22
24
|
max_seq_length: int = 512,
|
|
23
25
|
prompt: str = '',
|
|
24
|
-
revision: Optional[str] =
|
|
26
|
+
revision: Optional[str] = 'master',
|
|
25
27
|
**kwargs,
|
|
26
28
|
):
|
|
27
29
|
self.model_name_or_path = model_name_or_path
|
|
@@ -139,7 +141,7 @@ class CrossEncoderModel(BaseModel):
|
|
|
139
141
|
max_length=self.max_seq_length,
|
|
140
142
|
)
|
|
141
143
|
|
|
142
|
-
def predict(self, sentences: List[List[str]], **kwargs) ->
|
|
144
|
+
def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
|
|
143
145
|
self.encode_kwargs.update(kwargs)
|
|
144
146
|
|
|
145
147
|
if len(sentences[0]) == 3: # Note: For mteb retrieval task
|
|
@@ -154,6 +156,46 @@ class CrossEncoderModel(BaseModel):
|
|
|
154
156
|
return embeddings
|
|
155
157
|
|
|
156
158
|
|
|
159
|
+
class APIEmbeddingModel(BaseModel):
|
|
160
|
+
|
|
161
|
+
def __init__(self, **kwargs):
|
|
162
|
+
self.model_name = kwargs.get('model_name')
|
|
163
|
+
self.openai_api_base = kwargs.get('api_base')
|
|
164
|
+
self.openai_api_key = kwargs.get('api_key')
|
|
165
|
+
self.dimensions = kwargs.get('dimensions')
|
|
166
|
+
|
|
167
|
+
self.model = OpenAIEmbeddings(
|
|
168
|
+
model=self.model_name,
|
|
169
|
+
openai_api_base=self.openai_api_base,
|
|
170
|
+
openai_api_key=self.openai_api_key,
|
|
171
|
+
dimensions=self.dimensions,
|
|
172
|
+
check_embedding_ctx_length=False)
|
|
173
|
+
|
|
174
|
+
super().__init__(model_name_or_path=self.model_name, **kwargs)
|
|
175
|
+
|
|
176
|
+
self.batch_size = self.encode_kwargs.get('batch_size', 10)
|
|
177
|
+
|
|
178
|
+
def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
|
|
179
|
+
if isinstance(texts, str):
|
|
180
|
+
texts = [texts]
|
|
181
|
+
|
|
182
|
+
embeddings: List[List[float]] = []
|
|
183
|
+
for i in tqdm(range(0, len(texts), self.batch_size)):
|
|
184
|
+
response = self.model.embed_documents(texts[i:i + self.batch_size], chunk_size=self.batch_size)
|
|
185
|
+
embeddings.extend(response)
|
|
186
|
+
return torch.tensor(embeddings)
|
|
187
|
+
|
|
188
|
+
def encode_queries(self, queries, **kwargs):
|
|
189
|
+
return self.encode(queries, **kwargs)
|
|
190
|
+
|
|
191
|
+
def encode_corpus(self, corpus, **kwargs):
|
|
192
|
+
if isinstance(corpus[0], dict):
|
|
193
|
+
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
|
|
194
|
+
else:
|
|
195
|
+
input_texts = corpus
|
|
196
|
+
return self.encode(input_texts, **kwargs)
|
|
197
|
+
|
|
198
|
+
|
|
157
199
|
class EmbeddingModel:
|
|
158
200
|
"""Custom embeddings"""
|
|
159
201
|
|
|
@@ -165,6 +207,10 @@ class EmbeddingModel:
|
|
|
165
207
|
revision: Optional[str] = 'master',
|
|
166
208
|
**kwargs,
|
|
167
209
|
):
|
|
210
|
+
if kwargs.get('model_name'):
|
|
211
|
+
# If model_name is provided, use OpenAIEmbeddings
|
|
212
|
+
return APIEmbeddingModel(**kwargs)
|
|
213
|
+
|
|
168
214
|
# If model path does not exist and hub is 'modelscope', download the model
|
|
169
215
|
if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
|
|
170
216
|
model_name_or_path = download_model(model_name_or_path, revision)
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
|
3
3
|
from langchain_core.language_models.llms import LLM as BaseLLM
|
|
4
4
|
from langchain_openai import ChatOpenAI
|
|
5
|
-
from
|
|
5
|
+
from transformers.generation.configuration_utils import GenerationConfig
|
|
6
6
|
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_MODEL_REVISION
|
|
@@ -16,9 +16,9 @@ class LLM:
|
|
|
16
16
|
api_base = kw.get('api_base', None)
|
|
17
17
|
if api_base:
|
|
18
18
|
return ChatOpenAI(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
model=kw.get('model_name', ''),
|
|
20
|
+
base_url=api_base,
|
|
21
|
+
api_key=kw.get('api_key', 'EMPTY'),
|
|
22
22
|
)
|
|
23
23
|
else:
|
|
24
24
|
return LocalLLM(**kw)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import os
|
|
2
3
|
import subprocess
|
|
3
4
|
from functools import partial
|
|
4
5
|
from typing import Optional, Union
|
|
@@ -66,8 +67,9 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
66
67
|
del remain_cfg['name'] # remove not used args
|
|
67
68
|
del remain_cfg['type'] # remove not used args
|
|
68
69
|
|
|
69
|
-
|
|
70
|
-
|
|
70
|
+
norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
|
|
71
|
+
self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
|
|
72
|
+
new_model_names.append(norm_model_type)
|
|
71
73
|
else:
|
|
72
74
|
remain_cfg = copy.deepcopy(model_cfg)
|
|
73
75
|
del remain_cfg['name'] # remove not used args
|
evalscope/benchmarks/__init__.py
CHANGED
|
@@ -10,8 +10,8 @@ from evalscope.utils import get_logger
|
|
|
10
10
|
logger = get_logger()
|
|
11
11
|
|
|
12
12
|
# Using glob to find all files matching the pattern
|
|
13
|
-
pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
|
|
14
|
-
files = glob.glob(pattern, recursive=
|
|
13
|
+
pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
|
|
14
|
+
files = glob.glob(pattern, recursive=True)
|
|
15
15
|
|
|
16
16
|
for file_path in files:
|
|
17
17
|
if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import List, Optional, Union
|
|
2
|
+
|
|
3
|
+
from evalscope.benchmarks import DataAdapter
|
|
4
|
+
from evalscope.metrics import mean, metric_registry
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class T2IBaseAdapter(DataAdapter):
|
|
11
|
+
|
|
12
|
+
def __init__(self, **kwargs):
|
|
13
|
+
|
|
14
|
+
super().__init__(**kwargs)
|
|
15
|
+
|
|
16
|
+
logger.info(f'Initializing metrics: {self.metric_list}')
|
|
17
|
+
self.metrics = {m: metric_registry.get(m).object() for m in self.metric_list}
|
|
18
|
+
|
|
19
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
20
|
+
# dummy prompt for general t2i
|
|
21
|
+
return self.gen_prompt_data(prompt=input_d.get('prompt', ''), id=input_d.get('id', 0))
|
|
22
|
+
|
|
23
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
24
|
+
# dummy gold answer for general t2i
|
|
25
|
+
return input_d.get('prompt', '')
|
|
26
|
+
|
|
27
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
28
|
+
# dummy parse pred result for general t2i
|
|
29
|
+
return result or raw_input_d.get('image_path', '')
|
|
30
|
+
|
|
31
|
+
def match(self, gold: str, pred: str) -> dict:
|
|
32
|
+
# dummy match for general t2i
|
|
33
|
+
# pred is the image path, gold is the prompt
|
|
34
|
+
res = {}
|
|
35
|
+
for metric_name, metric_func in self.metrics.items():
|
|
36
|
+
score = metric_func(images=[pred], texts=[gold])[0][0]
|
|
37
|
+
if isinstance(score, dict):
|
|
38
|
+
for k, v in score.items():
|
|
39
|
+
res[f'{metric_name}_{k}'] = v.cpu().item()
|
|
40
|
+
else:
|
|
41
|
+
res[metric_name] = score.cpu().item() # Updated to use score.cpu().item()
|
|
42
|
+
return res
|
|
43
|
+
|
|
44
|
+
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
45
|
+
"""
|
|
46
|
+
compute weighted mean of the bleu score of all samples
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
review_res_list: [score1, score2, ...]
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
avg_res: List[dict]
|
|
53
|
+
|
|
54
|
+
"""
|
|
55
|
+
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
56
|
+
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.metrics import mean
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from .base import T2IBaseAdapter
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@Benchmark.register(
|
|
17
|
+
name='evalmuse',
|
|
18
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
20
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
21
|
+
subset_list=['EvalMuse'],
|
|
22
|
+
metric_list=['FGA_BLIP2Score'],
|
|
23
|
+
few_shot_num=0,
|
|
24
|
+
train_split=None,
|
|
25
|
+
eval_split='test',
|
|
26
|
+
)
|
|
27
|
+
class EvalMuseAdapter(T2IBaseAdapter):
|
|
28
|
+
|
|
29
|
+
def __init__(self, **kwargs):
|
|
30
|
+
super().__init__(**kwargs)
|
|
31
|
+
|
|
32
|
+
def load(self, **kwargs) -> dict:
|
|
33
|
+
if os.path.isfile(self.dataset_id):
|
|
34
|
+
data_list = jsonl_to_list(self.dataset_id)
|
|
35
|
+
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
36
|
+
return data_dict
|
|
37
|
+
else:
|
|
38
|
+
return super().load(**kwargs)
|
|
39
|
+
|
|
40
|
+
def get_gold_answer(self, input_d: dict) -> dict:
|
|
41
|
+
# return prompt and elements dict
|
|
42
|
+
return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
|
|
43
|
+
|
|
44
|
+
def match(self, gold: dict, pred: str) -> dict:
|
|
45
|
+
# dummy match for general t2i
|
|
46
|
+
# pred is the image path, gold is the prompt
|
|
47
|
+
res = {}
|
|
48
|
+
for metric_name, metric_func in self.metrics.items():
|
|
49
|
+
if metric_name == 'FGA_BLIP2Score':
|
|
50
|
+
# For FGA_BLIP2Score, we need to pass the dictionary
|
|
51
|
+
score = metric_func(images=[pred], texts=[gold])[0][0]
|
|
52
|
+
else:
|
|
53
|
+
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
54
|
+
if isinstance(score, dict):
|
|
55
|
+
for k, v in score.items():
|
|
56
|
+
res[f'{metric_name}:{k}'] = v.cpu().item()
|
|
57
|
+
else:
|
|
58
|
+
res[metric_name] = score.cpu().item()
|
|
59
|
+
return res
|
|
60
|
+
|
|
61
|
+
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
62
|
+
"""
|
|
63
|
+
compute weighted mean of the bleu score of all samples
|
|
64
|
+
"""
|
|
65
|
+
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
66
|
+
# add statistics for each metric
|
|
67
|
+
new_items = defaultdict(list)
|
|
68
|
+
for metric_name, value_list in items.items():
|
|
69
|
+
if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
|
|
70
|
+
metrics_prefix = metric_name.split(':')[0]
|
|
71
|
+
category = metric_name.rpartition('(')[-1].split(')')[0]
|
|
72
|
+
new_items[f'{metrics_prefix}:{category}'].extend(value_list)
|
|
73
|
+
else:
|
|
74
|
+
new_items[metric_name].extend(value_list)
|
|
75
|
+
|
|
76
|
+
# calculate mean for each metric
|
|
77
|
+
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in new_items.items()]
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .base import T2IBaseAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='genai_bench',
|
|
17
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
19
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
20
|
+
subset_list=['GenAI-Bench-1600'],
|
|
21
|
+
metric_list=['VQAScore'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
class GenAIBenchAdapter(T2IBaseAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
super().__init__(**kwargs)
|
|
30
|
+
|
|
31
|
+
def load(self, **kwargs) -> dict:
|
|
32
|
+
if os.path.isfile(self.dataset_id):
|
|
33
|
+
data_list = jsonl_to_list(self.dataset_id)
|
|
34
|
+
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
35
|
+
return data_dict
|
|
36
|
+
else:
|
|
37
|
+
return super().load(**kwargs)
|
|
38
|
+
|
|
39
|
+
def get_gold_answer(self, input_d: dict) -> dict:
|
|
40
|
+
# return prompt and elements dict
|
|
41
|
+
return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
|
|
42
|
+
|
|
43
|
+
def match(self, gold: dict, pred: str) -> dict:
|
|
44
|
+
# dummy match for general t2i
|
|
45
|
+
# pred is the image path, gold is the prompt
|
|
46
|
+
res = {}
|
|
47
|
+
for metric_name, metric_func in self.metrics.items():
|
|
48
|
+
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
49
|
+
|
|
50
|
+
res[metric_name] = score.cpu().item()
|
|
51
|
+
|
|
52
|
+
# fine-granular metrics
|
|
53
|
+
if gold['tags'].get('advanced'):
|
|
54
|
+
res[f'{metric_name}_advanced'] = score.cpu().item()
|
|
55
|
+
else:
|
|
56
|
+
res[f'{metric_name}_basic'] = score.cpu().item()
|
|
57
|
+
|
|
58
|
+
return res
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .base import T2IBaseAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='general_t2i',
|
|
17
|
+
dataset_id='general_t2i',
|
|
18
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
19
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
20
|
+
subset_list=['default'],
|
|
21
|
+
metric_list=['PickScore'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
class GeneralT2IAdapter(T2IBaseAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
|
|
30
|
+
super().__init__(**kwargs)
|
|
31
|
+
|
|
32
|
+
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
33
|
+
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
34
|
+
subset_list = subset_list or self.subset_list
|
|
35
|
+
|
|
36
|
+
data_file_dict = defaultdict(str)
|
|
37
|
+
data_list = []
|
|
38
|
+
|
|
39
|
+
# get data file path and subset name
|
|
40
|
+
if os.path.isdir(dataset_name_or_path):
|
|
41
|
+
for subset_name in subset_list:
|
|
42
|
+
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
43
|
+
elif os.path.isfile(dataset_name_or_path):
|
|
44
|
+
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
45
|
+
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
48
|
+
|
|
49
|
+
# load data from local disk
|
|
50
|
+
try:
|
|
51
|
+
for subset_name, file_path in data_file_dict.items():
|
|
52
|
+
data_list.extend(jsonl_to_list(file_path))
|
|
53
|
+
except Exception as e:
|
|
54
|
+
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
55
|
+
|
|
56
|
+
data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
|
|
57
|
+
|
|
58
|
+
return data_dict
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .base import T2IBaseAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='hpdv2',
|
|
17
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
19
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
20
|
+
subset_list=['HPDv2'],
|
|
21
|
+
metric_list=['HPSv2.1Score'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
class HPDv2Adapter(T2IBaseAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
super().__init__(**kwargs)
|
|
30
|
+
|
|
31
|
+
def load(self, **kwargs) -> dict:
|
|
32
|
+
if os.path.isfile(self.dataset_id):
|
|
33
|
+
data_list = jsonl_to_list(self.dataset_id)
|
|
34
|
+
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
35
|
+
return data_dict
|
|
36
|
+
else:
|
|
37
|
+
return super().load(**kwargs)
|
|
38
|
+
|
|
39
|
+
def get_gold_answer(self, input_d: dict) -> dict:
|
|
40
|
+
# return prompt and elements dict
|
|
41
|
+
return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
|
|
42
|
+
|
|
43
|
+
def match(self, gold: dict, pred: str) -> dict:
|
|
44
|
+
# dummy match for general t2i
|
|
45
|
+
# pred is the image path, gold is the prompt
|
|
46
|
+
res = {}
|
|
47
|
+
for metric_name, metric_func in self.metrics.items():
|
|
48
|
+
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
49
|
+
|
|
50
|
+
res[metric_name] = score.cpu().item()
|
|
51
|
+
|
|
52
|
+
# fine-granular metrics
|
|
53
|
+
category = gold['tags'].get('category')
|
|
54
|
+
if category:
|
|
55
|
+
res[f'{metric_name}_{category}'] = score.cpu().item()
|
|
56
|
+
|
|
57
|
+
return res
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
7
|
+
from evalscope.constants import OutputType
|
|
8
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from .base import T2IBaseAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='tifa160',
|
|
17
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
+
model_adapter=OutputType.IMAGE_GENERATION,
|
|
19
|
+
output_types=[OutputType.IMAGE_GENERATION],
|
|
20
|
+
subset_list=['TIFA-160'],
|
|
21
|
+
metric_list=['PickScore'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
class TIFA_Adapter(T2IBaseAdapter):
|
|
27
|
+
|
|
28
|
+
def __init__(self, **kwargs):
|
|
29
|
+
super().__init__(**kwargs)
|
|
30
|
+
|
|
31
|
+
def load(self, **kwargs) -> dict:
|
|
32
|
+
if os.path.isfile(self.dataset_id):
|
|
33
|
+
data_list = jsonl_to_list(self.dataset_id)
|
|
34
|
+
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
35
|
+
return data_dict
|
|
36
|
+
else:
|
|
37
|
+
return super().load(**kwargs)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.constants import OutputType
|
|
3
|
-
from evalscope.metrics
|
|
3
|
+
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
|
|
6
6
|
# flake8: noqa
|