evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -2,11 +2,10 @@ import os
|
|
|
2
2
|
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
|
3
3
|
from langchain_core.language_models.llms import LLM as BaseLLM
|
|
4
4
|
from langchain_openai import ChatOpenAI
|
|
5
|
-
from transformers.generation.configuration_utils import GenerationConfig
|
|
6
5
|
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
7
6
|
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
7
|
+
from evalscope.api.model import GenerateConfig, Model, get_model
|
|
8
|
+
from evalscope.constants import DEFAULT_MODEL_REVISION, EvalType
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
class LLM:
|
|
@@ -30,16 +29,19 @@ class LocalLLM(BaseLLM):
|
|
|
30
29
|
model_name_or_path: str
|
|
31
30
|
model_revision: str = DEFAULT_MODEL_REVISION
|
|
32
31
|
template_type: Optional[str] = None
|
|
33
|
-
model_name: Optional[str]
|
|
34
|
-
model: Optional[
|
|
35
|
-
generation_config: Optional[Dict]
|
|
32
|
+
model_name: Optional[str] = None
|
|
33
|
+
model: Optional[Model] = None
|
|
34
|
+
generation_config: Optional[Dict] = {}
|
|
36
35
|
|
|
37
36
|
def __init__(self, **kw):
|
|
38
37
|
super().__init__(**kw)
|
|
39
38
|
self.model_name = os.path.basename(self.model_name_or_path)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
39
|
+
|
|
40
|
+
# Create and initialize the local model
|
|
41
|
+
self.model = get_model(
|
|
42
|
+
model=self.model_name_or_path,
|
|
43
|
+
eval_type=EvalType.CHECKPOINT,
|
|
44
|
+
config=GenerateConfig(**self.generation_config),
|
|
43
45
|
)
|
|
44
46
|
|
|
45
47
|
def _call(
|
|
@@ -50,10 +52,9 @@ class LocalLLM(BaseLLM):
|
|
|
50
52
|
**kwargs: Any,
|
|
51
53
|
) -> str:
|
|
52
54
|
"""Run the LLM on the given input."""
|
|
53
|
-
infer_cfg = {'stop': stop}
|
|
54
55
|
|
|
55
|
-
response
|
|
56
|
-
return response
|
|
56
|
+
response = self.model.generate(input=prompt)
|
|
57
|
+
return response.completion
|
|
57
58
|
|
|
58
59
|
@property
|
|
59
60
|
def _identifying_params(self) -> Dict[str, Any]:
|
evalscope/benchmarks/__init__.py
CHANGED
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os.path
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GeneralI2IAdapter:
|
|
13
|
+
|
|
14
|
+
def __init__(self, **kwargs):
|
|
15
|
+
|
|
16
|
+
super().__init__(**kwargs)
|
|
17
|
+
|
|
18
|
+
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
19
|
+
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
20
|
+
subset_list = subset_list or self.subset_list
|
|
21
|
+
|
|
22
|
+
data_file_dict = defaultdict(str)
|
|
23
|
+
data_item_dict = defaultdict(list)
|
|
24
|
+
|
|
25
|
+
# get data file path and subset name
|
|
26
|
+
if os.path.isdir(dataset_name_or_path):
|
|
27
|
+
for subset_name in subset_list:
|
|
28
|
+
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
29
|
+
elif os.path.isfile(dataset_name_or_path):
|
|
30
|
+
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
31
|
+
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
34
|
+
|
|
35
|
+
# load data from local disk
|
|
36
|
+
try:
|
|
37
|
+
for subset_name, file_path in data_file_dict.items():
|
|
38
|
+
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
39
|
+
except Exception as e:
|
|
40
|
+
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
41
|
+
|
|
42
|
+
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
43
|
+
|
|
44
|
+
return data_dict
|
|
@@ -1,78 +1,76 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os.path
|
|
3
2
|
from collections import defaultdict
|
|
4
3
|
from typing import List, Optional, Union
|
|
5
4
|
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
6
|
+
from evalscope.api.metric.scorer import AggScore, Score
|
|
7
|
+
from evalscope.api.registry import get_metric, register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
8
9
|
from evalscope.metrics import mean
|
|
9
|
-
from evalscope.utils.
|
|
10
|
+
from evalscope.utils.function_utils import thread_safe
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
|
-
from .base import T2IBaseAdapter
|
|
12
12
|
|
|
13
13
|
logger = get_logger()
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
@
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
16
|
+
@register_benchmark(
|
|
17
|
+
BenchmarkMeta(
|
|
18
|
+
name='evalmuse',
|
|
19
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
20
|
+
description='EvalMuse Text-to-Image Benchmark',
|
|
21
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
22
|
+
subset_list=['EvalMuse'],
|
|
23
|
+
metric_list=['FGA_BLIP2Score'],
|
|
24
|
+
few_shot_num=0,
|
|
25
|
+
train_split=None,
|
|
26
|
+
eval_split='test',
|
|
27
|
+
)
|
|
26
28
|
)
|
|
27
|
-
class EvalMuseAdapter(
|
|
29
|
+
class EvalMuseAdapter(Text2ImageAdapter):
|
|
28
30
|
|
|
29
31
|
def __init__(self, **kwargs):
|
|
30
32
|
super().__init__(**kwargs)
|
|
33
|
+
assert len(self.metric_list
|
|
34
|
+
) == 1 and self.metric_list[0] == 'FGA_BLIP2Score', 'Only FGA_BLIP2Score is supported for EvalMuse'
|
|
31
35
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
return data_dict
|
|
37
|
-
else:
|
|
38
|
-
return super().load(**kwargs)
|
|
36
|
+
@thread_safe
|
|
37
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state):
|
|
38
|
+
# Get prediction and prompt from task state
|
|
39
|
+
image_path = task_state.metadata.get('image_path', original_prediction)
|
|
39
40
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
# Initialize the score object with prediction details
|
|
42
|
+
score = Score(
|
|
43
|
+
extracted_prediction=image_path,
|
|
44
|
+
prediction=image_path,
|
|
45
|
+
)
|
|
43
46
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# For FGA_BLIP2Score, we need to pass the dictionary
|
|
51
|
-
score = metric_func(images=[pred], texts=[gold])[0][0]
|
|
52
|
-
else:
|
|
53
|
-
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
54
|
-
if isinstance(score, dict):
|
|
55
|
-
for k, v in score.items():
|
|
56
|
-
res[f'{metric_name}:{k}'] = v.cpu().item()
|
|
57
|
-
else:
|
|
58
|
-
res[metric_name] = score.cpu().item()
|
|
59
|
-
return res
|
|
47
|
+
# Calculate scores for each configured metric
|
|
48
|
+
try:
|
|
49
|
+
metric_name = self.metric_list[0]
|
|
50
|
+
metric_cls = get_metric(metric_name)
|
|
51
|
+
metric_func = metric_cls() # Initialize with parameters
|
|
52
|
+
metric_score = metric_func(image_path, task_state.metadata)[0]
|
|
60
53
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
54
|
+
for k, v in metric_score.items():
|
|
55
|
+
score.value[f'{metric_name}:{k}'] = v.cpu().item()
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.error(f'Error calculating metric {metric_name}: {e}')
|
|
58
|
+
score.value[metric_name] = 0
|
|
59
|
+
score.metadata[metric_name] = f'error: {str(e)}'
|
|
60
|
+
|
|
61
|
+
return score
|
|
62
|
+
|
|
63
|
+
def aggregate_scores(self, sample_scores) -> List[AggScore]:
|
|
67
64
|
new_items = defaultdict(list)
|
|
68
|
-
|
|
69
|
-
|
|
65
|
+
agg_list = []
|
|
66
|
+
for sample_score in sample_scores:
|
|
67
|
+
for metric_name, value in sample_score.score.value.items():
|
|
70
68
|
metrics_prefix = metric_name.split(':')[0]
|
|
71
69
|
category = metric_name.rpartition('(')[-1].split(')')[0]
|
|
72
70
|
category = category.split('-')[0].lower() # remove the suffix if exists
|
|
73
|
-
new_items[f'{metrics_prefix}:{category}'].
|
|
74
|
-
|
|
75
|
-
|
|
71
|
+
new_items[f'{metrics_prefix}:{category}'].append(value)
|
|
72
|
+
|
|
73
|
+
for k, v in new_items.items():
|
|
74
|
+
agg_list.append(AggScore(metric_name=k, score=mean(v), num=len(v)))
|
|
76
75
|
|
|
77
|
-
|
|
78
|
-
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in new_items.items()]
|
|
76
|
+
return agg_list
|
|
@@ -1,58 +1,53 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
|
-
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser
|
|
7
|
+
from evalscope.api.metric.scorer import Score
|
|
8
|
+
from evalscope.api.registry import get_metric, register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
|
-
from .base import T2IBaseAdapter
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
@
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
15
|
+
@register_benchmark(
|
|
16
|
+
BenchmarkMeta(
|
|
17
|
+
name='genai_bench',
|
|
18
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
+
description='GenAI-Bench Text-to-Image Benchmark',
|
|
20
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
21
|
+
subset_list=['GenAI-Bench-1600'],
|
|
22
|
+
metric_list=['VQAScore'],
|
|
23
|
+
few_shot_num=0,
|
|
24
|
+
train_split=None,
|
|
25
|
+
eval_split='test',
|
|
26
|
+
)
|
|
25
27
|
)
|
|
26
|
-
class GenAIBenchAdapter(
|
|
28
|
+
class GenAIBenchAdapter(Text2ImageAdapter):
|
|
27
29
|
|
|
28
30
|
def __init__(self, **kwargs):
|
|
29
31
|
super().__init__(**kwargs)
|
|
30
32
|
|
|
31
|
-
def
|
|
33
|
+
def load_from_disk(self, **kwargs):
|
|
32
34
|
if os.path.isfile(self.dataset_id):
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
# fine-granular metrics
|
|
53
|
-
if gold['tags'].get('advanced'):
|
|
54
|
-
res[f'{metric_name}_advanced'] = score.cpu().item()
|
|
55
|
-
else:
|
|
56
|
-
res[f'{metric_name}_basic'] = score.cpu().item()
|
|
57
|
-
|
|
58
|
-
return res
|
|
35
|
+
file_name = os.path.basename(self.dataset_id)
|
|
36
|
+
file_without_ext = os.path.splitext(file_name)[0]
|
|
37
|
+
self.subset_list = [file_without_ext]
|
|
38
|
+
|
|
39
|
+
return super().load_from_disk(use_local_loader=True)
|
|
40
|
+
|
|
41
|
+
def record_to_sample(self, record) -> Sample:
|
|
42
|
+
"""Convert a record dictionary to a Sample object."""
|
|
43
|
+
advanced = record['tags'].get('advanced')
|
|
44
|
+
return Sample(
|
|
45
|
+
input=[ChatMessageUser(content=record['prompt'])],
|
|
46
|
+
metadata={
|
|
47
|
+
'id': record['id'],
|
|
48
|
+
'prompt': record['prompt'],
|
|
49
|
+
'category': 'advanced' if advanced else 'basic',
|
|
50
|
+
'tags': record.get('tags', []),
|
|
51
|
+
'image_path': record.get('image_path', ''), # Optional field for existing image path
|
|
52
|
+
}
|
|
53
|
+
)
|
|
@@ -1,58 +1,42 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from typing import List, Optional, Union
|
|
2
|
+
import os
|
|
5
3
|
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
|
-
from .base import T2IBaseAdapter
|
|
11
10
|
|
|
12
11
|
logger = get_logger()
|
|
13
12
|
|
|
14
13
|
|
|
15
|
-
@
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='general_t2i',
|
|
17
|
+
dataset_id='general_t2i',
|
|
18
|
+
description='General Text-to-Image Benchmark',
|
|
19
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
20
|
+
subset_list=['default'],
|
|
21
|
+
metric_list=['PickScore'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
25
26
|
)
|
|
26
|
-
class GeneralT2IAdapter(
|
|
27
|
+
class GeneralT2IAdapter(Text2ImageAdapter):
|
|
27
28
|
|
|
28
29
|
def __init__(self, **kwargs):
|
|
29
30
|
|
|
30
31
|
super().__init__(**kwargs)
|
|
31
32
|
|
|
32
|
-
def
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
43
|
-
elif os.path.isfile(dataset_name_or_path):
|
|
44
|
-
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
45
|
-
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
46
|
-
else:
|
|
47
|
-
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
48
|
-
|
|
49
|
-
# load data from local disk
|
|
50
|
-
try:
|
|
51
|
-
for subset_name, file_path in data_file_dict.items():
|
|
52
|
-
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
53
|
-
except Exception as e:
|
|
54
|
-
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
55
|
-
|
|
56
|
-
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
57
|
-
|
|
58
|
-
return data_dict
|
|
33
|
+
def load_from_disk(self, **kwargs):
|
|
34
|
+
if os.path.isfile(self.dataset_id):
|
|
35
|
+
file_name = os.path.basename(self.dataset_id)
|
|
36
|
+
file_without_ext = os.path.splitext(file_name)[0]
|
|
37
|
+
self.subset_list = [file_without_ext]
|
|
38
|
+
|
|
39
|
+
return super().load_from_disk(use_local_loader=True)
|
|
40
|
+
|
|
41
|
+
def record_to_sample(self, record):
|
|
42
|
+
return Sample(input=[ChatMessageUser(content=record['prompt'])], metadata={'image_path': record['image_path']})
|
|
@@ -1,57 +1,47 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from typing import List, Optional, Union
|
|
2
|
+
import os
|
|
5
3
|
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
|
-
from .base import T2IBaseAdapter
|
|
11
10
|
|
|
12
11
|
logger = get_logger()
|
|
13
12
|
|
|
14
13
|
|
|
15
|
-
@
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='hpdv2',
|
|
17
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
18
|
+
description='HPDv2 Text-to-Image Benchmark',
|
|
19
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
20
|
+
subset_list=['HPDv2'],
|
|
21
|
+
metric_list=['HPSv2.1Score'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
25
26
|
)
|
|
26
|
-
class HPDv2Adapter(
|
|
27
|
+
class HPDv2Adapter(Text2ImageAdapter):
|
|
27
28
|
|
|
28
29
|
def __init__(self, **kwargs):
|
|
29
30
|
super().__init__(**kwargs)
|
|
30
31
|
|
|
31
|
-
def
|
|
32
|
+
def load_from_disk(self, **kwargs):
|
|
32
33
|
if os.path.isfile(self.dataset_id):
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
for metric_name, metric_func in self.metrics.items():
|
|
48
|
-
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
49
|
-
|
|
50
|
-
res[metric_name] = score.cpu().item()
|
|
51
|
-
|
|
52
|
-
# fine-granular metrics
|
|
53
|
-
category = gold['tags'].get('category')
|
|
54
|
-
if category:
|
|
55
|
-
res[f'{metric_name}_{category}'] = score.cpu().item()
|
|
56
|
-
|
|
57
|
-
return res
|
|
34
|
+
file_name = os.path.basename(self.dataset_id)
|
|
35
|
+
file_without_ext = os.path.splitext(file_name)[0]
|
|
36
|
+
self.subset_list = [file_without_ext]
|
|
37
|
+
|
|
38
|
+
return super().load_from_disk(use_local_loader=True)
|
|
39
|
+
|
|
40
|
+
def record_to_sample(self, record):
|
|
41
|
+
return Sample(
|
|
42
|
+
input=[ChatMessageUser(content=record['prompt'])],
|
|
43
|
+
metadata={
|
|
44
|
+
'category': record.get('tags', {}).get('category', ''),
|
|
45
|
+
'tags': record.get('tags', {})
|
|
46
|
+
}
|
|
47
|
+
)
|
|
@@ -1,37 +1,26 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
from evalscope.benchmarks import Benchmark
|
|
7
|
-
from evalscope.constants import OutputType
|
|
8
|
-
from evalscope.utils.io_utils import jsonl_to_list
|
|
2
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
9
5
|
from evalscope.utils.logger import get_logger
|
|
10
|
-
from .base import T2IBaseAdapter
|
|
11
6
|
|
|
12
7
|
logger = get_logger()
|
|
13
8
|
|
|
14
9
|
|
|
15
|
-
@
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
10
|
+
@register_benchmark(
|
|
11
|
+
BenchmarkMeta(
|
|
12
|
+
name='tifa160',
|
|
13
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
14
|
+
description='TIFA-160 Text-to-Image Benchmark',
|
|
15
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
16
|
+
subset_list=['TIFA-160'],
|
|
17
|
+
metric_list=['PickScore'],
|
|
18
|
+
few_shot_num=0,
|
|
19
|
+
train_split=None,
|
|
20
|
+
eval_split='test',
|
|
21
|
+
)
|
|
25
22
|
)
|
|
26
|
-
class TIFA_Adapter(
|
|
23
|
+
class TIFA_Adapter(Text2ImageAdapter):
|
|
27
24
|
|
|
28
25
|
def __init__(self, **kwargs):
|
|
29
26
|
super().__init__(**kwargs)
|
|
30
|
-
|
|
31
|
-
def load(self, **kwargs) -> dict:
|
|
32
|
-
if os.path.isfile(self.dataset_id):
|
|
33
|
-
data_list = jsonl_to_list(self.dataset_id)
|
|
34
|
-
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
35
|
-
return data_dict
|
|
36
|
-
else:
|
|
37
|
-
return super().load(**kwargs)
|