evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +16 -9
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +16 -4
- evalscope/config.py +7 -3
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +9 -3
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/benchmark.py +4 -3
- evalscope/perf/main.py +4 -2
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/utils/benchmark_util.py +2 -2
- evalscope/perf/utils/db_util.py +16 -8
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +117 -67
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +3 -3
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +12 -4
- evalscope/version.py +2 -2
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_run.py +20 -7
- tests/perf/test_perf.py +6 -3
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
|
@@ -2,7 +2,6 @@ import json
|
|
|
2
2
|
import multiprocessing
|
|
3
3
|
import numpy as np
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
6
5
|
|
|
7
6
|
from evalscope.utils.logger import get_logger
|
|
8
7
|
from .pass_k_utils import compute_metrics_from_results
|
|
@@ -31,7 +30,10 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
|
|
|
31
30
|
args=(sample, generation, debug, result, metadata_list, timeout),
|
|
32
31
|
)
|
|
33
32
|
p.start()
|
|
34
|
-
|
|
33
|
+
global_timeout = (timeout + 1) * len(json.loads(sample['input_output'])['inputs'])
|
|
34
|
+
if debug:
|
|
35
|
+
logger.info(f'global timeout = {global_timeout}')
|
|
36
|
+
p.join(timeout=global_timeout)
|
|
35
37
|
if p.is_alive():
|
|
36
38
|
p.kill()
|
|
37
39
|
if not result:
|
|
@@ -39,7 +41,7 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
|
|
|
39
41
|
# consider that all tests failed
|
|
40
42
|
result = [[-1 for i in range(len(in_outs['inputs']))]]
|
|
41
43
|
if debug:
|
|
42
|
-
logger.info('global timeout')
|
|
44
|
+
logger.info('global timeout occured: alarm went off')
|
|
43
45
|
return result[0], metadata_list[0]
|
|
44
46
|
|
|
45
47
|
|
|
@@ -99,7 +101,7 @@ def evaluate_generations(
|
|
|
99
101
|
samples_list: list,
|
|
100
102
|
generations_list: list[list[str]],
|
|
101
103
|
debug: bool = False,
|
|
102
|
-
num_process_evaluate: int = 16,
|
|
104
|
+
num_process_evaluate: int = 16, # This parameter will be unused
|
|
103
105
|
timeout=6,
|
|
104
106
|
):
|
|
105
107
|
"""We take the list of code generations and try to compile them and the run
|
|
@@ -117,26 +119,19 @@ def evaluate_generations(
|
|
|
117
119
|
[-2] = compile error, [-1] = runtime error [False] = failed test
|
|
118
120
|
case [True] = passed test case
|
|
119
121
|
"""
|
|
122
|
+
results = {}
|
|
123
|
+
metadata = {}
|
|
120
124
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
for index in range(len(generations_list))]
|
|
125
|
-
|
|
126
|
-
with ProcessPoolExecutor(max_workers=1 if debug else num_process_evaluate) as executor:
|
|
127
|
-
futures = {
|
|
128
|
-
executor.submit(evaluate_generations_by_problem, problem_generations, sample, debug, timeout): index
|
|
129
|
-
for (problem_generations, sample, debug, timeout), index in inputs
|
|
130
|
-
}
|
|
125
|
+
for index in range(len(generations_list)):
|
|
126
|
+
problem_generations = generations_list[index]
|
|
127
|
+
sample = samples_list[index]
|
|
131
128
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
index = futures[future]
|
|
136
|
-
results[index], metadata[index] = future.result()
|
|
129
|
+
result, meta = evaluate_generations_by_problem(problem_generations, sample, debug, timeout)
|
|
130
|
+
results[index] = result
|
|
131
|
+
metadata[index] = meta
|
|
137
132
|
|
|
138
|
-
assert len(results) == len(
|
|
139
|
-
|
|
133
|
+
assert len(results) == len(
|
|
134
|
+
generations_list), f'results = {len(results)} inputs = {len(generations_list)} {results=}'
|
|
140
135
|
|
|
141
136
|
return results, metadata
|
|
142
137
|
|
|
@@ -18,7 +18,8 @@ logger = get_logger()
|
|
|
18
18
|
extra_params={
|
|
19
19
|
'start_date': None,
|
|
20
20
|
'end_date': None,
|
|
21
|
-
'timeout': 6
|
|
21
|
+
'timeout': 6,
|
|
22
|
+
'debug': False
|
|
22
23
|
},
|
|
23
24
|
system_prompt=
|
|
24
25
|
'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.', # noqa: E501
|
|
@@ -33,6 +34,7 @@ class LiveCodeBenchAdapter(DataAdapter):
|
|
|
33
34
|
extra_params = kwargs.get('extra_params', {})
|
|
34
35
|
|
|
35
36
|
self.timeout = extra_params.get('timeout', 6)
|
|
37
|
+
self.debug = extra_params.get('debug', False)
|
|
36
38
|
self.start_date = extra_params.get('start_date')
|
|
37
39
|
self.end_date = extra_params.get('end_date')
|
|
38
40
|
|
|
@@ -84,5 +86,6 @@ class LiveCodeBenchAdapter(DataAdapter):
|
|
|
84
86
|
k_list=[1],
|
|
85
87
|
num_process_evaluate=1,
|
|
86
88
|
timeout=self.timeout,
|
|
89
|
+
debug=self.debug,
|
|
87
90
|
)
|
|
88
91
|
return metrics['pass@1'] / 100 # convert to point scale
|
|
@@ -12,6 +12,7 @@ import time
|
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
from decimal import Decimal
|
|
14
14
|
from enum import Enum
|
|
15
|
+
from functools import partial
|
|
15
16
|
from io import StringIO
|
|
16
17
|
# from pyext import RuntimeModule
|
|
17
18
|
from types import ModuleType
|
|
@@ -46,8 +47,9 @@ class TimeoutException(Exception):
|
|
|
46
47
|
pass
|
|
47
48
|
|
|
48
49
|
|
|
49
|
-
def timeout_handler(signum, frame):
|
|
50
|
-
|
|
50
|
+
def timeout_handler(debug, signum, frame):
|
|
51
|
+
if debug:
|
|
52
|
+
logger.info('timeout occured: alarm went off')
|
|
51
53
|
raise TimeoutException
|
|
52
54
|
|
|
53
55
|
|
|
@@ -381,7 +383,8 @@ def run_test(sample, test=None, debug=False, timeout=6):
|
|
|
381
383
|
if test(generated_code) is not None it'll try to run the code.
|
|
382
384
|
otherwise it'll just return an input and output pair.
|
|
383
385
|
"""
|
|
384
|
-
|
|
386
|
+
timeout_handler_wrapper = partial(timeout_handler, debug)
|
|
387
|
+
signal.signal(signal.SIGALRM, timeout_handler_wrapper)
|
|
385
388
|
|
|
386
389
|
# Disable functionalities that can make destructive changes to the test.
|
|
387
390
|
# max memory is set to 4GB
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
-
from evalscope.metrics
|
|
2
|
+
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
3
3
|
from evalscope.utils.logger import get_logger
|
|
4
4
|
|
|
5
5
|
# flake8: noqa
|
|
@@ -137,7 +137,7 @@ SUBJECT_MAPPING = {
|
|
|
137
137
|
name='mmlu',
|
|
138
138
|
pretty_name='MMLU',
|
|
139
139
|
dataset_id='modelscope/mmlu',
|
|
140
|
-
model_adapter=OutputType.
|
|
140
|
+
model_adapter=OutputType.GENERATION,
|
|
141
141
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
142
142
|
subset_list=SUBSET_LIST,
|
|
143
143
|
metric_list=['AverageAccuracy'],
|
|
@@ -263,6 +263,8 @@ class MMLUAdapter(DataAdapter):
|
|
|
263
263
|
|
|
264
264
|
if include_answer:
|
|
265
265
|
example += f"\nAnswer: {input_d['target']}\n\n"
|
|
266
|
+
else:
|
|
267
|
+
example += '\nAnswer: \n\n'
|
|
266
268
|
|
|
267
269
|
return example
|
|
268
270
|
|
|
@@ -3,8 +3,7 @@ from collections import defaultdict
|
|
|
3
3
|
from typing import Any, List
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.metrics import Metric, mean, metric_registry
|
|
7
|
-
from evalscope.metrics.llm_judge import LLMJudge
|
|
6
|
+
from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
9
8
|
|
|
10
9
|
# flake8: noqa
|
evalscope/benchmarks/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import asdict, dataclass
|
|
2
2
|
from functools import wraps
|
|
3
|
-
from typing import Dict, List, Optional
|
|
3
|
+
from typing import Dict, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
from evalscope.constants import EvalType
|
|
6
6
|
from evalscope.utils.filters import Filter
|
|
@@ -9,30 +9,21 @@ from evalscope.utils.filters import Filter
|
|
|
9
9
|
@dataclass
|
|
10
10
|
class PromptData:
|
|
11
11
|
data: List[str]
|
|
12
|
-
index: Optional[int] = 0
|
|
12
|
+
index: Optional[Union[int, str]] = 0
|
|
13
13
|
system_prompt: Optional[str] = None
|
|
14
14
|
multi_choices: Optional[List[str]] = None
|
|
15
|
+
id: Optional[str] = None
|
|
15
16
|
|
|
16
17
|
def to_dict(self) -> Dict:
|
|
17
|
-
|
|
18
|
-
return {
|
|
19
|
-
'data': self.data,
|
|
20
|
-
'index': self.index,
|
|
21
|
-
'system_prompt': self.system_prompt,
|
|
22
|
-
}
|
|
23
|
-
else:
|
|
24
|
-
return {
|
|
25
|
-
'data': self.data,
|
|
26
|
-
'index': self.index,
|
|
27
|
-
'system_prompt': self.system_prompt,
|
|
28
|
-
'multi_choices': self.multi_choices,
|
|
29
|
-
}
|
|
18
|
+
return {k: v for k, v in asdict(self).items() if v is not None}
|
|
30
19
|
|
|
31
20
|
|
|
32
21
|
def preprocess_decorator(func):
|
|
33
22
|
|
|
34
23
|
@wraps(func)
|
|
35
24
|
def wrapper(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT):
|
|
25
|
+
if result is None:
|
|
26
|
+
result = ''
|
|
36
27
|
filters = self.config_kwargs.get('filters', None)
|
|
37
28
|
if filters:
|
|
38
29
|
# Apply filters to the resultply filters to the result
|
evalscope/cli/start_app.py
CHANGED
|
@@ -21,7 +21,7 @@ class StartAppCMD(CLICommand):
|
|
|
21
21
|
def define_args(parsers: ArgumentParser):
|
|
22
22
|
""" define args for create pipeline template command.
|
|
23
23
|
"""
|
|
24
|
-
from evalscope.report
|
|
24
|
+
from evalscope.report import add_argument
|
|
25
25
|
|
|
26
26
|
parser = parsers.add_parser(StartAppCMD.name)
|
|
27
27
|
add_argument(parser)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import pandas as pd
|
|
4
|
+
import random
|
|
4
5
|
from collections import defaultdict
|
|
5
6
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from copy import deepcopy
|
|
6
8
|
from tabulate import tabulate
|
|
7
9
|
from tqdm import tqdm
|
|
8
10
|
from typing import List
|
|
@@ -10,7 +12,7 @@ from typing import List
|
|
|
10
12
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
11
13
|
from evalscope.collections.sampler import DatasetEntry
|
|
12
14
|
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import AnswerKeys, DumpMode, EvalType
|
|
15
|
+
from evalscope.constants import AnswerKeys, DataCollection, DumpMode, EvalType
|
|
14
16
|
from evalscope.evaluator import Evaluator
|
|
15
17
|
from evalscope.models import initialize_model_adapter
|
|
16
18
|
from evalscope.report import ReportGenerator
|
|
@@ -67,9 +69,10 @@ class EvaluatorCollection:
|
|
|
67
69
|
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
68
70
|
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
|
|
69
71
|
raw_dataset = self.data_adapter.load()
|
|
70
|
-
# limit the dataset
|
|
72
|
+
# random limit the dataset
|
|
71
73
|
if self.task_cfg.limit:
|
|
72
|
-
raw_dataset = raw_dataset
|
|
74
|
+
raw_dataset = random.sample(raw_dataset,
|
|
75
|
+
self.task_cfg.limit) if len(raw_dataset) > self.task_cfg.limit else raw_dataset
|
|
73
76
|
# index dataset
|
|
74
77
|
datasets = []
|
|
75
78
|
for sample in raw_dataset:
|
|
@@ -95,10 +98,17 @@ class EvaluatorCollection:
|
|
|
95
98
|
|
|
96
99
|
def _initialize_evaluators(self):
|
|
97
100
|
evaluators = {}
|
|
101
|
+
# load dataset args
|
|
102
|
+
dataset_args = deepcopy(self.task_cfg.dataset_args)
|
|
103
|
+
common_args = dataset_args.get(DataCollection.NAME, {})
|
|
98
104
|
for dataset_name in self.dataset_name_map.keys():
|
|
99
105
|
benchmark = Benchmark.get(dataset_name)
|
|
100
106
|
model_adapter = initialize_model_adapter(self.task_cfg, benchmark, self.model)
|
|
101
|
-
|
|
107
|
+
# update dataset args
|
|
108
|
+
cur_dataset_args = dataset_args.get(dataset_name, {})
|
|
109
|
+
cur_dataset_args.update(common_args)
|
|
110
|
+
# get data adapter
|
|
111
|
+
data_adapter = benchmark.get_data_adapter(cur_dataset_args)
|
|
102
112
|
evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
|
|
103
113
|
self.outputs)
|
|
104
114
|
return evaluators
|
|
@@ -185,12 +195,14 @@ class EvaluatorCollection:
|
|
|
185
195
|
index = answer.get(AnswerKeys.INDEX)
|
|
186
196
|
answer_dict[index] = answer
|
|
187
197
|
indices.add(index)
|
|
198
|
+
|
|
188
199
|
data = []
|
|
189
200
|
for sample in self.dataset:
|
|
190
201
|
if sample.index not in indices:
|
|
191
202
|
data.append(sample)
|
|
192
203
|
data_map = self._init_name_map(data)
|
|
193
204
|
|
|
205
|
+
logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
|
|
194
206
|
return answer_dict, data, data_map
|
|
195
207
|
return answer_dict, self.dataset, self.dataset_name_map
|
|
196
208
|
|
evalscope/config.py
CHANGED
|
@@ -4,13 +4,12 @@ import copy
|
|
|
4
4
|
import json
|
|
5
5
|
import os
|
|
6
6
|
from argparse import Namespace
|
|
7
|
-
from collections import OrderedDict
|
|
8
7
|
from dataclasses import dataclass, field
|
|
9
8
|
from typing import Dict, List, Optional, Union
|
|
10
9
|
|
|
11
10
|
from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
|
|
12
|
-
JudgeStrategy, OutputType)
|
|
13
|
-
from evalscope.models
|
|
11
|
+
JudgeStrategy, ModelTask, OutputType)
|
|
12
|
+
from evalscope.models import CustomModel, DummyCustomModel
|
|
14
13
|
from evalscope.utils import gen_hash
|
|
15
14
|
from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
|
|
16
15
|
from evalscope.utils.logger import get_logger
|
|
@@ -36,6 +35,7 @@ class TaskConfig:
|
|
|
36
35
|
model: Union[str, 'CustomModel', None] = None
|
|
37
36
|
model_id: Optional[str] = None
|
|
38
37
|
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
38
|
+
model_task: Optional[str] = ModelTask.TEXT_GENERATION
|
|
39
39
|
|
|
40
40
|
# Template-related arguments
|
|
41
41
|
template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
@@ -79,6 +79,10 @@ class TaskConfig:
|
|
|
79
79
|
judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
|
|
80
80
|
|
|
81
81
|
def __post_init__(self):
|
|
82
|
+
if self.model is None:
|
|
83
|
+
self.model = DummyCustomModel()
|
|
84
|
+
self.eval_type = EvalType.CUSTOM
|
|
85
|
+
|
|
82
86
|
if (not self.model_id) and self.model:
|
|
83
87
|
if isinstance(self.model, CustomModel):
|
|
84
88
|
self.model_id = self.model.config.get('model_id', 'custom_model')
|
evalscope/constants.py
CHANGED
|
@@ -1,4 +1,9 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
os.environ['MODELSCOPE_LOG_LEVEL'] = '40' # Set default log level to ERROR
|
|
6
|
+
|
|
2
7
|
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
|
3
8
|
from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
|
|
4
9
|
|
|
@@ -145,6 +150,7 @@ class OutputType:
|
|
|
145
150
|
GENERATION = 'generation' # for text generation tasks and general tasks
|
|
146
151
|
MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
|
|
147
152
|
CONTINUOUS = 'continuous_logits' # for continuous tasks
|
|
153
|
+
IMAGE_GENERATION = 'image_generation' # for image generation tasks
|
|
148
154
|
|
|
149
155
|
|
|
150
156
|
class EvalBackend:
|
|
@@ -164,3 +170,8 @@ class JudgeStrategy:
|
|
|
164
170
|
RULE = 'rule'
|
|
165
171
|
LLM = 'llm'
|
|
166
172
|
LLM_RECALL = 'llm_recall'
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class ModelTask:
|
|
176
|
+
TEXT_GENERATION = 'text_generation'
|
|
177
|
+
IMAGE_GENERATION = 'image_generation'
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -66,7 +66,7 @@ class Evaluator(object):
|
|
|
66
66
|
if self.task_cfg.judge_strategy == JudgeStrategy.RULE:
|
|
67
67
|
self.judge = None
|
|
68
68
|
else:
|
|
69
|
-
from evalscope.metrics
|
|
69
|
+
from evalscope.metrics import LLMJudge
|
|
70
70
|
self.judge = LLMJudge(**self.task_cfg.judge_model_args)
|
|
71
71
|
|
|
72
72
|
def load_dataset(self):
|
|
@@ -281,7 +281,7 @@ class Evaluator(object):
|
|
|
281
281
|
os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
|
|
282
282
|
|
|
283
283
|
if self.use_cache and os.path.exists(review_file_path):
|
|
284
|
-
logger.
|
|
284
|
+
logger.info(f'Updating the review file: {review_file_path} ...')
|
|
285
285
|
os.remove(review_file_path)
|
|
286
286
|
|
|
287
287
|
def process_single_review(answer_d):
|
|
@@ -317,6 +317,8 @@ class Evaluator(object):
|
|
|
317
317
|
"""
|
|
318
318
|
|
|
319
319
|
review_res_list = []
|
|
320
|
+
max_choices = max(
|
|
321
|
+
len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
|
|
320
322
|
for review_d in reviews_list:
|
|
321
323
|
if not review_d[ReviewKeys.REVIEWED]:
|
|
322
324
|
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
@@ -325,10 +327,14 @@ class Evaluator(object):
|
|
|
325
327
|
if len(review_d[AnswerKeys.CHOICES]) == 0:
|
|
326
328
|
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
327
329
|
continue
|
|
328
|
-
elif len(review_d[AnswerKeys.CHOICES]) == 1:
|
|
330
|
+
elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
|
|
329
331
|
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
330
332
|
else:
|
|
331
333
|
review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
|
|
334
|
+
if len(review_d[AnswerKeys.CHOICES]) < max_choices:
|
|
335
|
+
logger.warning(
|
|
336
|
+
f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
|
|
337
|
+
f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
|
|
332
338
|
|
|
333
339
|
review_res_list.append(review_res)
|
|
334
340
|
|
|
@@ -11,7 +11,7 @@ from functools import partial
|
|
|
11
11
|
from typing import Any, List, Tuple
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
14
|
-
from evalscope.models
|
|
14
|
+
from evalscope.models import OpenAIModel
|
|
15
15
|
from evalscope.utils import completion_parsers, random_seeded_choice
|
|
16
16
|
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
17
17
|
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -1,5 +1,50 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
from evalscope.
|
|
5
|
-
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .llm_judge import LLMJudge
|
|
8
|
+
from .math_parser import extract_answer, math_equal, strip_answer_string
|
|
9
|
+
from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
|
|
10
|
+
weighted_mean)
|
|
11
|
+
from .named_metrics import Metric, metric_registry
|
|
12
|
+
from .rouge_metric import compute_rouge_score_one_sample_zh
|
|
13
|
+
|
|
14
|
+
else:
|
|
15
|
+
_import_structure = {
|
|
16
|
+
'metrics': [
|
|
17
|
+
'bleu_ngram_one_sample',
|
|
18
|
+
'exact_match',
|
|
19
|
+
'macro_mean',
|
|
20
|
+
'mean',
|
|
21
|
+
'micro_mean',
|
|
22
|
+
'simple_f1_score',
|
|
23
|
+
'weighted_mean',
|
|
24
|
+
],
|
|
25
|
+
'named_metrics': [
|
|
26
|
+
'Metric',
|
|
27
|
+
'metric_registry',
|
|
28
|
+
],
|
|
29
|
+
'rouge_metric': [
|
|
30
|
+
'compute_rouge_score_one_sample_zh',
|
|
31
|
+
],
|
|
32
|
+
'llm_judge': [
|
|
33
|
+
'LLMJudge',
|
|
34
|
+
],
|
|
35
|
+
'math_parser': [
|
|
36
|
+
'extract_answer',
|
|
37
|
+
'math_equal',
|
|
38
|
+
'strip_answer_string',
|
|
39
|
+
],
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
import sys
|
|
43
|
+
|
|
44
|
+
sys.modules[__name__] = _LazyModule(
|
|
45
|
+
__name__,
|
|
46
|
+
globals()['__file__'],
|
|
47
|
+
_import_structure,
|
|
48
|
+
module_spec=__spec__,
|
|
49
|
+
extra_objects={},
|
|
50
|
+
)
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -54,7 +54,7 @@ class LLMJudge:
|
|
|
54
54
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
55
55
|
self.generation_config = generation_config
|
|
56
56
|
|
|
57
|
-
from evalscope.models
|
|
57
|
+
from evalscope.models import ServerModelAdapter
|
|
58
58
|
|
|
59
59
|
# Initialize ServerModelAdapter
|
|
60
60
|
self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
|
|
@@ -3,6 +3,8 @@ from functools import partial
|
|
|
3
3
|
from typing import Callable, Dict
|
|
4
4
|
|
|
5
5
|
from evalscope.metrics.metrics import mean, pass_at_k, weighted_mean
|
|
6
|
+
from evalscope.metrics.t2v_metrics import (blip2_score, clip_flant5_score, clip_score, fga_blip2_score, hpsv2_1_score,
|
|
7
|
+
hpsv2_score, image_reward_score, mps_score, pick_score)
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
@dataclass
|
|
@@ -40,3 +42,14 @@ metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean
|
|
|
40
42
|
metric_registry.register(Metric(name='AveragePass@1', object=mean))
|
|
41
43
|
for k in range(1, 17):
|
|
42
44
|
metric_registry.register(Metric(name=f'Pass@{k}', object=partial(pass_at_k, k=k)))
|
|
45
|
+
|
|
46
|
+
# t2v_metrics
|
|
47
|
+
metric_registry.register(Metric(name='VQAScore', object=clip_flant5_score))
|
|
48
|
+
metric_registry.register(Metric(name='PickScore', object=pick_score))
|
|
49
|
+
metric_registry.register(Metric(name='CLIPScore', object=clip_score))
|
|
50
|
+
metric_registry.register(Metric(name='BLIPv2Score', object=blip2_score))
|
|
51
|
+
metric_registry.register(Metric(name='HPSv2Score', object=hpsv2_score))
|
|
52
|
+
metric_registry.register(Metric(name='HPSv2.1Score', object=hpsv2_1_score))
|
|
53
|
+
metric_registry.register(Metric(name='ImageRewardScore', object=image_reward_score))
|
|
54
|
+
metric_registry.register(Metric(name='FGA_BLIP2Score', object=fga_blip2_score))
|
|
55
|
+
metric_registry.register(Metric(name='MPS', object=mps_score))
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import absolute_import, division, print_function
|
|
2
|
+
|
|
3
|
+
from .clipscore import CLIPScore, list_all_clipscore_models
|
|
4
|
+
from .constants import CACHE_DIR
|
|
5
|
+
from .itmscore import ITMScore, list_all_itmscore_models
|
|
6
|
+
from .vqascore import VQAScore, list_all_vqascore_models
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def list_all_models():
|
|
10
|
+
return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
|
|
14
|
+
if model in list_all_vqascore_models():
|
|
15
|
+
return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
16
|
+
elif model in list_all_clipscore_models():
|
|
17
|
+
return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
18
|
+
elif model in list_all_itmscore_models():
|
|
19
|
+
return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
20
|
+
else:
|
|
21
|
+
raise NotImplementedError()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def clip_flant5_score():
|
|
25
|
+
clip_flant5_score = VQAScore(model='clip-flant5-xxl')
|
|
26
|
+
return clip_flant5_score
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def pick_score():
|
|
30
|
+
pick_score = CLIPScore(model='pickscore-v1')
|
|
31
|
+
return pick_score
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def clip_score():
|
|
35
|
+
clip_score = CLIPScore(model='openai:ViT-L-14-336')
|
|
36
|
+
return clip_score
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def blip2_score():
|
|
40
|
+
blip_itm_score = ITMScore(model='blip2-itm')
|
|
41
|
+
return blip_itm_score
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def hpsv2_score():
|
|
45
|
+
hpsv2_score = CLIPScore(model='hpsv2')
|
|
46
|
+
return hpsv2_score
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def hpsv2_1_score():
|
|
50
|
+
hpsv2_1_score = CLIPScore(model='hpsv2.1')
|
|
51
|
+
return hpsv2_1_score
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def image_reward_score():
|
|
55
|
+
image_reward_score = ITMScore(model='image-reward-v1')
|
|
56
|
+
return image_reward_score
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def fga_blip2_score():
|
|
60
|
+
fga_blip2_score = ITMScore(model='fga_blip2')
|
|
61
|
+
return fga_blip2_score
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def mps_score():
|
|
65
|
+
mps_score = CLIPScore(model='mps')
|
|
66
|
+
return mps_score
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from .constants import CACHE_DIR
|
|
4
|
+
from .models.clipscore_models import get_clipscore_model, list_all_clipscore_models
|
|
5
|
+
from .score import Score
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CLIPScore(Score):
|
|
9
|
+
|
|
10
|
+
def prepare_scoremodel(self, model='openai:ViT-L/14', device='cuda', cache_dir=CACHE_DIR):
|
|
11
|
+
return get_clipscore_model(model, device=device, cache_dir=cache_dir)
|
|
12
|
+
|
|
13
|
+
def list_all_models(self) -> List[str]:
|
|
14
|
+
return list_all_clipscore_models()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from modelscope.utils.file_utils import get_model_cache_root
|
|
3
|
+
|
|
4
|
+
CACHE_DIR = get_model_cache_root()
|
|
5
|
+
os.environ['TORCH_HOME'] = CACHE_DIR # set timm cache dir
|
|
6
|
+
|
|
7
|
+
# For CLIP-FlanT5
|
|
8
|
+
CONTEXT_LEN = 2048
|
|
9
|
+
SYSTEM_MSG = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
|
|
10
|
+
IGNORE_INDEX = -100
|
|
11
|
+
IMAGE_TOKEN_INDEX = -200
|
|
12
|
+
DEFAULT_IMAGE_TOKEN = '<image>'
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from .constants import CACHE_DIR
|
|
4
|
+
from .models.itmscore_models import get_itmscore_model, list_all_itmscore_models
|
|
5
|
+
from .score import Score
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ITMScore(Score):
|
|
9
|
+
|
|
10
|
+
def prepare_scoremodel(self, model='blip2-itm', device='cuda', cache_dir=CACHE_DIR):
|
|
11
|
+
return get_itmscore_model(model, device=device, cache_dir=cache_dir)
|
|
12
|
+
|
|
13
|
+
def list_all_models(self) -> List[str]:
|
|
14
|
+
return list_all_itmscore_models()
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from ...constants import CACHE_DIR
|
|
2
|
+
from .clip_model import CLIP_MODELS, CLIPScoreModel
|
|
3
|
+
from .hpsv2_model import HPSV2_MODELS, HPSV2ScoreModel
|
|
4
|
+
from .mps_model import MPS_MODELS, MPSModel
|
|
5
|
+
from .pickscore_model import PICKSCORE_MODELS, PickScoreModel
|
|
6
|
+
|
|
7
|
+
ALL_CLIP_MODELS = [
|
|
8
|
+
CLIP_MODELS,
|
|
9
|
+
HPSV2_MODELS,
|
|
10
|
+
PICKSCORE_MODELS,
|
|
11
|
+
MPS_MODELS,
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def list_all_clipscore_models():
|
|
16
|
+
return [model for models in ALL_CLIP_MODELS for model in models]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_clipscore_model(model_name, device='cuda', cache_dir=CACHE_DIR):
|
|
20
|
+
assert model_name in list_all_clipscore_models()
|
|
21
|
+
if model_name in CLIP_MODELS:
|
|
22
|
+
return CLIPScoreModel(model_name, device=device, cache_dir=cache_dir)
|
|
23
|
+
elif model_name in HPSV2_MODELS:
|
|
24
|
+
return HPSV2ScoreModel(model_name, device=device, cache_dir=cache_dir)
|
|
25
|
+
elif model_name in PICKSCORE_MODELS:
|
|
26
|
+
return PickScoreModel(model_name, device=device, cache_dir=cache_dir)
|
|
27
|
+
elif model_name in MPS_MODELS:
|
|
28
|
+
return MPSModel(model_name, device=device, cache_dir=cache_dir)
|
|
29
|
+
else:
|
|
30
|
+
raise NotImplementedError()
|
|
File without changes
|