evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +16 -9
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +16 -4
- evalscope/config.py +7 -3
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +9 -3
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/benchmark.py +4 -3
- evalscope/perf/main.py +4 -2
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/utils/benchmark_util.py +2 -2
- evalscope/perf/utils/db_util.py +16 -8
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +117 -67
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +3 -3
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +12 -4
- evalscope/version.py +2 -2
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_run.py +20 -7
- tests/perf/test_perf.py +6 -3
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import torch
|
|
3
|
+
from modelscope import AutoTokenizer
|
|
4
|
+
from PIL import Image
|
|
5
|
+
|
|
6
|
+
from ...constants import CACHE_DIR, IMAGE_TOKEN_INDEX
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def expand2square(pil_img, background_color):
|
|
10
|
+
width, height = pil_img.size
|
|
11
|
+
if width == height:
|
|
12
|
+
return pil_img
|
|
13
|
+
elif width > height:
|
|
14
|
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
|
15
|
+
result.paste(pil_img, (0, (width - height) // 2))
|
|
16
|
+
return result
|
|
17
|
+
else:
|
|
18
|
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
|
19
|
+
result.paste(pil_img, ((height - width) // 2, 0))
|
|
20
|
+
return result
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
|
|
24
|
+
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
|
|
25
|
+
|
|
26
|
+
def insert_separator(X, sep):
|
|
27
|
+
return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
|
|
28
|
+
|
|
29
|
+
input_ids = []
|
|
30
|
+
offset = 0
|
|
31
|
+
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
|
|
32
|
+
offset = 1
|
|
33
|
+
input_ids.append(prompt_chunks[0][0])
|
|
34
|
+
|
|
35
|
+
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
|
|
36
|
+
input_ids.extend(x[offset:])
|
|
37
|
+
|
|
38
|
+
if return_tensors is not None:
|
|
39
|
+
if return_tensors == 'pt':
|
|
40
|
+
return torch.tensor(input_ids, dtype=torch.long)
|
|
41
|
+
raise ValueError(f'Unsupported tensor type: {return_tensors}')
|
|
42
|
+
return input_ids
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def t5_tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
|
|
46
|
+
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
|
|
47
|
+
|
|
48
|
+
def insert_separator(X, sep):
|
|
49
|
+
return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
|
|
50
|
+
|
|
51
|
+
input_ids = []
|
|
52
|
+
# Since there's no bos_token_id, simply concatenate the tokenized prompt_chunks with the image_token_index
|
|
53
|
+
for x in insert_separator(prompt_chunks, [image_token_index]):
|
|
54
|
+
input_ids.extend(x)
|
|
55
|
+
|
|
56
|
+
if return_tensors is not None:
|
|
57
|
+
if return_tensors == 'pt':
|
|
58
|
+
return torch.tensor(input_ids, dtype=torch.long)
|
|
59
|
+
raise ValueError(f'Unsupported tensor type: {return_tensors}')
|
|
60
|
+
return input_ids
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def load_pretrained_model(
|
|
64
|
+
model_cls,
|
|
65
|
+
model_args,
|
|
66
|
+
model_path=None,
|
|
67
|
+
tokenizer_path=None,
|
|
68
|
+
model_max_length=None,
|
|
69
|
+
padding_side=None,
|
|
70
|
+
image_aspect_ratio='pad', # or 'square'
|
|
71
|
+
mmprojector_repo=None,
|
|
72
|
+
mmprojector_name=None,
|
|
73
|
+
device='cuda',
|
|
74
|
+
cache_dir=CACHE_DIR):
|
|
75
|
+
tokenizer_dict = {}
|
|
76
|
+
if model_max_length:
|
|
77
|
+
tokenizer_dict['model_max_length'] = model_max_length
|
|
78
|
+
if padding_side:
|
|
79
|
+
tokenizer_dict['padding_side'] = padding_side
|
|
80
|
+
|
|
81
|
+
from ..utils import download_file
|
|
82
|
+
|
|
83
|
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, **tokenizer_dict)
|
|
84
|
+
# tokenizer.pad_token = tokenizer.unk_token # could be redundant
|
|
85
|
+
|
|
86
|
+
model_path = download_file(model_path, cache_dir=cache_dir)
|
|
87
|
+
model = model_cls.from_pretrained(model_path, cache_dir=cache_dir)
|
|
88
|
+
|
|
89
|
+
if mmprojector_repo:
|
|
90
|
+
from huggingface_hub import hf_hub_download
|
|
91
|
+
model_base_name = mmprojector_repo.split('/')[-1]
|
|
92
|
+
|
|
93
|
+
if cache_dir is not None:
|
|
94
|
+
local_dir = os.path.join(cache_dir, model_base_name)
|
|
95
|
+
elif os.environ.get('HF_HOME') is not None:
|
|
96
|
+
local_dir = os.path.join(os.environ.get('HF_HOME'), model_base_name)
|
|
97
|
+
else:
|
|
98
|
+
local_dir = os.path.join(os.path.expanduser('~'), model_base_name)
|
|
99
|
+
print(f'Downloading projector weights to {local_dir}')
|
|
100
|
+
hf_hub_download(
|
|
101
|
+
repo_id=mmprojector_repo,
|
|
102
|
+
filename=mmprojector_name,
|
|
103
|
+
local_dir=local_dir,
|
|
104
|
+
)
|
|
105
|
+
pretrain_mm_mlp_adapter = os.path.join(local_dir, mmprojector_name)
|
|
106
|
+
model_args.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter # important to set to correct path
|
|
107
|
+
|
|
108
|
+
model.get_model().initialize_vision_modules(
|
|
109
|
+
model_args) # This will load the CLIP vision encoder and MLP projector
|
|
110
|
+
else:
|
|
111
|
+
model.resize_token_embeddings(len(tokenizer)) # perhaps not needed
|
|
112
|
+
|
|
113
|
+
if not model.get_vision_tower().is_loaded:
|
|
114
|
+
model.get_vision_tower().load_model()
|
|
115
|
+
model.to(device=device, dtype=torch.bfloat16)
|
|
116
|
+
image_processor = model.get_vision_tower().image_processor
|
|
117
|
+
|
|
118
|
+
model.requires_grad_(False)
|
|
119
|
+
|
|
120
|
+
# below might be redundant
|
|
121
|
+
model.config.image_aspect_ratio = image_aspect_ratio
|
|
122
|
+
model.config.use_cache = False
|
|
123
|
+
model.config.image_grid_pinpoints = None
|
|
124
|
+
model.config.freeze_mm_mlp_adapter = True
|
|
125
|
+
|
|
126
|
+
model = model.eval()
|
|
127
|
+
return tokenizer, model, image_processor
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from ..model import ScoreModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class VQAScoreModel(ScoreModel):
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def forward(self, images: List[str], texts: List[str], question_template: str,
|
|
12
|
+
answer_template: str) -> torch.Tensor:
|
|
13
|
+
"""Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
|
|
14
|
+
question_template: a string with optional {} to be replaced with the 'text'
|
|
15
|
+
answer_template: a string with optional {} to be replaced with the 'text'
|
|
16
|
+
"""
|
|
17
|
+
pass
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.nn as nn
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from torch.utils.data import DataLoader
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
from typing import List, TypedDict, Union
|
|
7
|
+
|
|
8
|
+
from .constants import CACHE_DIR
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ImageTextDict(TypedDict):
|
|
12
|
+
images: List[str]
|
|
13
|
+
texts: List[str]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Score(nn.Module):
|
|
17
|
+
|
|
18
|
+
def __init__(self, model: str, device: str = 'cuda', cache_dir: str = CACHE_DIR, **kwargs):
|
|
19
|
+
"""Initialize the ScoreModel
|
|
20
|
+
"""
|
|
21
|
+
super().__init__()
|
|
22
|
+
assert model in self.list_all_models()
|
|
23
|
+
self.device = device
|
|
24
|
+
self.model = self.prepare_scoremodel(model, device, cache_dir, **kwargs)
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def prepare_scoremodel(self, model: str, device: str, cache_dir: str, **kwargs):
|
|
28
|
+
"""Prepare the ScoreModel
|
|
29
|
+
"""
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def list_all_models(self) -> List[str]:
|
|
34
|
+
"""List all available models
|
|
35
|
+
"""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
def forward(self, images: Union[str, List[str]], texts: Union[str, List[str]], **kwargs) -> List[float]:
|
|
39
|
+
"""Return the similarity score(s) between the image(s) and the text(s)
|
|
40
|
+
If there are m images and n texts, return a m x n tensor
|
|
41
|
+
"""
|
|
42
|
+
if type(images) == str:
|
|
43
|
+
images = [images]
|
|
44
|
+
if type(texts) == str:
|
|
45
|
+
texts = [texts]
|
|
46
|
+
assert len(images) == len(texts), 'Number of images and texts must match'
|
|
47
|
+
scores = []
|
|
48
|
+
for i, image in enumerate(images):
|
|
49
|
+
scores.append(self.model.forward([image] * len(texts), texts, **kwargs))
|
|
50
|
+
return scores
|
|
51
|
+
|
|
52
|
+
def batch_forward(self, dataset: List[ImageTextDict], batch_size: int = 16, **kwargs) -> torch.Tensor:
|
|
53
|
+
"""Return the similarity score(s) between the image(s) and the text(s)
|
|
54
|
+
If there are m images and n texts, return a m x n tensor
|
|
55
|
+
"""
|
|
56
|
+
num_samples = len(dataset)
|
|
57
|
+
num_images = len(dataset[0]['images'])
|
|
58
|
+
num_texts = len(dataset[0]['texts'])
|
|
59
|
+
scores = torch.zeros(num_samples, num_images, num_texts).to(self.device)
|
|
60
|
+
|
|
61
|
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
|
|
62
|
+
counter = 0
|
|
63
|
+
for batch_idx, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
|
|
64
|
+
cur_batch_size = len(batch['images'][0])
|
|
65
|
+
assert len(batch['images']) == num_images, \
|
|
66
|
+
f"Number of image options in batch {batch_idx} is {len(batch['images'])}. Expected {num_images} images."
|
|
67
|
+
assert len(batch['texts']) == num_texts, \
|
|
68
|
+
f"Number of text options in batch {batch_idx} is {len(batch['texts'])}. Expected {num_texts} texts."
|
|
69
|
+
|
|
70
|
+
for image_idx in range(num_images):
|
|
71
|
+
images = batch['images'][image_idx]
|
|
72
|
+
for text_idx in range(num_texts):
|
|
73
|
+
texts = batch['texts'][text_idx]
|
|
74
|
+
scores[counter:counter+cur_batch_size, image_idx, text_idx] = \
|
|
75
|
+
self.model.forward(images, texts, **kwargs)
|
|
76
|
+
|
|
77
|
+
counter += cur_batch_size
|
|
78
|
+
return scores
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from .constants import CACHE_DIR
|
|
4
|
+
from .models.vqascore_models import get_vqascore_model, list_all_vqascore_models
|
|
5
|
+
from .score import Score
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class VQAScore(Score):
|
|
9
|
+
|
|
10
|
+
def prepare_scoremodel(self, model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
|
|
11
|
+
return get_vqascore_model(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
12
|
+
|
|
13
|
+
def list_all_models(self) -> List[str]:
|
|
14
|
+
return list_all_vqascore_models()
|
evalscope/models/__init__.py
CHANGED
|
@@ -1,17 +1,53 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
3
|
-
from evalscope.
|
|
4
|
-
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
5
|
-
from evalscope.models.choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
|
|
6
|
-
from evalscope.models.custom import CustomModel
|
|
7
|
-
from evalscope.models.custom_adapter import CustomModelAdapter
|
|
8
|
-
from evalscope.models.local_model import LocalModel, get_local_model
|
|
9
|
-
from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
|
|
10
|
-
from evalscope.models.register import get_model_adapter
|
|
11
|
-
from evalscope.models.server_adapter import ServerModelAdapter
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
12
5
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .adapters import (BaseModelAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
|
|
8
|
+
CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
|
|
9
|
+
initialize_model_adapter)
|
|
10
|
+
from .custom import CustomModel, DummyCustomModel
|
|
11
|
+
from .local_model import LocalModel, get_local_model
|
|
12
|
+
from .model import BaseModel, ChatBaseModel, OpenAIModel
|
|
13
|
+
from .register import get_model_adapter
|
|
14
|
+
|
|
15
|
+
else:
|
|
16
|
+
_import_structure = {
|
|
17
|
+
'adapters': [
|
|
18
|
+
'BaseModelAdapter',
|
|
19
|
+
'initialize_model_adapter',
|
|
20
|
+
'ChatGenerationModelAdapter',
|
|
21
|
+
'ContinuationLogitsModelAdapter',
|
|
22
|
+
'MultiChoiceModelAdapter',
|
|
23
|
+
'CustomModelAdapter',
|
|
24
|
+
'ServerModelAdapter',
|
|
25
|
+
'T2IModelAdapter',
|
|
26
|
+
],
|
|
27
|
+
'custom': [
|
|
28
|
+
'CustomModel',
|
|
29
|
+
'DummyCustomModel',
|
|
30
|
+
],
|
|
31
|
+
'local_model': [
|
|
32
|
+
'LocalModel',
|
|
33
|
+
'get_local_model',
|
|
34
|
+
],
|
|
35
|
+
'model': [
|
|
36
|
+
'BaseModel',
|
|
37
|
+
'ChatBaseModel',
|
|
38
|
+
'OpenAIModel',
|
|
39
|
+
],
|
|
40
|
+
'register': [
|
|
41
|
+
'get_model_adapter',
|
|
42
|
+
],
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
import sys
|
|
46
|
+
|
|
47
|
+
sys.modules[__name__] = _LazyModule(
|
|
48
|
+
__name__,
|
|
49
|
+
globals()['__file__'],
|
|
50
|
+
_import_structure,
|
|
51
|
+
module_spec=__spec__,
|
|
52
|
+
extra_objects={},
|
|
53
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .base_adapter import BaseModelAdapter, initialize_model_adapter
|
|
2
|
+
from .chat_adapter import ChatGenerationModelAdapter
|
|
3
|
+
from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
|
|
4
|
+
from .custom_adapter import CustomModelAdapter
|
|
5
|
+
from .server_adapter import ServerModelAdapter
|
|
6
|
+
from .t2i_adapter import T2IModelAdapter
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
'initialize_model_adapter',
|
|
10
|
+
'BaseModelAdapter',
|
|
11
|
+
'ChatGenerationModelAdapter',
|
|
12
|
+
'ContinuationLogitsModelAdapter',
|
|
13
|
+
'MultiChoiceModelAdapter',
|
|
14
|
+
'CustomModelAdapter',
|
|
15
|
+
'ServerModelAdapter',
|
|
16
|
+
'T2IModelAdapter',
|
|
17
|
+
]
|
|
@@ -3,19 +3,17 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
from evalscope.constants import EvalType, OutputType
|
|
6
|
-
from evalscope.models.custom import CustomModel
|
|
7
|
-
from evalscope.models.local_model import LocalModel
|
|
8
|
-
from evalscope.models.register import get_model_adapter, register_model_adapter
|
|
9
6
|
from evalscope.utils.logger import get_logger
|
|
7
|
+
from ..custom import CustomModel
|
|
8
|
+
from ..local_model import LocalModel
|
|
10
9
|
|
|
11
10
|
logger = get_logger()
|
|
12
11
|
|
|
13
12
|
if TYPE_CHECKING:
|
|
14
|
-
from evalscope.benchmarks import
|
|
13
|
+
from evalscope.benchmarks import DataAdapter
|
|
15
14
|
from evalscope.config import TaskConfig
|
|
16
15
|
|
|
17
16
|
|
|
18
|
-
@register_model_adapter('base')
|
|
19
17
|
class BaseModelAdapter(ABC):
|
|
20
18
|
|
|
21
19
|
def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
|
|
@@ -39,12 +37,9 @@ class BaseModelAdapter(ABC):
|
|
|
39
37
|
raise NotImplementedError
|
|
40
38
|
|
|
41
39
|
|
|
42
|
-
def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: '
|
|
40
|
+
def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', base_model: 'LocalModel'):
|
|
43
41
|
"""Initialize the model adapter based on the task configuration."""
|
|
44
|
-
if task_cfg.
|
|
45
|
-
from evalscope.models.model import DummyChatModel
|
|
46
|
-
return DummyChatModel(model_cfg=dict())
|
|
47
|
-
elif task_cfg.eval_type == EvalType.CUSTOM:
|
|
42
|
+
if task_cfg.eval_type == EvalType.CUSTOM:
|
|
48
43
|
if not isinstance(task_cfg.model, CustomModel):
|
|
49
44
|
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
50
45
|
from evalscope.models import CustomModelAdapter
|
|
@@ -66,13 +61,18 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'BenchmarkMeta',
|
|
|
66
61
|
stream=task_cfg.stream,
|
|
67
62
|
)
|
|
68
63
|
else:
|
|
64
|
+
from ..register import get_model_adapter
|
|
65
|
+
|
|
69
66
|
# for local model, we need to determine the model adapter class based on the output type
|
|
70
|
-
|
|
71
|
-
if
|
|
72
|
-
logger.warning(f'Output type {
|
|
67
|
+
model_adapter_cls_str = benchmark.model_adapter
|
|
68
|
+
if model_adapter_cls_str not in benchmark.output_types:
|
|
69
|
+
logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}. '
|
|
73
70
|
f'Using {benchmark.output_types[0]} instead.')
|
|
74
|
-
|
|
71
|
+
model_adapter_cls_str = benchmark.output_types[0]
|
|
75
72
|
|
|
76
|
-
|
|
77
|
-
return
|
|
78
|
-
model=base_model,
|
|
73
|
+
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
74
|
+
return model_adapter_cls(
|
|
75
|
+
model=base_model,
|
|
76
|
+
generation_config=task_cfg.generation_config,
|
|
77
|
+
chat_template=task_cfg.chat_template,
|
|
78
|
+
task_cfg=task_cfg)
|
|
@@ -3,18 +3,15 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import Any, Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
|
-
from evalscope.constants import OutputType
|
|
7
|
-
from evalscope.models.base_adapter import BaseModelAdapter
|
|
8
|
-
from evalscope.models.local_model import LocalModel
|
|
9
|
-
from evalscope.models.register import register_model_adapter
|
|
10
6
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
|
|
11
7
|
from evalscope.utils.logger import get_logger
|
|
12
8
|
from evalscope.utils.model_utils import fix_do_sample_warning
|
|
9
|
+
from ..local_model import LocalModel
|
|
10
|
+
from .base_adapter import BaseModelAdapter
|
|
13
11
|
|
|
14
12
|
logger = get_logger()
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
@register_model_adapter(OutputType.GENERATION)
|
|
18
15
|
class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
19
16
|
"""
|
|
20
17
|
Chat generation model adapter.
|
|
@@ -102,8 +99,14 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
102
99
|
messages = [ChatMessage(role='user', content=query)]
|
|
103
100
|
if i < len(system_prompts) and system_prompts[i]:
|
|
104
101
|
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
105
|
-
|
|
106
|
-
|
|
102
|
+
# whether thinking is needed
|
|
103
|
+
chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
|
|
104
|
+
if chat_template_kwargs is not None:
|
|
105
|
+
prompts = self.tokenizer.apply_chat_template(
|
|
106
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
|
|
107
|
+
else:
|
|
108
|
+
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
109
|
+
formatted_prompts.append(prompts)
|
|
107
110
|
else:
|
|
108
111
|
# For base model, use the queries as the input
|
|
109
112
|
formatted_prompts = queries
|
|
@@ -3,14 +3,11 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
-
from evalscope.constants import OutputType
|
|
7
|
-
from evalscope.models.base_adapter import BaseModelAdapter
|
|
8
|
-
from evalscope.models.local_model import LocalModel
|
|
9
|
-
from evalscope.models.register import register_model_adapter
|
|
10
6
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
7
|
+
from ..local_model import LocalModel
|
|
8
|
+
from .base_adapter import BaseModelAdapter
|
|
11
9
|
|
|
12
10
|
|
|
13
|
-
@register_model_adapter(OutputType.MULTIPLE_CHOICE)
|
|
14
11
|
class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
15
12
|
""" The multi-choice model adapter. """
|
|
16
13
|
|
|
@@ -113,7 +110,6 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
113
110
|
return log_probs, {'tokens': tokens}
|
|
114
111
|
|
|
115
112
|
|
|
116
|
-
@register_model_adapter(OutputType.CONTINUOUS)
|
|
117
113
|
class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
118
114
|
"""
|
|
119
115
|
Continuation-logits model adapter.
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
from typing import Any, Dict, List, Union
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from evalscope.models.register import register_model_adapter
|
|
3
|
+
from ..custom import CustomModel
|
|
4
|
+
from .base_adapter import BaseModelAdapter
|
|
6
5
|
|
|
7
6
|
|
|
8
|
-
@register_model_adapter('custom')
|
|
9
7
|
class CustomModelAdapter(BaseModelAdapter):
|
|
10
8
|
|
|
11
9
|
def __init__(self, custom_model: CustomModel, **kwargs):
|
|
@@ -5,14 +5,12 @@ from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
|
5
5
|
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
6
6
|
from typing import List, Optional, Union
|
|
7
7
|
|
|
8
|
-
from evalscope.models.base_adapter import BaseModelAdapter
|
|
9
|
-
from evalscope.models.register import register_model_adapter
|
|
10
8
|
from evalscope.utils.logger import get_logger
|
|
9
|
+
from .base_adapter import BaseModelAdapter
|
|
11
10
|
|
|
12
11
|
logger = get_logger()
|
|
13
12
|
|
|
14
13
|
|
|
15
|
-
@register_model_adapter('server')
|
|
16
14
|
class ServerModelAdapter(BaseModelAdapter):
|
|
17
15
|
"""
|
|
18
16
|
Server model adapter to request remote API model and generate results.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import torch
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
7
|
+
from evalscope.utils.io_utils import OutputsStructure
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
9
|
+
from ..local_model import LocalModel
|
|
10
|
+
from .base_adapter import BaseModelAdapter
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class T2IModelAdapter(BaseModelAdapter):
|
|
16
|
+
"""
|
|
17
|
+
Text to image model adapter.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, model: LocalModel, **kwargs):
|
|
21
|
+
super().__init__(model)
|
|
22
|
+
|
|
23
|
+
self.task_config = kwargs.get('task_cfg', None)
|
|
24
|
+
assert self.task_config is not None, 'Task config is required for T2I model adapter.'
|
|
25
|
+
|
|
26
|
+
self.save_path = os.path.join(self.task_config.work_dir, OutputsStructure.PREDICTIONS_DIR,
|
|
27
|
+
self.task_config.model_id, 'images')
|
|
28
|
+
os.makedirs(self.save_path, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
def _model_generate(self, prompt, infer_cfg=None) -> List:
|
|
31
|
+
"""
|
|
32
|
+
Generate images from the model.
|
|
33
|
+
Args:
|
|
34
|
+
prompt: The input prompt.
|
|
35
|
+
infer_cfg: The inference configuration.
|
|
36
|
+
Returns:
|
|
37
|
+
The generated images.
|
|
38
|
+
"""
|
|
39
|
+
infer_cfg = infer_cfg or {}
|
|
40
|
+
|
|
41
|
+
sample = self.model(prompt=prompt, **infer_cfg).images
|
|
42
|
+
return sample
|
|
43
|
+
|
|
44
|
+
@torch.no_grad()
|
|
45
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
46
|
+
"""
|
|
47
|
+
Args:
|
|
48
|
+
inputs: The input data.
|
|
49
|
+
infer_cfg: The inference configuration.
|
|
50
|
+
Returns:
|
|
51
|
+
The prediction results.
|
|
52
|
+
"""
|
|
53
|
+
results = []
|
|
54
|
+
for input_item in inputs:
|
|
55
|
+
prompt = input_item['data'][0]
|
|
56
|
+
image_id = input_item.get('id') or input_item.get('index')
|
|
57
|
+
|
|
58
|
+
samples = self._model_generate(prompt, infer_cfg)
|
|
59
|
+
|
|
60
|
+
choices_list = []
|
|
61
|
+
for index, sample in enumerate(samples):
|
|
62
|
+
image_file_path = os.path.join(self.save_path, f'{image_id}_{index}.jpeg')
|
|
63
|
+
sample.save(image_file_path)
|
|
64
|
+
logger.debug(f'Saved image to {image_file_path}')
|
|
65
|
+
|
|
66
|
+
choice = ChatCompletionResponseChoice(
|
|
67
|
+
index=index, message=ChatMessage(content=image_file_path, role='assistant'), finish_reason='stop')
|
|
68
|
+
choices_list.append(choice)
|
|
69
|
+
|
|
70
|
+
res_d = ChatCompletionResponse(
|
|
71
|
+
model=self.model_id, choices=choices_list, object='images.generations',
|
|
72
|
+
created=int(time.time())).model_dump(exclude_unset=True)
|
|
73
|
+
|
|
74
|
+
results.append(res_d)
|
|
75
|
+
|
|
76
|
+
return results
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
+
from typing import List
|
|
4
5
|
|
|
5
|
-
from evalscope.models.custom import CustomModel
|
|
6
|
-
from evalscope.run import run_task
|
|
7
|
-
from evalscope.utils.io_utils import yaml_to_dict
|
|
8
6
|
from evalscope.utils.logger import get_logger
|
|
7
|
+
from .custom_model import CustomModel
|
|
9
8
|
|
|
10
9
|
logger = get_logger()
|
|
11
10
|
"""
|
|
@@ -15,29 +14,25 @@ This script is used to rewrite the evaluation results without re-running the mod
|
|
|
15
14
|
|
|
16
15
|
class DummyCustomModel(CustomModel):
|
|
17
16
|
|
|
18
|
-
def __init__(self, config: dict, **kwargs):
|
|
17
|
+
def __init__(self, config: dict = {'model_id': 'dummy-model'}, **kwargs):
|
|
19
18
|
super(DummyCustomModel, self).__init__(config=config, **kwargs)
|
|
20
19
|
|
|
21
|
-
def predict(self, prompts:
|
|
20
|
+
def predict(self, prompts: List[dict], **kwargs):
|
|
22
21
|
# ONLY FOR DUMMY IMPLEMENTATION, DO NOT EDIT OR USE IN PRODUCTION.
|
|
23
22
|
|
|
24
|
-
response = '
|
|
23
|
+
response = ''
|
|
25
24
|
|
|
26
25
|
res_d: dict = {
|
|
27
26
|
'choices': [{
|
|
28
27
|
'index': 0,
|
|
29
28
|
'message': {
|
|
30
|
-
# 'content': f'The answer is B. Raw prompt: {prompt}',
|
|
31
29
|
'content': response,
|
|
32
30
|
'role': 'assistant'
|
|
33
31
|
}
|
|
34
32
|
}],
|
|
35
|
-
'created':
|
|
36
|
-
|
|
37
|
-
'
|
|
38
|
-
self.config.get('model_id'), # should be model_id
|
|
39
|
-
'object':
|
|
40
|
-
'chat.completion',
|
|
33
|
+
'created': time.time(),
|
|
34
|
+
'model': self.config.get('model_id'), # should be model_id
|
|
35
|
+
'object': 'chat.completion',
|
|
41
36
|
'usage': {
|
|
42
37
|
'completion_tokens': 0,
|
|
43
38
|
'prompt_tokens': 0,
|
|
@@ -49,6 +44,9 @@ class DummyCustomModel(CustomModel):
|
|
|
49
44
|
|
|
50
45
|
|
|
51
46
|
if __name__ == '__main__':
|
|
47
|
+
from evalscope.run import run_task
|
|
48
|
+
from evalscope.utils.io_utils import yaml_to_dict
|
|
49
|
+
|
|
52
50
|
# step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
|
|
53
51
|
# step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
|
|
54
52
|
|