evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (181) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/benchmarks/__init__.py +2 -2
  3. evalscope/benchmarks/aigc/__init__.py +0 -0
  4. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  5. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  6. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  7. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  8. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  9. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  10. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  11. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  12. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  13. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  14. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  16. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  18. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  20. evalscope/benchmarks/data_adapter.py +16 -9
  21. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  22. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  23. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
  24. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  25. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  26. evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
  27. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
  29. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  30. evalscope/benchmarks/utils.py +7 -16
  31. evalscope/cli/start_app.py +1 -1
  32. evalscope/collections/evaluator.py +16 -4
  33. evalscope/config.py +7 -3
  34. evalscope/constants.py +11 -0
  35. evalscope/evaluator/evaluator.py +9 -3
  36. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  37. evalscope/metrics/__init__.py +49 -4
  38. evalscope/metrics/llm_judge.py +1 -1
  39. evalscope/metrics/named_metrics.py +13 -0
  40. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  41. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  42. evalscope/metrics/t2v_metrics/constants.py +12 -0
  43. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  44. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  45. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  46. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  47. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  48. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  49. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  50. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  51. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  52. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  53. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  54. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  55. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  56. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  57. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  58. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  59. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  60. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  61. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  62. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  63. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  64. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  65. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  66. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  67. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  68. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  69. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  70. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  71. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  72. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  73. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  74. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  75. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  76. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  77. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  139. evalscope/metrics/t2v_metrics/score.py +78 -0
  140. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  141. evalscope/models/__init__.py +50 -14
  142. evalscope/models/adapters/__init__.py +17 -0
  143. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  144. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  145. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  146. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  147. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  148. evalscope/models/adapters/t2i_adapter.py +76 -0
  149. evalscope/models/custom/__init__.py +2 -1
  150. evalscope/models/custom/dummy_model.py +11 -13
  151. evalscope/models/local_model.py +82 -33
  152. evalscope/models/model.py +2 -42
  153. evalscope/models/register.py +26 -0
  154. evalscope/perf/benchmark.py +4 -3
  155. evalscope/perf/main.py +4 -2
  156. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  157. evalscope/perf/utils/benchmark_util.py +2 -2
  158. evalscope/perf/utils/db_util.py +16 -8
  159. evalscope/report/__init__.py +1 -0
  160. evalscope/report/app.py +117 -67
  161. evalscope/report/app_arguments.py +11 -0
  162. evalscope/report/generator.py +1 -1
  163. evalscope/run.py +3 -3
  164. evalscope/third_party/thinkbench/eval.py +19 -7
  165. evalscope/utils/chat_service.py +2 -2
  166. evalscope/utils/import_utils.py +66 -0
  167. evalscope/utils/utils.py +12 -4
  168. evalscope/version.py +2 -2
  169. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
  170. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
  171. tests/aigc/__init__.py +1 -0
  172. tests/aigc/test_t2i.py +87 -0
  173. tests/cli/test_run.py +20 -7
  174. tests/perf/test_perf.py +6 -3
  175. evalscope/metrics/code_metric.py +0 -98
  176. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  177. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  178. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
  179. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
  180. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
  181. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,127 @@
1
+ import os
2
+ import torch
3
+ from modelscope import AutoTokenizer
4
+ from PIL import Image
5
+
6
+ from ...constants import CACHE_DIR, IMAGE_TOKEN_INDEX
7
+
8
+
9
+ def expand2square(pil_img, background_color):
10
+ width, height = pil_img.size
11
+ if width == height:
12
+ return pil_img
13
+ elif width > height:
14
+ result = Image.new(pil_img.mode, (width, width), background_color)
15
+ result.paste(pil_img, (0, (width - height) // 2))
16
+ return result
17
+ else:
18
+ result = Image.new(pil_img.mode, (height, height), background_color)
19
+ result.paste(pil_img, ((height - width) // 2, 0))
20
+ return result
21
+
22
+
23
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
24
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
25
+
26
+ def insert_separator(X, sep):
27
+ return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
28
+
29
+ input_ids = []
30
+ offset = 0
31
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
32
+ offset = 1
33
+ input_ids.append(prompt_chunks[0][0])
34
+
35
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
36
+ input_ids.extend(x[offset:])
37
+
38
+ if return_tensors is not None:
39
+ if return_tensors == 'pt':
40
+ return torch.tensor(input_ids, dtype=torch.long)
41
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
42
+ return input_ids
43
+
44
+
45
+ def t5_tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
46
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
47
+
48
+ def insert_separator(X, sep):
49
+ return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
50
+
51
+ input_ids = []
52
+ # Since there's no bos_token_id, simply concatenate the tokenized prompt_chunks with the image_token_index
53
+ for x in insert_separator(prompt_chunks, [image_token_index]):
54
+ input_ids.extend(x)
55
+
56
+ if return_tensors is not None:
57
+ if return_tensors == 'pt':
58
+ return torch.tensor(input_ids, dtype=torch.long)
59
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
60
+ return input_ids
61
+
62
+
63
+ def load_pretrained_model(
64
+ model_cls,
65
+ model_args,
66
+ model_path=None,
67
+ tokenizer_path=None,
68
+ model_max_length=None,
69
+ padding_side=None,
70
+ image_aspect_ratio='pad', # or 'square'
71
+ mmprojector_repo=None,
72
+ mmprojector_name=None,
73
+ device='cuda',
74
+ cache_dir=CACHE_DIR):
75
+ tokenizer_dict = {}
76
+ if model_max_length:
77
+ tokenizer_dict['model_max_length'] = model_max_length
78
+ if padding_side:
79
+ tokenizer_dict['padding_side'] = padding_side
80
+
81
+ from ..utils import download_file
82
+
83
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, **tokenizer_dict)
84
+ # tokenizer.pad_token = tokenizer.unk_token # could be redundant
85
+
86
+ model_path = download_file(model_path, cache_dir=cache_dir)
87
+ model = model_cls.from_pretrained(model_path, cache_dir=cache_dir)
88
+
89
+ if mmprojector_repo:
90
+ from huggingface_hub import hf_hub_download
91
+ model_base_name = mmprojector_repo.split('/')[-1]
92
+
93
+ if cache_dir is not None:
94
+ local_dir = os.path.join(cache_dir, model_base_name)
95
+ elif os.environ.get('HF_HOME') is not None:
96
+ local_dir = os.path.join(os.environ.get('HF_HOME'), model_base_name)
97
+ else:
98
+ local_dir = os.path.join(os.path.expanduser('~'), model_base_name)
99
+ print(f'Downloading projector weights to {local_dir}')
100
+ hf_hub_download(
101
+ repo_id=mmprojector_repo,
102
+ filename=mmprojector_name,
103
+ local_dir=local_dir,
104
+ )
105
+ pretrain_mm_mlp_adapter = os.path.join(local_dir, mmprojector_name)
106
+ model_args.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter # important to set to correct path
107
+
108
+ model.get_model().initialize_vision_modules(
109
+ model_args) # This will load the CLIP vision encoder and MLP projector
110
+ else:
111
+ model.resize_token_embeddings(len(tokenizer)) # perhaps not needed
112
+
113
+ if not model.get_vision_tower().is_loaded:
114
+ model.get_vision_tower().load_model()
115
+ model.to(device=device, dtype=torch.bfloat16)
116
+ image_processor = model.get_vision_tower().image_processor
117
+
118
+ model.requires_grad_(False)
119
+
120
+ # below might be redundant
121
+ model.config.image_aspect_ratio = image_aspect_ratio
122
+ model.config.use_cache = False
123
+ model.config.image_grid_pinpoints = None
124
+ model.config.freeze_mm_mlp_adapter = True
125
+
126
+ model = model.eval()
127
+ return tokenizer, model, image_processor
@@ -0,0 +1,17 @@
1
+ import torch
2
+ from abc import abstractmethod
3
+ from typing import List
4
+
5
+ from ..model import ScoreModel
6
+
7
+
8
+ class VQAScoreModel(ScoreModel):
9
+
10
+ @abstractmethod
11
+ def forward(self, images: List[str], texts: List[str], question_template: str,
12
+ answer_template: str) -> torch.Tensor:
13
+ """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
14
+ question_template: a string with optional {} to be replaced with the 'text'
15
+ answer_template: a string with optional {} to be replaced with the 'text'
16
+ """
17
+ pass
@@ -0,0 +1,78 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ from abc import abstractmethod
4
+ from torch.utils.data import DataLoader
5
+ from tqdm import tqdm
6
+ from typing import List, TypedDict, Union
7
+
8
+ from .constants import CACHE_DIR
9
+
10
+
11
+ class ImageTextDict(TypedDict):
12
+ images: List[str]
13
+ texts: List[str]
14
+
15
+
16
+ class Score(nn.Module):
17
+
18
+ def __init__(self, model: str, device: str = 'cuda', cache_dir: str = CACHE_DIR, **kwargs):
19
+ """Initialize the ScoreModel
20
+ """
21
+ super().__init__()
22
+ assert model in self.list_all_models()
23
+ self.device = device
24
+ self.model = self.prepare_scoremodel(model, device, cache_dir, **kwargs)
25
+
26
+ @abstractmethod
27
+ def prepare_scoremodel(self, model: str, device: str, cache_dir: str, **kwargs):
28
+ """Prepare the ScoreModel
29
+ """
30
+ pass
31
+
32
+ @abstractmethod
33
+ def list_all_models(self) -> List[str]:
34
+ """List all available models
35
+ """
36
+ pass
37
+
38
+ def forward(self, images: Union[str, List[str]], texts: Union[str, List[str]], **kwargs) -> List[float]:
39
+ """Return the similarity score(s) between the image(s) and the text(s)
40
+ If there are m images and n texts, return a m x n tensor
41
+ """
42
+ if type(images) == str:
43
+ images = [images]
44
+ if type(texts) == str:
45
+ texts = [texts]
46
+ assert len(images) == len(texts), 'Number of images and texts must match'
47
+ scores = []
48
+ for i, image in enumerate(images):
49
+ scores.append(self.model.forward([image] * len(texts), texts, **kwargs))
50
+ return scores
51
+
52
+ def batch_forward(self, dataset: List[ImageTextDict], batch_size: int = 16, **kwargs) -> torch.Tensor:
53
+ """Return the similarity score(s) between the image(s) and the text(s)
54
+ If there are m images and n texts, return a m x n tensor
55
+ """
56
+ num_samples = len(dataset)
57
+ num_images = len(dataset[0]['images'])
58
+ num_texts = len(dataset[0]['texts'])
59
+ scores = torch.zeros(num_samples, num_images, num_texts).to(self.device)
60
+
61
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
62
+ counter = 0
63
+ for batch_idx, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
64
+ cur_batch_size = len(batch['images'][0])
65
+ assert len(batch['images']) == num_images, \
66
+ f"Number of image options in batch {batch_idx} is {len(batch['images'])}. Expected {num_images} images."
67
+ assert len(batch['texts']) == num_texts, \
68
+ f"Number of text options in batch {batch_idx} is {len(batch['texts'])}. Expected {num_texts} texts."
69
+
70
+ for image_idx in range(num_images):
71
+ images = batch['images'][image_idx]
72
+ for text_idx in range(num_texts):
73
+ texts = batch['texts'][text_idx]
74
+ scores[counter:counter+cur_batch_size, image_idx, text_idx] = \
75
+ self.model.forward(images, texts, **kwargs)
76
+
77
+ counter += cur_batch_size
78
+ return scores
@@ -0,0 +1,14 @@
1
+ from typing import List
2
+
3
+ from .constants import CACHE_DIR
4
+ from .models.vqascore_models import get_vqascore_model, list_all_vqascore_models
5
+ from .score import Score
6
+
7
+
8
+ class VQAScore(Score):
9
+
10
+ def prepare_scoremodel(self, model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
11
+ return get_vqascore_model(model, device=device, cache_dir=cache_dir, **kwargs)
12
+
13
+ def list_all_models(self) -> List[str]:
14
+ return list_all_vqascore_models()
@@ -1,17 +1,53 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
2
3
 
3
- from evalscope.models.base_adapter import BaseModelAdapter, initialize_model_adapter
4
- from evalscope.models.chat_adapter import ChatGenerationModelAdapter
5
- from evalscope.models.choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
6
- from evalscope.models.custom import CustomModel
7
- from evalscope.models.custom_adapter import CustomModelAdapter
8
- from evalscope.models.local_model import LocalModel, get_local_model
9
- from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
10
- from evalscope.models.register import get_model_adapter
11
- from evalscope.models.server_adapter import ServerModelAdapter
4
+ from evalscope.utils.import_utils import _LazyModule
12
5
 
13
- __all__ = [
14
- 'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter',
15
- 'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter',
16
- 'LocalModel', 'get_local_model', 'initialize_model_adapter', 'get_model_adapter'
17
- ]
6
+ if TYPE_CHECKING:
7
+ from .adapters import (BaseModelAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
8
+ CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
9
+ initialize_model_adapter)
10
+ from .custom import CustomModel, DummyCustomModel
11
+ from .local_model import LocalModel, get_local_model
12
+ from .model import BaseModel, ChatBaseModel, OpenAIModel
13
+ from .register import get_model_adapter
14
+
15
+ else:
16
+ _import_structure = {
17
+ 'adapters': [
18
+ 'BaseModelAdapter',
19
+ 'initialize_model_adapter',
20
+ 'ChatGenerationModelAdapter',
21
+ 'ContinuationLogitsModelAdapter',
22
+ 'MultiChoiceModelAdapter',
23
+ 'CustomModelAdapter',
24
+ 'ServerModelAdapter',
25
+ 'T2IModelAdapter',
26
+ ],
27
+ 'custom': [
28
+ 'CustomModel',
29
+ 'DummyCustomModel',
30
+ ],
31
+ 'local_model': [
32
+ 'LocalModel',
33
+ 'get_local_model',
34
+ ],
35
+ 'model': [
36
+ 'BaseModel',
37
+ 'ChatBaseModel',
38
+ 'OpenAIModel',
39
+ ],
40
+ 'register': [
41
+ 'get_model_adapter',
42
+ ],
43
+ }
44
+
45
+ import sys
46
+
47
+ sys.modules[__name__] = _LazyModule(
48
+ __name__,
49
+ globals()['__file__'],
50
+ _import_structure,
51
+ module_spec=__spec__,
52
+ extra_objects={},
53
+ )
@@ -0,0 +1,17 @@
1
+ from .base_adapter import BaseModelAdapter, initialize_model_adapter
2
+ from .chat_adapter import ChatGenerationModelAdapter
3
+ from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
4
+ from .custom_adapter import CustomModelAdapter
5
+ from .server_adapter import ServerModelAdapter
6
+ from .t2i_adapter import T2IModelAdapter
7
+
8
+ __all__ = [
9
+ 'initialize_model_adapter',
10
+ 'BaseModelAdapter',
11
+ 'ChatGenerationModelAdapter',
12
+ 'ContinuationLogitsModelAdapter',
13
+ 'MultiChoiceModelAdapter',
14
+ 'CustomModelAdapter',
15
+ 'ServerModelAdapter',
16
+ 'T2IModelAdapter',
17
+ ]
@@ -3,19 +3,17 @@ from abc import ABC, abstractmethod
3
3
  from typing import TYPE_CHECKING, Any, List, Optional, Union
4
4
 
5
5
  from evalscope.constants import EvalType, OutputType
6
- from evalscope.models.custom import CustomModel
7
- from evalscope.models.local_model import LocalModel
8
- from evalscope.models.register import get_model_adapter, register_model_adapter
9
6
  from evalscope.utils.logger import get_logger
7
+ from ..custom import CustomModel
8
+ from ..local_model import LocalModel
10
9
 
11
10
  logger = get_logger()
12
11
 
13
12
  if TYPE_CHECKING:
14
- from evalscope.benchmarks import BenchmarkMeta
13
+ from evalscope.benchmarks import DataAdapter
15
14
  from evalscope.config import TaskConfig
16
15
 
17
16
 
18
- @register_model_adapter('base')
19
17
  class BaseModelAdapter(ABC):
20
18
 
21
19
  def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
@@ -39,12 +37,9 @@ class BaseModelAdapter(ABC):
39
37
  raise NotImplementedError
40
38
 
41
39
 
42
- def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'BenchmarkMeta', base_model: 'LocalModel'):
40
+ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', base_model: 'LocalModel'):
43
41
  """Initialize the model adapter based on the task configuration."""
44
- if task_cfg.dry_run:
45
- from evalscope.models.model import DummyChatModel
46
- return DummyChatModel(model_cfg=dict())
47
- elif task_cfg.eval_type == EvalType.CUSTOM:
42
+ if task_cfg.eval_type == EvalType.CUSTOM:
48
43
  if not isinstance(task_cfg.model, CustomModel):
49
44
  raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
50
45
  from evalscope.models import CustomModelAdapter
@@ -66,13 +61,18 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'BenchmarkMeta',
66
61
  stream=task_cfg.stream,
67
62
  )
68
63
  else:
64
+ from ..register import get_model_adapter
65
+
69
66
  # for local model, we need to determine the model adapter class based on the output type
70
- model_adapter_cls = benchmark.model_adapter
71
- if model_adapter_cls not in benchmark.output_types:
72
- logger.warning(f'Output type {model_adapter_cls} is not supported for benchmark {benchmark.name}. '
67
+ model_adapter_cls_str = benchmark.model_adapter
68
+ if model_adapter_cls_str not in benchmark.output_types:
69
+ logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}. '
73
70
  f'Using {benchmark.output_types[0]} instead.')
74
- model_adapter_cls = benchmark.output_types[0]
71
+ model_adapter_cls_str = benchmark.output_types[0]
75
72
 
76
- model_adapter = get_model_adapter(model_adapter_cls)
77
- return model_adapter(
78
- model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
73
+ model_adapter_cls = get_model_adapter(model_adapter_cls_str)
74
+ return model_adapter_cls(
75
+ model=base_model,
76
+ generation_config=task_cfg.generation_config,
77
+ chat_template=task_cfg.chat_template,
78
+ task_cfg=task_cfg)
@@ -3,18 +3,15 @@ import time
3
3
  import torch
4
4
  from typing import Any, Dict, List, Tuple, Union
5
5
 
6
- from evalscope.constants import OutputType
7
- from evalscope.models.base_adapter import BaseModelAdapter
8
- from evalscope.models.local_model import LocalModel
9
- from evalscope.models.register import register_model_adapter
10
6
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
11
7
  from evalscope.utils.logger import get_logger
12
8
  from evalscope.utils.model_utils import fix_do_sample_warning
9
+ from ..local_model import LocalModel
10
+ from .base_adapter import BaseModelAdapter
13
11
 
14
12
  logger = get_logger()
15
13
 
16
14
 
17
- @register_model_adapter(OutputType.GENERATION)
18
15
  class ChatGenerationModelAdapter(BaseModelAdapter):
19
16
  """
20
17
  Chat generation model adapter.
@@ -102,8 +99,14 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
102
99
  messages = [ChatMessage(role='user', content=query)]
103
100
  if i < len(system_prompts) and system_prompts[i]:
104
101
  messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
105
- formatted_prompts.append(
106
- self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
102
+ # whether thinking is needed
103
+ chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
104
+ if chat_template_kwargs is not None:
105
+ prompts = self.tokenizer.apply_chat_template(
106
+ messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
107
+ else:
108
+ prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
109
+ formatted_prompts.append(prompts)
107
110
  else:
108
111
  # For base model, use the queries as the input
109
112
  formatted_prompts = queries
@@ -3,14 +3,11 @@ import time
3
3
  import torch
4
4
  from typing import List
5
5
 
6
- from evalscope.constants import OutputType
7
- from evalscope.models.base_adapter import BaseModelAdapter
8
- from evalscope.models.local_model import LocalModel
9
- from evalscope.models.register import register_model_adapter
10
6
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
7
+ from ..local_model import LocalModel
8
+ from .base_adapter import BaseModelAdapter
11
9
 
12
10
 
13
- @register_model_adapter(OutputType.MULTIPLE_CHOICE)
14
11
  class MultiChoiceModelAdapter(BaseModelAdapter):
15
12
  """ The multi-choice model adapter. """
16
13
 
@@ -113,7 +110,6 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
113
110
  return log_probs, {'tokens': tokens}
114
111
 
115
112
 
116
- @register_model_adapter(OutputType.CONTINUOUS)
117
113
  class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
118
114
  """
119
115
  Continuation-logits model adapter.
@@ -1,11 +1,9 @@
1
1
  from typing import Any, Dict, List, Union
2
2
 
3
- from evalscope.models.base_adapter import BaseModelAdapter
4
- from evalscope.models.custom import CustomModel
5
- from evalscope.models.register import register_model_adapter
3
+ from ..custom import CustomModel
4
+ from .base_adapter import BaseModelAdapter
6
5
 
7
6
 
8
- @register_model_adapter('custom')
9
7
  class CustomModelAdapter(BaseModelAdapter):
10
8
 
11
9
  def __init__(self, custom_model: CustomModel, **kwargs):
@@ -5,14 +5,12 @@ from openai.types.chat import ChatCompletion, ChatCompletionChunk
5
5
  from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
6
6
  from typing import List, Optional, Union
7
7
 
8
- from evalscope.models.base_adapter import BaseModelAdapter
9
- from evalscope.models.register import register_model_adapter
10
8
  from evalscope.utils.logger import get_logger
9
+ from .base_adapter import BaseModelAdapter
11
10
 
12
11
  logger = get_logger()
13
12
 
14
13
 
15
- @register_model_adapter('server')
16
14
  class ServerModelAdapter(BaseModelAdapter):
17
15
  """
18
16
  Server model adapter to request remote API model and generate results.
@@ -0,0 +1,76 @@
1
+ import os
2
+ import time
3
+ import torch
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
6
+ from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
7
+ from evalscope.utils.io_utils import OutputsStructure
8
+ from evalscope.utils.logger import get_logger
9
+ from ..local_model import LocalModel
10
+ from .base_adapter import BaseModelAdapter
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class T2IModelAdapter(BaseModelAdapter):
16
+ """
17
+ Text to image model adapter.
18
+ """
19
+
20
+ def __init__(self, model: LocalModel, **kwargs):
21
+ super().__init__(model)
22
+
23
+ self.task_config = kwargs.get('task_cfg', None)
24
+ assert self.task_config is not None, 'Task config is required for T2I model adapter.'
25
+
26
+ self.save_path = os.path.join(self.task_config.work_dir, OutputsStructure.PREDICTIONS_DIR,
27
+ self.task_config.model_id, 'images')
28
+ os.makedirs(self.save_path, exist_ok=True)
29
+
30
+ def _model_generate(self, prompt, infer_cfg=None) -> List:
31
+ """
32
+ Generate images from the model.
33
+ Args:
34
+ prompt: The input prompt.
35
+ infer_cfg: The inference configuration.
36
+ Returns:
37
+ The generated images.
38
+ """
39
+ infer_cfg = infer_cfg or {}
40
+
41
+ sample = self.model(prompt=prompt, **infer_cfg).images
42
+ return sample
43
+
44
+ @torch.no_grad()
45
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
46
+ """
47
+ Args:
48
+ inputs: The input data.
49
+ infer_cfg: The inference configuration.
50
+ Returns:
51
+ The prediction results.
52
+ """
53
+ results = []
54
+ for input_item in inputs:
55
+ prompt = input_item['data'][0]
56
+ image_id = input_item.get('id') or input_item.get('index')
57
+
58
+ samples = self._model_generate(prompt, infer_cfg)
59
+
60
+ choices_list = []
61
+ for index, sample in enumerate(samples):
62
+ image_file_path = os.path.join(self.save_path, f'{image_id}_{index}.jpeg')
63
+ sample.save(image_file_path)
64
+ logger.debug(f'Saved image to {image_file_path}')
65
+
66
+ choice = ChatCompletionResponseChoice(
67
+ index=index, message=ChatMessage(content=image_file_path, role='assistant'), finish_reason='stop')
68
+ choices_list.append(choice)
69
+
70
+ res_d = ChatCompletionResponse(
71
+ model=self.model_id, choices=choices_list, object='images.generations',
72
+ created=int(time.time())).model_dump(exclude_unset=True)
73
+
74
+ results.append(res_d)
75
+
76
+ return results
@@ -1,3 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.models.custom.custom_model import *
3
+ from .custom_model import CustomModel
4
+ from .dummy_model import DummyCustomModel
@@ -1,11 +1,10 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os
3
3
  import time
4
+ from typing import List
4
5
 
5
- from evalscope.models.custom import CustomModel
6
- from evalscope.run import run_task
7
- from evalscope.utils.io_utils import yaml_to_dict
8
6
  from evalscope.utils.logger import get_logger
7
+ from .custom_model import CustomModel
9
8
 
10
9
  logger = get_logger()
11
10
  """
@@ -15,29 +14,25 @@ This script is used to rewrite the evaluation results without re-running the mod
15
14
 
16
15
  class DummyCustomModel(CustomModel):
17
16
 
18
- def __init__(self, config: dict, **kwargs):
17
+ def __init__(self, config: dict = {'model_id': 'dummy-model'}, **kwargs):
19
18
  super(DummyCustomModel, self).__init__(config=config, **kwargs)
20
19
 
21
- def predict(self, prompts: str, **kwargs):
20
+ def predict(self, prompts: List[dict], **kwargs):
22
21
  # ONLY FOR DUMMY IMPLEMENTATION, DO NOT EDIT OR USE IN PRODUCTION.
23
22
 
24
- response = 'The answer is C. NOTE: ONLY FOR TEST'
23
+ response = ''
25
24
 
26
25
  res_d: dict = {
27
26
  'choices': [{
28
27
  'index': 0,
29
28
  'message': {
30
- # 'content': f'The answer is B. Raw prompt: {prompt}',
31
29
  'content': response,
32
30
  'role': 'assistant'
33
31
  }
34
32
  }],
35
- 'created':
36
- time.time(),
37
- 'model':
38
- self.config.get('model_id'), # should be model_id
39
- 'object':
40
- 'chat.completion',
33
+ 'created': time.time(),
34
+ 'model': self.config.get('model_id'), # should be model_id
35
+ 'object': 'chat.completion',
41
36
  'usage': {
42
37
  'completion_tokens': 0,
43
38
  'prompt_tokens': 0,
@@ -49,6 +44,9 @@ class DummyCustomModel(CustomModel):
49
44
 
50
45
 
51
46
  if __name__ == '__main__':
47
+ from evalscope.run import run_task
48
+ from evalscope.utils.io_utils import yaml_to_dict
49
+
52
50
  # step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
53
51
  # step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
54
52