evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (181) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/benchmarks/__init__.py +2 -2
  3. evalscope/benchmarks/aigc/__init__.py +0 -0
  4. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  5. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  6. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  7. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  8. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  9. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  10. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  11. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  12. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  13. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  14. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  16. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  18. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  20. evalscope/benchmarks/data_adapter.py +16 -9
  21. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  22. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  23. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
  24. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  25. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  26. evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
  27. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
  29. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  30. evalscope/benchmarks/utils.py +7 -16
  31. evalscope/cli/start_app.py +1 -1
  32. evalscope/collections/evaluator.py +16 -4
  33. evalscope/config.py +7 -3
  34. evalscope/constants.py +11 -0
  35. evalscope/evaluator/evaluator.py +9 -3
  36. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  37. evalscope/metrics/__init__.py +49 -4
  38. evalscope/metrics/llm_judge.py +1 -1
  39. evalscope/metrics/named_metrics.py +13 -0
  40. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  41. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  42. evalscope/metrics/t2v_metrics/constants.py +12 -0
  43. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  44. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  45. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  46. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  47. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  48. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  49. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  50. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  51. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  52. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  53. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  54. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  55. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  56. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  57. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  58. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  59. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  60. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  61. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  62. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  63. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  64. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  65. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  66. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  67. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  68. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  69. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  70. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  71. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  72. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  73. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  74. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  75. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  76. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  77. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  139. evalscope/metrics/t2v_metrics/score.py +78 -0
  140. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  141. evalscope/models/__init__.py +50 -14
  142. evalscope/models/adapters/__init__.py +17 -0
  143. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  144. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  145. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  146. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  147. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  148. evalscope/models/adapters/t2i_adapter.py +76 -0
  149. evalscope/models/custom/__init__.py +2 -1
  150. evalscope/models/custom/dummy_model.py +11 -13
  151. evalscope/models/local_model.py +82 -33
  152. evalscope/models/model.py +2 -42
  153. evalscope/models/register.py +26 -0
  154. evalscope/perf/benchmark.py +4 -3
  155. evalscope/perf/main.py +4 -2
  156. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  157. evalscope/perf/utils/benchmark_util.py +2 -2
  158. evalscope/perf/utils/db_util.py +16 -8
  159. evalscope/report/__init__.py +1 -0
  160. evalscope/report/app.py +117 -67
  161. evalscope/report/app_arguments.py +11 -0
  162. evalscope/report/generator.py +1 -1
  163. evalscope/run.py +3 -3
  164. evalscope/third_party/thinkbench/eval.py +19 -7
  165. evalscope/utils/chat_service.py +2 -2
  166. evalscope/utils/import_utils.py +66 -0
  167. evalscope/utils/utils.py +12 -4
  168. evalscope/version.py +2 -2
  169. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
  170. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
  171. tests/aigc/__init__.py +1 -0
  172. tests/aigc/test_t2i.py +87 -0
  173. tests/cli/test_run.py +20 -7
  174. tests/perf/test_perf.py +6 -3
  175. evalscope/metrics/code_metric.py +0 -98
  176. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  177. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  178. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
  179. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
  180. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
  181. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,218 @@
1
+ import torch
2
+ from typing import List
3
+
4
+ from ...constants import CACHE_DIR, CONTEXT_LEN, DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, SYSTEM_MSG
5
+ from .clip_t5.model import CLIPT5ForConditionalGeneration, ModelArguments
6
+ from .mm_utils import expand2square, load_pretrained_model, t5_tokenizer_image_token
7
+ from .vqa_model import VQAScoreModel
8
+
9
+ default_question_template = 'Does this figure show "{}"? Please answer yes or no.'
10
+ default_answer_template = 'Yes'
11
+
12
+
13
+ def format_question(question, conversation_style='plain'):
14
+ if conversation_style == 't5_plain': # for 1st stage t5 model
15
+ question = DEFAULT_IMAGE_TOKEN + question
16
+ elif conversation_style == 't5_chat': # for 2nd stage t5 model
17
+ question = SYSTEM_MSG + ' USER: ' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' ASSISTANT: '
18
+ elif conversation_style == 't5_chat_no_system': # for 2nd stage t5 model
19
+ question = 'USER: ' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' ASSISTANT: '
20
+ elif conversation_style == 't5_chat_no_system_no_user': # for 2nd stage t5 model
21
+ question = '' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' : '
22
+ # elif conversation_style == 't5_chat_ood_system': # for 2nd stage t5 model
23
+ # question = SYSTEM_MSG + " HUMAN: " + DEFAULT_IMAGE_TOKEN + "\n" + question + " GPT: "
24
+ else:
25
+ raise NotImplementedError()
26
+ return question
27
+
28
+
29
+ def format_answer(answer, conversation_style='plain'):
30
+ return answer
31
+
32
+
33
+ CLIP_T5_MODELS = {
34
+ # We recommend using 'clip-flant5-xxl' for maximal performance.
35
+ # If you want to use a smaller model, we recommend using 'clip-flant5-xl'.
36
+ 'clip-flant5-xxl': {
37
+ 'tokenizer': {
38
+ 'path': 'AI-ModelScope/clip-flant5-xxl', # zhiqiulin/clip-flant5-xxl
39
+ 'model_max_length': CONTEXT_LEN,
40
+ },
41
+ 'model': {
42
+ 'path': 'AI-ModelScope/clip-flant5-xxl', # zhiqiulin/clip-flant5-xxl
43
+ 'conversation': 't5_chat',
44
+ 'image_aspect_ratio': 'pad',
45
+ },
46
+ },
47
+ 'clip-flant5-xl': {
48
+ 'tokenizer': {
49
+ 'path': 'zhiqiulin/clip-flant5-xl',
50
+ 'model_max_length': CONTEXT_LEN,
51
+ },
52
+ 'model': {
53
+ 'path': 'zhiqiulin/clip-flant5-xl',
54
+ 'conversation': 't5_chat',
55
+ 'image_aspect_ratio': 'pad',
56
+ },
57
+ },
58
+ }
59
+
60
+
61
+ class CLIPT5Model(VQAScoreModel):
62
+ """A wrapper for the CLIP-FlanT5 or CLIP-T5 models"""
63
+
64
+ def __init__(self, model_name='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR):
65
+ assert model_name in CLIP_T5_MODELS
66
+ super().__init__(model_name=model_name, device=device, cache_dir=cache_dir)
67
+
68
+ def load_model(self):
69
+ """Load the model, tokenizer, image transform
70
+ """
71
+ model_args = ModelArguments()
72
+ model_max_length = CLIP_T5_MODELS[self.model_name]['tokenizer']['model_max_length'] \
73
+ if 'model_max_length' in CLIP_T5_MODELS[self.model_name]['tokenizer'] else None
74
+ padding_side = CLIP_T5_MODELS[self.model_name]['tokenizer']['padding_side'] \
75
+ if 'padding_side' in CLIP_T5_MODELS[self.model_name]['tokenizer'] else None
76
+ mmprojector_repo = CLIP_T5_MODELS[self.model_name]['model']['mmprojector_repo'] \
77
+ if 'mmprojector_repo' in CLIP_T5_MODELS[self.model_name]['model'] else None
78
+ mmprojector_name = CLIP_T5_MODELS[self.model_name]['model']['mmprojector_name'] \
79
+ if 'mmprojector_name' in CLIP_T5_MODELS[self.model_name]['model'] else None
80
+
81
+ # default is 'pad'
82
+ # stage-1 models use 'square'
83
+ self.image_aspect_ratio = CLIP_T5_MODELS[self.model_name]['model']['image_aspect_ratio'] \
84
+ if 'image_aspect_ratio' in CLIP_T5_MODELS[self.model_name]['model'] else 'pad'
85
+
86
+ self.conversational_style = CLIP_T5_MODELS[self.model_name]['model']['conversation']
87
+
88
+ self.context_len = CONTEXT_LEN
89
+
90
+ self.tokenizer, self.model, self.image_processor = load_pretrained_model(
91
+ CLIPT5ForConditionalGeneration,
92
+ model_args,
93
+ model_path=CLIP_T5_MODELS[self.model_name]['model']['path'],
94
+ tokenizer_path=CLIP_T5_MODELS[self.model_name]['tokenizer']['path'],
95
+ model_max_length=model_max_length,
96
+ padding_side=padding_side,
97
+ image_aspect_ratio=self.image_aspect_ratio,
98
+ mmprojector_repo=mmprojector_repo,
99
+ mmprojector_name=mmprojector_name,
100
+ device=self.device,
101
+ cache_dir=self.cache_dir)
102
+
103
+ def load_images(self, image: List[str]) -> torch.Tensor:
104
+ """Load the image(s), and return a tensor (after preprocessing) put on self.device
105
+ """
106
+ image = [self.image_loader(x) for x in image]
107
+ if self.image_aspect_ratio == 'pad':
108
+ image = [
109
+ expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean)) for image in image
110
+ ]
111
+ image = [self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] for image in image]
112
+ assert all(x.shape == image[0].shape for x in image)
113
+ image = torch.stack(image, dim=0).to(self.device)
114
+ return image
115
+
116
+ @torch.no_grad()
117
+ @torch.autocast(device_type='cuda', dtype=torch.bfloat16)
118
+ def forward(self,
119
+ images: List[str],
120
+ texts: List[str],
121
+ question_template: str = default_question_template,
122
+ answer_template: str = default_answer_template) -> torch.Tensor:
123
+ """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
124
+ """
125
+ assert len(images) == len(texts), 'Number of images and texts must match'
126
+ # Turn "a photo of a dog" into
127
+ # Q: "Does this figure show "a photo of a dog"? Please answer yes or no."
128
+ # A: "Yes"
129
+ questions = [question_template.format(text) for text in texts]
130
+ answers = [answer_template.format(text) for text in texts]
131
+
132
+ # Formatting for CLIP-FlanT5 desired input including system message and image tokens
133
+ questions = [format_question(question, conversation_style=self.conversational_style) for question in questions]
134
+ answers = [format_answer(answer, conversation_style=self.conversational_style) for answer in answers]
135
+
136
+ images = self.load_images(images)
137
+
138
+ input_ids = [t5_tokenizer_image_token(qs, self.tokenizer, return_tensors='pt') for qs in questions]
139
+ labels = [t5_tokenizer_image_token(ans, self.tokenizer, return_tensors='pt') for ans in answers]
140
+
141
+ input_ids = torch.nn.utils.rnn.pad_sequence(
142
+ input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
143
+ labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
144
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
145
+ labels = labels[:, :self.tokenizer.model_max_length]
146
+
147
+ attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
148
+ decoder_attention_mask = labels.ne(IGNORE_INDEX)
149
+
150
+ input_ids, attention_mask, decoder_attention_mask, labels = input_ids.to(self.device), \
151
+ attention_mask.to(self.device), decoder_attention_mask.to(self.device), labels.to(self.device)
152
+ model_input_kwargs = {
153
+ 'input_ids': input_ids,
154
+ 'attention_mask': attention_mask,
155
+ 'decoder_attention_mask': decoder_attention_mask,
156
+ 'labels': labels,
157
+ 'images': images,
158
+ 'past_key_values': None,
159
+ 'inputs_embeds': None,
160
+ 'use_cache': None,
161
+ 'output_attentions': None,
162
+ 'output_hidden_states': None,
163
+ 'return_dict': True,
164
+ }
165
+
166
+ outputs = self.model(**model_input_kwargs)
167
+
168
+ logits = outputs.logits
169
+ lm_prob = torch.zeros(logits.shape[0])
170
+ loss_fct = torch.nn.CrossEntropyLoss(reduction='mean')
171
+ for k in range(lm_prob.shape[0]):
172
+ lm_prob[k] = (
173
+ -loss_fct(logits[k], labels[k])).exp() # exp to cancel the log and get raw prob between 0 and 1
174
+ return lm_prob
175
+
176
+ @torch.no_grad()
177
+ @torch.autocast(device_type='cuda', dtype=torch.bfloat16)
178
+ def generate(
179
+ self,
180
+ images: List[str],
181
+ prompts: List[str],
182
+ temperature: float = 0.2,
183
+ ):
184
+ """Forward pass of the model to return n strings for n (image, prompt) pairs
185
+ """
186
+ assert len(images) == len(prompts), 'Number of images and texts must match'
187
+
188
+ # Formatting for CLIP-FlanT5 desired input including system message and image tokens
189
+ questions = [format_question(prompt, conversation_style=self.conversational_style) for prompt in prompts]
190
+ images = self.load_images(images)
191
+
192
+ input_ids = [t5_tokenizer_image_token(qs, self.tokenizer, return_tensors='pt') for qs in questions]
193
+ input_ids = torch.nn.utils.rnn.pad_sequence(
194
+ input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
195
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
196
+
197
+ attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
198
+
199
+ input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
200
+ model_input_kwargs = {
201
+ 'inputs': input_ids,
202
+ 'images': images,
203
+ 'attention_mask': attention_mask,
204
+ 'do_sample': True if temperature > 0 else False,
205
+ 'temperature': temperature,
206
+ 'top_p': None,
207
+ 'num_beams': 1,
208
+ 'max_new_token': 1024,
209
+ 'use_cache': True,
210
+ }
211
+
212
+ outputs = self.model.generate(**model_input_kwargs)
213
+ outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
214
+ for i in range(len(outputs)):
215
+ if outputs[i].endswith(' '):
216
+ outputs[i] = outputs[i][:-1]
217
+ outputs[i] = outputs[i].strip()
218
+ return outputs
@@ -0,0 +1,150 @@
1
+ import base64
2
+ import os
3
+ import tiktoken
4
+ import torch
5
+ from openai import OpenAI
6
+ from typing import List
7
+
8
+ from .vqa_model import VQAScoreModel
9
+
10
+ default_question_template = 'Does this figure show "{}"? Please answer yes or no.'
11
+ default_answer_template = 'Yes'
12
+
13
+ GPT4V_MODELS = {
14
+ # We recommend using 'gpt-4-turbo' for optimal performance.
15
+ 'gpt-4-turbo': {},
16
+ 'gpt-4o': {},
17
+ }
18
+
19
+
20
+ # Function to encode the image
21
+ def encode_image(image_path):
22
+ with open(image_path, 'rb') as image_file:
23
+ return base64.b64encode(image_file.read()).decode('utf-8')
24
+
25
+
26
+ def get_image_type(image_path):
27
+ image_type = image_path.split('.')[-1]
28
+ assert image_type in ['png', 'jpeg', 'jpg', 'gif', 'bmp', 'webp']
29
+ return image_type
30
+
31
+
32
+ class GPT4VModel(VQAScoreModel):
33
+ """A wrapper for the GPT4V models"""
34
+
35
+ def __init__(self, model_name='gpt-4-turbo', device='cuda', cache_dir=None, openai_key=None, top_logprobs=2):
36
+ assert model_name in GPT4V_MODELS
37
+ assert openai_key is not None, 'Please provide an OpenAI API key'
38
+ self.openai_key = openai_key
39
+ self.top_logprobs = top_logprobs
40
+ super().__init__(model_name=model_name, device=device, cache_dir=cache_dir)
41
+
42
+ def load_model(self):
43
+ """Load the model, tokenizer, image transform
44
+ """
45
+ self.tokenizer = tiktoken.encoding_for_model(self.model_name)
46
+ self.client = OpenAI(api_key=self.openai_key)
47
+ # self.candidate_answers = GPT4V_MODELS[self.model_name]['candidate_answers']
48
+ # assert GPT4V_MODELS[self.model_name]['answer'] in self.candidate_answers
49
+ # self.candidate_tokens = []
50
+ # for ans in self.candidate_answers:
51
+ # token = self.tokenizer.encode(ans)
52
+ # assert len(token) == 1, "Currently only support single token answers"
53
+ # self.candidate_tokens.append(token[0])
54
+
55
+ def load_images(self, image: List[str]) -> torch.Tensor:
56
+ """Load the image(s), and return the string
57
+ """
58
+ image = [{'path': img, 'type': get_image_type(img), 'base64': encode_image(img)} for img in image]
59
+ return image
60
+
61
+ def forward_single(self, image, question, answer):
62
+ try:
63
+ completion = self.client.chat.completions.create(
64
+ model=self.model_name,
65
+ messages=[{
66
+ 'role':
67
+ 'user',
68
+ 'content': [{
69
+ 'type': 'text',
70
+ 'text': question
71
+ }, {
72
+ 'type': 'image_url',
73
+ 'image_url': {
74
+ 'url': f"data:image/{image['type']};base64,{image['base64']}"
75
+ }
76
+ }]
77
+ }],
78
+ logprobs=True,
79
+ top_logprobs=self.top_logprobs,
80
+ # logit_bias={yes_token:50, no_token:50}
81
+ )
82
+ except:
83
+ print(
84
+ f"Warning: completion not generated for image: {image['path']} and question: {question} and answer: {answer}"
85
+ )
86
+ print(f'Trying again with the same image')
87
+ try:
88
+ completion = self.client.chat.completions.create(
89
+ model=self.model_name,
90
+ messages=[{
91
+ 'role':
92
+ 'user',
93
+ 'content': [{
94
+ 'type': 'text',
95
+ 'text': question
96
+ }, {
97
+ 'type': 'image_url',
98
+ 'image_url': {
99
+ 'url': f"data:image/{image['type']};base64,{image['base64']}"
100
+ }
101
+ }]
102
+ }],
103
+ logprobs=True,
104
+ top_logprobs=self.top_logprobs,
105
+ )
106
+ except:
107
+ print(f"Failed image: {image['path']} and question: {question} and answer: {answer}")
108
+ return torch.Tensor([0.0])
109
+
110
+ # print(completion.choices[0].message)
111
+ # print(completion.choices[0].logprobs)
112
+ # print(completion.choices[0].logprobs.content[0])
113
+ is_generated = False
114
+ for top_logprob in completion.choices[0].logprobs.content[0].top_logprobs:
115
+ if top_logprob.token == answer:
116
+ is_generated = True
117
+ return torch.Tensor([top_logprob.logprob]).exp()
118
+ if not is_generated:
119
+ print(
120
+ f"Warning: answer not generated for image: {image['path']} and question: {question} and answer: {answer}"
121
+ )
122
+ print(completion.choices[0].logprobs.content[0].top_logprobs)
123
+ return torch.Tensor([0.0])
124
+
125
+ def forward(self,
126
+ images: List[str],
127
+ texts: List[str],
128
+ question_template: str = default_question_template,
129
+ answer_template: str = default_answer_template) -> torch.Tensor:
130
+ """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
131
+ """
132
+ assert len(images) == len(texts), 'Number of images and texts must match'
133
+ # Turn "a photo of a dog" into
134
+ # Q: "Does this figure show "a photo of a dog"? Please answer yes or no."
135
+ # A: "Yes"
136
+ questions = [question_template.format(text) for text in texts]
137
+ answers = [answer_template.format(text) for text in texts]
138
+
139
+ for ans in answers:
140
+ ans_tokens = self.tokenizer.encode(ans)
141
+ assert len(ans_tokens) == 1, 'Currently only support single token answers'
142
+
143
+ images = self.load_images(images)
144
+
145
+ lm_prob = torch.zeros(len(images))
146
+
147
+ for idx, (image, question, answer) in enumerate(zip(images, questions, answers)):
148
+ lm_prob[idx] = self.forward_single(image, question, answer)
149
+
150
+ return lm_prob
@@ -0,0 +1,26 @@
1
+ """
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ from omegaconf import OmegaConf
11
+
12
+ from .common.registry import registry
13
+ from .models import *
14
+ from .processors import *
15
+
16
+ root_dir = os.path.dirname(os.path.abspath(__file__))
17
+ default_cfg = OmegaConf.load(os.path.join(root_dir, 'configs/default.yaml'))
18
+
19
+ registry.register_path('library_root', root_dir)
20
+ repo_root = os.path.join(root_dir, '..')
21
+ registry.register_path('repo_root', repo_root)
22
+ cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
23
+ registry.register_path('cache_root', cache_root)
24
+
25
+ registry.register('MAX_INT', sys.maxsize)
26
+ registry.register('SPLIT_NAMES', ['train', 'val', 'test'])