evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,306 @@
1
+ # Copyright 2023 Zhiqiu Lin
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import torch
16
+ from dataclasses import dataclass, field
17
+ from transformers import AutoConfig, AutoModelForSeq2SeqLM, T5Config, T5ForConditionalGeneration
18
+ from transformers.modeling_outputs import Seq2SeqLMOutput
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ from ..multimodal_encoder.builder import build_vision_tower
22
+ from ..multimodal_projector.builder import build_vision_projector
23
+
24
+ IMAGE_TOKEN_INDEX = -200
25
+
26
+
27
+ @dataclass
28
+ class ModelArguments:
29
+ tune_mm_mlp_adapter: bool = field(default=False)
30
+ vision_tower: Optional[str] = field(default='openai/clip-vit-large-patch14-336')
31
+ mm_vision_select_layer: Optional[int] = field(default=-2) # default to the second last layer in llava1.5
32
+ pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
33
+ mm_projector_type: Optional[str] = field(default='mlp2x_gelu')
34
+ mm_vision_select_feature: Optional[str] = field(default='patch')
35
+
36
+
37
+ class CLIPT5Config(T5Config):
38
+ model_type = 'clip_t5'
39
+
40
+
41
+ class CLIPT5ForConditionalGeneration(T5ForConditionalGeneration):
42
+ # This class supports both T5 and FlanT5
43
+ config_class = CLIPT5Config
44
+
45
+ def __init__(self, config):
46
+ super(CLIPT5ForConditionalGeneration, self).__init__(config)
47
+ self.embed_tokens = self.encoder.embed_tokens
48
+ if hasattr(config, 'mm_vision_tower'):
49
+ self.vision_tower = build_vision_tower(config, delay_load=False)
50
+ self.mm_projector = build_vision_projector(config)
51
+
52
+ def get_vision_tower(self):
53
+ vision_tower = getattr(self, 'vision_tower', None)
54
+ if type(vision_tower) is list:
55
+ vision_tower = vision_tower[0]
56
+ return vision_tower
57
+
58
+ def get_model(self):
59
+ return self # for compatibility with LlavaMetaForCausalLM
60
+
61
+ def prepare_inputs_labels_for_multimodal(
62
+ self, input_ids, attention_mask, decoder_attention_mask, past_key_values, labels, images
63
+ ):
64
+ # The labels are now separated from the input_ids.
65
+ vision_tower = self.get_vision_tower()
66
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
67
+ raise NotImplementedError()
68
+
69
+ if type(images) is list or images.ndim == 5:
70
+ concat_images = torch.cat([image for image in images], dim=0)
71
+ image_features = self.encode_images(concat_images)
72
+ split_sizes = [image.shape[0] for image in images]
73
+ image_features = torch.split(image_features, split_sizes, dim=0)
74
+ image_features = [x.flatten(0, 1) for x in image_features]
75
+ else:
76
+ image_features = self.encode_images(images)
77
+
78
+ new_input_embeds = []
79
+ cur_image_idx = 0
80
+ for _, cur_input_ids in enumerate(input_ids):
81
+ if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
82
+ # multimodal LLM, but the current sample is not multimodal
83
+ raise NotImplementedError()
84
+ image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
85
+ cur_new_input_embeds = []
86
+ while image_token_indices.numel() > 0:
87
+ cur_image_features = image_features[cur_image_idx]
88
+ image_token_start = image_token_indices[0]
89
+ cur_new_input_embeds.append(self.embed_tokens(cur_input_ids[:image_token_start]))
90
+ cur_new_input_embeds.append(cur_image_features)
91
+ cur_image_idx += 1
92
+ cur_input_ids = cur_input_ids[image_token_start + 1:]
93
+ image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
94
+ if cur_input_ids.numel() > 0:
95
+ cur_new_input_embeds.append(self.embed_tokens(cur_input_ids))
96
+ cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
97
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
98
+ new_input_embeds.append(cur_new_input_embeds)
99
+
100
+ if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
101
+ max_len = max(x.shape[0] for x in new_input_embeds)
102
+
103
+ new_input_embeds_align = []
104
+ _input_embeds_lengths = []
105
+ for cur_new_embed in new_input_embeds:
106
+ _input_embeds_lengths.append(cur_new_embed.shape[0])
107
+ cur_new_embed = torch.cat((
108
+ cur_new_embed,
109
+ torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]),
110
+ dtype=cur_new_embed.dtype,
111
+ device=cur_new_embed.device)
112
+ ),
113
+ dim=0)
114
+ new_input_embeds_align.append(cur_new_embed)
115
+ new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
116
+
117
+ if attention_mask is not None:
118
+ new_attention_mask = []
119
+ for cur_attention_mask, _input_embeds_length in zip(attention_mask, _input_embeds_lengths):
120
+ new_attn_mask_pad_left = torch.full((_input_embeds_length - input_ids.shape[1], ),
121
+ True,
122
+ dtype=attention_mask.dtype,
123
+ device=attention_mask.device)
124
+ new_attn_mask_pad_right = torch.full((new_input_embeds.shape[1] - _input_embeds_length, ),
125
+ False,
126
+ dtype=attention_mask.dtype,
127
+ device=attention_mask.device)
128
+ cur_new_attention_mask = torch.cat(
129
+ (new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0
130
+ )
131
+ new_attention_mask.append(cur_new_attention_mask)
132
+ attention_mask = torch.stack(new_attention_mask, dim=0)
133
+ assert attention_mask.shape == new_input_embeds.shape[:2]
134
+ else:
135
+ new_input_embeds = torch.stack(new_input_embeds, dim=0)
136
+
137
+ if attention_mask is not None:
138
+ new_attn_mask_pad_left = torch.full(
139
+ (attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]),
140
+ True,
141
+ dtype=attention_mask.dtype,
142
+ device=attention_mask.device
143
+ )
144
+ attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
145
+ assert attention_mask.shape == new_input_embeds.shape[:2]
146
+
147
+ return None, attention_mask, decoder_attention_mask, past_key_values, new_input_embeds, labels
148
+
149
+ def encode_images(self, images):
150
+ image_features = self.get_vision_tower()(images)
151
+ image_features = self.mm_projector(image_features)
152
+ return image_features
153
+
154
+ def initialize_vision_modules(self, model_args, fsdp=None):
155
+ vision_tower = model_args.vision_tower
156
+ mm_vision_select_layer = model_args.mm_vision_select_layer
157
+ mm_vision_select_feature = model_args.mm_vision_select_feature
158
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
159
+
160
+ self.config.mm_vision_tower = vision_tower
161
+ self.config.pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter
162
+
163
+ if self.get_vision_tower() is None:
164
+ vision_tower = build_vision_tower(model_args)
165
+
166
+ if fsdp is not None and len(fsdp) > 0:
167
+ self.vision_tower = [vision_tower]
168
+ else:
169
+ self.vision_tower = vision_tower
170
+ else:
171
+ if fsdp is not None and len(fsdp) > 0:
172
+ vision_tower = self.vision_tower[0]
173
+ else:
174
+ vision_tower = self.vision_tower
175
+ if not vision_tower.is_loaded:
176
+ vision_tower.load_model()
177
+
178
+ self.config.use_mm_proj = True
179
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'mlp2x_gelu')
180
+ self.config.mm_hidden_size = vision_tower.hidden_size
181
+ self.config.mm_vision_select_layer = mm_vision_select_layer
182
+ self.config.mm_vision_select_feature = mm_vision_select_feature
183
+
184
+ if getattr(self, 'mm_projector', None) is None:
185
+ self.mm_projector = build_vision_projector(self.config)
186
+
187
+ if pretrain_mm_mlp_adapter is not None:
188
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
189
+
190
+ def get_w(weights, keyword):
191
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
192
+
193
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
194
+
195
+ def forward(
196
+ self,
197
+ input_ids: torch.LongTensor = None,
198
+ attention_mask: Optional[torch.Tensor] = None,
199
+ decoder_attention_mask: Optional[torch.Tensor] = None,
200
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
201
+ inputs_embeds: Optional[torch.FloatTensor] = None,
202
+ labels: Optional[torch.LongTensor] = None,
203
+ use_cache: Optional[bool] = None,
204
+ output_attentions: Optional[bool] = None,
205
+ output_hidden_states: Optional[bool] = None,
206
+ images: Optional[torch.FloatTensor] = None,
207
+ return_dict: Optional[bool] = None,
208
+ **kwargs,
209
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
210
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
211
+ output_hidden_states = (
212
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
213
+ )
214
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
215
+
216
+ if inputs_embeds is None:
217
+ _, attention_mask, decoder_attention_mask, past_key_values, inputs_embeds, labels = \
218
+ self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, decoder_attention_mask, past_key_values, labels, images)
219
+
220
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
221
+ outputs = super(CLIPT5ForConditionalGeneration, self).forward(
222
+ input_ids=None, # will be None if inputs_embeds is not None
223
+ attention_mask=attention_mask,
224
+ decoder_attention_mask=decoder_attention_mask,
225
+ labels=labels,
226
+ past_key_values=past_key_values,
227
+ inputs_embeds=inputs_embeds,
228
+ use_cache=use_cache,
229
+ output_attentions=output_attentions,
230
+ output_hidden_states=output_hidden_states,
231
+ return_dict=return_dict,
232
+ **kwargs,
233
+ )
234
+
235
+ return outputs
236
+
237
+ @torch.no_grad()
238
+ def generate(
239
+ self,
240
+ inputs: Optional[torch.Tensor] = None,
241
+ attention_mask: Optional[torch.Tensor] = None,
242
+ images: Optional[torch.Tensor] = None,
243
+ **kwargs,
244
+ ):
245
+ assert images is not None, 'images must be provided'
246
+ assert inputs is not None, 'inputs must be provided'
247
+ assert attention_mask is not None, 'attention_mask must be provided'
248
+ _, attention_mask, _, _, inputs_embeds, _ = \
249
+ self.prepare_inputs_labels_for_multimodal(inputs, attention_mask, None, None, None, images)
250
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
251
+ outputs = super(CLIPT5ForConditionalGeneration, self).generate(
252
+ input_ids=None, # will be None if inputs_embeds is not None
253
+ attention_mask=attention_mask,
254
+ inputs_embeds=inputs_embeds,
255
+ )
256
+ return outputs
257
+
258
+ def prepare_inputs_for_generation(
259
+ self,
260
+ input_ids,
261
+ past_key_values=None,
262
+ attention_mask=None,
263
+ head_mask=None,
264
+ decoder_head_mask=None,
265
+ decoder_attention_mask=None,
266
+ cross_attn_head_mask=None,
267
+ use_cache=None,
268
+ encoder_outputs=None,
269
+ inputs_embeds=None,
270
+ **kwargs,
271
+ ):
272
+ # cut decoder_input_ids if past_key_values is used
273
+ if past_key_values is not None:
274
+ past_length = past_key_values[0][0].shape[2]
275
+
276
+ # Some generation methods already pass only the last input ID
277
+ if input_ids.shape[1] > past_length:
278
+ remove_prefix_length = past_length
279
+ else:
280
+ # Default to old behavior: keep only final ID
281
+ remove_prefix_length = input_ids.shape[1] - 1
282
+
283
+ input_ids = input_ids[:, remove_prefix_length:]
284
+
285
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
286
+ if inputs_embeds is not None and past_key_values is None:
287
+ model_inputs = {'inputs_embeds': inputs_embeds}
288
+ else:
289
+ model_inputs = {'input_ids': input_ids}
290
+
291
+ model_inputs.update({
292
+ 'decoder_input_ids': input_ids,
293
+ 'past_key_values': past_key_values,
294
+ 'encoder_outputs': encoder_outputs,
295
+ 'attention_mask': attention_mask,
296
+ 'head_mask': head_mask,
297
+ 'decoder_head_mask': decoder_head_mask,
298
+ 'decoder_attention_mask': decoder_attention_mask,
299
+ 'cross_attn_head_mask': cross_attn_head_mask,
300
+ 'use_cache': use_cache,
301
+ })
302
+ return model_inputs
303
+
304
+
305
+ AutoConfig.register('clip_t5', CLIPT5Config)
306
+ AutoModelForSeq2SeqLM.register(CLIPT5Config, CLIPT5ForConditionalGeneration)
@@ -0,0 +1,12 @@
1
+ import os
2
+
3
+ from .clip_encoder import CLIPVisionTower
4
+
5
+
6
+ def build_vision_tower(vision_tower_cfg, **kwargs):
7
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
8
+ is_absolute_path_exists = os.path.exists(vision_tower)
9
+ if is_absolute_path_exists or vision_tower.startswith('openai') or vision_tower.startswith('laion'):
10
+ return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
11
+
12
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
@@ -0,0 +1,84 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
4
+
5
+
6
+ class CLIPVisionTower(nn.Module):
7
+
8
+ def __init__(self, vision_tower, args, delay_load=False):
9
+ super().__init__()
10
+
11
+ self.is_loaded = False
12
+
13
+ self.vision_tower_name = vision_tower
14
+ self.select_layer = args.mm_vision_select_layer
15
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16
+
17
+ if not delay_load:
18
+ self.load_model()
19
+ else:
20
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
21
+
22
+ def load_model(self):
23
+ from .....utils import download_file
24
+ model_path = download_file(self.vision_tower_name.replace('openai', 'openai-mirror'))
25
+ self.image_processor = CLIPImageProcessor.from_pretrained(model_path)
26
+ self.vision_tower = CLIPVisionModel.from_pretrained(model_path)
27
+ self.vision_tower.requires_grad_(False)
28
+
29
+ self.is_loaded = True
30
+
31
+ def feature_select(self, image_forward_outs):
32
+ image_features = image_forward_outs.hidden_states[self.select_layer]
33
+ if self.select_feature == 'patch':
34
+ image_features = image_features[:, 1:]
35
+ elif self.select_feature == 'cls_patch':
36
+ image_features = image_features
37
+ else:
38
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
39
+ return image_features
40
+
41
+ @torch.no_grad()
42
+ def forward(self, images):
43
+ if type(images) is list:
44
+ image_features = []
45
+ for image in images:
46
+ image_forward_out = self.vision_tower(
47
+ image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True
48
+ )
49
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
50
+ image_features.append(image_feature)
51
+ else:
52
+ image_forward_outs = self.vision_tower(
53
+ images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
54
+ )
55
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
56
+
57
+ return image_features
58
+
59
+ @property
60
+ def dummy_feature(self):
61
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
62
+
63
+ @property
64
+ def dtype(self):
65
+ return self.vision_tower.dtype
66
+
67
+ @property
68
+ def device(self):
69
+ return self.vision_tower.device
70
+
71
+ @property
72
+ def config(self):
73
+ if self.is_loaded:
74
+ return self.vision_tower.config
75
+ else:
76
+ return self.cfg_only
77
+
78
+ @property
79
+ def hidden_size(self):
80
+ return self.config.hidden_size
81
+
82
+ @property
83
+ def num_patches(self):
84
+ return (self.config.image_size // self.config.patch_size)**2
@@ -0,0 +1,50 @@
1
+ import re
2
+ import torch
3
+ import torch.nn as nn
4
+
5
+
6
+ class IdentityMap(nn.Module):
7
+
8
+ def __init__(self):
9
+ super().__init__()
10
+
11
+ def forward(self, x, *args, **kwargs):
12
+ return x
13
+
14
+ @property
15
+ def config(self):
16
+ return {'mm_projector_type': 'identity'}
17
+
18
+
19
+ class SimpleResBlock(nn.Module):
20
+
21
+ def __init__(self, channels):
22
+ super().__init__()
23
+ self.pre_norm = nn.LayerNorm(channels)
24
+
25
+ self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels))
26
+
27
+ def forward(self, x):
28
+ x = self.pre_norm(x)
29
+ return x + self.proj(x)
30
+
31
+
32
+ def build_vision_projector(config, delay_load=False, **kwargs):
33
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
34
+
35
+ if projector_type == 'linear':
36
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
37
+
38
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
39
+ if mlp_gelu_match:
40
+ mlp_depth = int(mlp_gelu_match.group(1))
41
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
42
+ for _ in range(1, mlp_depth):
43
+ modules.append(nn.GELU())
44
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
45
+ return nn.Sequential(*modules)
46
+
47
+ if projector_type == 'identity':
48
+ return IdentityMap()
49
+
50
+ raise ValueError(f'Unknown projector type: {projector_type}')
@@ -0,0 +1,223 @@
1
+ import torch
2
+ from typing import List
3
+
4
+ from ...constants import CACHE_DIR, CONTEXT_LEN, DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, SYSTEM_MSG
5
+ from .clip_t5.model import CLIPT5ForConditionalGeneration, ModelArguments
6
+ from .mm_utils import expand2square, load_pretrained_model, t5_tokenizer_image_token
7
+ from .vqa_model import VQAScoreModel
8
+
9
+ default_question_template = 'Does this figure show "{}"? Please answer yes or no.'
10
+ default_answer_template = 'Yes'
11
+
12
+
13
+ def format_question(question, conversation_style='plain'):
14
+ if conversation_style == 't5_plain': # for 1st stage t5 model
15
+ question = DEFAULT_IMAGE_TOKEN + question
16
+ elif conversation_style == 't5_chat': # for 2nd stage t5 model
17
+ question = SYSTEM_MSG + ' USER: ' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' ASSISTANT: '
18
+ elif conversation_style == 't5_chat_no_system': # for 2nd stage t5 model
19
+ question = 'USER: ' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' ASSISTANT: '
20
+ elif conversation_style == 't5_chat_no_system_no_user': # for 2nd stage t5 model
21
+ question = '' + DEFAULT_IMAGE_TOKEN + '\n' + question + ' : '
22
+ # elif conversation_style == 't5_chat_ood_system': # for 2nd stage t5 model
23
+ # question = SYSTEM_MSG + " HUMAN: " + DEFAULT_IMAGE_TOKEN + "\n" + question + " GPT: "
24
+ else:
25
+ raise NotImplementedError()
26
+ return question
27
+
28
+
29
+ def format_answer(answer, conversation_style='plain'):
30
+ return answer
31
+
32
+
33
+ CLIP_T5_MODELS = {
34
+ # We recommend using 'clip-flant5-xxl' for maximal performance.
35
+ # If you want to use a smaller model, we recommend using 'clip-flant5-xl'.
36
+ 'clip-flant5-xxl': {
37
+ 'tokenizer': {
38
+ 'path': 'AI-ModelScope/clip-flant5-xxl', # zhiqiulin/clip-flant5-xxl
39
+ 'model_max_length': CONTEXT_LEN,
40
+ },
41
+ 'model': {
42
+ 'path': 'AI-ModelScope/clip-flant5-xxl', # zhiqiulin/clip-flant5-xxl
43
+ 'conversation': 't5_chat',
44
+ 'image_aspect_ratio': 'pad',
45
+ },
46
+ },
47
+ 'clip-flant5-xl': {
48
+ 'tokenizer': {
49
+ 'path': 'zhiqiulin/clip-flant5-xl',
50
+ 'model_max_length': CONTEXT_LEN,
51
+ },
52
+ 'model': {
53
+ 'path': 'zhiqiulin/clip-flant5-xl',
54
+ 'conversation': 't5_chat',
55
+ 'image_aspect_ratio': 'pad',
56
+ },
57
+ },
58
+ }
59
+
60
+
61
+ class CLIPT5Model(VQAScoreModel):
62
+ """A wrapper for the CLIP-FlanT5 or CLIP-T5 models"""
63
+
64
+ def __init__(self, model_name='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR):
65
+ assert model_name in CLIP_T5_MODELS
66
+ super().__init__(model_name=model_name, device=device, cache_dir=cache_dir)
67
+
68
+ def load_model(self):
69
+ """Load the model, tokenizer, image transform
70
+ """
71
+ model_args = ModelArguments()
72
+ model_max_length = CLIP_T5_MODELS[self.model_name]['tokenizer']['model_max_length'] \
73
+ if 'model_max_length' in CLIP_T5_MODELS[self.model_name]['tokenizer'] else None
74
+ padding_side = CLIP_T5_MODELS[self.model_name]['tokenizer']['padding_side'] \
75
+ if 'padding_side' in CLIP_T5_MODELS[self.model_name]['tokenizer'] else None
76
+ mmprojector_repo = CLIP_T5_MODELS[self.model_name]['model']['mmprojector_repo'] \
77
+ if 'mmprojector_repo' in CLIP_T5_MODELS[self.model_name]['model'] else None
78
+ mmprojector_name = CLIP_T5_MODELS[self.model_name]['model']['mmprojector_name'] \
79
+ if 'mmprojector_name' in CLIP_T5_MODELS[self.model_name]['model'] else None
80
+
81
+ # default is 'pad'
82
+ # stage-1 models use 'square'
83
+ self.image_aspect_ratio = CLIP_T5_MODELS[self.model_name]['model']['image_aspect_ratio'] \
84
+ if 'image_aspect_ratio' in CLIP_T5_MODELS[self.model_name]['model'] else 'pad'
85
+
86
+ self.conversational_style = CLIP_T5_MODELS[self.model_name]['model']['conversation']
87
+
88
+ self.context_len = CONTEXT_LEN
89
+
90
+ self.tokenizer, self.model, self.image_processor = load_pretrained_model(
91
+ CLIPT5ForConditionalGeneration,
92
+ model_args,
93
+ model_path=CLIP_T5_MODELS[self.model_name]['model']['path'],
94
+ tokenizer_path=CLIP_T5_MODELS[self.model_name]['tokenizer']['path'],
95
+ model_max_length=model_max_length,
96
+ padding_side=padding_side,
97
+ image_aspect_ratio=self.image_aspect_ratio,
98
+ mmprojector_repo=mmprojector_repo,
99
+ mmprojector_name=mmprojector_name,
100
+ device=self.device,
101
+ cache_dir=self.cache_dir
102
+ )
103
+
104
+ def load_images(self, image: List[str]) -> torch.Tensor:
105
+ """Load the image(s), and return a tensor (after preprocessing) put on self.device
106
+ """
107
+ image = [self.image_loader(x) for x in image]
108
+ if self.image_aspect_ratio == 'pad':
109
+ image = [
110
+ expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean)) for image in image
111
+ ]
112
+ image = [self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] for image in image]
113
+ assert all(x.shape == image[0].shape for x in image)
114
+ image = torch.stack(image, dim=0).to(self.device)
115
+ return image
116
+
117
+ @torch.no_grad()
118
+ @torch.autocast(device_type='cuda', dtype=torch.bfloat16)
119
+ def forward(
120
+ self,
121
+ images: List[str],
122
+ texts: List[str],
123
+ question_template: str = default_question_template,
124
+ answer_template: str = default_answer_template
125
+ ) -> torch.Tensor:
126
+ """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
127
+ """
128
+ assert len(images) == len(texts), 'Number of images and texts must match'
129
+ # Turn "a photo of a dog" into
130
+ # Q: "Does this figure show "a photo of a dog"? Please answer yes or no."
131
+ # A: "Yes"
132
+ questions = [question_template.format(text) for text in texts]
133
+ answers = [answer_template.format(text) for text in texts]
134
+
135
+ # Formatting for CLIP-FlanT5 desired input including system message and image tokens
136
+ questions = [format_question(question, conversation_style=self.conversational_style) for question in questions]
137
+ answers = [format_answer(answer, conversation_style=self.conversational_style) for answer in answers]
138
+
139
+ images = self.load_images(images)
140
+
141
+ input_ids = [t5_tokenizer_image_token(qs, self.tokenizer, return_tensors='pt') for qs in questions]
142
+ labels = [t5_tokenizer_image_token(ans, self.tokenizer, return_tensors='pt') for ans in answers]
143
+
144
+ input_ids = torch.nn.utils.rnn.pad_sequence(
145
+ input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
146
+ )
147
+ labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
148
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
149
+ labels = labels[:, :self.tokenizer.model_max_length]
150
+
151
+ attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
152
+ decoder_attention_mask = labels.ne(IGNORE_INDEX)
153
+
154
+ input_ids, attention_mask, decoder_attention_mask, labels = input_ids.to(self.device), \
155
+ attention_mask.to(self.device), decoder_attention_mask.to(self.device), labels.to(self.device)
156
+ model_input_kwargs = {
157
+ 'input_ids': input_ids,
158
+ 'attention_mask': attention_mask,
159
+ 'decoder_attention_mask': decoder_attention_mask,
160
+ 'labels': labels,
161
+ 'images': images,
162
+ 'past_key_values': None,
163
+ 'inputs_embeds': None,
164
+ 'use_cache': None,
165
+ 'output_attentions': None,
166
+ 'output_hidden_states': None,
167
+ 'return_dict': True,
168
+ }
169
+
170
+ outputs = self.model(**model_input_kwargs)
171
+
172
+ logits = outputs.logits
173
+ lm_prob = torch.zeros(logits.shape[0])
174
+ loss_fct = torch.nn.CrossEntropyLoss(reduction='mean')
175
+ for k in range(lm_prob.shape[0]):
176
+ lm_prob[k] = (-loss_fct(logits[k],
177
+ labels[k])).exp() # exp to cancel the log and get raw prob between 0 and 1
178
+ return lm_prob
179
+
180
+ @torch.no_grad()
181
+ @torch.autocast(device_type='cuda', dtype=torch.bfloat16)
182
+ def generate(
183
+ self,
184
+ images: List[str],
185
+ prompts: List[str],
186
+ temperature: float = 0.2,
187
+ ):
188
+ """Forward pass of the model to return n strings for n (image, prompt) pairs
189
+ """
190
+ assert len(images) == len(prompts), 'Number of images and texts must match'
191
+
192
+ # Formatting for CLIP-FlanT5 desired input including system message and image tokens
193
+ questions = [format_question(prompt, conversation_style=self.conversational_style) for prompt in prompts]
194
+ images = self.load_images(images)
195
+
196
+ input_ids = [t5_tokenizer_image_token(qs, self.tokenizer, return_tensors='pt') for qs in questions]
197
+ input_ids = torch.nn.utils.rnn.pad_sequence(
198
+ input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
199
+ )
200
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
201
+
202
+ attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
203
+
204
+ input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
205
+ model_input_kwargs = {
206
+ 'inputs': input_ids,
207
+ 'images': images,
208
+ 'attention_mask': attention_mask,
209
+ 'do_sample': True if temperature > 0 else False,
210
+ 'temperature': temperature,
211
+ 'top_p': None,
212
+ 'num_beams': 1,
213
+ 'max_new_token': 1024,
214
+ 'use_cache': True,
215
+ }
216
+
217
+ outputs = self.model.generate(**model_input_kwargs)
218
+ outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
219
+ for i in range(len(outputs)):
220
+ if outputs[i].endswith(' '):
221
+ outputs[i] = outputs[i][:-1]
222
+ outputs[i] = outputs[i].strip()
223
+ return outputs